From 138ecf24aa3a6cc5b64a2a38f5ccfb33cc4aae98 Mon Sep 17 00:00:00 2001 From: QingshuChen Date: Fri, 21 Aug 2020 15:21:49 +0800 Subject: [PATCH] support Baidu Kunlun AI Accelerator (#25959) * support Baidu AI Accelerator * test=kunlun * minor * test=kunlun * support xpu op in separate file * test=kunlun * update XPU error message and remove duplicated code * test=kunlun * minor * test=kunlun * minor * test=kunlun --- CMakeLists.txt | 5 +- cmake/configure.cmake | 5 + cmake/external/xpu.cmake | 54 +++++ cmake/operators.cmake | 18 +- cmake/third_party.cmake | 5 + paddle/fluid/framework/dlpack_tensor.cc | 5 + paddle/fluid/framework/executor.cc | 17 +- paddle/fluid/framework/garbage_collector.cc | 9 + paddle/fluid/framework/garbage_collector.h | 10 + paddle/fluid/framework/library_type.h | 2 + paddle/fluid/framework/op_registry.h | 9 + paddle/fluid/framework/operator.cc | 21 ++ paddle/fluid/framework/parallel_executor.cc | 3 + paddle/fluid/framework/tensor_util.cc | 197 ++++++++++++++---- .../fluid/imperative/gradient_accumulator.cc | 7 + paddle/fluid/imperative/prepared_operator.cc | 7 + paddle/fluid/memory/allocation/CMakeLists.txt | 2 + .../memory/allocation/allocator_facade.cc | 37 ++++ .../allocation/naive_best_fit_allocator.cc | 97 +++++++++ paddle/fluid/memory/memcpy.cc | 167 +++++++++++++++ paddle/fluid/operators/eye_op.cc | 3 +- paddle/fluid/operators/math/math_function.cc | 7 + paddle/fluid/operators/xpu/mul_xpu_op.cc | 183 ++++++++++++++++ paddle/fluid/platform/CMakeLists.txt | 12 +- paddle/fluid/platform/device_context.cc | 54 ++++- paddle/fluid/platform/device_context.h | 33 +++ .../fluid/platform/device_context_xpu_test.cc | 53 +++++ paddle/fluid/platform/init.cc | 19 ++ paddle/fluid/platform/init_test.cc | 14 +- paddle/fluid/platform/place.cc | 7 + paddle/fluid/platform/place.h | 42 +++- paddle/fluid/platform/place_test.cc | 13 ++ paddle/fluid/platform/xpu_header.h | 23 ++ paddle/fluid/platform/xpu_info.cc | 107 ++++++++++ paddle/fluid/platform/xpu_info.h | 33 +++ paddle/fluid/pybind/imperative.cc | 36 +++- paddle/fluid/pybind/pybind.cc | 114 +++++++++- paddle/fluid/pybind/tensor_py.h | 100 +++++++-- python/paddle/fluid/__init__.py | 3 +- python/paddle/fluid/framework.py | 16 ++ .../fluid/tests/unittests/test_mul_op.py | 52 +++++ python/setup.py.in | 17 ++ 42 files changed, 1533 insertions(+), 85 deletions(-) create mode 100644 cmake/external/xpu.cmake create mode 100644 paddle/fluid/operators/xpu/mul_xpu_op.cc create mode 100644 paddle/fluid/platform/device_context_xpu_test.cc create mode 100644 paddle/fluid/platform/xpu_header.h create mode 100644 paddle/fluid/platform/xpu_info.cc create mode 100644 paddle/fluid/platform/xpu_info.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 265ddc95041..fb796103350 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,10 @@ include(generic) # simplify cmake module # TODO(Shibo Tao): remove find_package(CUDA) completely. find_package(CUDA QUIET) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) - +option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN" OFF) +if (WITH_GPU AND WITH_XPU) + message(FATAL_ERROR "Error when compile GPU and XPU at the same time") +endif() # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them. if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15)) message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. " diff --git a/cmake/configure.cmake b/cmake/configure.cmake index bb57b42dcc7..cf458d97706 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -63,6 +63,11 @@ if(WITH_BOX_PS) add_definitions(-DPADDLE_WITH_BOX_PS) endif() +if(WITH_XPU) + message(STATUS "Compile with XPU!") + add_definitions(-DPADDLE_WITH_XPU) +endif() + if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DEIGEN_USE_GPU) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake new file mode 100644 index 00000000000..8a927d8e282 --- /dev/null +++ b/cmake/external/xpu.cmake @@ -0,0 +1,54 @@ +if (NOT WITH_XPU) + return() +endif() + +INCLUDE(ExternalProject) +SET(XPU_PROJECT "extern_xpu") +SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE) +SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") +SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") +SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") +SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/api/include") +SET(XPU_RUNTIME_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/runtime/include") +SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib") + +SET(XPU_API_LIB_NAME "libxpuapi.so") +SET(XPU_RT_LIB_NAME "libxpurt.so") +SET(XPU_SIM_LIB_NAME "libxpusim.so") +SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") +SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") +SET(XPU_SIM_LIB "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}") + +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") + +INCLUDE_DIRECTORIES(${XPU_API_INC_DIR}) +INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR}) + +FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(XPU)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY xpu/api xpu/runtime xpu/lib \n" + " DESTINATION ${XPU_INSTALL_DIR})\n") + +ExternalProject_Add( + ${XPU_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${XPU_SOURCE_DIR} + DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz + && tar xvf xpu.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT} +) + +ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL) +set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}") + +# generate a static dummy target to track xpulib dependencies +# for cc_library(xxx SRCS xxx.c DEPS xpulib) +generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake") + +TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB}) +ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index e927fae63f0..f60a6dc3f0c 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -8,6 +8,7 @@ function(op_library TARGET) set(hip_cu_srcs) set(miopen_hip_cc_srcs) set(cu_cc_srcs) + set(xpu_cc_srcs) set(cudnn_cu_cc_srcs) set(cudnn_cu_srcs) set(CUDNN_FILE) @@ -60,6 +61,12 @@ function(op_library TARGET) list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc) endif() endif() + if(WITH_XPU) + string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc) + list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc) + endif() + endif() else() foreach(src ${op_library_SRCS}) if (${src} MATCHES ".*\\.hip.cu$") @@ -76,6 +83,8 @@ function(op_library TARGET) list(APPEND mkldnn_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cu.cc$") list(APPEND cu_cc_srcs ${src}) + elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$") + list(APPEND xpu_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) else() @@ -109,7 +118,7 @@ function(op_library TARGET) hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() - cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} + cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) endif() @@ -150,10 +159,11 @@ function(op_library TARGET) list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) + list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) list(LENGTH hip_cu_srcs hip_cu_srcs_len) list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len) if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND - ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0) + ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0) file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") set(pybind_flag 1) endif() @@ -179,6 +189,9 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") endif() + if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n") + endif() # pybind USE_OP_DEVICE_KERNEL for MKLDNN if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) # Append first implemented MKLDNN activation operator @@ -228,6 +241,7 @@ function(register_operators) file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") string(REPLACE "_mkldnn" "" OPS "${OPS}") + string(REPLACE "_xpu" "" OPS "${OPS}") string(REPLACE ".cc" "" OPS "${OPS}") list(REMOVE_DUPLICATES OPS) list(LENGTH register_operators_DEPS register_operators_DEPS_len) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 5b2c0f51cd7..c9442e8f843 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -250,6 +250,11 @@ if(WITH_GPU) file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage endif(WITH_GPU) +if(WITH_XPU) + include(external/xpu) # download, build, install xpu + list(APPEND third_party_deps extern_xpu) +endif(WITH_XPU) + if(WITH_PSLIB) include(external/pslib) # download, build, install pslib list(APPEND third_party_deps extern_pslib) diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index f2421248e33..180b33d0cb7 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { return ctx; } + inline ::DLContext operator()(const platform::XPUPlace &place) const { + PADDLE_THROW( + platform::errors::Unimplemented("platform::XPUPlace is not supported")); + } + inline ::DLContext operator()(const platform::CUDAPlace &place) const { #ifdef PADDLE_WITH_CUDA ::DLContext ctx; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8e2e1d38a66..f11edb9a41b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; if (!ctx->force_disable_gc_ && max_memory_size >= 0) { -#ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { +#ifdef PADDLE_WITH_CUDA if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector( BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); @@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, gc.reset(new DefaultStreamGarbageCollector( BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); } - } else if (platform::is_cpu_place(place_)) { +#else + PADDLE_THROW( + platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle")); #endif + } else if (platform::is_cpu_place(place_)) { gc.reset(new CPUGarbageCollector( BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size)); -#ifdef PADDLE_WITH_CUDA - } + } else if (platform::is_xpu_place(place_)) { +#ifdef PADDLE_WITH_XPU + gc.reset(new XPUGarbageCollector( + BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size)); +#else + PADDLE_THROW( + platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); #endif + } } for (int64_t i = start_op_index; i < end_op_index; ++i) { diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index ac892443de3..f69ada08067 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function &callback) { callback(); } +#ifdef PADDLE_WITH_XPU +XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} +void XPUGarbageCollector::ClearCallback(const std::function &callback) { + callback(); +} +#endif + #ifdef PADDLE_WITH_CUDA UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( const platform::CUDAPlace &place, size_t max_memory_size) diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 2212122c03d..4f773965282 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector { void ClearCallback(const std::function &callback) override; }; +#ifdef PADDLE_WITH_XPU +class XPUGarbageCollector : public GarbageCollector { + public: + XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; +#endif + #ifdef PADDLE_WITH_CUDA class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h index d46f8a574c0..4307e51862d 100644 --- a/paddle/fluid/framework/library_type.h +++ b/paddle/fluid/framework/library_type.h @@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) { // CPU, CUDA, PLAIN are same library type. } else if (s == std::string("CPU")) { return LibraryType::kPlain; + } else if (s == std::string("XPU")) { + return LibraryType::kPlain; } else if (s == std::string("CUDA")) { return LibraryType::kPlain; } else { diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 0f842637a58..d8159d6a5c2 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx &places, const BuildStrategy &build_strategy, ir::Graph *graph) : member_(new ParallelExecutorPrivate(places, scope)) { + PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]), + platform::errors::Unavailable( + "XPU is not supported in ParallelExecutor")); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); member_->use_cuda_ = exec_strategy.use_cuda_; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index a56ca342ad1..829c182c991 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -54,18 +54,43 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); } +#ifdef PADDLE_WITH_XPU + else if (platform::is_xpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size); + } else if (platform::is_cpu_place(src_place) && + platform::is_xpu_place(dst_place)) { + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + } else if (platform::is_xpu_place(src_place) && + platform::is_xpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif #ifdef PADDLE_WITH_CUDA else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr, size); - } else if (platform::is_cpu_place(src_place) && // NOLINT - platform::is_cuda_pinned_place(dst_place)) { + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_cuda_pinned_place(dst_place)) { memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); - } else if (platform::is_gpu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { + } + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place); auto ctx_place = ctx.GetPlace(); @@ -75,8 +100,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); - } else if (platform::is_cpu_place(src_place) && - platform::is_gpu_place(dst_place)) { + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_gpu_place(dst_place)) { auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place); auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); auto ctx_place = ctx.GetPlace(); @@ -86,8 +112,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); - } else if (platform::is_gpu_place(src_place) && // NOLINT - platform::is_cuda_pinned_place(dst_place)) { + } + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cuda_pinned_place(dst_place)) { auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); auto dst_cuda_pinned_place = BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place); @@ -108,8 +135,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, reinterpret_cast(ctx).stream(); memory::Copy(dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); - } else if (platform::is_cuda_pinned_place(src_place) && - platform::is_gpu_place(dst_place)) { + } + else if (platform::is_cuda_pinned_place(src_place) && // NOLINT + platform::is_gpu_place(dst_place)) { auto src_cuda_pinned_place = BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place); auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); @@ -130,8 +158,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, reinterpret_cast(ctx).stream(); memory::Copy(dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream); - } else if (platform::is_gpu_place(src_place) && - platform::is_gpu_place(dst_place)) { + } + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_gpu_place(dst_place)) { auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); auto ctx_place = ctx.GetPlace(); @@ -154,7 +183,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place."); } } - } else { + } + else { // NOLINT PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place); } #endif @@ -200,44 +230,74 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); } +#ifdef PADDLE_WITH_XPU + else if (platform::is_xpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size); + } else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_xpu_place(dst_place)) { + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + } else if (platform::is_xpu_place(src_place) && // NOLINT + platform::is_xpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size); + } else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif #ifdef PADDLE_WITH_CUDA else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr, size); - } else if (platform::is_cpu_place(src_place) && // NOLINT - platform::is_cuda_pinned_place(dst_place)) { + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_cuda_pinned_place(dst_place)) { memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); - } else if (platform::is_gpu_place(src_place) && // NOLINT - platform::is_cuda_pinned_place(dst_place)) { + } + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cuda_pinned_place(dst_place)) { memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src_place), src_ptr, size, nullptr); - } else if (platform::is_gpu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { + } + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); - } else if (platform::is_cpu_place(src_place) && - platform::is_gpu_place(dst_place)) { + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_gpu_place(dst_place)) { auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place); auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); - } else if (platform::is_gpu_place(src_place) && - platform::is_gpu_place(dst_place)) { + } + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_gpu_place(dst_place)) { auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); - } else if (platform::is_cuda_pinned_place(src_place) && - platform::is_gpu_place(dst_place)) { + } + else if (platform::is_cuda_pinned_place(src_place) && // NOLINT + platform::is_gpu_place(dst_place)) { auto src_pinned_place = BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place); auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, nullptr); - } else { + } + else { // NOLINT PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place); } #endif @@ -276,6 +336,19 @@ class AnyVisitor : public boost::static_visitor { const framework::Tensor& tensor_; Predicate predicate_; + bool GetResultHelper(const framework::Tensor& out, + const platform::Place& place) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + auto ctx = platform::DeviceContextPool::Instance().Get(place); + ctx->Wait(); + TensorCopy(out, cpu, *ctx, &tmp); + ctx->Wait(); + return GetResult(tmp, cpu); + } + public: AnyVisitor(const framework::Tensor& tensor, Predicate predicate) : tensor_(tensor), predicate_(std::move(predicate)) {} @@ -290,17 +363,14 @@ class AnyVisitor : public boost::static_visitor { return this->GetResult(out, place); } + bool GetResult(const framework::Tensor& out, + const platform::XPUPlace& xpu) const { + return GetResultHelper(out, xpu); + } + bool GetResult(const framework::Tensor& out, const platform::CUDAPlace& gpu) const { - platform::CPUPlace cpu; - framework::Tensor tmp; - tmp.Resize({1}); - tmp.mutable_data(cpu); - auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); - gpuctx->Wait(); - TensorCopy(out, cpu, *gpuctx, &tmp); - gpuctx->Wait(); - return GetResult(tmp, cpu); + return GetResultHelper(out, gpu); } bool GetResult(const framework::Tensor& out, @@ -418,6 +488,10 @@ struct BothFalseVisitor : public boost::static_visitor<> { VisitorImpl(place); } + void VisitorImpl(const platform::XPUPlace& xpu) const { + PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); + } + void VisitorImpl(const platform::CUDAPlace& gpu) const { #ifdef PADDLE_WITH_CUDA auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu); @@ -498,6 +572,28 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, #else PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported when not compiled with CUDA")); +#endif + } else if (platform::is_xpu_place(tensor.place())) { +#ifdef PADDLE_WITH_XPU + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& xpu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + BOOST_GET_CONST(platform::XPUPlace, tensor.place()), + reinterpret_cast(data), size_to_write); + xpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "XPUPlace is not supported when not compiled with XPU")); #endif } else { os.write(static_cast(data_ptr), @@ -552,8 +648,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor, void* buf; auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); - if (platform::is_gpu_place(dev_ctx.GetPlace())) { -#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(dev_ctx.GetPlace()) || + platform::is_xpu_place(dev_ctx.GetPlace())) { +#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(shape)); framework::VisitDataType( @@ -563,8 +660,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); #else - PADDLE_THROW(platform::errors::Unimplemented( - "CUDAPlace is not supported when not compiled with CUDA")); + if (platform::is_gpu_place(dev_ctx.GetPlace())) { + PADDLE_THROW(platform::errors::Unimplemented( + "CUDAPlace is not supported when not compiled with CUDA")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "XPUPlace is not supported when not compiled with XPU")); + } #endif } else { framework::VisitDataType( @@ -603,8 +705,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor, void* buf; auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); - if (platform::is_gpu_place(dev_ctx.GetPlace())) { -#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(dev_ctx.GetPlace()) || + platform::is_xpu_place(dev_ctx.GetPlace())) { +#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( @@ -614,8 +717,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); #else - PADDLE_THROW(platform::errors::Unimplemented( - "CUDAPlace is not supported when not compiled with CUDA")); + if (platform::is_gpu_place(dev_ctx.GetPlace())) { + PADDLE_THROW(platform::errors::Unimplemented( + "CUDAPlace is not supported when not compiled with CUDA")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "XPUPlace is not supported when not compiled with XPU")); + } #endif } else { framework::VisitDataType( @@ -700,6 +808,9 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) { reinterpret_cast(*ctx).stream()); } #endif +#ifdef PADDLE_WITH_XPU + PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); +#endif } template diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index f5fc5944709..7caeb4378ce 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> { blas.AXPY(numel_, 1., x_, y_); } + void operator()(const platform::XPUPlace& place) { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } + #ifdef PADDLE_WITH_CUDA void operator()(const platform::CUDAPlace& place) { platform::CUDADeviceContext* ctx = diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 0336325bef6..4e0e95dd012 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -100,6 +100,13 @@ PreparedOp PrepareOpImpl(const NameVarMap& ins, VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); +#ifdef PADDLE_WITH_XPU + if (kernel_iter == kernels.end() && + is_xpu_place(expected_kernel_key.place_)) { + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } +#endif // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), platform::errors::NotFound( diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index bd1908ac655..9cc7c267454 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) if (WITH_GPU) set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator) +elseif(WITH_XPU) + set(AllocatorFacadeDeps xpu_info) else () set(AllocatorFacadeDeps) endif() diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 2ab0d69ef80..3213684c140 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -39,6 +39,9 @@ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/gpu_info.h" #endif +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/xpu_info.h" +#endif DEFINE_int64( gpu_allocator_retry_time, 10000, @@ -62,6 +65,11 @@ class AllocatorFacadePrivate { switch (strategy) { case AllocatorStrategy::kNaiveBestFit: { InitNaiveBestFitCPUAllocator(); +#ifdef PADDLE_WITH_XPU + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } +#endif #ifdef PADDLE_WITH_CUDA for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { @@ -74,6 +82,11 @@ class AllocatorFacadePrivate { case AllocatorStrategy::kAutoGrowth: { InitNaiveBestFitCPUAllocator(); +#ifdef PADDLE_WITH_XPU + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } +#endif #ifdef PADDLE_WITH_CUDA for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { @@ -86,6 +99,11 @@ class AllocatorFacadePrivate { case AllocatorStrategy::kThreadLocal: { InitNaiveBestFitCPUAllocator(); +#ifdef PADDLE_WITH_XPU + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } +#endif #ifdef PADDLE_WITH_CUDA for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { @@ -127,6 +145,13 @@ class AllocatorFacadePrivate { private: void InitSystemAllocators() { system_allocators_[platform::CPUPlace()] = std::make_shared(); +#ifdef PADDLE_WITH_XPU + int device_count = platform::GetXPUDeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::XPUPlace p(i); + system_allocators_[p] = std::make_shared(p); + } +#endif #ifdef PADDLE_WITH_CUDA system_allocators_[platform::CUDAPinnedPlace()] = std::make_shared(); @@ -164,6 +189,12 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_XPU + void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) { + allocators_[p] = std::make_shared(p); + } +#endif + class ZeroSizeAllocator : public Allocator { public: explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} @@ -191,6 +222,12 @@ class AllocatorFacadePrivate { } places.emplace_back(platform::CUDAPinnedPlace()); #endif +#ifdef PADDLE_WITH_XPU + int device_count = platform::GetXPUDeviceCount(); + for (int dev_id = 0; dev_id < device_count; ++dev_id) { + places.emplace_back(platform::XPUPlace(dev_id)); + } +#endif for (auto& p : places) { zero_size_allocators_[p] = std::make_shared(p); diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 907a266e7b2..92e3933a072 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -29,6 +29,9 @@ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/xpu_header.h" +#endif DEFINE_bool(init_allocated_mem, false, "It is a mistake that the values of the memory allocated by " @@ -101,6 +104,100 @@ size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); } +template <> +void *Alloc(const platform::XPUPlace &place, size_t size) { +#ifdef PADDLE_WITH_XPU + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + void *p = nullptr; + int dev_id = -1; + int ret = xpu_current_device(&dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + if (dev_id >= 64) { + // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id + dev_id -= 64; + } + ret = xpu_set_device(place.device); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + ret = xpu_malloc(reinterpret_cast(&p), size); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + if (FLAGS_init_allocated_mem) { + PADDLE_THROW(platform::errors::Unimplemented( + "xpu memory FLAGS_init_allocated_mem is not implemented.")); + } + ret = xpu_set_device(dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + VLOG(10) << " pointer=" << p; + return p; +#else + PADDLE_THROW( + platform::errors::PermissionDenied("'XPUPlace' is not supported.")); + return nullptr; +#endif +} + +template <> +void Free(const platform::XPUPlace &place, void *p, + size_t size) { +#ifdef PADDLE_WITH_XPU + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + int dev_id = -1; + int ret = xpu_current_device(&dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + if (dev_id >= 64) { + // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id + dev_id -= 64; + } + ret = xpu_set_device(place.device); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + xpu_free(p); + ret = xpu_set_device(dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); +#else + PADDLE_THROW( + platform::errors::PermissionDenied("'XPUPlace' is not supported.")); +#endif +} + +template <> +size_t Used(const platform::XPUPlace &place) { +#ifdef PADDLE_WITH_XPU + printf("Used func return 0 for XPUPlace\n"); + return 0; +#else + PADDLE_THROW( + platform::errors::PermissionDenied("'XPUPlace' is not supported.")); +#endif +} + #ifdef PADDLE_WITH_CUDA class GPUBuddyAllocatorList { private: diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index b19f02db1c0..225b6858cc1 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -18,6 +18,10 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/xpu_header.h" +#endif + namespace paddle { namespace memory { @@ -29,6 +33,169 @@ void Copy(platform::CPUPlace, void* dst, std::memcpy(dst, src, num); } +#ifdef PADDLE_WITH_XPU +template <> +void Copy(platform::XPUPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, size_t num) { + if (num <= 0) { + VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; + return; + } + int dev_id = -1; + int ret = xpu_current_device(&dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + if (dev_id >= 64) { + // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id + dev_id -= 64; + } + if (dev_id != dst_place.device) { + ret = xpu_set_device(dst_place.device); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + } + ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + if (dev_id != dst_place.device) { + ret = xpu_set_device(dev_id); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + } +} + +template <> +void Copy(platform::CPUPlace dst_place, + void* dst, + platform::XPUPlace src_place, + const void* src, size_t num) { + if (num <= 0) { + VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; + return; + } + int dev_id = -1; + int ret = xpu_current_device(&dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + if (dev_id >= 64) { + // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id + dev_id -= 64; + } + if (dev_id != src_place.device) { + ret = xpu_set_device(src_place.device); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + } + ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + if (dev_id != src_place.device) { + ret = xpu_set_device(dev_id); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + } +} + +template <> +void Copy(platform::XPUPlace dst_place, + void* dst, + platform::XPUPlace src_place, + const void* src, size_t num) { + if (num <= 0) { + VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; + return; + } + int dev_id = -1; + int ret = xpu_current_device(&dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + if (dev_id >= 64) { + // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id + dev_id -= 64; + } + if (dev_id != src_place.device || dev_id != dst_place.device) { + ret = xpu_set_device(src_place.device); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + void* tmp = malloc(num); + ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + ret = xpu_set_device(dst_place.device); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + ret = xpu_set_device(dev_id); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + free(tmp); + } else { + int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + } +} +#endif + #ifdef PADDLE_WITH_CUDA static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc index 2cf08e5c340..793519b4018 100644 --- a/paddle/fluid/operators/eye_op.cc +++ b/paddle/fluid/operators/eye_op.cc @@ -83,7 +83,6 @@ Return an identity tensor whose shape is [num_rows, num_columns]. namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; -using float16 = paddle::platform::float16; REGISTER_OPERATOR( eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference, @@ -93,4 +92,4 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel, ops::EyeKernel, ops::EyeKernel, ops::EyeKernel, - ops::EyeKernel); + ops::EyeKernel); diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 44b04104419..6748d0ab43f 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -73,6 +73,13 @@ struct TensorSetConstantCPU { float value_; }; +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); +} + template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, diff --git a/paddle/fluid/operators/xpu/mul_xpu_op.cc b/paddle/fluid/operators/xpu/mul_xpu_op.cc new file mode 100644 index 00000000000..79aae71c304 --- /dev/null +++ b/paddle/fluid/operators/xpu/mul_xpu_op.cc @@ -0,0 +1,183 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include +#include +#include +#include +#include "paddle/fluid/operators/mul_op.h" + +namespace paddle { +namespace operators { + +using framework::OpKernelType; +using framework::Tensor; + +template +class MulXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + const Tensor* y = context.Input("Y"); + Tensor* z = context.Output("Out"); + const Tensor x_matrix = + x->dims().size() > 2 + ? framework::ReshapeToMatrix( + *x, context.template Attr("x_num_col_dims")) + : *x; + const Tensor y_matrix = + y->dims().size() > 2 + ? framework::ReshapeToMatrix( + *y, context.template Attr("y_num_col_dims")) + : *y; + z->mutable_data(context.GetPlace()); + auto z_dim = z->dims(); + if (z_dim.size() != 2) { + z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); + } + bool trans_a = false; + bool trans_b = false; + int m = x_matrix.dims()[0]; + int k = x_matrix.dims()[1]; + int k1 = y_matrix.dims()[0]; + int n = y_matrix.dims()[1]; + PADDLE_ENFORCE_EQ( + k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op")); + T alpha = static_cast(1.0); + T beta = static_cast(0.0); + const T* data_a = x_matrix.data(); + const T* data_b = y_matrix.data(); + T* data_c = z->data(); + auto& dev_ctx = context.template device_context(); + int ret = xpu::fc_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, + alpha, data_a, data_b, beta, data_c); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + if (z_dim.size() != 2) { + z->Resize(z_dim); + } + } +}; + +template +class MulGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int x_num_col_dims = ctx.template Attr("x_num_col_dims"); + int y_num_col_dims = ctx.template Attr("y_num_col_dims"); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto x_matrix = x->dims().size() > 2 + ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : static_cast(*x); + auto y_matrix = y->dims().size() > 2 + ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : static_cast(*y); + auto* dout = ctx.Input(framework::GradVarName("Out")); + Tensor dout_mat; + dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0], + framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]}); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + if (dx != nullptr) { + dx->set_lod(x->lod()); + } + if (dy != nullptr) { + dy->set_lod(y->lod()); + } + auto& dev_ctx = ctx.template device_context(); + if (dx) { + dx->mutable_data(ctx.GetPlace()); + Tensor dx_matrix = dx->dims().size() > 2 + ? framework::ReshapeToMatrix(*dx, x_num_col_dims) + : *dx; + // dx = dout * y'. dx: M x K, dout : M x N, y : K x N + // blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix); + bool trans_a = false; + bool trans_b = true; + int m = dout_mat.dims()[0]; + int k = dout_mat.dims()[1]; + int n = y_matrix.dims()[0]; + int k1 = y_matrix.dims()[1]; + PADDLE_ENFORCE_EQ( + k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op")); + int lda = (!trans_a) ? k : m; + int ldb = (!trans_b) ? n : k; + int ldc = n; + T alpha = static_cast(1.0); + T beta = static_cast(0.0); + const T* data_a = dout->data(); + const T* data_b = y_matrix.data(); + T* data_c = dx_matrix.data(); + int ret = + xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha, + data_a, lda, data_b, ldb, beta, data_c, ldc); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check " + "where Baidu Kunlun Card is properly installed.", + ret)); + } + + if (dy) { + dy->mutable_data(ctx.GetPlace()); + Tensor dy_matrix = dy->dims().size() > 2 + ? framework::ReshapeToMatrix(*dy, y_num_col_dims) + : *dy; + // dy = x' * dout. dy K x N, dout : M x N, x : M x K + // blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix); + bool trans_a = true; + bool trans_b = false; + int k = x_matrix.dims()[0]; + int m = x_matrix.dims()[1]; + int k1 = dout_mat.dims()[0]; + int n = dout_mat.dims()[1]; + PADDLE_ENFORCE_EQ( + k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op")); + int lda = (!trans_a) ? k : m; + int ldb = (!trans_b) ? n : k; + int ldc = n; + T alpha = static_cast(1.0); + T beta = static_cast(0.0); + const T* data_a = x_matrix.data(); + const T* data_b = dout->data(); + T* data_c = dy_matrix.data(); + int ret = + xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha, + data_a, lda, data_b, ldb, beta, data_c, ldc); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check " + "where Baidu Kunlun Card is properly installed.", + ret)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL( + mul, ops::MulXPUKernel); +REGISTER_OP_XPU_KERNEL( + mul_grad, ops::MulGradXPUKernel) +#endif diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 6a28b975f8e..15530738593 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -4,6 +4,12 @@ if(WITH_GPU) proto_library(cuda_error_proto SRCS cuda_error.proto) endif(WITH_GPU) +if(WITH_XPU) + set(XPU_CTX_DEPS xpulib) +ELSE() + set(XPU_CTX_DEPS) +endif(WITH_XPU) + if (WITH_PYTHON) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -50,6 +56,10 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cu cc_library(place SRCS place.cc DEPS enforce boost) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) +if(WITH_XPU) +cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce) +endif() + add_subdirectory(dynload) add_subdirectory(stream) @@ -84,7 +94,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} - ${dgc_deps} dlpack cudnn_workspace_helper) + ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS}) cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto device_context enforce) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 38b0894c3f7..9a502810769 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -61,7 +61,8 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { if (it == device_contexts_.end()) { PADDLE_THROW(platform::errors::Unimplemented( "Place %s is not supported. Please check that your paddle compiles " - "with WITH_GPU option or check that your train process hold the " + "with WITH_GPU or WITH_XPU option or check that your train process " + "hold the " "correct gpu_id if you use Executor.", place)); } @@ -115,6 +116,14 @@ DeviceContextPool::DeviceContextPool( PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported. Please re-compile with WITH_GPU " "option.")); +#endif + } else if (platform::is_xpu_place(p)) { +#ifdef PADDLE_WITH_XPU + EmplaceDeviceContext(&device_contexts_, p); +#else + PADDLE_THROW( + platform::errors::Unimplemented("XPUPlace is not supported. Please " + "re-compile with WITH_XPU option.")); #endif } } @@ -134,6 +143,49 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { Place CPUDeviceContext::GetPlace() const { return place_; } +#ifdef PADDLE_WITH_XPU +XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); } + +XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); } + +XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) { + int dev_id = -1; + int ret = xpu_current_device(&dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + ret = xpu_set_device(place.device); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + context_ = xpu::create_context(); + ret = xpu_set_device(dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); +} + +void XPUDeviceContext::Wait() const { + int ret = xpu_set_device(place_.device); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + xpu_wait(); +} + +Place XPUDeviceContext::GetPlace() const { return place_; } + +xpu::Context* XPUDeviceContext::x_context() const { return context_; } +#endif + #ifdef PADDLE_WITH_CUDA class EigenCudaStreamDevice : public Eigen::StreamInterface { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 7511edb9ccf..3c476f4f08b 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -43,6 +43,10 @@ limitations under the License. */ #endif #include "unsupported/Eigen/CXX11/Tensor" +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/xpu_header.h" +#endif + namespace paddle { namespace platform { @@ -76,6 +80,35 @@ struct DefaultDeviceContextType { using TYPE = CPUDeviceContext; }; +#ifdef PADDLE_WITH_XPU +class XPUDeviceContext : public DeviceContext { + public: + XPUDeviceContext(); + explicit XPUDeviceContext(XPUPlace place); + virtual ~XPUDeviceContext(); + Eigen::DefaultDevice* eigen_device() const { return nullptr; } + Place GetPlace() const override; + xpu::Context* x_context() const; + + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const override; + + private: + XPUPlace place_; + xpu::Context* context_; + + // Need to be the same with other DeviceContext, + // Eventhough eigen_device_ is not used in XPU + std::unique_ptr eigen_device_; + DISABLE_COPY_AND_ASSIGN(XPUDeviceContext); +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = XPUDeviceContext; +}; +#endif + #ifdef PADDLE_WITH_CUDA class EigenCudaStreamDevice; diff --git a/paddle/fluid/platform/device_context_xpu_test.cc b/paddle/fluid/platform/device_context_xpu_test.cc new file mode 100644 index 00000000000..3de2e3957a9 --- /dev/null +++ b/paddle/fluid/platform/device_context_xpu_test.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/platform/device_context.h" + +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +TEST(Device, Init) { + using paddle::platform::DeviceContext; + using paddle::platform::XPUDeviceContext; + using paddle::platform::XPUPlace; + + int count = paddle::platform::GetXPUDeviceCount(); + for (int i = 0; i < count; i++) { + XPUDeviceContext* device_context = new XPUDeviceContext(XPUPlace(i)); + xpu::Context* ctx = device_context->x_context(); + ASSERT_NE(nullptr, ctx); + delete device_context; + } +} + +TEST(Device, DeviceContextPool) { + using paddle::platform::DeviceContextPool; + using paddle::platform::XPUDeviceContext; + using paddle::platform::Place; + using paddle::platform::CPUPlace; + using paddle::platform::XPUPlace; + + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto cpu_dev_ctx1 = pool.Get(CPUPlace()); + auto cpu_dev_ctx2 = pool.Get(CPUPlace()); + ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1); + + std::vector xpu_places; + int count = paddle::platform::GetXPUDeviceCount(); + for (int i = 0; i < count; ++i) { + auto dev_ctx = pool.Get(XPUPlace(i)); + ASSERT_NE(dev_ctx, nullptr); + } +} diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 261f6e807a2..2e708e44fd0 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -33,6 +33,11 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/piece.h" +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/xpu_header.h" +#include "paddle/fluid/platform/xpu_info.h" +#endif + DECLARE_int32(paddle_num_threads); DEFINE_int32(multiple_of_cupti_buffer_size, 1, "Multiple of the CUPTI device buffer size. If the timestamps have " @@ -151,6 +156,14 @@ void InitDevices(bool init_p2p) { } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; } +#endif +#ifdef PADDLE_WITH_XPU + try { + // use user specified XPUs in single-node multi-process mode. + devices = platform::GetXPUSelectedDevices(); + } catch (const std::exception &exp) { + LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime."; + } #endif InitDevices(init_p2p, devices); } @@ -165,7 +178,13 @@ void InitDevices(bool init_p2p, const std::vector devices) { LOG(WARNING) << "Invalid devices id."; continue; } + +#ifdef PADDLE_WITH_CUDA places.emplace_back(platform::CUDAPlace(devices[i])); +#endif +#ifdef PADDLE_WITH_XPU + places.emplace_back(platform::XPUPlace(devices[i])); +#endif } if (init_p2p) { InitP2P(devices); diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc index 6392c4f4c42..f14fbdd74f9 100644 --- a/paddle/fluid/platform/init_test.cc +++ b/paddle/fluid/platform/init_test.cc @@ -20,7 +20,7 @@ TEST(InitDevices, CPU) { using paddle::framework::InitDevices; using paddle::platform::DeviceContextPool; -#ifndef PADDLE_WITH_CUDA +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU) InitDevices(true); DeviceContextPool& pool = DeviceContextPool::Instance(); ASSERT_EQ(pool.size(), 1U); @@ -39,6 +39,18 @@ TEST(InitDevices, CUDA) { #endif } +TEST(InitDevices, XPU) { + using paddle::framework::InitDevices; + using paddle::platform::DeviceContextPool; + +#ifdef PADDLE_WITH_XPU + int count = paddle::platform::GetXPUDeviceCount(); + InitDevices(true); + DeviceContextPool& pool = DeviceContextPool::Instance(); + ASSERT_EQ(pool.size(), 1U + static_cast(count)); +#endif +} + #ifndef _WIN32 TEST(SignalHandle, SignalHandle) { std::string msg = "Signal raises"; diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 195acc1b6d1..b80d2fd1632 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -32,6 +32,7 @@ class PlacePrinter : public boost::static_visitor<> { void operator()(const CUDAPlace &p) { os_ << "CUDAPlace(" << p.device << ")"; } + void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; } void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; } private: @@ -44,6 +45,10 @@ bool is_gpu_place(const Place &p) { return boost::apply_visitor(IsCUDAPlace(), p); } +bool is_xpu_place(const Place &p) { + return boost::apply_visitor(IsXPUPlace(), p); +} + bool is_cpu_place(const Place &p) { return boost::apply_visitor(IsCPUPlace(), p); } @@ -60,6 +65,8 @@ bool is_same_place(const Place &p1, const Place &p2) { if (places_are_same_class(p1, p2)) { if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) { return true; + } else if (is_xpu_place(p1)) { + return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2); } else { return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2); } diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index eeda10a633b..f95f6954a32 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -58,31 +58,58 @@ struct CUDAPinnedPlace { inline bool operator<(const CUDAPinnedPlace &) const { return false; } }; +// Place for Baidu Kunlun Accelerator +struct XPUPlace { + XPUPlace() : XPUPlace(0) {} + explicit XPUPlace(int d) : device(d) {} + + inline int GetDeviceId() const { return device; } + // needed for variant equality comparison + inline bool operator==(const XPUPlace &o) const { return device == o.device; } + inline bool operator!=(const XPUPlace &o) const { return !(*this == o); } + inline bool operator<(const XPUPlace &o) const { return device < o.device; } + + int device; +}; + struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } + bool operator()(const XPUPlace &) const { return false; } bool operator()(const CUDAPlace &gpu) const { return true; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; struct IsCPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &cpu) const { return true; } + bool operator()(const XPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; struct IsCUDAPinnedPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } + bool operator()(const XPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } }; -class Place : public boost::variant { +struct IsXPUPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const XPUPlace &xpu) const { return true; } + bool operator()(const CUDAPlace &) const { return false; } + bool operator()(const CUDAPinnedPlace &) const { return false; } +}; + +class Place + : public boost::variant { private: - using PlaceBase = boost::variant; + using PlaceBase = + boost::variant; public: Place() = default; Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT + Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT : PlaceBase(cuda_pinned_place) {} @@ -98,6 +125,7 @@ class Place : public boost::variant { using PlaceList = std::vector; bool is_gpu_place(const Place &); +bool is_xpu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); bool places_are_same_class(const Place &, const Place &); @@ -115,6 +143,16 @@ struct PlaceVisitorWrapper return visitor_(cpu); } + typename Visitor::result_type operator()(const XPUPlace &xpu) const { +#ifdef PADDLE_WITH_XPU + return visitor_(xpu); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with XPU. Cannot visit xpu device")); + return typename Visitor::result_type(); +#endif + } + typename Visitor::result_type operator()(const CUDAPlace &cuda) const { #ifdef PADDLE_WITH_CUDA return visitor_(cuda); diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc index e4c1d3def90..13f28c73f45 100644 --- a/paddle/fluid/platform/place_test.cc +++ b/paddle/fluid/platform/place_test.cc @@ -18,19 +18,32 @@ TEST(Place, Equality) { paddle::platform::CPUPlace cpu; paddle::platform::CUDAPlace g0(0), g1(1), gg0(0); + paddle::platform::XPUPlace x0(0), x1(1), xx0(0); EXPECT_EQ(cpu, cpu); EXPECT_EQ(g0, g0); EXPECT_EQ(g1, g1); EXPECT_EQ(g0, gg0); + EXPECT_EQ(x0, x0); + EXPECT_EQ(x1, x1); + EXPECT_EQ(x0, xx0); EXPECT_NE(g0, g1); + EXPECT_NE(x0, x1); EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0)); + EXPECT_TRUE(paddle::platform::places_are_same_class(x0, xx0)); EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu)); + EXPECT_FALSE(paddle::platform::places_are_same_class(x0, cpu)); + EXPECT_FALSE(paddle::platform::places_are_same_class(g0, x0)); } TEST(Place, Print) { + { + std::stringstream ss; + ss << paddle::platform::XPUPlace(1); + EXPECT_EQ("XPUPlace(1)", ss.str()); + } { std::stringstream ss; ss << paddle::platform::CUDAPlace(1); diff --git a/paddle/fluid/platform/xpu_header.h b/paddle/fluid/platform/xpu_header.h new file mode 100644 index 00000000000..d8c5f85f9cf --- /dev/null +++ b/paddle/fluid/platform/xpu_header.h @@ -0,0 +1,23 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_XPU +#include "xpu/api.h" +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" + +namespace xpu = baidu::xpu::api; +#endif diff --git a/paddle/fluid/platform/xpu_info.cc b/paddle/fluid/platform/xpu_info.cc new file mode 100644 index 00000000000..f88248fda7e --- /dev/null +++ b/paddle/fluid/platform/xpu_info.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/platform/xpu_info.h" + +#include +#include +#include +#include "gflags/gflags.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/xpu_header.h" +#include "paddle/fluid/string/split.h" + +DEFINE_string(selected_xpus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (XPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between XPU devices, use XPU_VISIBLE_DEVICES can only use" + "share-memory only."); + +namespace paddle { +namespace platform { + +static int GetXPUDeviceCountImpl() { + const auto *xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES"); + if (xpu_visible_devices != nullptr) { + std::string xpu_visible_devices_str(xpu_visible_devices); + if (std::all_of(xpu_visible_devices_str.begin(), + xpu_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected."; + return 0; + } + } + + int count = 0; + int ret = xpu_device_count(&count); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + return count; +} + +int GetXPUDeviceCount() { + static auto dev_cnt = GetXPUDeviceCountImpl(); + return dev_cnt; +} + +int GetXPUCurrentDeviceId() { + int dev_id; + int ret = xpu_current_device(&dev_id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); + + if (dev_id >= 64) { + // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id + dev_id -= 64; + } + return dev_id; +} + +//! Get a list of device ids from environment variable or use all. +std::vector GetXPUSelectedDevices() { + // use user specified XPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_xpus.empty()) { + auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetXPUDeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + +void SetXPUDeviceId(int id) { + PADDLE_ENFORCE_LT( + id, GetXPUDeviceCount(), + platform::errors::InvalidArgument("id must less than XPU count")); + int ret = xpu_set_device(id); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + ret)); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/xpu_info.h b/paddle/fluid/platform/xpu_info.h new file mode 100644 index 00000000000..efaba13453e --- /dev/null +++ b/paddle/fluid/platform/xpu_info.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_XPU +#include + +namespace paddle { +namespace platform { + +//! Get the total number of XPU devices in system. +int GetXPUDeviceCount(); + +//! Get the current XPU device id in system. +int GetXPUCurrentDeviceId(); + +//! Get a list of device ids from environment variable or use all. +std::vector GetXPUSelectedDevices(); + +//! Set the XPU device id for next execution. +void SetXPUDeviceId(int device_id); + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index be55201595e..021d10ca7fa 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -66,11 +66,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); + } else if (py::isinstance(place_obj)) { + return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); } else { PADDLE_THROW(platform::errors::InvalidArgument( - "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace")); + "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace")); } } @@ -92,6 +94,9 @@ static void InitTensorForVarBase(imperative::VarBase *self, if (platform::is_cpu_place(place)) { SetTensorFromPyArray( tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy); + } else if (platform::is_xpu_place(place)) { + SetTensorFromPyArray( + tensor, array, BOOST_GET_CONST(platform::XPUPlace, place), zero_copy); } else if (platform::is_gpu_place(place)) { SetTensorFromPyArray( tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place), zero_copy); @@ -101,7 +106,7 @@ static void InitTensorForVarBase(imperative::VarBase *self, zero_copy); } else { PADDLE_THROW(platform::errors::InvalidArgument( - "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace")); + "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace")); } if (stop_gradient != -1) { self->SetOverridedStopGradient(stop_gradient); @@ -588,6 +593,10 @@ void BindImperative(py::module *m_ptr) { py::arg("value"), py::arg("place"), py::arg("persistable") = false, py::arg("zero_copy") = false, py::arg("name") = "", py::arg("stop_gradient") = -1) + .def("__init__", &InitVarBaseFromNumpyWithArg, + py::arg("value"), py::arg("place"), py::arg("persistable") = false, + py::arg("zero_copy") = false, py::arg("name") = "", + py::arg("stop_gradient") = -1) .def("__init__", &InitVarBaseFromNumpyWithArg, py::arg("value"), py::arg("place"), py::arg("persistable") = false, py::arg("zero_copy") = false, py::arg("name") = "", @@ -823,6 +832,10 @@ void BindImperative(py::module *m_ptr) { const platform::CUDAPinnedPlace &place, bool blocking) { return self.NewVarBase(place, blocking); }, py::return_value_policy::copy) + .def("_copy_to", + [](const imperative::VarBase &self, const platform::XPUPlace &place, + bool blocking) { return self.NewVarBase(place, blocking); }, + py::return_value_policy::copy) .def("_copy_to", [](const imperative::VarBase &self, const platform::CUDAPlace &place, bool blocking) { return self.NewVarBase(place, blocking); }, @@ -890,6 +903,9 @@ void BindImperative(py::module *m_ptr) { if (py::isinstance(obj)) { auto p = obj.cast(); self.SetExpectedPlace(*p); + } else if (py::isinstance(obj)) { + auto p = obj.cast(); + self.SetExpectedPlace(*p); } else if (py::isinstance(obj)) { auto p = obj.cast(); self.SetExpectedPlace(*p); @@ -898,7 +914,8 @@ void BindImperative(py::module *m_ptr) { self.SetExpectedPlace(*p); } else { PADDLE_THROW(platform::errors::InvalidArgument( - "Incompatible Place Type: supports CUDAPlace, CPUPlace, " + "Incompatible Place Type: supports XPUPlace, CUDAPlace, " + "CPUPlace, " "and CUDAPinnedPlace, " "but got Unknown Type!")); } @@ -928,6 +945,19 @@ void BindImperative(py::module *m_ptr) { *(imperative::AmpOperators::Instance().GetAllowOps()), *(imperative::AmpOperators::Instance().GetBlockOps())); }) + .def("trace", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, const platform::XPUPlace &place, + bool trace_backward) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + py::gil_scoped_release release; + self.TraceOp(type, std::move(ins_map), std::move(outs_map), + std::move(attrs), place, trace_backward); + } + }) .def("trace", [](imperative::Tracer &self, const std::string &type, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 635a81dff0d..13aa8d3f8e1 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -90,6 +90,10 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #endif +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/xpu_info.h" +#endif + #ifdef PADDLE_WITH_DISTRIBUTE #include "paddle/fluid/pybind/communicator_py.h" #endif @@ -118,6 +122,14 @@ bool IsCompiledWithCUDA() { #endif } +bool IsCompiledWithXPU() { +#ifndef PADDLE_WITH_XPU + return false; +#else + return true; +#endif +} + bool IsCompiledWithMKLDNN() { #ifndef PADDLE_WITH_MKLDNN return false; @@ -471,6 +483,10 @@ PYBIND11_MODULE(core_noavx, m) { [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); }) + .def("_alloc_float", + [](Tensor &self, paddle::platform::XPUPlace &place) { + self.mutable_data(place); + }) .def("_alloc_float", [](Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); @@ -483,6 +499,10 @@ PYBIND11_MODULE(core_noavx, m) { [](Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); }) + .def("_alloc_int", + [](Tensor &self, paddle::platform::XPUPlace &place) { + self.mutable_data(place); + }) .def("_alloc_int", [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); @@ -500,6 +520,11 @@ PYBIND11_MODULE(core_noavx, m) { paddle::framework::proto::VarType::Type type) { return reinterpret_cast(self.mutable_data(place, type)); }) + .def("_mutable_data", + [](Tensor &self, paddle::platform::XPUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast(self.mutable_data(place, type)); + }) .def("_mutable_data", [](Tensor &self, paddle::platform::CUDAPlace &place, paddle::framework::proto::VarType::Type type) { @@ -513,6 +538,8 @@ PYBIND11_MODULE(core_noavx, m) { .def("_clear", &Tensor::clear) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) + .def("set", SetTensorFromPyArray, + py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, @@ -522,7 +549,7 @@ PYBIND11_MODULE(core_noavx, m) { Args: lod (numpy.ndarray): The data to set. - place (CPUPlace|CUDAPlace|CUDAPinnedPlace): The place where the + place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the LoDTensor is to be set. zero_copy (bool, optional): Whether to share memory with the input numpy array. This parameter only works with CPUPlace. Default: False. @@ -1230,6 +1257,18 @@ All parameter, weight, gradient are variables in Paddle. -> paddle::platform::DeviceContext* { return new paddle::platform::CPUDeviceContext(); }) + .def_static("create", + [](paddle::platform::XPUPlace& place) + -> paddle::platform::DeviceContext* { +#ifndef PADDLE_WITH_XPU + PADDLE_THROW( + platform::errors::PermissionDenied( + "Cannot use XPUPlace in CPU/GPU version, " + "Please recompile or reinstall Paddle with XPU support.")); +#else + return new paddle::platform::XPUDeviceContext(place); +#endif + }) .def_static("create", [](paddle::platform::CUDAPlace& place) -> paddle::platform::DeviceContext* { @@ -1331,6 +1370,7 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_get_device_id", @@ -1338,6 +1378,60 @@ All parameter, weight, gradient are variables in Paddle. #endif .def("__str__", string::to_string); + py::class_(m, "XPUPlace", R"DOC( + **Note**: + Examples: + .. code-block:: python + import paddle.fluid as fluid + xpu_place = fluid.XPUPlace(0) + )DOC") + .def("__init__", + [](platform::XPUPlace &self, int dev_id) { +#ifdef PADDLE_WITH_XPU + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid XPUPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) { + if (platform::GetXPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use XPU because there is no XPU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid XPUPlace(%d), must inside [0, %d), because XPU " + "number on your machine is %d", + dev_id, platform::GetXPUDeviceCount(), + platform::GetXPUDeviceCount()); + std::exit(-1); + } + } + new (&self) platform::XPUPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use XPU because you have installed CPU/GPU version " + "PaddlePaddle.\n" + "If you want to use XPU, please try to install XPU version " + "PaddlePaddle by: pip install paddlepaddle-xpu\n" + "If you only have CPU, please change XPUPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("__str__", string::to_string); + py::class_(m, "CPUPlace", R"DOC( CPUPlace is a descriptor of a device. It represents a CPU device allocated or to be allocated with Tensor or LoDTensor. @@ -1352,6 +1446,7 @@ All parameter, weight, gradient are variables in Paddle. .def(py::init<>()) .def("_type", &PlaceIndex) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", @@ -1386,6 +1481,8 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", @@ -1398,11 +1495,14 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("is_gpu_place", [](platform::Place &self) { return platform::is_gpu_place(self); }) .def("is_cpu_place", [](platform::Place &self) { return platform::is_cpu_place(self); }) + .def("is_xpu_place", + [](platform::Place &self) { return platform::is_xpu_place(self); }) .def("is_cuda_pinned_place", [](platform::Place &self) { return platform::is_cuda_pinned_place(self); @@ -1411,12 +1511,20 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self) { return BOOST_GET_CONST(platform::CUDAPlace, self).device; }) + .def("xpu_device_id", + [](platform::Place &self) { + return BOOST_GET_CONST(platform::XPUPlace, self).device; + }) .def("set_place", [](platform::Place &self, const platform::Place &other) { self = other; }) .def("set_place", [](platform::Place &self, const platform::CPUPlace &cpu_place) { self = cpu_place; }) + .def("set_place", + [](platform::Place &self, const platform::XPUPlace &xpu_place) { + self = xpu_place; + }) .def("set_place", [](platform::Place &self, const platform::CUDAPlace &gpu_place) { self = gpu_place; @@ -1444,6 +1552,9 @@ All parameter, weight, gradient are variables in Paddle. .def("run", [](OperatorBase &self, const Scope &scope, const platform::CPUPlace &place) { self.Run(scope, place); }) + .def("run", + [](OperatorBase &self, const Scope &scope, + const platform::XPUPlace &place) { self.Run(scope, place); }) .def("run", [](OperatorBase &self, const Scope &scope, const platform::CUDAPlace &place) { self.Run(scope, place); }) @@ -1544,6 +1655,7 @@ All parameter, weight, gradient are variables in Paddle. [](bool init_p2p) { framework::InitDevices(init_p2p); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); + m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_dist", IsCompiledWithDIST); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index c16b22b9fc3..0b4e4502bb8 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -145,8 +145,14 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { T b = static_cast(0); if (platform::is_cpu_place(self.place())) { b = self.data()[offset]; + } else if (platform::is_xpu_place(self.place())) { +#ifdef PADDLE_WITH_XPU + const T *a = self.data(); + auto p = BOOST_GET_CONST(platform::XPUPlace, self.place()); + paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T)); +#endif + } else if (platform::is_gpu_place(self.place())) { #ifdef PADDLE_WITH_CUDA - } else { const T *a = self.data(); auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place()); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), @@ -163,8 +169,14 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { "The offset exceeds the size of tensor.")); if (platform::is_cpu_place(self->place())) { self->mutable_data(self->place())[offset] = elem; + } else if (platform::is_xpu_place(self->place())) { +#ifdef PADDLE_WITH_XPU + auto p = BOOST_GET_CONST(platform::XPUPlace, self->place()); + T *a = self->mutable_data(p); + paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T)); +#endif + } else if (platform::is_gpu_place(self->place())) { #ifdef PADDLE_WITH_CUDA - } else { auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place()); T *a = self->mutable_data(p); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), @@ -194,6 +206,16 @@ void SetTensorFromPyArrayT( auto dst = self->mutable_data(place); std::memcpy(dst, array.data(), array.nbytes()); } + } else if (paddle::platform::is_xpu_place(place)) { +#ifdef PADDLE_WITH_XPU + auto dst = self->mutable_data(place); + xpu_memcpy(dst, array.data(), array.nbytes(), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use XPUPlace in CPU/GPU version, " + "Please recompile or reinstall Paddle with XPU support.")); +#endif } else { #ifdef PADDLE_WITH_CUDA auto dst = self->mutable_data(place); @@ -354,8 +376,13 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self, if (platform::is_cpu_place(place)) { output->mutable_data(BOOST_GET_CONST(platform::CPUPlace, place), self.type()); -#ifdef PADDLE_WITH_CUDA + } else if (platform::is_xpu_place(place)) { +#ifdef PADDLE_WITH_XPU + output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place), + self.type()); +#endif } else { +#ifdef PADDLE_WITH_CUDA if (platform::is_cuda_pinned_place(place)) { output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place), self.type()); @@ -516,6 +543,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, return py::array(); } bool is_gpu_tensor = platform::is_gpu_place(tensor.place()); + bool is_xpu_tensor = platform::is_xpu_place(tensor.place()); const auto &tensor_dims = tensor.dims(); auto tensor_dtype = tensor.type(); size_t sizeof_dtype = framework::SizeOfType(tensor_dtype); @@ -534,7 +562,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type()); - if (!is_gpu_tensor) { + if (!is_gpu_tensor && !is_xpu_tensor) { if (!need_deep_copy) { return py::array(py::buffer_info( const_cast(tensor_buf_ptr), sizeof_dtype, py_dtype_str, @@ -557,28 +585,54 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, copy_bytes); return py_arr; } - } - + } else if (is_xpu_tensor) { +#ifdef PADDLE_WITH_XPU + py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); + PADDLE_ENFORCE_EQ(py_arr.writeable(), true, + platform::errors::InvalidArgument( + "PyArray is not writable, in which case memory leak " + "or double free would occur")); + PADDLE_ENFORCE_EQ( + py_arr.owndata(), true, + platform::errors::InvalidArgument( + "PyArray does not own data, in which case memory leak " + "or double free would occur")); + + size_t copy_bytes = sizeof_dtype * numel; + auto p = BOOST_GET_CONST(platform::XPUPlace, tensor.place()); + paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p, + tensor_buf_ptr, copy_bytes); + return py_arr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use XPUPlace in CPU/GPU version, " + "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else if (is_gpu_tensor) { #ifdef PADDLE_WITH_CUDA - py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); - PADDLE_ENFORCE_EQ(py_arr.writeable(), true, - platform::errors::InvalidArgument( - "PyArray is not writable, in which case memory leak " - "or double free would occur")); - PADDLE_ENFORCE_EQ(py_arr.owndata(), true, - platform::errors::InvalidArgument( - "PyArray does not own data, in which case memory leak " - "or double free would occur")); - - size_t copy_bytes = sizeof_dtype * numel; - paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr, - copy_bytes, cudaMemcpyDeviceToHost); - return py_arr; + py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); + PADDLE_ENFORCE_EQ(py_arr.writeable(), true, + platform::errors::InvalidArgument( + "PyArray is not writable, in which case memory leak " + "or double free would occur")); + PADDLE_ENFORCE_EQ( + py_arr.owndata(), true, + platform::errors::InvalidArgument( + "PyArray does not own data, in which case memory leak " + "or double free would occur")); + + size_t copy_bytes = sizeof_dtype * numel; + paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr, + copy_bytes, cudaMemcpyDeviceToHost); + return py_arr; #else - PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot use CUDAPlace in CPU only version, " - "Please recompile or reinstall Paddle with CUDA support.")); + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CUDAPlace in CPU only version, " + "Please recompile or reinstall Paddle with CUDA support.")); #endif + } + PADDLE_THROW(platform::errors::Unimplemented("Place is not supported")); + return py::array(); } } // namespace pybind diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7e0d8c0de5b..2ed8642c86d 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -68,7 +68,7 @@ from .input import embedding, one_hot from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder -from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope +from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope from .incubate import fleet from .incubate import data_generator from .transpiler import DistributeTranspiler, \ @@ -119,6 +119,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'LoDTensor', 'LoDTensorArray', 'CPUPlace', + 'XPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', 'Tensor', diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index e844c74c106..3169cc9dae8 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -48,6 +48,7 @@ __all__ = [ 'cuda_pinned_places', 'in_dygraph_mode', 'is_compiled_with_cuda', + 'is_compiled_with_xpu', 'Variable', 'ComplexVariable', 'load_op_library', @@ -310,6 +311,21 @@ def _cuda_ids(): return device_ids +def is_compiled_with_xpu(): + """ + Whether this whl package can be used to run the model on XPU. + + Returns (bool): support xpu or not. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + support_xpu = fluid.is_compiled_with_xpu() + """ + return core.is_compiled_with_xpu() + + def is_compiled_with_cuda(): """ Whether this whl package can be used to run the model on GPU. diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py index 8ca06aa9521..5f223de1954 100644 --- a/python/paddle/fluid/tests/unittests/test_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_mul_op.py @@ -175,5 +175,57 @@ class TestFP16MulOp2(TestMulOp2): no_grad_set=set('Y')) +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUMulOp1(TestMulOp): + def init_dtype_type(self): + self.dtype = np.float32 + + def test_check_output(self): + place = core.XPUPlace(0) + self.check_output_with_place(place, atol=1e-1) + + def test_check_grad_normal(self): + place = core.XPUPlace(0) + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', max_relative_error=0.5) + + def test_check_grad_ingore_x(self): + place = core.XPUPlace(0) + self.check_grad_with_place( + place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = core.XPUPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) + + +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUMulOp2(TestMulOp2): + def init_dtype_type(self): + self.dtype = np.float32 + + def test_check_output(self): + place = core.XPUPlace(0) + self.check_output_with_place(place, atol=2e-1) + + def test_check_grad_normal(self): + place = core.XPUPlace(0) + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', max_relative_error=0.9) + + def test_check_grad_ingore_x(self): + place = core.XPUPlace(0) + self.check_grad_with_place( + place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = core.XPUPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y')) + + if __name__ == "__main__": unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 4706099c0c3..5b206296bd6 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -303,6 +303,23 @@ if '${WITH_MKLDNN}' == 'ON': else: package_data['paddle.libs']+=['mkldnn.dll'] +if '${WITH_XPU}' == 'ON': + # only change rpath in Release mode, + if '${CMAKE_BUILD_TYPE}' == 'Release': + if os.name != 'nt': + if "@APPLE@" == "1": + command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}" + else: + command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}" + if os.system(command) != 0: + raise Exception("patch ${XPU_API_LIB} failed, command: %s" % command) + shutil.copy('${XPU_API_LIB}', libs_path) + shutil.copy('${XPU_RT_LIB}', libs_path) + shutil.copy('${XPU_SIM_LIB}', libs_path) + package_data['paddle.libs']+=['${XPU_API_LIB_NAME}', + '${XPU_RT_LIB_NAME}', + '${XPU_SIM_LIB_NAME}'] + # copy libfuild_framework.so to libs if os.name != 'nt' and sys.platform != 'darwin': paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}' -- GitLab