未验证 提交 138ecf24 编写于 作者: Q QingshuChen 提交者: GitHub

support Baidu Kunlun AI Accelerator (#25959)

* support Baidu AI Accelerator
  * test=kunlun

* minor
 * test=kunlun

* support xpu op in separate file
 * test=kunlun

* update XPU error message and remove duplicated code

 * test=kunlun

* minor
 * test=kunlun

* minor
 * test=kunlun
上级 17bcaef4
......@@ -28,7 +28,10 @@ include(generic) # simplify cmake module
# TODO(Shibo Tao): remove find_package(CUDA) completely.
find_package(CUDA QUIET)
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
......
......@@ -63,6 +63,11 @@ if(WITH_BOX_PS)
add_definitions(-DPADDLE_WITH_BOX_PS)
endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)
endif()
if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU)
......
if (NOT WITH_XPU)
return()
endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/api/include")
SET(XPU_RUNTIME_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
SET(XPU_API_LIB_NAME "libxpuapi.so")
SET(XPU_RT_LIB_NAME "libxpurt.so")
SET(XPU_SIM_LIB_NAME "libxpusim.so")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
SET(XPU_SIM_LIB "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(XPU)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
" DESTINATION ${XPU_INSTALL_DIR})\n")
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${XPU_SOURCE_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
&& tar xvf xpu.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
)
ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
# generate a static dummy target to track xpulib dependencies
# for cc_library(xxx SRCS xxx.c DEPS xpulib)
generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
......@@ -8,6 +8,7 @@ function(op_library TARGET)
set(hip_cu_srcs)
set(miopen_hip_cc_srcs)
set(cu_cc_srcs)
set(xpu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(cudnn_cu_srcs)
set(CUDNN_FILE)
......@@ -60,6 +61,12 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
endif()
endif()
if(WITH_XPU)
string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
endif()
endif()
else()
foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.hip.cu$")
......@@ -76,6 +83,8 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
......@@ -109,7 +118,7 @@ function(op_library TARGET)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
......@@ -150,10 +159,11 @@ function(op_library TARGET)
list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
......@@ -179,6 +189,9 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
endif()
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
......@@ -228,6 +241,7 @@ function(register_operators)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE "_xpu" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS)
list(LENGTH register_operators_DEPS register_operators_DEPS_len)
......
......@@ -250,6 +250,11 @@ if(WITH_GPU)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
endif(WITH_GPU)
if(WITH_XPU)
include(external/xpu) # download, build, install xpu
list(APPEND third_party_deps extern_xpu)
endif(WITH_XPU)
if(WITH_PSLIB)
include(external/pslib) # download, build, install pslib
list(APPEND third_party_deps extern_pslib)
......
......@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
return ctx;
}
inline ::DLContext operator()(const platform::XPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA
::DLContext ctx;
......
......@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector> gc;
if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
#ifdef PADDLE_WITH_CUDA
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
......@@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
gc.reset(new DefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
}
} else if (platform::is_cpu_place(place_)) {
#else
PADDLE_THROW(
platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
#endif
} else if (platform::is_cpu_place(place_)) {
gc.reset(new CPUGarbageCollector(
BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
#ifdef PADDLE_WITH_CUDA
}
} else if (platform::is_xpu_place(place_)) {
#ifdef PADDLE_WITH_XPU
gc.reset(new XPUGarbageCollector(
BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif
}
}
for (int64_t i = start_op_index; i < end_op_index; ++i) {
......
......@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
callback();
}
#ifdef PADDLE_WITH_XPU
XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
callback();
}
#endif
#ifdef PADDLE_WITH_CUDA
UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
const platform::CUDAPlace &place, size_t max_memory_size)
......
......@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
void ClearCallback(const std::function<void()> &callback) override;
};
#ifdef PADDLE_WITH_XPU
class XPUGarbageCollector : public GarbageCollector {
public:
XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
#endif
#ifdef PADDLE_WITH_CUDA
class UnsafeFastGPUGarbageCollector : public GarbageCollector {
public:
......
......@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
// CPU, CUDA, PLAIN are same library type.
} else if (s == std::string("CPU")) {
return LibraryType::kPlain;
} else if (s == std::string("XPU")) {
return LibraryType::kPlain;
} else if (s == std::string("CUDA")) {
return LibraryType::kPlain;
} else {
......
......@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \
customized_type_value, \
......@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
/**
* Macro to mark what Operator and Kernel
* we will use and tell the compiler to
......
......@@ -34,6 +34,9 @@ limitations under the License. */
#include "paddle/fluid/framework/unused_var_check.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
......@@ -165,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::SetDeviceId(dev_id);
#endif
} else if (platform::is_xpu_place(place)) {
#ifndef PADDLE_WITH_XPU
PADDLE_THROW(platform::errors::Unimplemented(
"Cannot run operator on place %s", place));
#else
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
platform::SetXPUDeviceId(dev_id);
#endif
}
......@@ -1109,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_XPU
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing XPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", type_,
......
......@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const BuildStrategy &build_strategy,
ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places, scope)) {
PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
platform::errors::Unavailable(
"XPU is not supported in ParallelExecutor"));
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size());
member_->use_cuda_ = exec_strategy.use_cuda_;
......
......@@ -54,18 +54,43 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
} else if (platform::is_cpu_place(src_place) &&
platform::is_xpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else if (platform::is_xpu_place(src_place) &&
platform::is_xpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUDA
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
size);
} else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
}
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
auto ctx_place = ctx.GetPlace();
......@@ -75,8 +100,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
auto ctx_place = ctx.GetPlace();
......@@ -86,8 +112,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
}
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
auto dst_cuda_pinned_place =
BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place);
......@@ -108,8 +135,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else if (platform::is_cuda_pinned_place(src_place) &&
platform::is_gpu_place(dst_place)) {
}
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_gpu_place(dst_place)) {
auto src_cuda_pinned_place =
BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
......@@ -130,8 +158,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size,
stream);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
}
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
auto ctx_place = ctx.GetPlace();
......@@ -154,7 +183,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
}
}
} else {
}
else { // NOLINT
PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
}
#endif
......@@ -200,44 +230,74 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
} else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
} else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUDA
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
size);
} else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
}
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CUDAPlace, src_place), src_ptr, size,
nullptr);
} else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
}
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
}
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cuda_pinned_place(src_place) &&
platform::is_gpu_place(dst_place)) {
}
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_gpu_place(dst_place)) {
auto src_pinned_place =
BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
nullptr);
} else {
}
else { // NOLINT
PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
}
#endif
......@@ -276,6 +336,19 @@ class AnyVisitor : public boost::static_visitor<bool> {
const framework::Tensor& tensor_;
Predicate predicate_;
bool GetResultHelper(const framework::Tensor& out,
const platform::Place& place) const {
platform::CPUPlace cpu;
framework::Tensor tmp;
tmp.Resize({1});
tmp.mutable_data<bool>(cpu);
auto ctx = platform::DeviceContextPool::Instance().Get(place);
ctx->Wait();
TensorCopy(out, cpu, *ctx, &tmp);
ctx->Wait();
return GetResult(tmp, cpu);
}
public:
AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
: tensor_(tensor), predicate_(std::move(predicate)) {}
......@@ -290,17 +363,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
return this->GetResult(out, place);
}
bool GetResult(const framework::Tensor& out,
const platform::XPUPlace& xpu) const {
return GetResultHelper(out, xpu);
}
bool GetResult(const framework::Tensor& out,
const platform::CUDAPlace& gpu) const {
platform::CPUPlace cpu;
framework::Tensor tmp;
tmp.Resize({1});
tmp.mutable_data<bool>(cpu);
auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
gpuctx->Wait();
TensorCopy(out, cpu, *gpuctx, &tmp);
gpuctx->Wait();
return GetResult(tmp, cpu);
return GetResultHelper(out, gpu);
}
bool GetResult(const framework::Tensor& out,
......@@ -418,6 +488,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
VisitorImpl(place);
}
void VisitorImpl(const platform::XPUPlace& xpu) const {
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
void VisitorImpl(const platform::CUDAPlace& gpu) const {
#ifdef PADDLE_WITH_CUDA
auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
......@@ -498,6 +572,28 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
#endif
} else if (platform::is_xpu_place(tensor.place())) {
#ifdef PADDLE_WITH_XPU
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& xpu_dev_ctx =
static_cast<const platform::XPUDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
BOOST_GET_CONST(platform::XPUPlace, tensor.place()),
reinterpret_cast<const void*>(data), size_to_write);
xpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
......@@ -552,8 +648,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
void* buf;
auto ctx = platform::CPUDeviceContext();
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(shape));
framework::VisitDataType(
......@@ -563,8 +660,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
}
#endif
} else {
framework::VisitDataType(
......@@ -603,8 +705,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
void* buf;
auto ctx = platform::CPUDeviceContext();
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType(
......@@ -614,8 +717,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
}
#endif
} else {
framework::VisitDataType(
......@@ -700,6 +808,9 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
reinterpret_cast<const platform::CUDADeviceContext&>(*ctx).stream());
}
#endif
#ifdef PADDLE_WITH_XPU
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
#endif
}
template <typename T>
......
......@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
blas.AXPY(numel_, 1., x_, y_);
}
void operator()(const platform::XPUPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#ifdef PADDLE_WITH_CUDA
void operator()(const platform::CUDAPlace& place) {
platform::CUDADeviceContext* ctx =
......
......@@ -100,6 +100,13 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_)) {
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound(
......
......@@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
if (WITH_GPU)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
elseif(WITH_XPU)
set(AllocatorFacadeDeps xpu_info)
else ()
set(AllocatorFacadeDeps)
endif()
......
......@@ -39,6 +39,9 @@
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
DEFINE_int64(
gpu_allocator_retry_time, 10000,
......@@ -62,6 +65,11 @@ class AllocatorFacadePrivate {
switch (strategy) {
case AllocatorStrategy::kNaiveBestFit: {
InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
......@@ -74,6 +82,11 @@ class AllocatorFacadePrivate {
case AllocatorStrategy::kAutoGrowth: {
InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
......@@ -86,6 +99,11 @@ class AllocatorFacadePrivate {
case AllocatorStrategy::kThreadLocal: {
InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
......@@ -127,6 +145,13 @@ class AllocatorFacadePrivate {
private:
void InitSystemAllocators() {
system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
#ifdef PADDLE_WITH_XPU
int device_count = platform::GetXPUDeviceCount();
for (int i = 0; i < device_count; ++i) {
platform::XPUPlace p(i);
system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
#ifdef PADDLE_WITH_CUDA
system_allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<CPUPinnedAllocator>();
......@@ -164,6 +189,12 @@ class AllocatorFacadePrivate {
}
#endif
#ifdef PADDLE_WITH_XPU
void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
class ZeroSizeAllocator : public Allocator {
public:
explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
......@@ -191,6 +222,12 @@ class AllocatorFacadePrivate {
}
places.emplace_back(platform::CUDAPinnedPlace());
#endif
#ifdef PADDLE_WITH_XPU
int device_count = platform::GetXPUDeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
places.emplace_back(platform::XPUPlace(dev_id));
}
#endif
for (auto& p : places) {
zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
......
......@@ -29,6 +29,9 @@
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_header.h"
#endif
DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
......@@ -101,6 +104,100 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
}
template <>
void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void *p = nullptr;
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
ret = xpu_set_device(place.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (FLAGS_init_allocated_mem) {
PADDLE_THROW(platform::errors::Unimplemented(
"xpu memory FLAGS_init_allocated_mem is not implemented."));
}
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
VLOG(10) << " pointer=" << p;
return p;
#else
PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported."));
return nullptr;
#endif
}
template <>
void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
ret = xpu_set_device(place.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
xpu_free(p);
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
#else
PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif
}
template <>
size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#ifdef PADDLE_WITH_XPU
printf("Used func return 0 for XPUPlace\n");
return 0;
#else
PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif
}
#ifdef PADDLE_WITH_CUDA
class GPUBuddyAllocatorList {
private:
......
......@@ -18,6 +18,10 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_header.h"
#endif
namespace paddle {
namespace memory {
......@@ -29,6 +33,169 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
std::memcpy(dst, src, num);
}
#ifdef PADDLE_WITH_XPU
template <>
void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num) {
if (num <= 0) {
VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
return;
}
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
if (dev_id != dst_place.device) {
ret = xpu_set_device(dst_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id != dst_place.device) {
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
}
template <>
void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::XPUPlace src_place,
const void* src, size_t num) {
if (num <= 0) {
VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
return;
}
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
if (dev_id != src_place.device) {
ret = xpu_set_device(src_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id != src_place.device) {
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
}
template <>
void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
void* dst,
platform::XPUPlace src_place,
const void* src, size_t num) {
if (num <= 0) {
VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
return;
}
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
if (dev_id != src_place.device || dev_id != dst_place.device) {
ret = xpu_set_device(src_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
void* tmp = malloc(num);
ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_set_device(dst_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
free(tmp);
} else {
int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
}
#endif
#ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
......
......@@ -83,7 +83,6 @@ Return an identity tensor whose shape is [num_rows, num_columns].
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
using float16 = paddle::platform::float16;
REGISTER_OPERATOR(
eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
......@@ -93,4 +92,4 @@ REGISTER_OPERATOR(
REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
ops::EyeKernel<CPU, double>,
ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
ops::EyeKernel<CPU, float16>);
ops::EyeKernel<CPU, paddle::platform::float16>);
......@@ -73,6 +73,13 @@ struct TensorSetConstantCPU {
float value_;
};
template <>
void set_constant_with_place<platform::XPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/mul_op.h"
namespace paddle {
namespace operators {
using framework::OpKernelType;
using framework::Tensor;
template <typename DeviceContext, typename T>
class MulXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* x = context.Input<Tensor>("X");
const Tensor* y = context.Input<Tensor>("Y");
Tensor* z = context.Output<Tensor>("Out");
const Tensor x_matrix =
x->dims().size() > 2
? framework::ReshapeToMatrix(
*x, context.template Attr<int>("x_num_col_dims"))
: *x;
const Tensor y_matrix =
y->dims().size() > 2
? framework::ReshapeToMatrix(
*y, context.template Attr<int>("y_num_col_dims"))
: *y;
z->mutable_data<T>(context.GetPlace());
auto z_dim = z->dims();
if (z_dim.size() != 2) {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
bool trans_a = false;
bool trans_b = false;
int m = x_matrix.dims()[0];
int k = x_matrix.dims()[1];
int k1 = y_matrix.dims()[0];
int n = y_matrix.dims()[1];
PADDLE_ENFORCE_EQ(
k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
T alpha = static_cast<T>(1.0);
T beta = static_cast<T>(0.0);
const T* data_a = x_matrix.data<T>();
const T* data_b = y_matrix.data<T>();
T* data_c = z->data<T>();
auto& dev_ctx = context.template device_context<DeviceContext>();
int ret = xpu::fc_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k,
alpha, data_a, data_b, beta, data_c);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (z_dim.size() != 2) {
z->Resize(z_dim);
}
}
};
template <typename DeviceContext, typename T>
class MulGradXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto x_matrix = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, x_num_col_dims)
: static_cast<const Tensor&>(*x);
auto y_matrix = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, y_num_col_dims)
: static_cast<const Tensor&>(*y);
auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
Tensor dout_mat;
dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
if (dx != nullptr) {
dx->set_lod(x->lod());
}
if (dy != nullptr) {
dy->set_lod(y->lod());
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_matrix = dx->dims().size() > 2
? framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
// blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
bool trans_a = false;
bool trans_b = true;
int m = dout_mat.dims()[0];
int k = dout_mat.dims()[1];
int n = y_matrix.dims()[0];
int k1 = y_matrix.dims()[1];
PADDLE_ENFORCE_EQ(
k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
int lda = (!trans_a) ? k : m;
int ldb = (!trans_b) ? n : k;
int ldc = n;
T alpha = static_cast<T>(1.0);
T beta = static_cast<T>(0.0);
const T* data_a = dout->data<T>();
const T* data_b = y_matrix.data<T>();
T* data_c = dx_matrix.data<T>();
int ret =
xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
data_a, lda, data_b, ldb, beta, data_c, ldc);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
ret));
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
Tensor dy_matrix = dy->dims().size() > 2
? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
// blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
bool trans_a = true;
bool trans_b = false;
int k = x_matrix.dims()[0];
int m = x_matrix.dims()[1];
int k1 = dout_mat.dims()[0];
int n = dout_mat.dims()[1];
PADDLE_ENFORCE_EQ(
k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
int lda = (!trans_a) ? k : m;
int ldb = (!trans_b) ? n : k;
int ldc = n;
T alpha = static_cast<T>(1.0);
T beta = static_cast<T>(0.0);
const T* data_a = x_matrix.data<T>();
const T* data_b = dout->data<T>();
T* data_c = dy_matrix.data<T>();
int ret =
xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
data_a, lda, data_b, ldb, beta, data_c, ldc);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
ret));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
mul, ops::MulXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(
mul_grad, ops::MulGradXPUKernel<paddle::platform::XPUDeviceContext, float>)
#endif
......@@ -4,6 +4,12 @@ if(WITH_GPU)
proto_library(cuda_error_proto SRCS cuda_error.proto)
endif(WITH_GPU)
if(WITH_XPU)
set(XPU_CTX_DEPS xpulib)
ELSE()
set(XPU_CTX_DEPS)
endif(WITH_XPU)
if (WITH_PYTHON)
py_proto_compile(profiler_py_proto SRCS profiler.proto)
add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
......@@ -50,6 +56,10 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cu
cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
if(WITH_XPU)
cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce)
endif()
add_subdirectory(dynload)
add_subdirectory(stream)
......@@ -84,7 +94,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
# avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
${dgc_deps} dlpack cudnn_workspace_helper)
${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto device_context enforce)
......
......@@ -61,7 +61,8 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
if (it == device_contexts_.end()) {
PADDLE_THROW(platform::errors::Unimplemented(
"Place %s is not supported. Please check that your paddle compiles "
"with WITH_GPU option or check that your train process hold the "
"with WITH_GPU or WITH_XPU option or check that your train process "
"hold the "
"correct gpu_id if you use Executor.",
place));
}
......@@ -115,6 +116,14 @@ DeviceContextPool::DeviceContextPool(
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported. Please re-compile with WITH_GPU "
"option."));
#endif
} else if (platform::is_xpu_place(p)) {
#ifdef PADDLE_WITH_XPU
EmplaceDeviceContext<XPUDeviceContext, XPUPlace>(&device_contexts_, p);
#else
PADDLE_THROW(
platform::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
}
}
......@@ -134,6 +143,49 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
Place CPUDeviceContext::GetPlace() const { return place_; }
#ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); }
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_set_device(place.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
context_ = xpu::create_context();
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
void XPUDeviceContext::Wait() const {
int ret = xpu_set_device(place_.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
xpu_wait();
}
Place XPUDeviceContext::GetPlace() const { return place_; }
xpu::Context* XPUDeviceContext::x_context() const { return context_; }
#endif
#ifdef PADDLE_WITH_CUDA
class EigenCudaStreamDevice : public Eigen::StreamInterface {
......
......@@ -43,6 +43,10 @@ limitations under the License. */
#endif
#include "unsupported/Eigen/CXX11/Tensor"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_header.h"
#endif
namespace paddle {
namespace platform {
......@@ -76,6 +80,35 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
using TYPE = CPUDeviceContext;
};
#ifdef PADDLE_WITH_XPU
class XPUDeviceContext : public DeviceContext {
public:
XPUDeviceContext();
explicit XPUDeviceContext(XPUPlace place);
virtual ~XPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override;
xpu::Context* x_context() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
private:
XPUPlace place_;
xpu::Context* context_;
// Need to be the same with other DeviceContext,
// Eventhough eigen_device_ is not used in XPU
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
DISABLE_COPY_AND_ASSIGN(XPUDeviceContext);
};
template <>
struct DefaultDeviceContextType<platform::XPUPlace> {
using TYPE = XPUDeviceContext;
};
#endif
#ifdef PADDLE_WITH_CUDA
class EigenCudaStreamDevice;
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
TEST(Device, Init) {
using paddle::platform::DeviceContext;
using paddle::platform::XPUDeviceContext;
using paddle::platform::XPUPlace;
int count = paddle::platform::GetXPUDeviceCount();
for (int i = 0; i < count; i++) {
XPUDeviceContext* device_context = new XPUDeviceContext(XPUPlace(i));
xpu::Context* ctx = device_context->x_context();
ASSERT_NE(nullptr, ctx);
delete device_context;
}
}
TEST(Device, DeviceContextPool) {
using paddle::platform::DeviceContextPool;
using paddle::platform::XPUDeviceContext;
using paddle::platform::Place;
using paddle::platform::CPUPlace;
using paddle::platform::XPUPlace;
DeviceContextPool& pool = DeviceContextPool::Instance();
auto cpu_dev_ctx1 = pool.Get(CPUPlace());
auto cpu_dev_ctx2 = pool.Get(CPUPlace());
ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
std::vector<Place> xpu_places;
int count = paddle::platform::GetXPUDeviceCount();
for (int i = 0; i < count; ++i) {
auto dev_ctx = pool.Get(XPUPlace(i));
ASSERT_NE(dev_ctx, nullptr);
}
}
......@@ -33,6 +33,11 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/piece.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_header.h"
#include "paddle/fluid/platform/xpu_info.h"
#endif
DECLARE_int32(paddle_num_threads);
DEFINE_int32(multiple_of_cupti_buffer_size, 1,
"Multiple of the CUPTI device buffer size. If the timestamps have "
......@@ -151,6 +156,14 @@ void InitDevices(bool init_p2p) {
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
}
#endif
#ifdef PADDLE_WITH_XPU
try {
// use user specified XPUs in single-node multi-process mode.
devices = platform::GetXPUSelectedDevices();
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
}
#endif
InitDevices(init_p2p, devices);
}
......@@ -165,7 +178,13 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
LOG(WARNING) << "Invalid devices id.";
continue;
}
#ifdef PADDLE_WITH_CUDA
places.emplace_back(platform::CUDAPlace(devices[i]));
#endif
#ifdef PADDLE_WITH_XPU
places.emplace_back(platform::XPUPlace(devices[i]));
#endif
}
if (init_p2p) {
InitP2P(devices);
......
......@@ -20,7 +20,7 @@ TEST(InitDevices, CPU) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
InitDevices(true);
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U);
......@@ -39,6 +39,18 @@ TEST(InitDevices, CUDA) {
#endif
}
TEST(InitDevices, XPU) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
#ifdef PADDLE_WITH_XPU
int count = paddle::platform::GetXPUDeviceCount();
InitDevices(true);
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
#endif
}
#ifndef _WIN32
TEST(SignalHandle, SignalHandle) {
std::string msg = "Signal raises";
......
......@@ -32,6 +32,7 @@ class PlacePrinter : public boost::static_visitor<> {
void operator()(const CUDAPlace &p) {
os_ << "CUDAPlace(" << p.device << ")";
}
void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
private:
......@@ -44,6 +45,10 @@ bool is_gpu_place(const Place &p) {
return boost::apply_visitor(IsCUDAPlace(), p);
}
bool is_xpu_place(const Place &p) {
return boost::apply_visitor(IsXPUPlace(), p);
}
bool is_cpu_place(const Place &p) {
return boost::apply_visitor(IsCPUPlace(), p);
}
......@@ -60,6 +65,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
if (places_are_same_class(p1, p2)) {
if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
return true;
} else if (is_xpu_place(p1)) {
return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
} else {
return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
}
......
......@@ -58,31 +58,58 @@ struct CUDAPinnedPlace {
inline bool operator<(const CUDAPinnedPlace &) const { return false; }
};
// Place for Baidu Kunlun Accelerator
struct XPUPlace {
XPUPlace() : XPUPlace(0) {}
explicit XPUPlace(int d) : device(d) {}
inline int GetDeviceId() const { return device; }
// needed for variant equality comparison
inline bool operator==(const XPUPlace &o) const { return device == o.device; }
inline bool operator!=(const XPUPlace &o) const { return !(*this == o); }
inline bool operator<(const XPUPlace &o) const { return device < o.device; }
int device;
};
struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const CUDAPlace &gpu) const { return true; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
struct IsCPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &cpu) const { return true; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
};
class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
struct IsXPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &xpu) const { return true; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
class Place
: public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
private:
using PlaceBase = boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace>;
using PlaceBase =
boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
public:
Place() = default;
Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT
Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT
Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT
Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT
: PlaceBase(cuda_pinned_place) {}
......@@ -98,6 +125,7 @@ class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
using PlaceList = std::vector<Place>;
bool is_gpu_place(const Place &);
bool is_xpu_place(const Place &);
bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &);
bool places_are_same_class(const Place &, const Place &);
......@@ -115,6 +143,16 @@ struct PlaceVisitorWrapper
return visitor_(cpu);
}
typename Visitor::result_type operator()(const XPUPlace &xpu) const {
#ifdef PADDLE_WITH_XPU
return visitor_(xpu);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with XPU. Cannot visit xpu device"));
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#ifdef PADDLE_WITH_CUDA
return visitor_(cuda);
......
......@@ -18,19 +18,32 @@
TEST(Place, Equality) {
paddle::platform::CPUPlace cpu;
paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
paddle::platform::XPUPlace x0(0), x1(1), xx0(0);
EXPECT_EQ(cpu, cpu);
EXPECT_EQ(g0, g0);
EXPECT_EQ(g1, g1);
EXPECT_EQ(g0, gg0);
EXPECT_EQ(x0, x0);
EXPECT_EQ(x1, x1);
EXPECT_EQ(x0, xx0);
EXPECT_NE(g0, g1);
EXPECT_NE(x0, x1);
EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
EXPECT_TRUE(paddle::platform::places_are_same_class(x0, xx0));
EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
EXPECT_FALSE(paddle::platform::places_are_same_class(x0, cpu));
EXPECT_FALSE(paddle::platform::places_are_same_class(g0, x0));
}
TEST(Place, Print) {
{
std::stringstream ss;
ss << paddle::platform::XPUPlace(1);
EXPECT_EQ("XPUPlace(1)", ss.str());
}
{
std::stringstream ss;
ss << paddle::platform::CUDAPlace(1);
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_XPU
#include "xpu/api.h"
#include "xpu/runtime.h"
#include "xpu/runtime_ex.h"
namespace xpu = baidu::xpu::api;
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/xpu_info.h"
#include <algorithm>
#include <cstdlib>
#include <string>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/xpu_header.h"
#include "paddle/fluid/string/split.h"
DEFINE_string(selected_xpus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (XPU). If you want to use "
"all visible devices, set this to empty string. NOTE: the "
"reason of doing this is that we want to use P2P communication"
"between XPU devices, use XPU_VISIBLE_DEVICES can only use"
"share-memory only.");
namespace paddle {
namespace platform {
static int GetXPUDeviceCountImpl() {
const auto *xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
if (xpu_visible_devices != nullptr) {
std::string xpu_visible_devices_str(xpu_visible_devices);
if (std::all_of(xpu_visible_devices_str.begin(),
xpu_visible_devices_str.end(),
[](char ch) { return ch == ' '; })) {
VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
return 0;
}
}
int count = 0;
int ret = xpu_device_count(&count);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
return count;
}
int GetXPUDeviceCount() {
static auto dev_cnt = GetXPUDeviceCountImpl();
return dev_cnt;
}
int GetXPUCurrentDeviceId() {
int dev_id;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
return dev_id;
}
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetXPUSelectedDevices() {
// use user specified XPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_xpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetXPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
void SetXPUDeviceId(int id) {
PADDLE_ENFORCE_LT(
id, GetXPUDeviceCount(),
platform::errors::InvalidArgument("id must less than XPU count"));
int ret = xpu_set_device(id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_XPU
#include <vector>
namespace paddle {
namespace platform {
//! Get the total number of XPU devices in system.
int GetXPUDeviceCount();
//! Get the current XPU device id in system.
int GetXPUCurrentDeviceId();
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetXPUSelectedDevices();
//! Set the XPU device id for next execution.
void SetXPUDeviceId(int device_id);
} // namespace platform
} // namespace paddle
#endif
......@@ -66,11 +66,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
return place_obj.cast<platform::CPUPlace>();
} else if (py::isinstance<platform::CUDAPlace>(place_obj)) {
return place_obj.cast<platform::CUDAPlace>();
} else if (py::isinstance<platform::XPUPlace>(place_obj)) {
return place_obj.cast<platform::XPUPlace>();
} else if (py::isinstance<platform::CUDAPinnedPlace>(place_obj)) {
return place_obj.cast<platform::CUDAPinnedPlace>();
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
"Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
}
}
......@@ -92,6 +94,9 @@ static void InitTensorForVarBase(imperative::VarBase *self,
if (platform::is_cpu_place(place)) {
SetTensorFromPyArray<platform::CPUPlace>(
tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy);
} else if (platform::is_xpu_place(place)) {
SetTensorFromPyArray<platform::XPUPlace>(
tensor, array, BOOST_GET_CONST(platform::XPUPlace, place), zero_copy);
} else if (platform::is_gpu_place(place)) {
SetTensorFromPyArray<platform::CUDAPlace>(
tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place), zero_copy);
......@@ -101,7 +106,7 @@ static void InitTensorForVarBase(imperative::VarBase *self,
zero_copy);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
"Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
}
if (stop_gradient != -1) {
self->SetOverridedStopGradient(stop_gradient);
......@@ -588,6 +593,10 @@ void BindImperative(py::module *m_ptr) {
py::arg("value"), py::arg("place"), py::arg("persistable") = false,
py::arg("zero_copy") = false, py::arg("name") = "",
py::arg("stop_gradient") = -1)
.def("__init__", &InitVarBaseFromNumpyWithArg<platform::XPUPlace>,
py::arg("value"), py::arg("place"), py::arg("persistable") = false,
py::arg("zero_copy") = false, py::arg("name") = "",
py::arg("stop_gradient") = -1)
.def("__init__", &InitVarBaseFromNumpyWithArg<platform::CUDAPlace>,
py::arg("value"), py::arg("place"), py::arg("persistable") = false,
py::arg("zero_copy") = false, py::arg("name") = "",
......@@ -823,6 +832,10 @@ void BindImperative(py::module *m_ptr) {
const platform::CUDAPinnedPlace &place,
bool blocking) { return self.NewVarBase(place, blocking); },
py::return_value_policy::copy)
.def("_copy_to",
[](const imperative::VarBase &self, const platform::XPUPlace &place,
bool blocking) { return self.NewVarBase(place, blocking); },
py::return_value_policy::copy)
.def("_copy_to",
[](const imperative::VarBase &self, const platform::CUDAPlace &place,
bool blocking) { return self.NewVarBase(place, blocking); },
......@@ -890,6 +903,9 @@ void BindImperative(py::module *m_ptr) {
if (py::isinstance<platform::CUDAPlace>(obj)) {
auto p = obj.cast<platform::CUDAPlace *>();
self.SetExpectedPlace(*p);
} else if (py::isinstance<platform::XPUPlace>(obj)) {
auto p = obj.cast<platform::XPUPlace *>();
self.SetExpectedPlace(*p);
} else if (py::isinstance<platform::CPUPlace>(obj)) {
auto p = obj.cast<platform::CPUPlace *>();
self.SetExpectedPlace(*p);
......@@ -898,7 +914,8 @@ void BindImperative(py::module *m_ptr) {
self.SetExpectedPlace(*p);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Incompatible Place Type: supports CUDAPlace, CPUPlace, "
"Incompatible Place Type: supports XPUPlace, CUDAPlace, "
"CPUPlace, "
"and CUDAPinnedPlace, "
"but got Unknown Type!"));
}
......@@ -928,6 +945,19 @@ void BindImperative(py::module *m_ptr) {
*(imperative::AmpOperators::Instance().GetAllowOps()),
*(imperative::AmpOperators::Instance().GetBlockOps()));
})
.def("trace",
[](imperative::Tracer &self, const std::string &type,
const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
framework::AttributeMap attrs, const platform::XPUPlace &place,
bool trace_backward) {
auto ins_map = ConvertToNameVarBaseMap(ins);
auto outs_map = ConvertToNameVarBaseMap(outs);
{
py::gil_scoped_release release;
self.TraceOp(type, std::move(ins_map), std::move(outs_map),
std::move(attrs), place, trace_backward);
}
})
.def("trace",
[](imperative::Tracer &self, const std::string &type,
const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
......
......@@ -90,6 +90,10 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/pybind/communicator_py.h"
#endif
......@@ -118,6 +122,14 @@ bool IsCompiledWithCUDA() {
#endif
}
bool IsCompiledWithXPU() {
#ifndef PADDLE_WITH_XPU
return false;
#else
return true;
#endif
}
bool IsCompiledWithMKLDNN() {
#ifndef PADDLE_WITH_MKLDNN
return false;
......@@ -471,6 +483,10 @@ PYBIND11_MODULE(core_noavx, m) {
[](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](Tensor &self, paddle::platform::XPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<float>(place);
......@@ -483,6 +499,10 @@ PYBIND11_MODULE(core_noavx, m) {
[](Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](Tensor &self, paddle::platform::XPUPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<int>(place);
......@@ -500,6 +520,11 @@ PYBIND11_MODULE(core_noavx, m) {
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
})
.def("_mutable_data",
[](Tensor &self, paddle::platform::XPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
})
.def("_mutable_data",
[](Tensor &self, paddle::platform::CUDAPlace &place,
paddle::framework::proto::VarType::Type type) {
......@@ -513,6 +538,8 @@ PYBIND11_MODULE(core_noavx, m) {
.def("_clear", &Tensor::clear)
.def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
......@@ -522,7 +549,7 @@ PYBIND11_MODULE(core_noavx, m) {
Args:
lod (numpy.ndarray): The data to set.
place (CPUPlace|CUDAPlace|CUDAPinnedPlace): The place where the
place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the
LoDTensor is to be set.
zero_copy (bool, optional): Whether to share memory with the input numpy array.
This parameter only works with CPUPlace. Default: False.
......@@ -1230,6 +1257,18 @@ All parameter, weight, gradient are variables in Paddle.
-> paddle::platform::DeviceContext* {
return new paddle::platform::CPUDeviceContext();
})
.def_static("create",
[](paddle::platform::XPUPlace& place)
-> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_XPU
PADDLE_THROW(
platform::errors::PermissionDenied(
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."));
#else
return new paddle::platform::XPUDeviceContext(place);
#endif
})
.def_static("create",
[](paddle::platform::CUDAPlace& place)
-> paddle::platform::DeviceContext* {
......@@ -1331,6 +1370,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
.def("_get_device_id",
......@@ -1338,6 +1378,60 @@ All parameter, weight, gradient are variables in Paddle.
#endif
.def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
**Note**:
Examples:
.. code-block:: python
import paddle.fluid as fluid
xpu_place = fluid.XPUPlace(0)
)DOC")
.def("__init__",
[](platform::XPUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_XPU
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid XPUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
if (platform::GetXPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use XPU because there is no XPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid XPUPlace(%d), must inside [0, %d), because XPU "
"number on your machine is %d",
dev_id, platform::GetXPUDeviceCount(),
platform::GetXPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::XPUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use XPU because you have installed CPU/GPU version "
"PaddlePaddle.\n"
"If you want to use XPU, please try to install XPU version "
"PaddlePaddle by: pip install paddlepaddle-xpu\n"
"If you only have CPU, please change XPUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
.def("_equals",
&IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::XPUPlace &>);
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
CPUPlace is a descriptor of a device.
It represents a CPU device allocated or to be allocated with Tensor or LoDTensor.
......@@ -1352,6 +1446,7 @@ All parameter, weight, gradient are variables in Paddle.
.def(py::init<>())
.def("_type", &PlaceIndex<platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
.def("_equals",
......@@ -1386,6 +1481,8 @@ All parameter, weight, gradient are variables in Paddle.
.def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
.def("_equals",
......@@ -1398,11 +1495,14 @@ All parameter, weight, gradient are variables in Paddle.
.def("_equals", &IsSamePlace<platform::Place, platform::Place>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); })
.def("is_cpu_place",
[](platform::Place &self) { return platform::is_cpu_place(self); })
.def("is_xpu_place",
[](platform::Place &self) { return platform::is_xpu_place(self); })
.def("is_cuda_pinned_place",
[](platform::Place &self) {
return platform::is_cuda_pinned_place(self);
......@@ -1411,12 +1511,20 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self) {
return BOOST_GET_CONST(platform::CUDAPlace, self).device;
})
.def("xpu_device_id",
[](platform::Place &self) {
return BOOST_GET_CONST(platform::XPUPlace, self).device;
})
.def("set_place", [](platform::Place &self,
const platform::Place &other) { self = other; })
.def("set_place",
[](platform::Place &self, const platform::CPUPlace &cpu_place) {
self = cpu_place;
})
.def("set_place",
[](platform::Place &self, const platform::XPUPlace &xpu_place) {
self = xpu_place;
})
.def("set_place",
[](platform::Place &self, const platform::CUDAPlace &gpu_place) {
self = gpu_place;
......@@ -1444,6 +1552,9 @@ All parameter, weight, gradient are variables in Paddle.
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::CPUPlace &place) { self.Run(scope, place); })
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::XPUPlace &place) { self.Run(scope, place); })
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::CUDAPlace &place) { self.Run(scope, place); })
......@@ -1544,6 +1655,7 @@ All parameter, weight, gradient are variables in Paddle.
[](bool init_p2p) { framework::InitDevices(init_p2p); });
m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_xpu", IsCompiledWithXPU);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
m.def("is_compiled_with_dist", IsCompiledWithDIST);
......
......@@ -145,8 +145,14 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
T b = static_cast<T>(0);
if (platform::is_cpu_place(self.place())) {
b = self.data<T>()[offset];
} else if (platform::is_xpu_place(self.place())) {
#ifdef PADDLE_WITH_XPU
const T *a = self.data<T>();
auto p = BOOST_GET_CONST(platform::XPUPlace, self.place());
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
#endif
} else if (platform::is_gpu_place(self.place())) {
#ifdef PADDLE_WITH_CUDA
} else {
const T *a = self.data<T>();
auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
......@@ -163,8 +169,14 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
"The offset exceeds the size of tensor."));
if (platform::is_cpu_place(self->place())) {
self->mutable_data<T>(self->place())[offset] = elem;
} else if (platform::is_xpu_place(self->place())) {
#ifdef PADDLE_WITH_XPU
auto p = BOOST_GET_CONST(platform::XPUPlace, self->place());
T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
#endif
} else if (platform::is_gpu_place(self->place())) {
#ifdef PADDLE_WITH_CUDA
} else {
auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
......@@ -194,6 +206,16 @@ void SetTensorFromPyArrayT(
auto dst = self->mutable_data<T>(place);
std::memcpy(dst, array.data(), array.nbytes());
}
} else if (paddle::platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
auto dst = self->mutable_data<T>(place);
xpu_memcpy(dst, array.data(), array.nbytes(),
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else {
#ifdef PADDLE_WITH_CUDA
auto dst = self->mutable_data<T>(place);
......@@ -354,8 +376,13 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
if (platform::is_cpu_place(place)) {
output->mutable_data(BOOST_GET_CONST(platform::CPUPlace, place),
self.type());
#ifdef PADDLE_WITH_CUDA
} else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place),
self.type());
#endif
} else {
#ifdef PADDLE_WITH_CUDA
if (platform::is_cuda_pinned_place(place)) {
output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
self.type());
......@@ -516,6 +543,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
return py::array();
}
bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
const auto &tensor_dims = tensor.dims();
auto tensor_dtype = tensor.type();
size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
......@@ -534,7 +562,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
if (!is_gpu_tensor) {
if (!is_gpu_tensor && !is_xpu_tensor) {
if (!need_deep_copy) {
return py::array(py::buffer_info(
const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
......@@ -557,28 +585,54 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
copy_bytes);
return py_arr;
}
}
} else if (is_xpu_tensor) {
#ifdef PADDLE_WITH_XPU
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
platform::errors::InvalidArgument(
"PyArray is not writable, in which case memory leak "
"or double free would occur"));
PADDLE_ENFORCE_EQ(
py_arr.owndata(), true,
platform::errors::InvalidArgument(
"PyArray does not own data, in which case memory leak "
"or double free would occur"));
size_t copy_bytes = sizeof_dtype * numel;
auto p = BOOST_GET_CONST(platform::XPUPlace, tensor.place());
paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
tensor_buf_ptr, copy_bytes);
return py_arr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (is_gpu_tensor) {
#ifdef PADDLE_WITH_CUDA
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
platform::errors::InvalidArgument(
"PyArray is not writable, in which case memory leak "
"or double free would occur"));
PADDLE_ENFORCE_EQ(py_arr.owndata(), true,
platform::errors::InvalidArgument(
"PyArray does not own data, in which case memory leak "
"or double free would occur"));
size_t copy_bytes = sizeof_dtype * numel;
paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
copy_bytes, cudaMemcpyDeviceToHost);
return py_arr;
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
platform::errors::InvalidArgument(
"PyArray is not writable, in which case memory leak "
"or double free would occur"));
PADDLE_ENFORCE_EQ(
py_arr.owndata(), true,
platform::errors::InvalidArgument(
"PyArray does not own data, in which case memory leak "
"or double free would occur"));
size_t copy_bytes = sizeof_dtype * numel;
paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
copy_bytes, cudaMemcpyDeviceToHost);
return py_arr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
#endif
}
PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
return py::array();
}
} // namespace pybind
......
......@@ -68,7 +68,7 @@ from .input import embedding, one_hot
from . import distribute_lookup_table
from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
from .incubate import fleet
from .incubate import data_generator
from .transpiler import DistributeTranspiler, \
......@@ -119,6 +119,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'LoDTensor',
'LoDTensorArray',
'CPUPlace',
'XPUPlace',
'CUDAPlace',
'CUDAPinnedPlace',
'Tensor',
......
......@@ -48,6 +48,7 @@ __all__ = [
'cuda_pinned_places',
'in_dygraph_mode',
'is_compiled_with_cuda',
'is_compiled_with_xpu',
'Variable',
'ComplexVariable',
'load_op_library',
......@@ -310,6 +311,21 @@ def _cuda_ids():
return device_ids
def is_compiled_with_xpu():
"""
Whether this whl package can be used to run the model on XPU.
Returns (bool): support xpu or not.
Examples:
.. code-block:: python
import paddle.fluid as fluid
support_xpu = fluid.is_compiled_with_xpu()
"""
return core.is_compiled_with_xpu()
def is_compiled_with_cuda():
"""
Whether this whl package can be used to run the model on GPU.
......
......@@ -175,5 +175,57 @@ class TestFP16MulOp2(TestMulOp2):
no_grad_set=set('Y'))
@unittest.skipIf(not core.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUMulOp1(TestMulOp):
def init_dtype_type(self):
self.dtype = np.float32
def test_check_output(self):
place = core.XPUPlace(0)
self.check_output_with_place(place, atol=1e-1)
def test_check_grad_normal(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['X', 'Y'], 'Out', max_relative_error=0.5)
def test_check_grad_ingore_x(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
@unittest.skipIf(not core.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUMulOp2(TestMulOp2):
def init_dtype_type(self):
self.dtype = np.float32
def test_check_output(self):
place = core.XPUPlace(0)
self.check_output_with_place(place, atol=2e-1)
def test_check_grad_normal(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['X', 'Y'], 'Out', max_relative_error=0.9)
def test_check_grad_ingore_x(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
if __name__ == "__main__":
unittest.main()
......@@ -303,6 +303,23 @@ if '${WITH_MKLDNN}' == 'ON':
else:
package_data['paddle.libs']+=['mkldnn.dll']
if '${WITH_XPU}' == 'ON':
# only change rpath in Release mode,
if '${CMAKE_BUILD_TYPE}' == 'Release':
if os.name != 'nt':
if "@APPLE@" == "1":
command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}"
else:
command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}"
if os.system(command) != 0:
raise Exception("patch ${XPU_API_LIB} failed, command: %s" % command)
shutil.copy('${XPU_API_LIB}', libs_path)
shutil.copy('${XPU_RT_LIB}', libs_path)
shutil.copy('${XPU_SIM_LIB}', libs_path)
package_data['paddle.libs']+=['${XPU_API_LIB_NAME}',
'${XPU_RT_LIB_NAME}',
'${XPU_SIM_LIB_NAME}']
# copy libfuild_framework.so to libs
if os.name != 'nt' and sys.platform != 'darwin':
paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册