未验证 提交 c1e5a393 编写于 作者: W Wilber 提交者: GitHub

[PTEN] Add xpu context. (#39098)

上级 b2a7261d
......@@ -86,5 +86,12 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
using TYPE = pten::CPUContext;
};
#ifdef PADDLE_WITH_XPU
template <>
struct ConvertToPtenContext<platform::XPUDeviceContext> {
using TYPE = pten::XPUContext;
};
#endif
} // namespace framework
} // namespace paddle
......@@ -94,11 +94,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
inverse_scale = 0.0;
}
paddle::platform::XPUVersion version = dev_ctx.xpu_version();
auto version = dev_ctx.xpu_version();
framework::Tensor float_x;
framework::Tensor float_out;
if (std::is_same<T, paddle::platform::float16>::value &&
(version == paddle::platform::XPUVersion::XPU1)) {
(version == pten::backends::xpu::XPUVersion::XPU1)) {
float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
x->numel() * sizeof(MPDType));
float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),
......
......@@ -107,8 +107,8 @@ class DropoutGradXPUKernel : public framework::OpKernel<T> {
return;
}
paddle::platform::XPUVersion version = dev_ctx.xpu_version();
if (version == paddle::platform::XPUVersion::XPU1) {
auto version = dev_ctx.xpu_version();
if (version == pten::backends::xpu::XPUVersion::XPU1) {
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
XPUType* mask_new = RAII_GUARD.alloc_l3_or_gm<XPUType>(mask->numel());
float scale =
......
......@@ -448,7 +448,8 @@ class ReshapeKernel {
#ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
pten::ReshapeKernel(static_cast<const pten::XPUContext &>(dev_ctx),
*pt_x.get(), pt_scalar_shape, pt_out);
}
#endif
// non-inplace need move all result from pt_out to out, inplace need set
......@@ -485,7 +486,8 @@ class ReshapeGradKernel {
#ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get());
pten::ReshapeGradKernel(static_cast<const pten::XPUContext &>(dev_ctx),
*pt_d_out.get(), pt_d_x.get());
}
#endif
}
......@@ -516,7 +518,9 @@ class ReshapeDoubleGradKernel {
#ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get());
pten::ReshapeDoubleGradKernel(
static_cast<const pten::XPUContext &>(dev_ctx), *pt_dd_x.get(),
pt_dd_out.get());
}
#endif
}
......
......@@ -45,8 +45,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
auto& dev_ctx = context.template device_context<DeviceContext>();
int r = XPU_SUCCESS;
paddle::platform::XPUVersion version = dev_ctx.xpu_version();
if (version == paddle::platform::XPUVersion::XPU1) {
auto version = dev_ctx.xpu_version();
if (version == pten::backends::xpu::XPUVersion::XPU1) {
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm<XPUType>(x->numel());
r = xpu::clip_v2(dev_ctx.x_context(),
......
......@@ -121,6 +121,9 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} cpu_context)
if(WITH_XPU)
target_link_libraries(device_context xpu_context)
endif()
cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
if(WITH_ASCEND_CL)
......
......@@ -4,7 +4,7 @@ endif()
set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place)
cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info)
cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context)
add_subdirectory(tests)
......@@ -15,177 +15,36 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/enforce.h"
#include "xpu/bkcl.h"
#include "paddle/pten/backends/xpu/enforce_xpu.h"
namespace paddle {
namespace platform {
// Note: XPU runtime api return int, not XPUError_t
inline const char* xpuGetErrorString(int stat) {
switch (stat) {
case XPU_SUCCESS:
return "Success";
case XPUERR_INVALID_DEVICE:
return "Invalid XPU device";
case XPUERR_UNINIT:
return "XPU runtime not properly inited";
case XPUERR_NOMEM:
return "Device memory not enough";
case XPUERR_NOCPUMEM:
return "CPU memory not enough";
case XPUERR_INVALID_PARAM:
return "Invalid parameter";
case XPUERR_NOXPUFUNC:
return "Cannot get XPU Func";
case XPUERR_LDSO:
return "Error loading dynamic library";
case XPUERR_LDSYM:
return "Error loading func from dynamic library";
case XPUERR_SIMULATOR:
return "Error from XPU Simulator";
case XPUERR_NOSUPPORT:
return "Operation not supported";
case XPUERR_ABNORMAL:
return "Device abnormal due to previous error";
case XPUERR_KEXCEPTION:
return "Exception in kernel execution";
case XPUERR_TIMEOUT:
return "Kernel execution timed out";
case XPUERR_BUSY:
return "Resource busy";
case XPUERR_USEAFCLOSE:
return "Use a stream after closed";
case XPUERR_UCECC:
return "Uncorrectable ECC";
case XPUERR_OVERHEAT:
return "Overheat";
case XPUERR_UNEXPECT:
return "Execution error, reach unexpected control flow";
case XPUERR_DEVRESET:
return "Device is being reset, try again later";
case XPUERR_HWEXCEPTION:
return "Hardware module exception";
case XPUERR_HBM_INIT:
return "Error init HBM";
case XPUERR_DEVINIT:
return "Error init device";
case XPUERR_PEERRESET:
return "Device is being reset, try again later";
case XPUERR_MAXDEV:
return "Device count exceed limit";
case XPUERR_NOIOC:
return "Unknown IOCTL command";
case XPUERR_DMATIMEOUT:
return "DMA timed out, a reboot maybe needed";
case XPUERR_DMAABORT:
return "DMA aborted due to error, possibly wrong address or hardware "
"state";
case XPUERR_MCUUNINIT:
return "Firmware not initialized";
case XPUERR_OLDFW:
return "Firmware version too old (<15), please update.";
case XPUERR_PCIE:
return "Error in PCIE";
case XPUERR_FAULT:
return "Error copy between kernel and user space";
case XPUERR_INTERRUPTED:
return "Execution interrupted by user";
default:
return "unkonwn error";
}
return pten::backends::xpu::xpuGetErrorString(stat);
}
inline const char* bkclGetErrorString(BKCLResult_t stat) {
switch (stat) {
case BKCL_SUCCESS:
return "BKCL_SUCCESS";
case BKCL_INVALID_ARGUMENT:
return "BKCL_INVALID_ARGUMENT";
case BKCL_RUNTIME_ERROR:
return "BKCL_RUNTIME_ERROR";
case BKCL_SYSTEM_ERROR:
return "BKCL_SYSTEM_ERROR";
case BKCL_INTERNAL_ERROR:
return "BKCL_INTERNAL_ERROR";
default:
return "Unknown BKCL status";
}
return pten::backends::xpu::bkclGetErrorString(stat);
}
inline const char* xdnnGetErrorString(int stat) {
switch (stat) {
case xpu::Error_t::SUCCESS:
return "XDNN_SUCCESS";
case xpu::Error_t::INVALID_PARAM:
return "XDNN_INVALID_PARAM";
case xpu::Error_t::RUNTIME_ERROR:
return "XDNN_RUNTIME_ERROR";
case xpu::Error_t::NO_ENOUGH_WORKSPACE:
return "XDNN_NO_ENOUGH_WORKSPACE";
case xpu::Error_t::NOT_IMPLEMENT:
return "XDNN_NOT_IMPLEMENT";
default:
return "Unknown XDNN status";
}
return pten::backends::xpu::xdnnGetErrorString(stat);
}
inline std::string build_xpu_error_msg(int stat) {
std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
return msg + xpuGetErrorString(stat) + " ";
return pten::backends::xpu::build_xpu_error_msg(stat);
}
inline std::string build_xpu_error_msg(BKCLResult_t stat) {
std::string msg("BKCL Error, ");
return msg + bkclGetErrorString(stat) + " ";
return pten::backends::xpu::build_xpu_error_msg(stat);
}
inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) {
return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " ";
return pten::backends::xpu::build_xpu_xdnn_error_msg(stat, msg);
}
namespace details {
template <typename T>
struct ExternalApiType {};
#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
template <> \
struct ExternalApiType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
#undef DEFINE_EXTERNAL_API_TYPE
} // namespace details
#define PADDLE_ENFORCE_XPU_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __XPU_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::ExternalApiType< \
__XPU_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = paddle::platform::errors::External( \
::paddle::platform::build_xpu_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG) \
do { \
auto __cond__ = (COND); \
if (UNLIKELY(__cond__ != xpu::Error_t::SUCCESS)) { \
auto __summary__ = paddle::platform::errors::External( \
::paddle::platform::build_xpu_xdnn_error_msg(__cond__, MSG)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
} // namespace platform
} // namespace paddle
......@@ -15,42 +15,5 @@ limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_XPU
#include <map>
#include <string>
#include <unordered_map>
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "xpu/runtime.h"
#include "xpu/runtime_ex.h"
#include "xpu/xdnn.h"
namespace xpu = baidu::xpu::api;
static std::map<int, std::string> XPUAPIErrorMsg = {
{xpu::Error_t::SUCCESS, "xpu api success"},
{xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
{xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
{xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
template <typename T>
class XPUTypeTrait {
public:
using Type = T;
};
template <>
class XPUTypeTrait<paddle::platform::float16> {
public:
using Type = float16;
};
template <>
class XPUTypeTrait<paddle::platform::bfloat16> {
public:
using Type = bfloat16;
};
#include "paddle/pten/backends/xpu/xpu_header.h"
#endif
......@@ -14,22 +14,14 @@ limitations under the License. */
#include <cstdlib>
#include <string>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
PADDLE_DEFINE_EXPORTED_string(
selected_xpus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (XPU). If you want to use "
"all visible devices, set this to empty string. NOTE: the "
"reason of doing this is that we want to use P2P communication"
"between XPU devices, use XPU_VISIBLE_DEVICES can only use"
"share-memory only.");
#include "paddle/pten/backends/xpu/xpu_info.h"
namespace paddle {
namespace platform {
......@@ -37,101 +29,40 @@ namespace platform {
/**************************** Version Management **************************/
//! Get the version of XPU Driver
int GetDriverVersion() {
uint32_t driver_version_major = 0;
uint32_t driver_version_minor = 0;
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_get_driver_version(&driver_version_major, &driver_version_minor));
int driver_version = driver_version_major * 10 + driver_version_minor;
return driver_version;
}
int GetDriverVersion() { return pten::backends::xpu::GetDriverVersion(); }
//! Get the version of XPU Runtime
int GetRuntimeVersion() {
uint32_t rumtime_version_major = 0;
uint32_t rumtime_version_minor = 0;
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
return runtime_version;
}
int GetRuntimeVersion() { return pten::backends::xpu::GetRuntimeVersion(); }
/**************************** Device Management **************************/
static int GetDeviceCountImpl() {
const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
if (xpu_visible_devices != nullptr) {
std::string xpu_visible_devices_str(xpu_visible_devices);
if (std::all_of(xpu_visible_devices_str.begin(),
xpu_visible_devices_str.end(),
[](char ch) { return ch == ' '; })) {
VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
return 0;
}
}
int count = 0;
PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
return count;
}
int GetXPUDeviceCount() {
static auto dev_cnt = GetDeviceCountImpl();
return dev_cnt;
}
int GetXPUDeviceCount() { return pten::backends::xpu::GetXPUDeviceCount(); }
int GetXPUCurrentDeviceId() {
int dev_id;
PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
return dev_id;
return pten::backends::xpu::GetXPUCurrentDeviceId();
}
void SetXPUDeviceId(int id) {
PADDLE_ENFORCE_LT(
id, GetXPUDeviceCount(),
platform::errors::InvalidArgument("id must less than XPU count"));
PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
}
void SetXPUDeviceId(int id) { pten::backends::xpu::SetXPUDeviceId(id); }
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetXPUSelectedDevices() {
// use user specified XPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_xpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetXPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
return pten::backends::xpu::GetXPUSelectedDevices();
}
/**************************** Memory Management **************************/
void MemcpySyncH2D(void* dst, const void* src, size_t count,
const platform::XPUPlace& dst_place) {
platform::XPUDeviceGuard guard(dst_place.device);
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
pten::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place);
}
void MemcpySyncD2H(void* dst, const void* src, size_t count,
const platform::XPUPlace& src_place) {
platform::XPUDeviceGuard guard(src_place.device);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
dev_ctx->Wait();
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
pten::backends::xpu::MemcpySyncD2H(dst, src, count, src_place, *dev_ctx);
}
// if src.device == dst.device and you need sync , after call this function,
......@@ -139,33 +70,16 @@ void MemcpySyncD2H(void* dst, const void* src, size_t count,
void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place,
const void* src, const platform::XPUPlace& src_place,
size_t count) {
int dev_id = GetXPUCurrentDeviceId();
if (dst_place.device == dev_id && src_place.device == dev_id) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
PADDLE_ENFORCE_XDNN_SUCCESS(
xpu::copy(dev_ctx->x_context(), static_cast<const int8_t*>(src),
static_cast<int8_t*>(dst), count),
"copy ");
} else {
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
pten::backends::xpu::MemcpySyncD2D(dst, dst_place, src, src_place, count,
*dev_ctx);
}
/**************************** Others **************************/
XPUVersion get_xpu_version(int dev_id) {
uint64_t v = 0;
PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
if (v == K100 || v == K200) {
VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
return XPU1;
} else {
VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
return XPU2;
}
pten::backends::xpu::XPUVersion get_xpu_version(int dev_id) {
return pten::backends::xpu::get_xpu_version(dev_id);
}
} // namespace platform
......
......@@ -13,6 +13,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include <vector>
#include "paddle/fluid/platform/place.h"
#include "paddle/pten/backends/xpu/xpu_info.h"
namespace paddle {
namespace platform {
......@@ -50,31 +51,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place,
const void *src, const platform::XPUPlace &src_place,
size_t count);
class XPUDeviceGuard {
public:
explicit inline XPUDeviceGuard(int dev_id) {
int prev_id = platform::GetXPUCurrentDeviceId();
if (prev_id != dev_id) {
prev_id_ = prev_id;
platform::SetXPUDeviceId(dev_id);
}
}
using XPUDeviceGuard = pten::backends::xpu::XPUDeviceGuard;
inline ~XPUDeviceGuard() {
if (prev_id_ != -1) {
platform::SetXPUDeviceId(prev_id_);
}
}
XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
private:
int prev_id_{-1};
};
enum XPUVersion { XPU1, XPU2 };
XPUVersion get_xpu_version(int dev_id);
pten::backends::xpu::XPUVersion get_xpu_version(int dev_id);
} // namespace platform
} // namespace paddle
......
......@@ -24,7 +24,7 @@ namespace platform {
bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
auto& ops = get_kl1_ops();
auto v = get_xpu_version(type.place_.device);
if (v == XPU2) {
if (v == pten::backends::xpu::XPUVersion::XPU2) {
ops = get_kl2_ops();
}
......@@ -74,10 +74,11 @@ bool is_in_xpu_black_list(const std::string& op_name) {
return false;
}
std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
XPUVersion version) {
std::vector<vartype::Type> get_xpu_op_support_type(
const std::string& op_name, pten::backends::xpu::XPUVersion version) {
std::vector<vartype::Type> res;
auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
: get_kl2_ops();
if (ops.find(op_name) != ops.end()) {
XPUKernelSet& type_set = ops[op_name];
for (auto& item : type_set) {
......@@ -87,9 +88,10 @@ std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
return res;
}
XPUOpListMap get_xpu_op_list(XPUVersion version) {
XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version) {
XPUOpListMap res;
auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
: get_kl2_ops();
for (auto& op : ops) {
std::vector<vartype::Type> op_vartypes;
for (auto& item : op.second) {
......
......@@ -27,9 +27,9 @@ using XPUOpListMap =
bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
bool is_in_xpu_black_list(const std::string& op_name);
std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
XPUVersion version);
XPUOpListMap get_xpu_op_list(XPUVersion version);
std::vector<vartype::Type> get_xpu_op_support_type(
const std::string& op_name, pten::backends::xpu::XPUVersion version);
XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version);
} // namespace platform
} // namespace paddle
......
......@@ -246,52 +246,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() {
context_ = xpu::create_context();
xpu_version_ = get_xpu_version(place_.device);
}
XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}
XPUDeviceContext::~XPUDeviceContext() {}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
platform::XPUDeviceGuard guard(place.device);
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place_.device);
context_ = xpu::create_context();
const int MAX_XPU_NUM = 16;
static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
int l3_size = 13.5 * 1024 * 1024;
if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
}
auto selected_xpus = GetXPUSelectedDevices();
for (unsigned int i = 0; i < selected_xpus.size(); i++) {
if (place.device == selected_xpus[i]) {
if (l3ptrs[place.device] == nullptr) {
xpu_malloc(static_cast<void**>(&l3ptrs[place.device]), l3_size,
XPU_MEM_L3);
}
if (l3ptrs[place.device] != nullptr) {
context_->_l3_mgr.set(l3ptrs[place.device], l3_size);
VLOG(3) << "xpu place " << place.device << " set l3 size " << l3_size;
}
break;
}
}
<< static_cast<int>(place.device);
}
void XPUDeviceContext::Wait() const {
platform::SetXPUDeviceId(place_.device);
xpu_wait(context_->xpu_stream);
}
Place XPUDeviceContext::GetPlace() const { return place_; }
xpu::Context* XPUDeviceContext::x_context() const { return context_; }
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......
......@@ -78,6 +78,7 @@ struct GpuDevice;
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/pten/backends/xpu/xpu_context.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -171,39 +172,12 @@ struct DefaultDeviceContextType<platform::MLUPlace>;
#ifdef PADDLE_WITH_XPU
namespace xpu = baidu::xpu::api;
class XPUDeviceContext : public DeviceContext {
class XPUDeviceContext : public pten::XPUContext {
public:
XPUDeviceContext();
explicit XPUDeviceContext(XPUPlace place);
virtual ~XPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
XPUVersion xpu_version() const { return xpu_version_; }
Place GetPlace() const override;
xpu::Context* x_context() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
#ifdef PADDLE_WITH_XPU_BKCL
/*! \brief Return bkcl context. */
BKCLContext_t bkcl_context() const { return bkcl_context_; }
/*! \brief Set bkcl context. */
void set_bkcl_context(BKCLContext_t context) { bkcl_context_ = context; }
#endif
private:
XPUPlace place_;
XPUVersion xpu_version_;
xpu::Context* context_;
#ifdef PADDLE_WITH_XPU_BKCL
BKCLContext_t bkcl_context_;
#endif
// Need to be the same with other DeviceContext,
// Eventhough eigen_device_ is not used in XPU
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
DISABLE_COPY_AND_ASSIGN(XPUDeviceContext);
};
template <>
......
......@@ -1756,27 +1756,30 @@ All parameter, weight, gradient are variables in Paddle.
.def("__repr__", string::to_string<const platform::XPUPlace &>)
.def("__str__", string::to_string<const platform::XPUPlace &>);
#ifdef PADDLE_WITH_XPU
py::enum_<platform::XPUVersion>(m, "XPUVersion", py::arithmetic())
.value("XPU1", platform::XPUVersion::XPU1)
.value("XPU2", platform::XPUVersion::XPU2)
py::enum_<pten::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
.value("XPU1", pten::backends::xpu::XPUVersion::XPU1)
.value("XPU2", pten::backends::xpu::XPUVersion::XPU2)
.export_values();
m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
m.def("get_xpu_device_version",
[](int device_id) { return platform::get_xpu_version(device_id); });
m.def("get_xpu_device_op_support_types",
[](const std::string &op_name, platform::XPUVersion version) {
return platform::get_xpu_op_support_type(op_name, version);
});
m.def("get_xpu_device_op_list", [](platform::XPUVersion version) {
m.def(
"get_xpu_device_op_support_types",
[](const std::string &op_name, pten::backends::xpu::XPUVersion version) {
return platform::get_xpu_op_support_type(op_name, version);
});
m.def("get_xpu_device_op_list", [](pten::backends::xpu::XPUVersion version) {
return platform::get_xpu_op_list(version);
});
m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
// XPUs with Compute Capability > xpu2 support float16 and bfloat16
return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
return platform::get_xpu_version(place.device) >
pten::backends::xpu::XPUVersion::XPU1;
});
m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
// XPUs with Compute Capability > xpu2 support float16 and bfloat16
return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
return platform::get_xpu_version(place.device) >
pten::backends::xpu::XPUVersion::XPU1;
});
#endif
......
......@@ -2,4 +2,12 @@ add_subdirectory(dynload)
add_subdirectory(cpu)
cc_library(pten_context SRCS all_context.cc DEPS device_context)
if(WITH_XPU)
add_subdirectory(xpu)
endif()
cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
if(WITH_XPU)
add_dependencies(pten_context xpu_context)
endif()
......@@ -18,16 +18,11 @@
// NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen.
#include "paddle/pten/core/device_context.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace pten {
struct CPUContext::CPUImpl {
Eigen::DefaultDevice* device_{nullptr};
CPUContextResource res_;
CPUPlace place_;
CPUImpl() { device_ = new Eigen::DefaultDevice(); }
// Users need to manage external resources.
......@@ -36,7 +31,7 @@ struct CPUContext::CPUImpl {
}
~CPUImpl() {
if (res_.device == nullptr) {
if (res_.device == nullptr && device_ != nullptr) {
delete device_;
device_ = nullptr;
}
......@@ -56,27 +51,28 @@ struct CPUContext::CPUImpl {
}
Place GetPlace() const { return place_; }
Eigen::DefaultDevice* device_{nullptr};
CPUContextResource res_;
CPUPlace place_;
};
CPUContext::CPUContext() : DeviceContext(), cpu_impl_(nullptr) {
CPUContext::CPUContext() : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>();
}
CPUContext::CPUContext(const CPUContext& other)
: DeviceContext(), cpu_impl_(nullptr) {
CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>();
cpu_impl_->SetEigenDevice(other.eigen_device());
}
CPUContext::CPUContext(CPUContext&& other)
: DeviceContext(), cpu_impl_(nullptr) {
CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
cpu_impl_ = std::move(other.cpu_impl_);
}
CPUContext::~CPUContext() = default;
CPUContext::CPUContext(const CPUContextResource& ctx_res)
: DeviceContext(), cpu_impl_(nullptr) {
CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
}
......
cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place)
cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pten/backends/xpu/xpu_header.h"
#include "xpu/bkcl.h"
#include "paddle/fluid/platform/enforce.h"
namespace pten {
namespace backends {
namespace xpu {
// Note: XPU runtime api return int, not XPUError_t
inline const char* xpuGetErrorString(int stat) {
switch (stat) {
case XPU_SUCCESS:
return "Success";
case XPUERR_INVALID_DEVICE:
return "Invalid XPU device";
case XPUERR_UNINIT:
return "XPU runtime not properly inited";
case XPUERR_NOMEM:
return "Device memory not enough";
case XPUERR_NOCPUMEM:
return "CPU memory not enough";
case XPUERR_INVALID_PARAM:
return "Invalid parameter";
case XPUERR_NOXPUFUNC:
return "Cannot get XPU Func";
case XPUERR_LDSO:
return "Error loading dynamic library";
case XPUERR_LDSYM:
return "Error loading func from dynamic library";
case XPUERR_SIMULATOR:
return "Error from XPU Simulator";
case XPUERR_NOSUPPORT:
return "Operation not supported";
case XPUERR_ABNORMAL:
return "Device abnormal due to previous error";
case XPUERR_KEXCEPTION:
return "Exception in kernel execution";
case XPUERR_TIMEOUT:
return "Kernel execution timed out";
case XPUERR_BUSY:
return "Resource busy";
case XPUERR_USEAFCLOSE:
return "Use a stream after closed";
case XPUERR_UCECC:
return "Uncorrectable ECC";
case XPUERR_OVERHEAT:
return "Overheat";
case XPUERR_UNEXPECT:
return "Execution error, reach unexpected control flow";
case XPUERR_DEVRESET:
return "Device is being reset, try again later";
case XPUERR_HWEXCEPTION:
return "Hardware module exception";
case XPUERR_HBM_INIT:
return "Error init HBM";
case XPUERR_DEVINIT:
return "Error init device";
case XPUERR_PEERRESET:
return "Device is being reset, try again later";
case XPUERR_MAXDEV:
return "Device count exceed limit";
case XPUERR_NOIOC:
return "Unknown IOCTL command";
case XPUERR_DMATIMEOUT:
return "DMA timed out, a reboot maybe needed";
case XPUERR_DMAABORT:
return "DMA aborted due to error, possibly wrong address or hardware "
"state";
case XPUERR_MCUUNINIT:
return "Firmware not initialized";
case XPUERR_OLDFW:
return "Firmware version too old (<15), please update.";
case XPUERR_PCIE:
return "Error in PCIE";
case XPUERR_FAULT:
return "Error copy between kernel and user space";
case XPUERR_INTERRUPTED:
return "Execution interrupted by user";
default:
return "unkonwn error";
}
}
inline const char* bkclGetErrorString(BKCLResult_t stat) {
switch (stat) {
case BKCL_SUCCESS:
return "BKCL_SUCCESS";
case BKCL_INVALID_ARGUMENT:
return "BKCL_INVALID_ARGUMENT";
case BKCL_RUNTIME_ERROR:
return "BKCL_RUNTIME_ERROR";
case BKCL_SYSTEM_ERROR:
return "BKCL_SYSTEM_ERROR";
case BKCL_INTERNAL_ERROR:
return "BKCL_INTERNAL_ERROR";
default:
return "Unknown BKCL status";
}
}
inline const char* xdnnGetErrorString(int stat) {
switch (stat) {
case baidu::xpu::api::Error_t::SUCCESS:
return "XDNN_SUCCESS";
case baidu::xpu::api::Error_t::INVALID_PARAM:
return "XDNN_INVALID_PARAM";
case baidu::xpu::api::Error_t::RUNTIME_ERROR:
return "XDNN_RUNTIME_ERROR";
case baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE:
return "XDNN_NO_ENOUGH_WORKSPACE";
case baidu::xpu::api::Error_t::NOT_IMPLEMENT:
return "XDNN_NOT_IMPLEMENT";
default:
return "Unknown XDNN status";
}
}
inline std::string build_xpu_error_msg(int stat) {
std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
return msg + xpuGetErrorString(stat) + " ";
}
inline std::string build_xpu_error_msg(BKCLResult_t stat) {
std::string msg("BKCL Error, ");
return msg + bkclGetErrorString(stat) + " ";
}
inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) {
return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " ";
}
namespace details {
template <typename T>
struct ExternalApiType {};
#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
template <> \
struct ExternalApiType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
#undef DEFINE_EXTERNAL_API_TYPE
} // namespace details
#define PADDLE_ENFORCE_XPU_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __XPU_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::pten::backends::xpu::details::ExternalApiType< \
__XPU_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = paddle::platform::errors::External( \
::pten::backends::xpu::build_xpu_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG) \
do { \
auto __cond__ = (COND); \
if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) { \
auto __summary__ = paddle::platform::errors::External( \
::pten::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
} // namespace xpu
} // namespace backends
} // namespace pten
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Forward-declares.
#pragma once
// Forward declaration of xpu context.
namespace baidu {
namespace xpu {
namespace api {
struct Context;
typedef void* BKCLContext_t;
} // namespace api
} // namespace xpu
} // namespace baidu
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/pten/backends/xpu/xpu_context.h"
#include <memory>
#include "paddle/pten/api/ext/exception.h"
#include "xpu/runtime.h"
#include "xpu/runtime_ex.h"
#include "xpu/xdnn.h"
namespace xpu = baidu::xpu::api;
namespace pten {
struct XPUContext::XPUImpl {
void SetL3Cache() {
const int MAX_XPU_NUM = 16;
static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
int l3_size = 13.5 * 1024 * 1024;
if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
}
auto selected_xpus = backends::xpu::GetXPUSelectedDevices();
for (unsigned int i = 0; i < selected_xpus.size(); i++) {
if (place_.GetDeviceId() == selected_xpus[i]) {
if (l3ptrs[place_.GetDeviceId()] == nullptr) {
xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]),
l3_size,
XPU_MEM_L3);
}
if (l3ptrs[place_.GetDeviceId()] != nullptr) {
context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size);
VLOG(3) << "xpu place " << place_.GetDeviceId() << " set l3 size "
<< l3_size;
}
break;
}
}
}
XPUImpl() {
context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
}
explicit XPUImpl(XPUPlace place) : place_(place) {
backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place_.device);
context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
}
// Users need to manage external resources.
explicit XPUImpl(const XPUContextResource& ctx_res,
const XPUPlace& place = XPUPlace(0))
: res_(ctx_res), place_(place) {
context_ = res_.context;
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
}
~XPUImpl() {
if (res_.context == nullptr && context_ != nullptr) {
xpu::destroy_context(context_);
context_ = nullptr;
}
}
Place GetPlace() const { return place_; }
backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; }
xpu::Context* GetXContext() const {
PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
return context_;
}
xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; }
void Wait() const {
backends::xpu::SetXPUDeviceId(place_.GetDeviceId());
PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
xpu_wait(context_->xpu_stream);
}
void SetXContext(xpu::Context* context) {
if (context == nullptr) {
return;
}
res_.context = context;
context_ = context;
}
void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
XPUContextResource res_;
XPUPlace place_;
backends::xpu::XPUVersion xpu_version_;
xpu::Context* context_{nullptr};
// NOTE: Distributed communicator, distributed framework manages its
// resources, XPUContext only holds references.
xpu::BKCLContext_t bkcl_context_{nullptr};
};
XPUContext::XPUContext() : DeviceContext() {
impl_ = std::make_unique<XPUImpl>();
}
XPUContext::XPUContext(const XPUPlace& place) {
impl_ = std::make_unique<XPUImpl>(place);
}
XPUContext::XPUContext(const XPUContext& other) : DeviceContext() {
impl_ = std::make_unique<XPUImpl>();
impl_->SetXContext(other.x_context());
impl_->SetBkclContext(other.bkcl_context());
}
XPUContext::XPUContext(XPUContext&& other) : DeviceContext() {
impl_ = std::move(other.impl_);
}
XPUContext::~XPUContext() = default;
XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() {
impl_ = std::make_unique<XPUImpl>(ctx_res);
}
Place XPUContext::GetPlace() const { return impl_->GetPlace(); }
backends::xpu::XPUVersion XPUContext::xpu_version() const {
return impl_->GetXpuVersion();
}
xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); }
xpu::BKCLContext_t XPUContext::bkcl_context() const {
return impl_->GetBkclContext();
}
void XPUContext::Wait() const { impl_->Wait(); }
void XPUContext::set_x_context(xpu::Context* context) {
impl_->SetXContext(context);
}
void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) {
impl_->SetBkclContext(context);
}
} // namespace pten
......@@ -14,13 +14,60 @@ limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_XPU
#include <memory>
#include "paddle/pten/backends/xpu/forwards.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/core/device_context.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/backends/xpu/xpu_header.h"
#include "paddle/pten/backends/xpu/xpu_info.h"
namespace xpu = baidu::xpu::api;
namespace pten {
using XPUContext = paddle::platform::XPUDeviceContext;
} // namespace pten
#endif // PADDLE_WITH_XPU
struct XPUContextResource {
xpu::Context* context{nullptr};
};
class XPUContext : public DeviceContext {
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
XPUContext();
explicit XPUContext(const XPUPlace&);
// NOTE: Share the same underlying resources, please ensure that resources are
// not released.
XPUContext(const XPUContext&);
XPUContext(XPUContext&&);
virtual ~XPUContext();
Place GetPlace() const override;
backends::xpu::XPUVersion xpu_version() const;
xpu::Context* x_context() const;
// Return bkcl context.
xpu::BKCLContext_t bkcl_context() const;
// Wait for all operations completion in the stream.
void Wait() const override;
public:
// NOTE: External users manage resources. Used in inference scenarios.
explicit XPUContext(const XPUContextResource&);
void set_x_context(xpu::Context*);
void set_bkcl_context(xpu::BKCLContext_t context);
private:
struct XPUImpl;
std::unique_ptr<XPUImpl> impl_;
};
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_XPU
#include <map>
#include <string>
#include <unordered_map>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/common/bfloat16.h"
#include "paddle/pten/common/float16.h"
#include "xpu/runtime.h"
#include "xpu/runtime_ex.h"
#include "xpu/xdnn.h"
namespace xpu = baidu::xpu::api;
static std::map<int, std::string> XPUAPIErrorMsg = {
{xpu::Error_t::SUCCESS, "xpu api success"},
{xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
{xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
{xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
template <typename T>
class XPUTypeTrait {
public:
using Type = T;
};
template <>
class XPUTypeTrait<pten::dtype::float16> {
public:
using Type = float16;
};
template <>
class XPUTypeTrait<pten::dtype::bfloat16> {
public:
using Type = bfloat16;
};
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/xpu/xpu_info.h"
#include <algorithm>
#include <cstdlib>
#include <string>
#include "paddle/pten/backends/xpu/enforce_xpu.h"
#include "paddle/pten/backends/xpu/xpu_context.h"
#include "paddle/pten/backends/xpu/xpu_header.h"
#include "paddle/pten/common/place.h"
// TODO(wilber): The pten computing library requires a component to manage
// flags.
#include "paddle/fluid/platform/flags.h"
PADDLE_DEFINE_EXPORTED_string(
selected_xpus,
"",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (XPU). If you want to use "
"all visible devices, set this to empty string. NOTE: the "
"reason of doing this is that we want to use P2P communication"
"between XPU devices, use XPU_VISIBLE_DEVICES can only use"
"share-memory only.");
namespace pten {
class XPUContext;
namespace backends {
namespace xpu {
/**************************** Version Management **************************/
//! Get the version of XPU Driver
int GetDriverVersion() {
uint32_t driver_version_major = 0;
uint32_t driver_version_minor = 0;
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_get_driver_version(&driver_version_major, &driver_version_minor));
int driver_version = driver_version_major * 10 + driver_version_minor;
return driver_version;
}
//! Get the version of XPU Runtime
int GetRuntimeVersion() {
uint32_t rumtime_version_major = 0;
uint32_t rumtime_version_minor = 0;
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
return runtime_version;
}
/**************************** Device Management **************************/
static int GetDeviceCountImpl() {
const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
if (xpu_visible_devices != nullptr) {
std::string xpu_visible_devices_str(xpu_visible_devices);
if (std::all_of(xpu_visible_devices_str.begin(),
xpu_visible_devices_str.end(),
[](char ch) { return ch == ' '; })) {
VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
return 0;
}
}
int count = 0;
PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
return count;
}
int GetXPUDeviceCount() {
static auto dev_cnt = GetDeviceCountImpl();
return dev_cnt;
}
int GetXPUCurrentDeviceId() {
int dev_id;
PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
return dev_id;
}
void SetXPUDeviceId(int id) {
PADDLE_ENFORCE_LT(
id,
GetXPUDeviceCount(),
paddle::platform::errors::InvalidArgument("id must less than XPU count"));
PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
}
static inline std::vector<std::string> Split(std::string const& original,
char separator) {
std::vector<std::string> results;
std::string token;
std::istringstream is(original);
while (std::getline(is, token, separator)) {
if (!token.empty()) {
results.push_back(token);
}
}
return results;
}
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetXPUSelectedDevices() {
// use user specified XPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_xpus.empty()) {
auto devices_str = Split(FLAGS_selected_xpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetXPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
/**************************** Memory Management **************************/
void MemcpySyncH2D(void* dst,
const void* src,
size_t count,
const pten::XPUPlace& dst_place) {
XPUDeviceGuard guard(dst_place.device);
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
}
void MemcpySyncD2H(void* dst,
const void* src,
size_t count,
const pten::XPUPlace& src_place,
const pten::XPUContext& dev_ctx) {
XPUDeviceGuard guard(src_place.GetDeviceId());
dev_ctx.Wait();
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
}
// if src.device == dst.device and you need sync , after call this function,
// need to call xpu_wait()
void MemcpySyncD2D(void* dst,
const pten::XPUPlace& dst_place,
const void* src,
const pten::XPUPlace& src_place,
size_t count,
const pten::XPUContext& dev_ctx) {
int dev_id = GetXPUCurrentDeviceId();
if (dst_place.device == dev_id && src_place.device == dev_id) {
PADDLE_ENFORCE_XDNN_SUCCESS(
baidu::xpu::api::copy(dev_ctx.x_context(),
static_cast<const int8_t*>(src),
static_cast<int8_t*>(dst),
count),
"copy ");
} else {
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
}
}
/**************************** Others **************************/
XPUVersion get_xpu_version(int dev_id) {
uint64_t v = 0;
PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
if (v == K100 || v == K200) {
VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
return XPU1;
} else {
VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
return XPU2;
}
}
} // namespace xpu
} // namespace backends
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/pten/common/place.h"
namespace pten {
class XPUContext;
namespace backends {
namespace xpu {
/***** Version Management *****/
//! Get the version of XPU Driver
int GetDriverVersion();
//! Get the version of XPU Runtime
int GetRuntimeVersion();
/***** Device Management *****/
//! Get the total number of XPU devices in system.
int GetXPUDeviceCount();
//! Set the XPU device id for next execution.
void SetXPUDeviceId(int device_id);
//! Get the current XPU device id in system.
int GetXPUCurrentDeviceId();
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetXPUSelectedDevices();
/***** Memory Management *****/
//! Copy memory from address src to dst synchronously.
void MemcpySyncH2D(void *dst,
const void *src,
size_t count,
const pten::XPUPlace &dst_place);
void MemcpySyncD2H(void *dst,
const void *src,
size_t count,
const pten::XPUPlace &src_place,
const pten::XPUContext &dev_ctx);
void MemcpySyncD2D(void *dst,
const pten::XPUPlace &dst_place,
const void *src,
const pten::XPUPlace &src_place,
size_t count,
const pten::XPUContext &dev_ctx);
class XPUDeviceGuard {
public:
explicit inline XPUDeviceGuard(int dev_id) {
int prev_id = GetXPUCurrentDeviceId();
if (prev_id != dev_id) {
prev_id_ = prev_id;
SetXPUDeviceId(dev_id);
}
}
inline ~XPUDeviceGuard() {
if (prev_id_ != -1) {
SetXPUDeviceId(prev_id_);
}
}
XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
private:
int prev_id_{-1};
};
enum XPUVersion { XPU1, XPU2 };
XPUVersion get_xpu_version(int dev_id);
} // namespace xpu
} // namespace backends
} // namespace pten
......@@ -13,28 +13,45 @@
// limitations under the License.
#include "paddle/pten/core/device_context.h"
#include "paddle/pten/api/ext/exception.h"
namespace pten {
struct DeviceContext::Impl {
Allocator* allocator_{nullptr};
Impl() = default;
~Impl() = default;
void SetAllocator(Allocator* allocator) { allocator_ = allocator; }
void SetDeviceAllocator(Allocator* allocator) {
device_allocator_ = allocator;
}
void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; }
const Allocator& GetDeviceAllocator() const {
PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr.");
return *device_allocator_;
}
const Allocator& GetAllocator() const { return *allocator_; }
const Allocator& GetHostAllocator() const {
PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr.");
return *host_allocator_;
}
// TODO(Wilber): Add impl. It seems that tensorbase not have interface to
// communicate with allocator.
void Alloc(TensorBase* tensor) {}
void HostAlloc(TensorBase* tensor) {}
void DeviceAlloc(TensorBase* tensor) {}
Allocator* device_allocator_{nullptr};
Allocator* host_allocator_{nullptr};
};
DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
DeviceContext::DeviceContext(const DeviceContext& other) {
impl_->SetAllocator(const_cast<Allocator*>(&other.GetAllocator()));
impl_->SetDeviceAllocator(
const_cast<Allocator*>(&other.GetDeviceAllocator()));
impl_->SetHostAllocator(const_cast<Allocator*>(&other.GetHostAllocator()));
}
DeviceContext::DeviceContext(DeviceContext&& other) {
......@@ -43,14 +60,26 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
DeviceContext::~DeviceContext() = default;
void DeviceContext::SetAllocator(Allocator* allocator) {
impl_->SetAllocator(allocator);
void DeviceContext::SetHostAllocator(Allocator* allocator) {
impl_->SetHostAllocator(allocator);
}
void DeviceContext::SetDeviceAllocator(Allocator* allocator) {
impl_->SetDeviceAllocator(allocator);
}
const Allocator& DeviceContext::GetHostAllocator() const {
return impl_->GetHostAllocator();
}
const Allocator& DeviceContext::GetAllocator() const {
return impl_->GetAllocator();
const Allocator& DeviceContext::GetDeviceAllocator() const {
return impl_->GetDeviceAllocator();
}
void DeviceContext::Alloc(TensorBase* tensor) { impl_->Alloc(tensor); }
void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); }
void DeviceContext::DeviceAlloc(TensorBase* tensor) {
impl_->DeviceAlloc(tensor);
}
} // namespace pten
......@@ -57,19 +57,38 @@ class DeviceContext {
*
* @param allocator
*/
void SetAllocator(Allocator*);
void SetDeviceAllocator(Allocator*);
/**
* @brief Get the const Allocator object.
* @brief Get the const deveice-releated Allocator object.
*
* @return Allocator
*/
const Allocator& GetAllocator() const;
const Allocator& GetDeviceAllocator() const;
/**
* @brief Allocate memory for tensor.
* @brief Allocate device memory for tensor.
*/
void Alloc(pten::TensorBase*);
void DeviceAlloc(pten::TensorBase*);
/**
* @brief Set the host Allocator object.
*
* @param allocator
*/
void SetHostAllocator(Allocator*);
/**
* @brief Get the const host Allocator object.
*
* @return Allocator
*/
const Allocator& GetHostAllocator() const;
/**
* @brief Allocate host memory for tensor.
*/
void HostAlloc(pten::TensorBase*);
// TODO(wilber): Just for the convenience of migrating the code, it will be
// modified or removed later.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册