未验证 提交 e1187e50 编写于 作者: Q Qi Li 提交者: GitHub

[XPU] update XPU device info, test=develop (#37884)

上级 c0c54ba3
......@@ -110,9 +110,7 @@ void OpHandleBase::InitXPU() {
"%s should have only one dev_ctx.", Name()));
auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
PADDLE_ENFORCE_EQ(
xpu_set_device(dev_id), XPU_SUCCESS,
platform::errors::PreconditionNotMet("xpu_set_device failed"));
platform::SetXPUDeviceId(dev_id);
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
......
......@@ -122,7 +122,8 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
#endif
#ifdef PADDLE_WITH_XPU
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
platform::XPUDeviceGuard guard(dev_id);
#endif
auto& block = program.Block(0);
......@@ -343,7 +344,8 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
#endif
#ifdef PADDLE_WITH_XPU
auto place = thread_tensor->place();
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
platform::XPUDeviceGuard guard(dev_id);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::DeviceContext* dev_ctx = pool.Get(place);
......@@ -370,7 +372,8 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
#endif
#ifdef PADDLE_WITH_XPU
auto place = root_tensor->place();
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
platform::XPUDeviceGuard guard(dev_id);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::DeviceContext* dev_ctx = pool.Get(place);
......@@ -416,7 +419,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
std::shared_ptr<HeterServiceContext> context = object_pool_.Get();
if (!context->scope_) {
int num = rand() % places_.size();
int num = rand_r() % places_.size();
context->place_num_ = num;
auto place = places_[num];
context->scope_ = &(place_scopes_[num]->NewScope());
......
......@@ -29,15 +29,7 @@
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device/npu/npu_info.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/device/device_wrapper.h"
PADDLE_DEFINE_EXPORTED_bool(
init_allocated_mem, false,
......@@ -153,24 +145,9 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void *p = nullptr;
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
ret = xpu_set_device(place.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
platform::XPUDeviceGuard gurad(place.device);
int ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
if (ret != XPU_SUCCESS) {
std::cout << "xpu memory malloc(" << size << ") failed, try again\n";
xpu_wait();
......@@ -184,12 +161,6 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
PADDLE_THROW(platform::errors::Unimplemented(
"xpu memory FLAGS_init_allocated_mem is not implemented."));
}
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
VLOG(10) << " pointer=" << p;
return p;
#else
......@@ -205,30 +176,9 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
#ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
ret = xpu_set_device(place.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
platform::XPUDeviceGuard gurad(place.device);
xpu_free(p);
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
#else
PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported."));
......
......@@ -19,12 +19,7 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#endif
#include "paddle/fluid/platform/device/device_wrapper.h"
DECLARE_bool(use_pinned_memory);
......
......@@ -14,18 +14,10 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
namespace paddle {
namespace memory {
......@@ -74,41 +66,7 @@ void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
return;
}
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
if (dev_id != dst_place.device) {
ret = xpu_set_device(dst_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id != dst_place.device) {
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
platform::MemcpySyncH2D(dst, src, num, dst_place.device);
}
template <>
......@@ -120,46 +78,7 @@ void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
return;
}
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
if (dev_id != src_place.device) {
ret = xpu_set_device(src_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
dev_ctx->Wait();
ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id != src_place.device) {
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
platform::MemcpySyncD2H(dst, src, num, src_place.device);
}
template <>
......@@ -171,69 +90,7 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
return;
}
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
if (dev_id != src_place.device || dev_id != dst_place.device) {
ret = xpu_set_device(src_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
void* tmp = malloc(num);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
dev_ctx->Wait();
ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_set_device(dst_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
free(tmp);
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
int ret = xpu::copy(dev_ctx->x_context(), static_cast<const int8_t*>(src),
static_cast<int8_t*>(dst), num);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
"XPU API return wrong value[%d %s]",
ret, XPUAPIErrorMsg[ret]));
}
platform::MemcpySyncD2D(dst, dst_place.device, src, src_place.device, num);
}
#endif
......
......@@ -12,6 +12,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/masked_select_op.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
namespace paddle {
namespace operators {
......@@ -41,13 +42,8 @@ class MaskedSelectXPUKernel : public framework::OpKernel<T> {
int* out_size = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
int out_size_cpu;
int ret = xpu::nonzero_count(dev_ctx.x_context(), mask_data, out_size,
mask->numel());
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU nonzero_count kernel return wrong value[%d %s]",
ret, XPUAPIErrorMsg[ret]));
PADDLE_ENFORCE_XPU_SUCCESS(xpu::nonzero_count(
dev_ctx.x_context(), mask_data, out_size, mask->numel()));
memory::Copy(platform::CPUPlace(), static_cast<void*>(&out_size_cpu),
BOOST_GET_CONST(platform::XPUPlace, mask->place()),
static_cast<void*>(out_size), sizeof(int32_t));
......@@ -59,12 +55,9 @@ class MaskedSelectXPUKernel : public framework::OpKernel<T> {
auto input_shape = framework::vectorize<int>(input_dim);
auto mask_shape = framework::vectorize<int>(mask_dim);
ret = xpu::masked_select(dev_ctx.x_context(), input_data, mask_data,
out_data, input_shape, mask_shape);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU masked_select kernel return wrong value[%d %s]",
ret, XPUAPIErrorMsg[ret]));
PADDLE_ENFORCE_XPU_SUCCESS(
xpu::masked_select(dev_ctx.x_context(), input_data, mask_data, out_data,
input_shape, mask_shape));
}
};
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
namespace paddle {
......@@ -292,17 +293,8 @@ BKCLComm* BKCLCommContext::CreateComm(BKCLUniqueId* bkcl_id, int nranks,
"Expected dev_id >= 0. But received dev_id is %d.", dev_id));
BKCLContext_t comm = nullptr;
auto ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::PreconditionNotMet(
"XPU API return wrong value[%d %s], please check whether "
"Baidu Kunlun Card is properly installed.",
ret, XPUAPIErrorMsg[ret]));
ret = bkcl_init_rank(&comm, rank, nranks, bkcl_id);
PADDLE_ENFORCE_EQ(ret, BKCL_SUCCESS,
platform::errors::PreconditionNotMet(
"bkcl_init_rank failed, got wrong value [%d].", ret));
platform::SetXPUDeviceId(dev_id);
PADDLE_ENFORCE_XPU_SUCCESS(bkcl_init_rank(&comm, rank, nranks, bkcl_id));
auto* comm_wrapper = AssignBKCLComm(comm, nranks, rank, dev_id, ring_id);
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/**************************** Enforce Wrapper **************************/
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device/npu/enforce_npu.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
......@@ -6,3 +6,5 @@ set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context)
add_subdirectory(tests)
......@@ -26,8 +26,8 @@
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/place.h"
#include "xpu/bkcl.h"
......@@ -73,13 +73,9 @@ struct InitBKCLPara {
static void *init_bkcl_context_func(void *args) {
struct InitBKCLPara *para = (struct InitBKCLPara *)args;
PADDLE_ENFORCE_EQ(xpu_set_device(para->dev_id), XPU_SUCCESS,
platform::errors::PreconditionNotMet(
"xpu_set_device failed[%d]", para->dev_id));
PADDLE_ENFORCE_EQ(
bkcl_init_rank(para->ctx, para->rank, para->nranks, para->bkcl_id),
BKCL_SUCCESS,
platform::errors::PreconditionNotMet("bkcl_init_rank failed"));
platform::SetXPUDeviceId(para->dev_id);
PADDLE_ENFORCE_XPU_SUCCESS(
bkcl_init_rank(para->ctx, para->rank, para->nranks, para->bkcl_id));
return nullptr;
}
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/enforce.h"
#include "xpu/bkcl.h"
namespace paddle {
namespace platform {
// Note: XPU runtime api return int, not XPUError_t
inline const char* xpuGetErrorString(int stat) {
switch (stat) {
case XPU_SUCCESS:
return "Success";
case XPUERR_INVALID_DEVICE:
return "Invalid XPU device";
case XPUERR_UNINIT:
return "XPU runtime not properly inited";
case XPUERR_NOMEM:
return "Device memory not enough";
case XPUERR_NOCPUMEM:
return "CPU memory not enough";
case XPUERR_INVALID_PARAM:
return "Invalid parameter";
case XPUERR_NOXPUFUNC:
return "Cannot get XPU Func";
case XPUERR_LDSO:
return "Error loading dynamic library";
case XPUERR_LDSYM:
return "Error loading func from dynamic library";
case XPUERR_SIMULATOR:
return "Error from XPU Simulator";
case XPUERR_NOSUPPORT:
return "Operation not supported";
case XPUERR_ABNORMAL:
return "Device abnormal due to previous error";
case XPUERR_KEXCEPTION:
return "Exception in kernel execution";
case XPUERR_TIMEOUT:
return "Kernel execution timed out";
case XPUERR_BUSY:
return "Resource busy";
case XPUERR_USEAFCLOSE:
return "Use a stream after closed";
case XPUERR_UCECC:
return "Uncorrectable ECC";
case XPUERR_OVERHEAT:
return "Overheat";
case XPUERR_UNEXPECT:
return "Execution error, reach unexpected control flow";
case XPUERR_DEVRESET:
return "Device is being reset, try again later";
case XPUERR_HWEXCEPTION:
return "Hardware module exception";
case XPUERR_HBM_INIT:
return "Error init HBM";
case XPUERR_DEVINIT:
return "Error init device";
case XPUERR_PEERRESET:
return "Device is being reset, try again later";
case XPUERR_MAXDEV:
return "Device count exceed limit";
case XPUERR_NOIOC:
return "Unknown IOCTL command";
case XPUERR_DMATIMEOUT:
return "DMA timed out, a reboot maybe needed";
case XPUERR_DMAABORT:
return "DMA aborted due to error, possibly wrong address or hardware "
"state";
case XPUERR_MCUUNINIT:
return "Firmware not initialized";
case XPUERR_OLDFW:
return "Firmware version too old (<15), please update.";
case XPUERR_PCIE:
return "Error in PCIE";
case XPUERR_FAULT:
return "Error copy between kernel and user space";
case XPUERR_INTERRUPTED:
return "Execution interrupted by user";
default:
return "unkonwn error";
}
}
inline const char* bkclGetErrorString(BKCLResult_t stat) {
switch (stat) {
case BKCL_SUCCESS:
return "BKCL_SUCCESS";
case BKCL_INVALID_ARGUMENT:
return "BKCL_INVALID_ARGUMENT";
case BKCL_RUNTIME_ERROR:
return "BKCL_RUNTIME_ERROR";
case BKCL_SYSTEM_ERROR:
return "BKCL_SYSTEM_ERROR";
case BKCL_INTERNAL_ERROR:
return "BKCL_INTERNAL_ERROR";
default:
return "Unknown BKCL status";
}
}
inline std::string build_xpu_error_msg(int stat) {
std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
return msg + xpuGetErrorString(stat) + " ";
}
inline std::string build_xpu_error_msg(BKCLResult_t stat) {
std::string msg("BKCL Error, ");
return msg + bkclGetErrorString(stat) + " ";
}
namespace details {
template <typename T>
struct ExternalApiType {};
#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
template <> \
struct ExternalApiType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
#undef DEFINE_EXTERNAL_API_TYPE
} // namespace details
#define PADDLE_ENFORCE_XPU_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __XPU_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::ExternalApiType< \
__XPU_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = paddle::platform::errors::External( \
::paddle::platform::build_xpu_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
} // namespace platform
} // namespace paddle
cc_test(enforce_xpu_test SRCS enforce_xpu_test.cc DEPS stringpiece)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
#include "gtest/gtest.h"
template <typename T>
bool CheckXPUStatusSuccess(T value, const std::string& msg = "success") {
PADDLE_ENFORCE_XPU_SUCCESS(value);
return true;
}
template <typename T>
bool CheckXPUStatusFailure(T value, const std::string& msg) {
try {
PADDLE_ENFORCE_XPU_SUCCESS(value);
return false;
} catch (paddle::platform::EnforceNotMet& error) {
std::string ex_msg = error.what();
std::cout << ex_msg << std::endl;
return ex_msg.find(msg) != std::string::npos;
}
}
TEST(enforce, xpu_status) {
EXPECT_TRUE(CheckXPUStatusSuccess(static_cast<int>(XPU_SUCCESS)));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_INVALID_DEVICE),
"Invalid XPU device"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_UNINIT),
"XPU runtime not properly inited"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOMEM),
"Device memory not enough"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOCPUMEM),
"CPU memory not enough"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_INVALID_PARAM),
"Invalid parameter"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOXPUFUNC),
"Cannot get XPU Func"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_LDSO),
"Error loading dynamic library"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_LDSYM),
"Error loading func from dynamic library"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_SIMULATOR),
"Error from XPU Simulator"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOSUPPORT),
"Operation not supported"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_ABNORMAL),
"Device abnormal due to previous error"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_KEXCEPTION),
"Exception in kernel execution"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_TIMEOUT),
"Kernel execution timed out"));
EXPECT_TRUE(
CheckXPUStatusFailure(static_cast<int>(XPUERR_BUSY), "Resource busy"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_USEAFCLOSE),
"Use a stream after closed"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_UCECC),
"Uncorrectable ECC"));
EXPECT_TRUE(
CheckXPUStatusFailure(static_cast<int>(XPUERR_OVERHEAT), "Overheat"));
EXPECT_TRUE(
CheckXPUStatusFailure(static_cast<int>(XPUERR_UNEXPECT),
"Execution error, reach unexpected control flow"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_DEVRESET),
"Device is being reset, try again later"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_HWEXCEPTION),
"Hardware module exception"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_HBM_INIT),
"Error init HBM"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_DEVINIT),
"Error init device"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_PEERRESET),
"Device is being reset, try again later"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_MAXDEV),
"Device count exceed limit"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOIOC),
"Unknown IOCTL command"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_DMATIMEOUT),
"DMA timed out, a reboot maybe needed"));
EXPECT_TRUE(CheckXPUStatusFailure(
static_cast<int>(XPUERR_DMAABORT),
"DMA aborted due to error, possibly wrong address or hardware state"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_MCUUNINIT),
"Firmware not initialized"));
EXPECT_TRUE(
CheckXPUStatusFailure(static_cast<int>(XPUERR_OLDFW),
"Firmware version too old (<15), please update."));
EXPECT_TRUE(
CheckXPUStatusFailure(static_cast<int>(XPUERR_PCIE), "Error in PCIE"));
EXPECT_TRUE(
CheckXPUStatusFailure(static_cast<int>(XPUERR_FAULT),
"Error copy between kernel and user space"));
EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_INTERRUPTED),
"Execution interrupted by user"));
}
TEST(enforce, bkcl_status) {
EXPECT_TRUE(CheckXPUStatusSuccess(BKCL_SUCCESS));
EXPECT_TRUE(
CheckXPUStatusFailure(BKCL_INVALID_ARGUMENT, "BKCL_INVALID_ARGUMENT"));
EXPECT_TRUE(CheckXPUStatusFailure(BKCL_RUNTIME_ERROR, "BKCL_RUNTIME_ERROR"));
EXPECT_TRUE(CheckXPUStatusFailure(BKCL_SYSTEM_ERROR, "BKCL_SYSTEM_ERROR"));
EXPECT_TRUE(
CheckXPUStatusFailure(BKCL_INTERNAL_ERROR, "BKCL_INTERNAL_ERROR"));
}
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
......@@ -21,37 +21,14 @@
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/float16.h"
#include "xpu/runtime.h"
#include "xpu/runtime_ex.h"
#include "xpu/xdnn.h"
namespace xpu = baidu::xpu::api;
class XPUActHelper {
public:
// Convert string to activation type in xpu
static xpu::Activation_t ConvertToXpuActType(
const std::string& act_type_str) {
static std::unordered_map<std::string, xpu::Activation_t> str2act = {
{"linear", xpu::Activation_t::LINEAR},
{"relu", xpu::Activation_t::RELU},
{"sigmoid", xpu::Activation_t::SIGMOID},
{"tanh", xpu::Activation_t::TANH},
{"gelu", xpu::Activation_t::GELU},
{"leaky_relu", xpu::Activation_t::LEAKY_RELU},
{"sqrt", xpu::Activation_t::SQRT},
{"square", xpu::Activation_t::SQUARE}};
auto res = str2act.find(act_type_str);
PADDLE_ENFORCE_NE(res, str2act.end(),
paddle::platform::errors::InvalidArgument(
"Invalid activation type(%s) in XPU", act_type_str));
return res->second;
}
};
static std::map<int, std::string> XPUAPIErrorMsg = {
{xpu::Error_t::SUCCESS, "xpu api success"},
{xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
......
......@@ -14,8 +14,8 @@ limitations under the License. */
#include <cstdlib>
#include <string>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/split.h"
PADDLE_DEFINE_EXPORTED_string(
......@@ -31,7 +31,31 @@ PADDLE_DEFINE_EXPORTED_string(
namespace paddle {
namespace platform {
static int GetXPUDeviceCountImpl() {
/**************************** Version Management **************************/
//! Get the version of XPU Driver
int GetDriverVersion() {
uint32_t driver_version_major = 0;
uint32_t driver_version_minor = 0;
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_get_driver_version(&driver_version_major, &driver_version_minor));
int driver_version = driver_version_major * 10 + driver_version_minor;
return driver_version;
}
//! Get the version of XPU Runtime
int GetRuntimeVersion() {
uint32_t rumtime_version_major = 0;
uint32_t rumtime_version_minor = 0;
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
return runtime_version;
}
/**************************** Device Management **************************/
static int GetDeviceCountImpl() {
const auto *xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
if (xpu_visible_devices != nullptr) {
std::string xpu_visible_devices_str(xpu_visible_devices);
......@@ -44,29 +68,18 @@ static int GetXPUDeviceCountImpl() {
}
int count = 0;
int ret = xpu_device_count(&count);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
return count;
}
int GetXPUDeviceCount() {
static auto dev_cnt = GetXPUDeviceCountImpl();
static auto dev_cnt = GetDeviceCountImpl();
return dev_cnt;
}
int GetXPUCurrentDeviceId() {
int dev_id;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
......@@ -74,6 +87,13 @@ int GetXPUCurrentDeviceId() {
return dev_id;
}
void SetXPUDeviceId(int id) {
PADDLE_ENFORCE_LT(
id, GetXPUDeviceCount(),
platform::errors::InvalidArgument("id must less than XPU count"));
PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
}
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetXPUSelectedDevices() {
// use user specified XPUs in single-node multi-process mode.
......@@ -92,24 +112,38 @@ std::vector<int> GetXPUSelectedDevices() {
return devices;
}
void SetXPUDeviceId(int id) {
PADDLE_ENFORCE_LT(
id, GetXPUDeviceCount(),
platform::errors::InvalidArgument("id must less than XPU count"));
int ret = xpu_set_device(id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
/**************************** Memory Management **************************/
void MemcpySyncH2D(void *dst, const void *src, size_t count, int dev_id) {
platform::XPUDeviceGuard guard(dev_id);
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
}
void MemcpySyncD2H(void *dst, const void *src, size_t count, int dev_id) {
platform::XPUDeviceGuard guard(dev_id);
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
}
void MemcpySyncD2D(void *dst, int dst_id, const void *src, int src_id,
size_t count) {
int dev_id = GetXPUCurrentDeviceId();
if (dst_id == dev_id && src_id == dev_id) {
platform::XPUDeviceGuard guard(dev_id);
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE));
} else {
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy_peer(dst_id, dst, src_id, src, count));
}
}
/**************************** Others **************************/
XPUVersion get_xpu_version(int dev_id) {
uint64_t v = 0;
int ret = xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"xpu_device_get_attr return wrong value[%d]", ret));
PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
if (v == K100 || v == K200) {
VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
......
......@@ -16,17 +16,35 @@ limitations under the License. */
namespace paddle {
namespace platform {
/***** Version Management *****/
//! Get the version of XPU Driver
int GetDriverVersion();
//! Get the version of XPU Runtime
int GetRuntimeVersion();
/***** Device Management *****/
//! Get the total number of XPU devices in system.
int GetXPUDeviceCount();
//! Set the XPU device id for next execution.
void SetXPUDeviceId(int device_id);
//! Get the current XPU device id in system.
int GetXPUCurrentDeviceId();
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetXPUSelectedDevices();
//! Set the XPU device id for next execution.
void SetXPUDeviceId(int device_id);
/***** Memory Management *****/
//! Copy memory from address src to dst synchronously.
void MemcpySyncH2D(void *dst, const void *src, size_t count, int dev_id);
void MemcpySyncD2H(void *dst, const void *src, size_t count, int dev_id);
void MemcpySyncD2D(void *dst, int dst_id, const void *src, int src_id,
size_t count);
class XPUDeviceGuard {
public:
......@@ -44,8 +62,8 @@ class XPUDeviceGuard {
}
}
XPUDeviceGuard(const XPUDeviceGuard& o) = delete;
XPUDeviceGuard& operator=(const XPUDeviceGuard& o) = delete;
XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
private:
int prev_id_{-1};
......
......@@ -264,19 +264,7 @@ XPUDeviceContext::XPUDeviceContext() {
XPUDeviceContext::~XPUDeviceContext() {}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_set_device(place.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
platform::XPUDeviceGuard guard(place.device);
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " << place_.device;
......@@ -303,22 +291,10 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
break;
}
}
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
void XPUDeviceContext::Wait() const {
int ret = xpu_set_device(place_.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
platform::SetXPUDeviceId(place_.device);
xpu_wait(context_->xpu_stream);
}
......
......@@ -13,13 +13,7 @@
// limitations under the License.
#include "paddle/fluid/platform/stream_callback_manager.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/device/device_wrapper.h"
namespace paddle {
namespace platform {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册