未验证 提交 41917fb5 编写于 作者: Q QingshuChen 提交者: GitHub

feat: support check_nan_inf for kunlun/xpu device (#29694) (#29898)

* feat: support check_nan_inf for kunlun device

* support kunlun stack

* minor
上级 f781ab08
......@@ -333,6 +333,33 @@ void CheckVarHasNanOrInf(const std::string& op_type,
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
var_name));
#endif
return;
} else if (platform::is_xpu_place(tensor->place())) {
#ifdef PADDLE_WITH_XPU
if (tensor->type() != proto::VarType::FP32) {
return;
}
float* cpu_data = new float[tensor->numel()];
xpu_memcpy(cpu_data, tensor->data<float>(), tensor->numel() * sizeof(float),
XPU_DEVICE_TO_HOST);
bool flag = false;
for (int i = 0; i < tensor->numel(); i++) {
if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
flag = true;
break;
}
}
delete[] cpu_data;
PADDLE_ENFORCE_NE(
flag, true,
platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
op_type, var_name));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Tensor[%s] use xpu place. PaddlePaddle must compile with XPU.",
var_name));
#endif
return;
}
......
......@@ -28,50 +28,34 @@ class StackXPUKernel : public framework::OpKernel<T> {
auto* y = ctx.Output<Tensor>("Y");
int axis = ctx.Attr<int>("axis");
if (axis < 0) {
axis += (x[0]->dims().size() + 1);
axis += x[0]->dims().size() + 1;
}
int n = static_cast<int>(x.size());
PADDLE_ENFORCE_LE(n, 24,
platform::errors::InvalidArgument(
"XPU only surpport at most 24 tensors for now"));
auto* y_data = y->mutable_data<T>(ctx.GetPlace());
int pre = 1, post = 1;
auto& dim = x[0]->dims();
for (auto i = 0; i < axis; ++i) {
pre *= dim[i];
std::vector<int> xdims;
for (auto i = 0; i < dim.size(); ++i) {
xdims.push_back(dim[i]);
}
for (auto i = axis; i < dim.size(); ++i) {
post *= dim[i];
xdims.push_back(1);
std::vector<std::vector<int>> xdims_list;
int n = static_cast<int>(x.size());
for (int i = 0; i < n; i++) {
xdims_list.push_back(xdims);
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
void* x_datas_host = std::malloc(n * sizeof(void*));
void* x_datas_device = nullptr;
PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&x_datas_device),
n * sizeof(void*)),
XPU_SUCCESS,
platform::errors::ResourceExhausted(
"\n\nOut of memory error on XPU, Cannot"
"allocate %s memory on XPU. \n\nPlease "
"check whether there is any other process "
"using XPU.\n",
string::HumanReadableSize(n * sizeof(void*))));
for (auto i = 0; i < n; ++i) {
((const void**)x_datas_host)[i] = x[i]->data<T>();
std::vector<const T*> x_list;
for (int i = 0; i < n; i++) {
x_list.push_back(x[i]->data<T>());
}
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
x_datas_device, platform::CPUPlace(), x_datas_host,
n * sizeof(void*));
int r = xpu::stack_forward<float>(dev_ctx.x_context(), pre, post, n,
x_datas_device, y_data);
PADDLE_ENFORCE_EQ(
r, xpu::Error_t::SUCCESS,
auto& dev_ctx = ctx.template device_context<DeviceContext>();
int r =
xpu::concat<T>(dev_ctx.x_context(), x_list, y_data, xdims_list, axis);
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External(
"The stack XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
r));
dev_ctx.Wait();
std::free(x_datas_host);
xpu_free(x_datas_device);
"The stack XPU API return wrong value[%d %s]", r,
XPUAPIErrorMsg[r]));
}
};
......
......@@ -19,18 +19,19 @@ import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest, skip_check_grad_ci
from op_test_xpu import XPUOpTest
import paddle
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
@skip_check_grad_ci(reason="There is no grad kernel for stack_xpu op.")
class TestStackOpBase(OpTest):
class TestStackOpBase(XPUOpTest):
def initDefaultParameters(self):
self.num_inputs = 4
self.input_dim = (5, 6, 7)
self.axis = 0
self.dtype = 'float64'
self.dtype = 'float32'
def initParameters(self):
pass
......@@ -73,7 +74,7 @@ class TestStackOp1(TestStackOpBase):
class TestStackOp2(TestStackOpBase):
def initParameters(self):
self.num_inputs = 20
self.num_inputs = 30
class TestStackOp3(TestStackOpBase):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册