未验证 提交 59b47f3b 编写于 作者: Q QingshuChen 提交者: GitHub

feat: support check_nan_inf for kunlun/xpu device (#29694)

* feat: support check_nan_inf for kunlun device

* support kunlun stack

* minor
上级 7498df25
...@@ -333,6 +333,33 @@ void CheckVarHasNanOrInf(const std::string& op_type, ...@@ -333,6 +333,33 @@ void CheckVarHasNanOrInf(const std::string& op_type,
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.", "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
var_name)); var_name));
#endif
return;
} else if (platform::is_xpu_place(tensor->place())) {
#ifdef PADDLE_WITH_XPU
if (tensor->type() != proto::VarType::FP32) {
return;
}
float* cpu_data = new float[tensor->numel()];
xpu_memcpy(cpu_data, tensor->data<float>(), tensor->numel() * sizeof(float),
XPU_DEVICE_TO_HOST);
bool flag = false;
for (int i = 0; i < tensor->numel(); i++) {
if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
flag = true;
break;
}
}
delete[] cpu_data;
PADDLE_ENFORCE_NE(
flag, true,
platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
op_type, var_name));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Tensor[%s] use xpu place. PaddlePaddle must compile with XPU.",
var_name));
#endif #endif
return; return;
} }
......
...@@ -28,50 +28,34 @@ class StackXPUKernel : public framework::OpKernel<T> { ...@@ -28,50 +28,34 @@ class StackXPUKernel : public framework::OpKernel<T> {
auto* y = ctx.Output<Tensor>("Y"); auto* y = ctx.Output<Tensor>("Y");
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
if (axis < 0) { if (axis < 0) {
axis += (x[0]->dims().size() + 1); axis += x[0]->dims().size() + 1;
} }
int n = static_cast<int>(x.size());
PADDLE_ENFORCE_LE(n, 24,
platform::errors::InvalidArgument(
"XPU only surpport at most 24 tensors for now"));
auto* y_data = y->mutable_data<T>(ctx.GetPlace()); auto* y_data = y->mutable_data<T>(ctx.GetPlace());
int pre = 1, post = 1;
auto& dim = x[0]->dims(); auto& dim = x[0]->dims();
for (auto i = 0; i < axis; ++i) { std::vector<int> xdims;
pre *= dim[i]; for (auto i = 0; i < dim.size(); ++i) {
xdims.push_back(dim[i]);
} }
for (auto i = axis; i < dim.size(); ++i) { xdims.push_back(1);
post *= dim[i]; std::vector<std::vector<int>> xdims_list;
int n = static_cast<int>(x.size());
for (int i = 0; i < n; i++) {
xdims_list.push_back(xdims);
} }
auto& dev_ctx = ctx.template device_context<DeviceContext>();
void* x_datas_host = std::malloc(n * sizeof(void*)); std::vector<const T*> x_list;
void* x_datas_device = nullptr; for (int i = 0; i < n; i++) {
PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&x_datas_device), x_list.push_back(x[i]->data<T>());
n * sizeof(void*)),
XPU_SUCCESS,
platform::errors::ResourceExhausted(
"\n\nOut of memory error on XPU, Cannot"
"allocate %s memory on XPU. \n\nPlease "
"check whether there is any other process "
"using XPU.\n",
string::HumanReadableSize(n * sizeof(void*))));
for (auto i = 0; i < n; ++i) {
((const void**)x_datas_host)[i] = x[i]->data<T>();
} }
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
x_datas_device, platform::CPUPlace(), x_datas_host, auto& dev_ctx = ctx.template device_context<DeviceContext>();
n * sizeof(void*)); int r =
int r = xpu::stack_forward<float>(dev_ctx.x_context(), pre, post, n, xpu::concat<T>(dev_ctx.x_context(), x_list, y_data, xdims_list, axis);
x_datas_device, y_data); PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
PADDLE_ENFORCE_EQ( platform::errors::External(
r, xpu::Error_t::SUCCESS, "The stack XPU API return wrong value[%d %s]", r,
platform::errors::External( XPUAPIErrorMsg[r]));
"The stack XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
r));
dev_ctx.Wait();
std::free(x_datas_host);
xpu_free(x_datas_device);
} }
}; };
......
...@@ -19,18 +19,19 @@ import unittest ...@@ -19,18 +19,19 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest, skip_check_grad_ci from op_test import OpTest, skip_check_grad_ci
from op_test_xpu import XPUOpTest
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import Program, program_guard from paddle.fluid import Program, program_guard
@skip_check_grad_ci(reason="There is no grad kernel for stack_xpu op.") @skip_check_grad_ci(reason="There is no grad kernel for stack_xpu op.")
class TestStackOpBase(OpTest): class TestStackOpBase(XPUOpTest):
def initDefaultParameters(self): def initDefaultParameters(self):
self.num_inputs = 4 self.num_inputs = 4
self.input_dim = (5, 6, 7) self.input_dim = (5, 6, 7)
self.axis = 0 self.axis = 0
self.dtype = 'float64' self.dtype = 'float32'
def initParameters(self): def initParameters(self):
pass pass
...@@ -73,7 +74,7 @@ class TestStackOp1(TestStackOpBase): ...@@ -73,7 +74,7 @@ class TestStackOp1(TestStackOpBase):
class TestStackOp2(TestStackOpBase): class TestStackOp2(TestStackOpBase):
def initParameters(self): def initParameters(self):
self.num_inputs = 20 self.num_inputs = 30
class TestStackOp3(TestStackOpBase): class TestStackOp3(TestStackOpBase):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册