未验证 提交 f354e1d6 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] fix some op bugs (#31855)

* fix some op bugs

* fix some bugs

* follow comments

* fix log level

* add ut
上级 9754d0a7
......@@ -364,4 +364,5 @@ REGISTER_OP_NPU_KERNEL(
REGISTER_OP_NPU_KERNEL(
square, ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
paddle::platform::float16>,
ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, int>);
......@@ -148,13 +148,13 @@ class LazyZerosNPU {
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
if (found_inf_vec[0]) {
VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --";
auto place = dev_ctx.GetPlace();
auto stream = dev_ctx.stream();
auto g = out->mutable_data<int>(place);
auto g = out->mutable_data<T>(place);
platform::NPUMemsetAsync(static_cast<void*>(g), 0,
out->numel() * sizeof(int), stream);
out->numel() * sizeof(T), stream);
}
}
}
......
......@@ -71,15 +71,6 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {
auto outs =
ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
{
auto dx = outs;
auto x = ins;
for (size_t i = 0; i < dx.size(); ++i) {
if (dx[i] != nullptr) {
dx[i]->set_lod(x[i]->lod());
}
}
}
PADDLE_ENFORCE_NOT_NULL(ins[0],
platform::errors::NotFound(
"The first input tensor is not initalized."));
......@@ -88,26 +79,39 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {
axis = ComputeAxis(static_cast<int64_t>(axis),
static_cast<int64_t>(ins[0]->dims().size()));
// get output tensor that the name is not kEmptyVarName
std::vector<framework::Tensor> outputs;
std::vector<int> sizes;
int offset = 0;
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
for (size_t j = 0; j < outs.size(); ++j) {
// For stop gradient
// get output tensor that the name is not kEmptyVarName
if (out_var_names[j] != framework::kEmptyVarName &&
outs[j]->numel() != 0UL) {
outs[j]->mutable_data<T>(ctx.GetPlace());
outputs.push_back(*outs[j]);
sizes.push_back(outs[j]->dims()[axis]);
std::vector<int> offsets;
std::vector<int> sizes;
for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
if (dim == axis) {
offsets.push_back(offset);
sizes.push_back(ins[j]->dims()[dim]);
} else {
offsets.push_back(0);
sizes.push_back(ins[j]->dims()[dim]);
}
}
auto runner =
NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
{{"offsets", offset}, {"size", ins[j]->dims()[axis]}});
runner.Run(stream);
}
if (ins[j]->numel() != 0UL) {
offset += ins[j]->dims()[axis];
}
}
auto runner =
NpuOpRunner("SplitVD", {*out_grad}, outputs,
{{"split_dim", axis},
{"size_splits", sizes},
{"num_split", static_cast<int>(outputs.size())}});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
......
......@@ -34,23 +34,58 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace());
// special case
if (x->dims().size() == 1 && keep_dims == false) {
keep_dims = true;
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
framework::Tensor cast_x;
framework::Tensor cast_out;
// NOTE: ReduceSumD only supports fp32 and fp16
if (x->type() != framework::proto::VarType::FP32 &&
x->type() != framework::proto::VarType::FP16) {
cast_x.Resize(x->dims());
cast_x.mutable_data<float>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
auto runner_cast = NpuOpRunner(
"Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);
cast_out.Resize(out->dims());
cast_out.mutable_data<float>(ctx.GetPlace());
} else {
cast_x.ShareDataWith(*x);
cast_out.ShareDataWith(*out);
}
if (reduce_all) {
std::vector<int> dim_vec;
for (int i = 0; i < x->dims().size(); i++) {
dim_vec.push_back(i);
}
auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out},
auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dim_vec}, {"keep_dims", keep_dims}});
runner.Run(stream);
} else {
auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out},
auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dims}, {"keep_dims", keep_dims}});
runner.Run(stream);
}
if (x->type() != framework::proto::VarType::FP32 &&
x->type() != framework::proto::VarType::FP16) {
auto dst_dtype = ConvertToNpuDtype(out->type());
auto runner_cast =
NpuOpRunner("Cast", {cast_out}, {*out},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);
}
}
};
......
......@@ -36,7 +36,15 @@ void UpdateAttr(const framework::DDim in_dims, const std::vector<int> axes,
if (axis == i) {
start = starts[cnt];
end = ends[cnt] <= in_dims[i] ? ends[cnt] : end;
if (start < 0) {
start = (start + in_dims[i]);
}
start = std::max(start, static_cast<int>(0));
end = ends[cnt];
if (end < 0) {
end = (end + in_dims[i]);
}
end = std::min(end, static_cast<int>(in_dims[i]));
cnt++;
}
......
......@@ -32,7 +32,7 @@ class TestConcat(OpTest):
def setUp(self):
self.set_npu()
self.op_type = "concat"
self.place = paddle.NPUPlace(4)
self.place = paddle.NPUPlace(0)
self.init_dtype()
self.init_test_data()
......@@ -66,7 +66,7 @@ class TestConcat(OpTest):
def test_check_grad(self):
self.check_grad_with_place(
self.place, ['x0'], 'Out', check_dygraph=False)
self.place, ['x0', 'x2'], 'Out', check_dygraph=False)
self.check_grad_with_place(
self.place, ['x1'], 'Out', check_dygraph=False)
self.check_grad_with_place(
......@@ -77,7 +77,7 @@ class TestConcatFP16(OpTest):
def setUp(self):
self.set_npu()
self.op_type = "concat"
self.place = paddle.NPUPlace(4)
self.place = paddle.NPUPlace(0)
self.init_dtype()
self.init_test_data()
......
......@@ -32,6 +32,7 @@ class TestReduceSum(OpTest):
def setUp(self):
np.random.seed(SEED)
self.set_npu()
self.init_dtype()
self.place = paddle.NPUPlace(0)
self.init_op_type()
self.initTestCase()
......@@ -42,7 +43,7 @@ class TestReduceSum(OpTest):
'keep_dim': self.keep_dim,
'reduce_all': self.reduce_all
}
self.inputs = {'X': np.random.random(self.shape).astype("float32")}
self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
if self.attrs['reduce_all']:
self.outputs = {'Out': self.inputs['X'].sum()}
else:
......@@ -78,6 +79,11 @@ class TestReduceSum(OpTest):
#
class TestReduceSum2(OpTest):
def init_dtype(self):
self.dtype = np.int32
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestReduceSumNet(unittest.TestCase):
......
......@@ -63,10 +63,22 @@ class TestSliceOp(OpTest):
self.check_output_with_place(self.place, check_dygraph=False)
def test_check_grad_normal(self):
if self.dtype == np.float16:
return
self.check_grad_with_place(
self.place, ['Input'], 'Out', check_dygraph=False)
class TestSliceOp2(TestSliceOp):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
self.starts = [1, 0, -3]
self.ends = [3, 3, -1]
self.axes = [0, 1, 2]
self.infer_flags = [1, 1, 1]
self.out = self.input[1:3, 0:3, -3:-1, :]
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestSliceOpFp16(TestSliceOp):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册