From e23dfed9ea00f69acd4ba90583d5e355cdaa3b59 Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Wed, 16 Nov 2022 14:26:40 +0800 Subject: [PATCH] Fix paddle rec, kim, dsin models' bugs (#47792) * add stat tool * add roll and roll_grad kernels and strided_slice and strided_slice_grad kernels, test=kunlun * embedding and embedding_grad add int32 input, test=kunlun --- .../kernels/funcs/selected_rows_functor.cc | 64 ++++++++++++++ paddle/phi/kernels/xpu/add_n_kernel.cc | 65 +++++++++++--- .../phi/kernels/xpu/embedding_grad_kernel.cc | 13 ++- paddle/phi/kernels/xpu/embedding_kernel.cc | 12 ++- .../tests/unittests/xpu/test_sum_op_xpu.py | 85 +++++++++++++++++++ 5 files changed, 224 insertions(+), 15 deletions(-) diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc index e1d45eef549..de362d45a8b 100644 --- a/paddle/phi/kernels/funcs/selected_rows_functor.cc +++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc @@ -320,11 +320,75 @@ struct SelectedRowsAddToTensor { } }; +#ifdef PADDLE_WITH_XPU +template +struct SelectedRowsAddToTensor { + void operator()(const phi::XPUContext& context, + const phi::SelectedRows& input1, + phi::DenseTensor* input2) { + if (UNLIKELY(input1.rows().size() == 0)) { + LOG(WARNING) << "input selected rows is empty!"; + return; + } + using XPUType = typename XPUTypeTrait::Type; + auto in1_height = input1.height(); + const auto& in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ( + in1_height, + in2_dims[0], + phi::errors::InvalidArgument("The two inputs height must be equal." + "But received first input height = " + "[%d], second input height = [%d]", + in1_height, + in2_dims[0])); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + int64_t* in1_rows_data = nullptr; + xpu::VectorParam in1_rows_vec{ + in1_rows.data(), static_cast(in1_rows.size()), in1_rows_data}; + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ( + in1_row_numel, + input2->numel() / in1_height, + phi::errors::InvalidArgument( + "The two inputs width must be equal." + "But received first input width = [%d], second input width = [%d]", + in1_row_numel, + input2->numel() / in1_height)); + + auto* in1_data = in1_value.data(); + auto* out_data = input2->data(); + + int h = in1_rows.size(); + int w = in1_row_numel; + const std::vector xshape{h, w}; + + int r = xpu::scatter( + context.x_context(), + nullptr, + reinterpret_cast(in1_data), + reinterpret_cast(out_data), + in1_rows_vec, + xshape, + 0, + false); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scatter"); + } +}; + +#endif + template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; + +#ifdef PADDLE_WITH_XPU +template struct SelectedRowsAddToTensor; +#endif // This is a separated namespace for manipulate SelectedRows typed // data. Like merge duplicated rows, adding two SelectedRows etc. // diff --git a/paddle/phi/kernels/xpu/add_n_kernel.cc b/paddle/phi/kernels/xpu/add_n_kernel.cc index 324ced03d98..c1411a10d54 100644 --- a/paddle/phi/kernels/xpu/add_n_kernel.cc +++ b/paddle/phi/kernels/xpu/add_n_kernel.cc @@ -17,6 +17,8 @@ #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/selected_rows_functor.h" + namespace phi { template @@ -25,6 +27,8 @@ void AddNKernel(const Context& dev_ctx, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; size_t in_num = x.size(); + dev_ctx.template Alloc(out); + bool in_place = false; if (x.size() > 0 && x[0]->initialized() && DenseTensor::classof(x[0])) { if ((static_cast(x[0]))->Holder() == out->Holder()) { @@ -33,26 +37,61 @@ void AddNKernel(const Context& dev_ctx, } if (!in_place) { - dev_ctx.template Alloc(out); + int r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(out->data()), + out->numel(), + XPUType(0)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); } + std::vector ptrs; + phi::funcs::SelectedRowsAddToTensor functor; for (size_t i = 0; i < in_num; ++i) { - PADDLE_ENFORCE_EQ(DenseTensor::classof(x[i]), - true, - errors::InvalidArgument("XPU only support DensorTensor")); + if (DenseTensor::classof(x[i])) { + auto& in_t = *(static_cast(x[i])); + if (!in_t.initialized() || in_t.numel() == 0) { + continue; + } + ptrs.push_back(reinterpret_cast(in_t.data())); + } else if (SelectedRows::classof(x[i])) { + PADDLE_ENFORCE_EQ(x[i]->dtype(), + DataType::FLOAT32, + errors::InvalidArgument("SelectedRowsAdd(scatter) only", + "supports float type")); - auto& in_t = *(static_cast(x[i])); - if (in_t.numel() == 0) { - continue; + auto& in_t = *(static_cast(x[i])); + functor(dev_ctx, in_t, out); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Expected type of Input(X) of %d-th must be Tensor, " + "SelectedRows. But got " + "unsupport type: %s.", + x[i]->type_info().name())); } - ptrs.push_back(reinterpret_cast(in_t.data())); } - int r = xpu::sum(dev_ctx.x_context(), - ptrs, - reinterpret_cast(out->data()), - out->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum"); + if (ptrs.empty()) { + return; + } else if (ptrs.size() < x.size()) { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + XPUType* out_t = RAII_GUARD.alloc_l3_or_gm(out->numel()); + int r = xpu::sum(dev_ctx.x_context(), ptrs, out_t, out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum"); + + r = xpu::add(dev_ctx.x_context(), + reinterpret_cast(out->data()), + out_t, + reinterpret_cast(out->data()), + out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); + } else { + int r = xpu::sum(dev_ctx.x_context(), + ptrs, + reinterpret_cast(out->data()), + out->numel()); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum"); + } } template diff --git a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc index 53b5cdb9016..cd3b920feff 100644 --- a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc @@ -43,7 +43,18 @@ void EmbeddingGradKernel(const Context& ctx, "number of ids in LookupTableV2GradXPUKernel.")); auto& dev_ctx = ctx; - const int64_t* ids_data = ids_t->data(); + xpu::ctx_guard RAII_GUARD(ctx.x_context()); + const int64_t* ids_data; + if (ids_t->dtype() == phi::DataType::INT64) { + ids_data = ids_t->data(); + } else { + int64_t* ids_tt = RAII_GUARD.alloc_l3_or_gm(ids_t->numel()); + int r = xpu::cast( + ctx.x_context(), ids_t->data(), ids_tt, ids_t->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + ids_data = reinterpret_cast(ids_tt); + } + const T* d_output_data = d_output_t->data(); T* d_table_data = dev_ctx.template Alloc(d_table_t); int xm = d_table_t->dims()[0]; diff --git a/paddle/phi/kernels/xpu/embedding_kernel.cc b/paddle/phi/kernels/xpu/embedding_kernel.cc index d0e531f8c13..ace2116cdc9 100644 --- a/paddle/phi/kernels/xpu/embedding_kernel.cc +++ b/paddle/phi/kernels/xpu/embedding_kernel.cc @@ -42,7 +42,17 @@ void EmbeddingKernel(const Context &ctx, auto *table = table_t->data(); auto *output = dev_ctx.template Alloc(output_t); - const int64_t *ids = ids_t->data(); + xpu::ctx_guard RAII_GUARD(ctx.x_context()); + const int64_t *ids; + if (ids_t->dtype() == phi::DataType::INT64) { + ids = ids_t->data(); + } else { + int64_t *ids_tt = RAII_GUARD.alloc_l3_or_gm(ids_t->numel()); + int r = xpu::cast( + ctx.x_context(), ids_t->data(), ids_tt, ids_t->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + ids = reinterpret_cast(ids_tt); + } PADDLE_ENFORCE_EQ( ids_numel <= std::numeric_limits::max(), diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py index 778cb66b9a8..84be81718fb 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py @@ -197,6 +197,91 @@ class TestSumOpError(unittest.TestCase): self.assertRaises(Exception, test_list_of_none_input) +class TestLoDTensorAndSelectedRowsOp(unittest.TestCase): + def setUp(self): + self.height = 10 + self.row_numel = 12 + self.rows = [0, 1, 2, 3, 4, 5, 6] + self.dtype = np.float32 + self.init_kernel_type() + + def check_with_place(self, place, inplace): + self.check_input_and_optput(place, inplace, True, True, True) + + def init_kernel_type(self): + pass + + def _get_array(self, rows, row_numel): + array = np.ones((len(rows), row_numel)).astype(self.dtype) + for i in range(len(rows)): + array[i] *= rows[i] + return array + + def check_input_and_optput( + self, + place, + inplace, + w1_has_data=False, + w2_has_data=False, + w3_has_data=False, + ): + paddle.disable_static() + w1 = self.create_lod_tensor(place) + w2 = self.create_selected_rows(place, w2_has_data) + + x = [w1, w2] + out = paddle.add_n(x) + + result = np.ones((1, self.height)).astype(np.int32).tolist()[0] + for ele in self.rows: + result[ele] += 1 + + out_t = np.array(out) + self.assertEqual(out_t.shape[0], self.height) + np.testing.assert_array_equal( + out_t, + self._get_array([i for i in range(self.height)], self.row_numel) + * np.tile(np.array(result).reshape(self.height, 1), self.row_numel), + ) + + paddle.enable_static() + + def create_selected_rows(self, place, has_data): + # create and initialize W Variable + if has_data: + rows = self.rows + else: + rows = [] + + w_array = self._get_array(self.rows, self.row_numel) + var = core.eager.Tensor( + core.VarDesc.VarType.FP32, + w_array.shape, + "selected_rows", + core.VarDesc.VarType.SELECTED_ROWS, + True, + ) + + w_selected_rows = var.value().get_selected_rows() + w_selected_rows.set_height(self.height) + w_selected_rows.set_rows(rows) + w_tensor = w_selected_rows.get_tensor() + w_tensor.set(w_array, place) + + return var + + def create_lod_tensor(self, place): + w_array = self._get_array( + [i for i in range(self.height)], self.row_numel + ) + return paddle.to_tensor(w_array) + + def test_w_is_selected_rows(self): + places = [core.XPUPlace(0)] + for place in places: + self.check_with_place(place, True) + + support_types = get_xpu_op_support_types('sum') for stype in support_types: create_test_class(globals(), XPUTestSumOp, stype) -- GitLab