diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index a8144f835c8d17975220e67b39140fb7b1df9b76..87db2affdd75303790b4333d31de156fc760b3e6 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -527,6 +527,7 @@ XPUOpMap& get_kl2_ops() { {"reduce_max", XPUKernelSet({phi::DataType::FLOAT32})}, {"reduce_mean_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"reduce_mean", XPUKernelSet({phi::DataType::FLOAT32})}, + {"reduce_min_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"reduce_min", XPUKernelSet({phi::DataType::FLOAT32})}, {"reduce_prod", XPUKernelSet({phi::DataType::FLOAT32})}, {"reduce_sum_grad", XPUKernelSet({phi::DataType::FLOAT32})}, diff --git a/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..25fb123e11a4f30b7b3fd27deb1db90dc27fee9e --- /dev/null +++ b/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_min_grad_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/phi/kernels/xpu/reduce.h" + +namespace phi { + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + const IntArray& dims_arr, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims_arr, reduce_all); + auto dims = dims_arr.GetData(); + + dev_ctx.template Alloc(x_grad); + const T* x_data = x.data(); + const T* out_data = out.data(); + const T* out_grad_data = out_grad.data(); + auto* x_grad_data = x_grad->data(); + const auto& input_dim_size = x.dims().size(); + std::vector true_dims; + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) { + true_dims.push_back(dims[i] + input_dim_size); + } else { + true_dims.push_back(dims[i]); + } + } + std::vector ydims(input_dim_size); + std::vector xdims((input_dim_size)); + std::set dims_set(true_dims.begin(), true_dims.end()); + for (auto i = 0; i < input_dim_size; i++) { + xdims[i] = x.dims()[i]; + if (dims_set.find(i) != dims_set.end() || reduce_all) { + ydims[i] = 1; + } else { + ydims[i] = x.dims()[i]; + } + } + + T* brocast1 = nullptr; + T* brocast2 = nullptr; + bool* equal = nullptr; + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&brocast1), x.numel() * sizeof(T)), + XPU_SUCCESS, + errors::ResourceExhausted("XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&equal), x.numel() * sizeof(bool)), + XPU_SUCCESS, + errors::ResourceExhausted("XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&brocast2), x.numel() * sizeof(T)), + XPU_SUCCESS, + errors::ResourceExhausted("XPU has no enough memory")); + + // use [1] to replace [], because xpu not support [] + if (xdims.size() == 0) { + xdims = std::vector({1}); + } + if (ydims.size() == 0) { + ydims = std::vector({1}); + } + + // step 1. brocast out and out_grad + int r = + xpu::broadcast(dev_ctx.x_context(), out_data, brocast1, ydims, xdims); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast"); + + r = xpu::broadcast( + dev_ctx.x_context(), out_grad_data, brocast2, ydims, xdims); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast"); + + // step 2. comparse out_brocast and x + r = xpu::equal(dev_ctx.x_context(), x_data, brocast1, equal, x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "equal"); + // step 3. get x_grad + r = xpu::constant(dev_ctx.x_context(), brocast1, x.numel(), 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + r = xpu::select(dev_ctx.x_context(), + equal, + brocast2, + brocast1, + x_grad_data, + xdims, + xdims); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "select"); + + if (dev_ctx.x_context()->xpu_stream) { + dev_ctx.Wait(); + } + xpu_free(brocast1); + xpu_free(brocast2); + xpu_free(equal); +} + +} // namespace phi + +PD_REGISTER_KERNEL(min_grad, XPU, ALL_LAYOUT, phi::ReduceMinGradKernel, float) { +} diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py index 462e7457afae11f9e76c46da0a7870779b88631a..dd00a711f85ac06788afc3d6e8033f431ef8191b 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py @@ -47,6 +47,7 @@ class XPUTestReduceMaxOp(XPUOpTestWrapper): 'use_xpu': True, 'reduce_all': self.reduce_all, 'keep_dim': self.keep_dim, + 'dim': self.axis, } self.inputs = {'X': np.random.random(self.shape).astype("float32")} if self.attrs['reduce_all']: diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py index 0227ffb77d1994f8226f5e609d7bf09727cf9232..87ab399863596fd55167b85562f4e5774a7d6a6e 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py @@ -47,6 +47,7 @@ class XPUTestReduceMinOp(XPUOpTestWrapper): 'use_xpu': True, 'reduce_all': self.reduce_all, 'keep_dim': self.keep_dim, + 'dim': self.axis, } self.inputs = {'X': np.random.random(self.shape).astype("float32")} if self.attrs['reduce_all']: @@ -68,15 +69,50 @@ class XPUTestReduceMinOp(XPUOpTestWrapper): self.check_output_with_place(self.place) def test_check_grad(self): - pass + self.check_grad_with_place(self.place, ['X'], 'Out') class XPUTestReduceMinCase1(XPUTestReduceMinBase): + def init_case(self): + self.shape = (5, 6, 10) + self.axis = (0,) + self.reduce_all = False + self.keep_dim = False + + class XPUTestReduceMinCase2(XPUTestReduceMinBase): def init_case(self): self.shape = (5, 6, 10) self.axis = (0,) self.reduce_all = False self.keep_dim = True + class XPUTestReduceMinCase3(XPUTestReduceMinBase): + def init_case(self): + self.shape = (5, 6, 10) + self.axis = (0,) + self.reduce_all = True + self.keep_dim = False + + class XPUTestReduceMinCase4(XPUTestReduceMinBase): + def init_case(self): + self.shape = (5, 6, 10) + self.axis = (1,) + self.reduce_all = False + self.keep_dim = False + + class XPUTestReduceMinCase5(XPUTestReduceMinBase): + def init_case(self): + self.shape = (5, 6, 10) + self.axis = (1,) + self.reduce_all = False + self.keep_dim = True + + class XPUTestReduceMinCase6(XPUTestReduceMinBase): + def init_case(self): + self.shape = (5, 6, 10) + self.axis = (1,) + self.reduce_all = True + self.keep_dim = False + support_types = get_xpu_op_support_types('reduce_min') for stype in support_types: