From d8314ff5dfe2c80ceaf3e59f71290c0e296ed7de Mon Sep 17 00:00:00 2001 From: Lin Manhui Date: Wed, 26 Oct 2022 17:37:59 +0800 Subject: [PATCH] [Fix] Fix paddle.pow() Gets Incorrect Result When Broadcasting Is Triggered (#47307) * Fix paddle.pow() bugs * Add unittest cases * Fix ut cases * Add ut cases on multiple devices --- paddle/phi/kernels/cpu/elementwise_kernel.cc | 11 +- .../phi/kernels/funcs/elementwise_functor.h | 32 +++ .../paddle/fluid/tests/unittests/test_pow.py | 185 +++++++++++------- 3 files changed, 150 insertions(+), 78 deletions(-) diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc index b7c3f3c848..3e16d75377 100644 --- a/paddle/phi/kernels/cpu/elementwise_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc @@ -91,8 +91,15 @@ void ElementwisePowRawKernel(const Context& dev_ctx, DenseTensor* out) { // allocate memory for out dev_ctx.template Alloc(out); - funcs::ElementwiseCompute, T>( - dev_ctx, x, y, axis, funcs::ElementwisePowFunctor(), out); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + if (x_dims.size() >= y_dims.size()) { + funcs::ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::ElementwisePowFunctor(), out); + } else { + funcs::ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::ElementwiseInversePowFunctor(), out); + } } template diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index 1304fedbd2..b98247fdf0 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -606,6 +606,28 @@ struct ElementwisePowFunctor { } }; +template +struct ElementwiseInversePowFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { +// TODO(wujionghao): A potential speed improvement is supporting different +// types in C++. +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) + // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and + // it will return a float number like 2.99... , which floor to 2 + // when cast to int by default and it is wrong. + // Use llrint to cast it to the nearest integer, which is 3. + if (std::is_integral::value) { + return std::llrint( + std::pow(static_cast(b), static_cast(a))); + } +#endif +#ifdef PADDLE_WITH_XPU_KP + return pow(b, a); +#endif + return std::pow(b, a); + } +}; + template <> struct ElementwisePowFunctor { inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a, @@ -616,5 +638,15 @@ struct ElementwisePowFunctor { } }; +template <> +struct ElementwiseInversePowFunctor { + inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a, + const dtype::float16 b) const { + float f_a = static_cast(a); + float f_b = static_cast(b); + return static_cast(std::pow(f_b, f_a)); + } +}; + } // namespace funcs } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_pow.py b/python/paddle/fluid/tests/unittests/test_pow.py index 79282f3460..ea42c6f3e1 100755 --- a/python/paddle/fluid/tests/unittests/test_pow.py +++ b/python/paddle/fluid/tests/unittests/test_pow.py @@ -18,15 +18,18 @@ import numpy as np import paddle from paddle.static import Program, program_guard +import paddle.fluid.core as core DYNAMIC = 1 STATIC = 2 -def _run_power(mode, x, y): +def _run_power(mode, x, y, device='cpu'): # dynamic mode if mode == DYNAMIC: paddle.disable_static() + # Set device + paddle.set_device(device) # y is scalar if isinstance(y, (int, float)): x_ = paddle.to_tensor(x) @@ -48,7 +51,11 @@ def _run_power(mode, x, y): x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype) y_ = y res = paddle.pow(x_, y_) - place = paddle.CPUPlace() + place = ( + paddle.CPUPlace() + if device == 'cpu' + else paddle.CUDAPlace(0) + ) exe = paddle.static.Executor(place) outs = exe.run(feed={'x': x}, fetch_list=[res]) return outs[0] @@ -58,7 +65,11 @@ def _run_power(mode, x, y): x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype) y_ = paddle.static.data(name="y", shape=y.shape, dtype=y.dtype) res = paddle.pow(x_, y_) - place = paddle.CPUPlace() + place = ( + paddle.CPUPlace() + if device == 'cpu' + else paddle.CUDAPlace(0) + ) exe = paddle.static.Executor(place) outs = exe.run(feed={'x': x, 'y': y}, fetch_list=[res]) return outs[0] @@ -67,82 +78,104 @@ def _run_power(mode, x, y): class TestPowerAPI(unittest.TestCase): """TestPowerAPI.""" + def setUp(self): + self.places = ['cpu'] + if core.is_compiled_with_cuda(): + self.places.append('gpu') + def test_power(self): """test_power.""" np.random.seed(7) - # test 1-d float tensor ** float scalar - dims = (np.random.randint(200, 300),) - x = (np.random.rand(*dims) * 10).astype(np.float64) - y = np.random.rand() * 10 - res = _run_power(DYNAMIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - res = _run_power(STATIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - - # test 1-d float tensor ** int scalar - dims = (np.random.randint(200, 300),) - x = (np.random.rand(*dims) * 10).astype(np.float64) - y = int(np.random.rand() * 10) - res = _run_power(DYNAMIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - res = _run_power(STATIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - - x = (np.random.rand(*dims) * 10).astype(np.int64) - y = int(np.random.rand() * 10) - res = _run_power(DYNAMIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - res = _run_power(STATIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - - # test 1-d float tensor ** 1-d float tensor - dims = (np.random.randint(200, 300),) - x = (np.random.rand(*dims) * 10).astype(np.float64) - y = (np.random.rand(*dims) * 10).astype(np.float64) - res = _run_power(DYNAMIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - res = _run_power(STATIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - - # test 1-d int tensor ** 1-d int tensor - dims = (np.random.randint(200, 300),) - x = (np.random.rand(*dims) * 10).astype(np.int64) - y = (np.random.rand(*dims) * 10).astype(np.int64) - res = _run_power(DYNAMIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - res = _run_power(STATIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - - # test 1-d int tensor ** 1-d int tensor - dims = (np.random.randint(200, 300),) - x = (np.random.rand(*dims) * 10).astype(np.int32) - y = (np.random.rand(*dims) * 10).astype(np.int32) - res = _run_power(DYNAMIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - res = _run_power(STATIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - - # test 1-d int tensor ** 1-d int tensor - dims = (np.random.randint(200, 300),) - x = (np.random.rand(*dims) * 10).astype(np.float32) - y = (np.random.rand(*dims) * 10).astype(np.float32) - res = _run_power(DYNAMIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - res = _run_power(STATIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - - # test broadcast - dims = ( - np.random.randint(1, 10), - np.random.randint(5, 10), - np.random.randint(5, 10), - ) - x = (np.random.rand(*dims) * 10).astype(np.float64) - y = (np.random.rand(dims[-1]) * 10).astype(np.float64) - res = _run_power(DYNAMIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) - res = _run_power(STATIC, x, y) - np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + for place in self.places: + # test 1-d float tensor ** float scalar + dims = (np.random.randint(200, 300),) + x = (np.random.rand(*dims) * 10).astype(np.float64) + y = np.random.rand() * 10 + res = _run_power(DYNAMIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + res = _run_power(STATIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + + # test 1-d float tensor ** int scalar + dims = (np.random.randint(200, 300),) + x = (np.random.rand(*dims) * 10).astype(np.float64) + y = int(np.random.rand() * 10) + res = _run_power(DYNAMIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + res = _run_power(STATIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + + x = (np.random.rand(*dims) * 10).astype(np.int64) + y = int(np.random.rand() * 10) + res = _run_power(DYNAMIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + res = _run_power(STATIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + + # test 1-d float tensor ** 1-d float tensor + dims = (np.random.randint(200, 300),) + x = (np.random.rand(*dims) * 10).astype(np.float64) + y = (np.random.rand(*dims) * 10).astype(np.float64) + res = _run_power(DYNAMIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + res = _run_power(STATIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + + # test 1-d int tensor ** 1-d int tensor + dims = (np.random.randint(200, 300),) + x = (np.random.rand(*dims) * 10).astype(np.int64) + y = (np.random.rand(*dims) * 10).astype(np.int64) + res = _run_power(DYNAMIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + res = _run_power(STATIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + + # test 1-d int tensor ** 1-d int tensor + dims = (np.random.randint(200, 300),) + x = (np.random.rand(*dims) * 10).astype(np.int32) + y = (np.random.rand(*dims) * 10).astype(np.int32) + res = _run_power(DYNAMIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + res = _run_power(STATIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + + # test 1-d int tensor ** 1-d int tensor + dims = (np.random.randint(200, 300),) + x = (np.random.rand(*dims) * 10).astype(np.float32) + y = (np.random.rand(*dims) * 10).astype(np.float32) + res = _run_power(DYNAMIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + res = _run_power(STATIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + + # test float scalar ** 2-d float tensor + dims = (np.random.randint(2, 10), np.random.randint(5, 10)) + x = np.random.rand() * 10 + y = (np.random.rand(*dims) * 10).astype(np.float32) + res = _run_power(DYNAMIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + + # test 2-d float tensor ** float scalar + dims = (np.random.randint(2, 10), np.random.randint(5, 10)) + x = (np.random.rand(*dims) * 10).astype(np.float32) + y = np.random.rand() * 10 + res = _run_power(DYNAMIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + res = _run_power(STATIC, x, y, place) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + + # test broadcast + dims = ( + np.random.randint(1, 10), + np.random.randint(5, 10), + np.random.randint(5, 10), + ) + x = (np.random.rand(*dims) * 10).astype(np.float64) + y = (np.random.rand(dims[-1]) * 10).astype(np.float64) + res = _run_power(DYNAMIC, x, y) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) + res = _run_power(STATIC, x, y) + np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05) class TestPowerError(unittest.TestCase): -- GitLab