From d8314ff5dfe2c80ceaf3e59f71290c0e296ed7de Mon Sep 17 00:00:00 2001
From: Lin Manhui <mhlin425@whu.edu.cn>
Date: Wed, 26 Oct 2022 17:37:59 +0800
Subject: [PATCH] [Fix] Fix paddle.pow() Gets Incorrect Result When
 Broadcasting Is Triggered (#47307)

* Fix paddle.pow() bugs

* Add unittest cases

* Fix ut cases

* Add ut cases on multiple devices
---
 paddle/phi/kernels/cpu/elementwise_kernel.cc  |  11 +-
 .../phi/kernels/funcs/elementwise_functor.h   |  32 +++
 .../paddle/fluid/tests/unittests/test_pow.py  | 185 +++++++++++-------
 3 files changed, 150 insertions(+), 78 deletions(-)
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index b7c3f3c8482..3e16d75377e 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -91,8 +91,15 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
                              DenseTensor* out) {
   // allocate memory for out
   dev_ctx.template Alloc<T>(out);
-  funcs::ElementwiseCompute<funcs::ElementwisePowFunctor<T>, T>(
-      dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  if (x_dims.size() >= y_dims.size()) {
+    funcs::ElementwiseCompute<funcs::ElementwisePowFunctor<T>, T>(
+        dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
+  } else {
+    funcs::ElementwiseCompute<funcs::ElementwiseInversePowFunctor<T>, T>(
+        dev_ctx, x, y, axis, funcs::ElementwiseInversePowFunctor<T>(), out);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 1304fedbd2e..b98247fdf0c 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -606,6 +606,28 @@ struct ElementwisePowFunctor {
   }
 };
 
+template <typename T>
+struct ElementwiseInversePowFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+// TODO(wujionghao): A potential speed improvement is supporting different
+// types in C++.
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+    // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+    // it will return a float number like 2.99... , which floor to 2
+    // when cast to int by default and it is wrong.
+    // Use llrint to cast it to the nearest integer, which is 3.
+    if (std::is_integral<T>::value) {
+      return std::llrint(
+          std::pow(static_cast<double>(b), static_cast<double>(a)));
+    }
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+    return pow(b, a);
+#endif
+    return std::pow(b, a);
+  }
+};
+
 template <>
 struct ElementwisePowFunctor<dtype::float16> {
   inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a,
@@ -616,5 +638,15 @@ struct ElementwisePowFunctor<dtype::float16> {
   }
 };
 
+template <>
+struct ElementwiseInversePowFunctor<dtype::float16> {
+  inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a,
+                                              const dtype::float16 b) const {
+    float f_a = static_cast<float>(a);
+    float f_b = static_cast<float>(b);
+    return static_cast<dtype::float16>(std::pow(f_b, f_a));
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_pow.py b/python/paddle/fluid/tests/unittests/test_pow.py
index 79282f3460f..ea42c6f3e12 100755
--- a/python/paddle/fluid/tests/unittests/test_pow.py
+++ b/python/paddle/fluid/tests/unittests/test_pow.py
@@ -18,15 +18,18 @@ import numpy as np
 
 import paddle
 from paddle.static import Program, program_guard
+import paddle.fluid.core as core
 
 DYNAMIC = 1
 STATIC = 2
 
 
-def _run_power(mode, x, y):
+def _run_power(mode, x, y, device='cpu'):
     # dynamic mode
     if mode == DYNAMIC:
         paddle.disable_static()
+        # Set device
+        paddle.set_device(device)
         # y is scalar
         if isinstance(y, (int, float)):
             x_ = paddle.to_tensor(x)
@@ -48,7 +51,11 @@ def _run_power(mode, x, y):
                 x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype)
                 y_ = y
                 res = paddle.pow(x_, y_)
-                place = paddle.CPUPlace()
+                place = (
+                    paddle.CPUPlace()
+                    if device == 'cpu'
+                    else paddle.CUDAPlace(0)
+                )
                 exe = paddle.static.Executor(place)
                 outs = exe.run(feed={'x': x}, fetch_list=[res])
                 return outs[0]
@@ -58,7 +65,11 @@ def _run_power(mode, x, y):
                 x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype)
                 y_ = paddle.static.data(name="y", shape=y.shape, dtype=y.dtype)
                 res = paddle.pow(x_, y_)
-                place = paddle.CPUPlace()
+                place = (
+                    paddle.CPUPlace()
+                    if device == 'cpu'
+                    else paddle.CUDAPlace(0)
+                )
                 exe = paddle.static.Executor(place)
                 outs = exe.run(feed={'x': x, 'y': y}, fetch_list=[res])
                 return outs[0]
@@ -67,82 +78,104 @@ def _run_power(mode, x, y):
 class TestPowerAPI(unittest.TestCase):
     """TestPowerAPI."""
 
+    def setUp(self):
+        self.places = ['cpu']
+        if core.is_compiled_with_cuda():
+            self.places.append('gpu')
+
     def test_power(self):
         """test_power."""
         np.random.seed(7)
-        # test 1-d float tensor ** float scalar
-        dims = (np.random.randint(200, 300),)
-        x = (np.random.rand(*dims) * 10).astype(np.float64)
-        y = np.random.rand() * 10
-        res = _run_power(DYNAMIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-        res = _run_power(STATIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-
-        # test 1-d float tensor ** int scalar
-        dims = (np.random.randint(200, 300),)
-        x = (np.random.rand(*dims) * 10).astype(np.float64)
-        y = int(np.random.rand() * 10)
-        res = _run_power(DYNAMIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-        res = _run_power(STATIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-
-        x = (np.random.rand(*dims) * 10).astype(np.int64)
-        y = int(np.random.rand() * 10)
-        res = _run_power(DYNAMIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-        res = _run_power(STATIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-
-        # test 1-d float tensor ** 1-d float tensor
-        dims = (np.random.randint(200, 300),)
-        x = (np.random.rand(*dims) * 10).astype(np.float64)
-        y = (np.random.rand(*dims) * 10).astype(np.float64)
-        res = _run_power(DYNAMIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-        res = _run_power(STATIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-
-        # test 1-d int tensor ** 1-d int tensor
-        dims = (np.random.randint(200, 300),)
-        x = (np.random.rand(*dims) * 10).astype(np.int64)
-        y = (np.random.rand(*dims) * 10).astype(np.int64)
-        res = _run_power(DYNAMIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-        res = _run_power(STATIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-
-        # test 1-d int tensor ** 1-d int tensor
-        dims = (np.random.randint(200, 300),)
-        x = (np.random.rand(*dims) * 10).astype(np.int32)
-        y = (np.random.rand(*dims) * 10).astype(np.int32)
-        res = _run_power(DYNAMIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-        res = _run_power(STATIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-
-        # test 1-d int tensor ** 1-d int tensor
-        dims = (np.random.randint(200, 300),)
-        x = (np.random.rand(*dims) * 10).astype(np.float32)
-        y = (np.random.rand(*dims) * 10).astype(np.float32)
-        res = _run_power(DYNAMIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-        res = _run_power(STATIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-
-        # test broadcast
-        dims = (
-            np.random.randint(1, 10),
-            np.random.randint(5, 10),
-            np.random.randint(5, 10),
-        )
-        x = (np.random.rand(*dims) * 10).astype(np.float64)
-        y = (np.random.rand(dims[-1]) * 10).astype(np.float64)
-        res = _run_power(DYNAMIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
-        res = _run_power(STATIC, x, y)
-        np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+        for place in self.places:
+            # test 1-d float tensor ** float scalar
+            dims = (np.random.randint(200, 300),)
+            x = (np.random.rand(*dims) * 10).astype(np.float64)
+            y = np.random.rand() * 10
+            res = _run_power(DYNAMIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+            res = _run_power(STATIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+
+            # test 1-d float tensor ** int scalar
+            dims = (np.random.randint(200, 300),)
+            x = (np.random.rand(*dims) * 10).astype(np.float64)
+            y = int(np.random.rand() * 10)
+            res = _run_power(DYNAMIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+            res = _run_power(STATIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+
+            x = (np.random.rand(*dims) * 10).astype(np.int64)
+            y = int(np.random.rand() * 10)
+            res = _run_power(DYNAMIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+            res = _run_power(STATIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+
+            # test 1-d float tensor ** 1-d float tensor
+            dims = (np.random.randint(200, 300),)
+            x = (np.random.rand(*dims) * 10).astype(np.float64)
+            y = (np.random.rand(*dims) * 10).astype(np.float64)
+            res = _run_power(DYNAMIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+            res = _run_power(STATIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+
+            # test 1-d int tensor ** 1-d int tensor
+            dims = (np.random.randint(200, 300),)
+            x = (np.random.rand(*dims) * 10).astype(np.int64)
+            y = (np.random.rand(*dims) * 10).astype(np.int64)
+            res = _run_power(DYNAMIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+            res = _run_power(STATIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+
+            # test 1-d int tensor ** 1-d int tensor
+            dims = (np.random.randint(200, 300),)
+            x = (np.random.rand(*dims) * 10).astype(np.int32)
+            y = (np.random.rand(*dims) * 10).astype(np.int32)
+            res = _run_power(DYNAMIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+            res = _run_power(STATIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+
+            # test 1-d int tensor ** 1-d int tensor
+            dims = (np.random.randint(200, 300),)
+            x = (np.random.rand(*dims) * 10).astype(np.float32)
+            y = (np.random.rand(*dims) * 10).astype(np.float32)
+            res = _run_power(DYNAMIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+            res = _run_power(STATIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+
+            # test float scalar ** 2-d float tensor
+            dims = (np.random.randint(2, 10), np.random.randint(5, 10))
+            x = np.random.rand() * 10
+            y = (np.random.rand(*dims) * 10).astype(np.float32)
+            res = _run_power(DYNAMIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+
+            # test 2-d float tensor ** float scalar
+            dims = (np.random.randint(2, 10), np.random.randint(5, 10))
+            x = (np.random.rand(*dims) * 10).astype(np.float32)
+            y = np.random.rand() * 10
+            res = _run_power(DYNAMIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+            res = _run_power(STATIC, x, y, place)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+
+            # test broadcast
+            dims = (
+                np.random.randint(1, 10),
+                np.random.randint(5, 10),
+                np.random.randint(5, 10),
+            )
+            x = (np.random.rand(*dims) * 10).astype(np.float64)
+            y = (np.random.rand(dims[-1]) * 10).astype(np.float64)
+            res = _run_power(DYNAMIC, x, y)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
+            res = _run_power(STATIC, x, y)
+            np.testing.assert_allclose(res, np.power(x, y), rtol=1e-05)
 
 
 class TestPowerError(unittest.TestCase):
-- 
GitLab