diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
index 345eb2539523f623d00ee37fae3d4929fd5ba55a..1e0eab493802b9f0d5825d8c1fa5f60942e80407 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 
 #include <cmath>
+#include <type_traits>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
@@ -20,7 +21,18 @@ namespace operators {
 
 template <typename T>
 struct PowFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+#ifdef __CUDA_ARCH__
+    // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+    // it will return a float number like 2.99... , which floor to 2
+    // when cast to int by default and it is wrong.
+    // Use llrint to cast it to the nearest integer, which is 3.
+    if (std::is_integral<T>::value) {
+      return std::llrint(std::pow(a, b));
+    }
+#endif
+    return std::pow(a, b);
+  }
 };
 
 template <typename DeviceContext, typename T>
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
index e6a065889c7b1a0c445b85c1282f3d6311caf816..ffdd6857a9b1f83581d90ffa63bf2c5e26582b5c 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid as fluid
 
 
 class TestElementwisePowOp(OpTest):
@@ -114,5 +115,48 @@ class TestElementwisePowOp_broadcast_4(TestElementwisePowOp):
         self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
 
 
+class TestElementwisePowOpInt(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {'X': np.asarray([1, 3, 6]), 'Y': np.asarray([1, 1, 1])}
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestElementwisePowGradOpInt(unittest.TestCase):
+    def setUp(self):
+        self.x = np.asarray([1, 3, 6])
+        self.y = np.asarray([1, 1, 1])
+        self.res = self.x**self.y
+        # dout = 1
+        self.grad_res = np.asarray([1, 1, 1])
+        # dx = dout * y * pow(x, y-1)
+        self.grad_x = self.grad_res * self.y * (self.x
+                                                **(self.y - 1)).astype("int")
+        # dy = dout * log(x) * pow(x, y)
+        self.grad_y = (self.grad_res * np.log(self.x) *
+                       (self.x**self.y)).astype("int")
+        print(self.grad_res, self.grad_x, self.grad_y)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if fluid.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            with fluid.dygraph.guard(place):
+                x = fluid.dygraph.to_variable(self.x, zero_copy=False)
+                y = fluid.dygraph.to_variable(self.y, zero_copy=False)
+                print(x, y)
+                x.stop_gradient = False
+                y.stop_gradient = False
+                res = x**y
+                res.backward()
+                self.assertTrue(np.array_equal(res.gradient(), self.grad_res))
+                self.assertTrue(np.array_equal(x.gradient(), self.grad_x))
+                self.assertTrue(np.array_equal(y.gradient(), self.grad_y))
+
+
 if __name__ == '__main__':
     unittest.main()