diff --git a/paddle/fluid/operators/dist_op.cu b/paddle/fluid/operators/dist_op.cu
index 499f5572910dd7666973bf077bf919a0378cfe52..90674969e283f1cba816ad46802cdbf971bcc555 100644
--- a/paddle/fluid/operators/dist_op.cu
+++ b/paddle/fluid/operators/dist_op.cu
@@ -15,9 +15,18 @@
 #include "paddle/fluid/operators/dist_op.h"
 
 namespace ops = paddle::operators;
+#ifdef PADDLE_WITH_HIP
+// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
+// do not support double in HIPCC platform (Eigen3 to be fixed)
+REGISTER_OP_CUDA_KERNEL(
+    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>);
+#else
 REGISTER_OP_CUDA_KERNEL(
     dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>,
     ops::DistKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::DistGradKernel<paddle::platform::CUDADeviceContext, double>);
+#endif
diff --git a/paddle/fluid/operators/math/math_cuda_utils.h b/paddle/fluid/operators/math/math_cuda_utils.h
index fbb84226478937f056b1322326b04b90dcc3f02e..e97dbd20ca142af75420ccf3ce349c1bdc928b09 100644
--- a/paddle/fluid/operators/math/math_cuda_utils.h
+++ b/paddle/fluid/operators/math/math_cuda_utils.h
@@ -214,7 +214,7 @@ __inline__ __device__ T warpReduceMax(T val, unsigned lane_mask) {
 template <typename T>
 __inline__ __device__ T warpReduceMin(T val, unsigned lane_mask) {
   for (int mask = HALF_WARP; mask > 0; mask >>= 1)
-#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
     val = min(val, __shfl_xor_sync(lane_mask, val, mask, warpSize));
 #else
     val = min(val, __shfl_xor(val, mask, warpSize));
@@ -226,7 +226,7 @@ __inline__ __device__ T warpReduceMin(T val, unsigned lane_mask) {
  * threads are less than warpSize.*/
 template <typename T>
 __inline__ __device__ T PartialWarpReduceMin(T val, unsigned lane_mask) {
-#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
   T warp_val = __shfl_sync(lane_mask, val, 0, warpSize);
 #else
   T warp_val = __shfl(
@@ -235,7 +235,7 @@ __inline__ __device__ T PartialWarpReduceMin(T val, unsigned lane_mask) {
   warp_val = val;
 
   for (int offset = HALF_WARP; offset > 0; offset >>= 1)
-#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
     warp_val =
         min(warp_val, __shfl_down_sync(lane_mask, warp_val, offset, warpSize));
 #else
@@ -298,9 +298,15 @@ __inline__ __device__ T PartialBlockReduceMin(T val, unsigned mask) {
   __syncthreads();
 
   shared[lane] = PartialWarpReduceMin(shared[lane], mask);
+#if defined(PADDLE_WITH_HIP)
+  // HIP do not support __syncwarp, using __syncthreads() instead is ok,
+  // although bringing a few performance decrease.
+  __syncthreads();
+#else
   __syncwarp();
+#endif
 
-#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
   val = __shfl_sync(mask, shared[lane], 0, warpSize);
 #else
   val = __shfl(shared[lane], 0, warpSize);
diff --git a/python/paddle/fluid/tests/unittests/dist_test.sh b/python/paddle/fluid/tests/unittests/dist_test.sh
index d5a6490042b20a4f9160c55fbb93e9f2f8092eae..69a893a7ddc13949bd5e0c4aedb0e81392126736 100644
--- a/python/paddle/fluid/tests/unittests/dist_test.sh
+++ b/python/paddle/fluid/tests/unittests/dist_test.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 unset https_proxy http_proxy
 export FLAGS_rpc_disable_reuse_port=1
 
@@ -50,14 +65,30 @@ do
     cat -n ${log}
 done
 
+# check CUDA or ROCM env
+GPU_SYS_INFO_CMD=nvidia-smi
+
+which ${GPU_SYS_INFO_CMD}
+exit_code=$?
+if [[ $exit_code -ne 0 ]]; then
+    GPU_SYS_INFO_CMD=rocm-smi
+fi
+
+which ${GPU_SYS_INFO_CMD}
+exit_code=$?
+if [[ $exit_code -ne 0 ]]; then
+    echo "nvidia-smi or rocm-smi faild with ${exit_code}"
+    exit ${exit_code}
+fi
+
 #display system context
 for i in {1..2}; do 
     sleep 3
     ps -aux
     netstat -anlp
 
-    if hash "nvidia-smi" > /dev/null; then
-        nvidia-smi
+    if hash "${GPU_SYS_INFO_CMD}" > /dev/null; then
+        ${GPU_SYS_INFO_CMD}
     fi
 done
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py
index 0f71027d274018a48e769a28ff9679204251c1d3..b9b8ea92cb3a845afd3d090e1999b3e149850952 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_op.py
@@ -39,9 +39,10 @@ class TestDistOp(OpTest):
         self.op_type = 'dist'
         self.attrs = {}
         self.init_case()
+        self.init_data_type()
         self.inputs = {
-            "X": np.random.random(self.x_shape).astype("float64"),
-            "Y": np.random.random(self.y_shape).astype("float64")
+            "X": np.random.random(self.x_shape).astype(self.data_type),
+            "Y": np.random.random(self.y_shape).astype(self.data_type)
         }
 
         self.attrs["p"] = self.p
@@ -55,6 +56,10 @@ class TestDistOp(OpTest):
         self.y_shape = (120)
         self.p = 0.
 
+    def init_data_type(self):
+        self.data_type = np.float32 if core.is_compiled_with_rocm(
+        ) else np.float64
+
     def calc_gradient(self):
         x = self.inputs["X"]
         y = self.inputs["Y"]
@@ -143,15 +148,20 @@ class TestDistOpCase5(TestDistOp):
 
 
 class TestDistAPI(unittest.TestCase):
+    def init_data_type(self):
+        self.data_type = 'float32' if core.is_compiled_with_rocm(
+        ) else 'float64'
+
     def test_api(self):
+        self.init_data_type()
         main_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(main_program, startup_program):
-            x = fluid.data(name='x', shape=[2, 3, 4, 5], dtype='float64')
-            y = fluid.data(name='y', shape=[3, 1, 5], dtype='float64')
+            x = fluid.data(name='x', shape=[2, 3, 4, 5], dtype=self.data_type)
+            y = fluid.data(name='y', shape=[3, 1, 5], dtype=self.data_type)
             p = 2
-            x_i = np.random.random((2, 3, 4, 5)).astype("float64")
-            y_i = np.random.random((3, 1, 5)).astype("float64")
+            x_i = np.random.random((2, 3, 4, 5)).astype(self.data_type)
+            y_i = np.random.random((3, 1, 5)).astype(self.data_type)
             result = paddle.dist(x, y, p)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()