[ROCM] fix test_dist_op ci test, test=develop (#31468)

133a914b · Qi Li · GitHub · f9377965 · 133a914b · 133a914b
4 changed file
--- a/paddle/fluid/operators/dist_op.cu
+++ b/paddle/fluid/operators/dist_op.cu
@@ -15,9 +15,18 @@
 #include "paddle/fluid/operators/dist_op.h"

 namespace ops = paddle::operators;
+#ifdef PADDLE_WITH_HIP
+// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
+// do not support double in HIPCC platform (Eigen3 to be fixed)
+REGISTER_OP_CUDA_KERNEL(
+    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>);
+#else
 REGISTER_OP_CUDA_KERNEL(
    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>,
    ops::DistKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::DistGradKernel<paddle::platform::CUDADeviceContext, double>);
+#endif
--- a/paddle/fluid/operators/math/math_cuda_utils.h
+++ b/paddle/fluid/operators/math/math_cuda_utils.h
@@ -214,7 +214,7 @@ __inline__ __device__ T warpReduceMax(T val, unsigned lane_mask) {
 template <typename T>
 __inline__ __device__ T warpReduceMin(T val, unsigned lane_mask) {
  for (int mask = HALF_WARP; mask > 0; mask >>= 1)
-#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
    val = min(val, __shfl_xor_sync(lane_mask, val, mask, warpSize));
 #else
    val = min(val, __shfl_xor(val, mask, warpSize));
@@ -226,7 +226,7 @@ __inline__ __device__ T warpReduceMin(T val, unsigned lane_mask) {
 * threads are less than warpSize.*/
 template <typename T>
 __inline__ __device__ T PartialWarpReduceMin(T val, unsigned lane_mask) {
-#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
  T warp_val = __shfl_sync(lane_mask, val, 0, warpSize);
 #else
  T warp_val = __shfl(
@@ -235,7 +235,7 @@ __inline__ __device__ T PartialWarpReduceMin(T val, unsigned lane_mask) {
  warp_val = val;

  for (int offset = HALF_WARP; offset > 0; offset >>= 1)
-#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
    warp_val =
        min(warp_val, __shfl_down_sync(lane_mask, warp_val, offset, warpSize));
 #else
@@ -298,9 +298,15 @@ __inline__ __device__ T PartialBlockReduceMin(T val, unsigned mask) {
  __syncthreads();

  shared[lane] = PartialWarpReduceMin(shared[lane], mask);
+#if defined(PADDLE_WITH_HIP)
+  // HIP do not support __syncwarp, using __syncthreads() instead is ok,
+  // although bringing a few performance decrease.
+  __syncthreads();
+#else
  __syncwarp();
+#endif

-#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
  val = __shfl_sync(mask, shared[lane], 0, warpSize);
 #else
  val = __shfl(shared[lane], 0, warpSize);

--- a/python/paddle/fluid/tests/unittests/dist_test.sh
+++ b/python/paddle/fluid/tests/unittests/dist_test.sh
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 unset https_proxy http_proxy
 export FLAGS_rpc_disable_reuse_port=1

@@ -50,14 +65,30 @@ do
    cat -n ${log}
 done

+# check CUDA or ROCM env
+GPU_SYS_INFO_CMD=nvidia-smi
+
+which ${GPU_SYS_INFO_CMD}
+exit_code=$?
+if [[ $exit_code -ne 0 ]]; then
+    GPU_SYS_INFO_CMD=rocm-smi
+fi
+
+which ${GPU_SYS_INFO_CMD}
+exit_code=$?
+if [[ $exit_code -ne 0 ]]; then
+    echo "nvidia-smi or rocm-smi faild with ${exit_code}"
+    exit ${exit_code}
+fi
+
 #display system context
 for i in {1..2}; do 
    sleep 3
    ps -aux
    netstat -anlp

-    if hash "nvidia-smi" > /dev/null; then
-        nvidia-smi
+    if hash "${GPU_SYS_INFO_CMD}" > /dev/null; then
+        ${GPU_SYS_INFO_CMD}
    fi
 done


--- a/python/paddle/fluid/tests/unittests/test_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_op.py
@@ -39,9 +39,10 @@ class TestDistOp(OpTest):
        self.op_type = 'dist'
        self.attrs = {}
        self.init_case()
+        self.init_data_type()
        self.inputs = {
-            "X": np.random.random(self.x_shape).astype("float64"),
-            "Y": np.random.random(self.y_shape).astype("float64")
+            "X": np.random.random(self.x_shape).astype(self.data_type),
+            "Y": np.random.random(self.y_shape).astype(self.data_type)
        }

        self.attrs["p"] = self.p
@@ -55,6 +56,10 @@ class TestDistOp(OpTest):
        self.y_shape = (120)
        self.p = 0.

+    def init_data_type(self):
+        self.data_type = np.float32 if core.is_compiled_with_rocm(
+        ) else np.float64
+
    def calc_gradient(self):
        x = self.inputs["X"]
        y = self.inputs["Y"]
@@ -143,15 +148,20 @@ class TestDistOpCase5(TestDistOp):


 class TestDistAPI(unittest.TestCase):
+    def init_data_type(self):
+        self.data_type = 'float32' if core.is_compiled_with_rocm(
+        ) else 'float64'
+
    def test_api(self):
+        self.init_data_type()
        main_program = fluid.Program()
        startup_program = fluid.Program()
        with fluid.program_guard(main_program, startup_program):
-            x = fluid.data(name='x', shape=[2, 3, 4, 5], dtype='float64')
-            y = fluid.data(name='y', shape=[3, 1, 5], dtype='float64')
+            x = fluid.data(name='x', shape=[2, 3, 4, 5], dtype=self.data_type)
+            y = fluid.data(name='y', shape=[3, 1, 5], dtype=self.data_type)
            p = 2
-            x_i = np.random.random((2, 3, 4, 5)).astype("float64")
-            y_i = np.random.random((3, 1, 5)).astype("float64")
+            x_i = np.random.random((2, 3, 4, 5)).astype(self.data_type)
+            y_i = np.random.random((3, 1, 5)).astype(self.data_type)
            result = paddle.dist(x, y, p)
            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
            ) else fluid.CPUPlace()