diff --git a/paddle/fluid/operators/dist_op.cu b/paddle/fluid/operators/dist_op.cu index 499f5572910dd7666973bf077bf919a0378cfe52..90674969e283f1cba816ad46802cdbf971bcc555 100644 --- a/paddle/fluid/operators/dist_op.cu +++ b/paddle/fluid/operators/dist_op.cu @@ -15,9 +15,18 @@ #include "paddle/fluid/operators/dist_op.h" namespace ops = paddle::operators; +#ifdef PADDLE_WITH_HIP +// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 +// do not support double in HIPCC platform (Eigen3 to be fixed) +REGISTER_OP_CUDA_KERNEL( + dist, ops::DistKernel); +REGISTER_OP_CUDA_KERNEL( + dist_grad, ops::DistGradKernel); +#else REGISTER_OP_CUDA_KERNEL( dist, ops::DistKernel, ops::DistKernel); REGISTER_OP_CUDA_KERNEL( dist_grad, ops::DistGradKernel, ops::DistGradKernel); +#endif diff --git a/paddle/fluid/operators/math/math_cuda_utils.h b/paddle/fluid/operators/math/math_cuda_utils.h index fbb84226478937f056b1322326b04b90dcc3f02e..e97dbd20ca142af75420ccf3ce349c1bdc928b09 100644 --- a/paddle/fluid/operators/math/math_cuda_utils.h +++ b/paddle/fluid/operators/math/math_cuda_utils.h @@ -214,7 +214,7 @@ __inline__ __device__ T warpReduceMax(T val, unsigned lane_mask) { template __inline__ __device__ T warpReduceMin(T val, unsigned lane_mask) { for (int mask = HALF_WARP; mask > 0; mask >>= 1) -#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000) val = min(val, __shfl_xor_sync(lane_mask, val, mask, warpSize)); #else val = min(val, __shfl_xor(val, mask, warpSize)); @@ -226,7 +226,7 @@ __inline__ __device__ T warpReduceMin(T val, unsigned lane_mask) { * threads are less than warpSize.*/ template __inline__ __device__ T PartialWarpReduceMin(T val, unsigned lane_mask) { -#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000) T warp_val = __shfl_sync(lane_mask, val, 0, warpSize); #else T warp_val = __shfl( @@ -235,7 +235,7 @@ __inline__ __device__ T PartialWarpReduceMin(T val, unsigned lane_mask) { warp_val = val; for (int offset = HALF_WARP; offset > 0; offset >>= 1) -#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000) warp_val = min(warp_val, __shfl_down_sync(lane_mask, warp_val, offset, warpSize)); #else @@ -298,9 +298,15 @@ __inline__ __device__ T PartialBlockReduceMin(T val, unsigned mask) { __syncthreads(); shared[lane] = PartialWarpReduceMin(shared[lane], mask); +#if defined(PADDLE_WITH_HIP) + // HIP do not support __syncwarp, using __syncthreads() instead is ok, + // although bringing a few performance decrease. + __syncthreads(); +#else __syncwarp(); +#endif -#if __CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000) val = __shfl_sync(mask, shared[lane], 0, warpSize); #else val = __shfl(shared[lane], 0, warpSize); diff --git a/python/paddle/fluid/tests/unittests/dist_test.sh b/python/paddle/fluid/tests/unittests/dist_test.sh index d5a6490042b20a4f9160c55fbb93e9f2f8092eae..69a893a7ddc13949bd5e0c4aedb0e81392126736 100644 --- a/python/paddle/fluid/tests/unittests/dist_test.sh +++ b/python/paddle/fluid/tests/unittests/dist_test.sh @@ -1,4 +1,19 @@ #!/bin/bash + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + unset https_proxy http_proxy export FLAGS_rpc_disable_reuse_port=1 @@ -50,14 +65,30 @@ do cat -n ${log} done +# check CUDA or ROCM env +GPU_SYS_INFO_CMD=nvidia-smi + +which ${GPU_SYS_INFO_CMD} +exit_code=$? +if [[ $exit_code -ne 0 ]]; then + GPU_SYS_INFO_CMD=rocm-smi +fi + +which ${GPU_SYS_INFO_CMD} +exit_code=$? +if [[ $exit_code -ne 0 ]]; then + echo "nvidia-smi or rocm-smi faild with ${exit_code}" + exit ${exit_code} +fi + #display system context for i in {1..2}; do sleep 3 ps -aux netstat -anlp - if hash "nvidia-smi" > /dev/null; then - nvidia-smi + if hash "${GPU_SYS_INFO_CMD}" > /dev/null; then + ${GPU_SYS_INFO_CMD} fi done diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py index 0f71027d274018a48e769a28ff9679204251c1d3..b9b8ea92cb3a845afd3d090e1999b3e149850952 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_op.py +++ b/python/paddle/fluid/tests/unittests/test_dist_op.py @@ -39,9 +39,10 @@ class TestDistOp(OpTest): self.op_type = 'dist' self.attrs = {} self.init_case() + self.init_data_type() self.inputs = { - "X": np.random.random(self.x_shape).astype("float64"), - "Y": np.random.random(self.y_shape).astype("float64") + "X": np.random.random(self.x_shape).astype(self.data_type), + "Y": np.random.random(self.y_shape).astype(self.data_type) } self.attrs["p"] = self.p @@ -55,6 +56,10 @@ class TestDistOp(OpTest): self.y_shape = (120) self.p = 0. + def init_data_type(self): + self.data_type = np.float32 if core.is_compiled_with_rocm( + ) else np.float64 + def calc_gradient(self): x = self.inputs["X"] y = self.inputs["Y"] @@ -143,15 +148,20 @@ class TestDistOpCase5(TestDistOp): class TestDistAPI(unittest.TestCase): + def init_data_type(self): + self.data_type = 'float32' if core.is_compiled_with_rocm( + ) else 'float64' + def test_api(self): + self.init_data_type() main_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(main_program, startup_program): - x = fluid.data(name='x', shape=[2, 3, 4, 5], dtype='float64') - y = fluid.data(name='y', shape=[3, 1, 5], dtype='float64') + x = fluid.data(name='x', shape=[2, 3, 4, 5], dtype=self.data_type) + y = fluid.data(name='y', shape=[3, 1, 5], dtype=self.data_type) p = 2 - x_i = np.random.random((2, 3, 4, 5)).astype("float64") - y_i = np.random.random((3, 1, 5)).astype("float64") + x_i = np.random.random((2, 3, 4, 5)).astype(self.data_type) + y_i = np.random.random((3, 1, 5)).astype(self.data_type) result = paddle.dist(x, y, p) place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( ) else fluid.CPUPlace()