From 7cdf6ea77081a4938182b1fdf26bcf341e5588a8 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Wed, 3 Mar 2021 17:22:33 +0800 Subject: [PATCH] [ROCM] update fluid elementwise op for rocm (part10), test=develop (#31361) * [ROCM] update fluid elementwise op for rocm (part10), test=develop * update, test=develop * address review comments, test=develop --- paddle/fluid/memory/memcpy.cc | 2 +- .../elementwise/elementwise_add_op.h | 12 +++-- .../elementwise/elementwise_div_op.h | 2 +- .../elementwise/elementwise_floordiv_op.h | 12 ++++- .../elementwise/elementwise_mul_op.h | 2 +- .../elementwise/elementwise_op_function.cu.h | 18 ++++--- .../elementwise/elementwise_op_function.h | 52 ++++++++++--------- .../elementwise/elementwise_pow_op.h | 2 +- .../elementwise/elementwise_sub_op.h | 2 +- .../test_elementwise_add_grad_grad.cc | 2 +- .../test_elementwise_add_op_inplace.cc | 4 +- .../test_elementwise_div_grad_grad.cc | 2 +- .../test_elementwise_op_grad_grad.h | 8 ++- 13 files changed, 76 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 6a1d44f6cf..7f871fab5a 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -239,7 +239,7 @@ void Copy( platform::SetDeviceId(src_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by thream(" << stream << ")"; + << dst_place << " by stream(" << stream << ")"; if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); #ifdef PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index c46184f5ba..abea9da942 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -20,12 +20,18 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef __NVCC__ #include #include #include "cub/cub.cuh" #endif +#ifdef __HIPCC__ +#include +#include +#include +namespace cub = hipcub; +#endif #endif namespace paddle { @@ -179,7 +185,7 @@ __global__ void MatrixColReduce(const T *__restrict__ in, T *__restrict__ out, } } -#if CUDA_VERSION >= 10000 +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000 template __global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in, __half2 *__restrict__ out, size_t width, @@ -287,7 +293,7 @@ bool static RunSpecialDims(const framework::DDim &dx_dims, return true; } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // cuda definition template typename std::enable_if< diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index 5f4321f727..0be8d934b1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -144,7 +144,7 @@ elementwise_div_grad(const framework::ExecutionContext& ctx, ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // cuda definition template typename std::enable_if< diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h index 721c23e383..06eb0b1cc8 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h @@ -25,10 +25,14 @@ namespace operators { template struct FloorDivFunctor { inline HOSTDEVICE T operator()(T a, T b) const { -#ifdef __CUDA_ARCH__ +#if defined(__HIPCC__) || defined(__CUDA_ARCH__) if (b == 0) { printf("Error: Divide by zero encounter in floor_divide\n"); +#ifdef __HIPCC__ + abort(); +#else asm("trap;"); +#endif } #else if (b == 0) @@ -42,10 +46,14 @@ struct FloorDivFunctor { template struct InverseFloorDivFunctor { inline HOSTDEVICE T operator()(T a, T b) const { -#ifdef __CUDA_ARCH__ +#if defined(__HIPCC__) || defined(__CUDA_ARCH__) if (a == 0) { printf("Error: Divide by zero encounter in floor_divide\n"); +#ifdef __HIPCC__ + abort(); +#else asm("trap;"); +#endif } #else if (a == 0) diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 3bc12fe16d..46a00268e4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -192,7 +192,7 @@ elementwise_mul_grad(const framework::ExecutionContext& ctx, ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX(), MulGradDY()); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // cuda definition template typename std::enable_if< diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h index afa87a0ad8..1121d0ef68 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h @@ -22,13 +22,19 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include +#ifdef PADDLE_CUDA_FP16 +#include +#endif #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_HIP +#include #ifdef PADDLE_CUDA_FP16 -#include +#include #endif +#endif // PADDLE_WITH_HIP -#if CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000 #define __h2div h2div #endif @@ -101,7 +107,7 @@ struct DivRangeFunctor< #ifdef PADDLE_CUDA_FP16 inline DEVICE half2 half2_add(const half2& a, const half2& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) return __hadd2(a, b); #else float a1 = __low2float(a); @@ -115,7 +121,7 @@ inline DEVICE half2 half2_add(const half2& a, const half2& b) { } inline DEVICE half2 half2_sub(const half2& a, const half2& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) return __hsub2(a, b); #else float a1 = __low2float(a); @@ -129,7 +135,7 @@ inline DEVICE half2 half2_sub(const half2& a, const half2& b) { } inline DEVICE half2 half2_mul(const half2& a, const half2& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) return __hmul2(a, b); #else float a1 = __low2float(a); @@ -143,7 +149,7 @@ inline DEVICE half2 half2_mul(const half2& a, const half2& b) { } inline DEVICE half2 half2_div(const half2& a, const half2& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) return __h2div(a, b); #else float a1 = __low2float(a); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 46b477afeb..923611143a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -29,8 +29,12 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/transform.h" +#if defined(__NVCC__) || defined(__HIPCC__) #ifdef __NVCC__ #include +#elif defined(__HIPCC__) +#include +#endif #include #include "paddle/fluid/platform/cuda_device_function.h" @@ -196,7 +200,7 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x, } } -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) template __global__ void ElementwiseKernel(const T *x, const T *y, OutType *out, int pre, int n, int post, int total, Functor func) { @@ -310,7 +314,7 @@ void CommonForwardBroadcastCUDA( y_data, out_data, out_size, max_dim, func, is_xsize_larger); } -#endif // __NVCC__ +#endif // __NVCC__ or __HIPCC__ template void CommonGradBroadcastCPU( @@ -382,7 +386,7 @@ inline void ComputeBroadcastTranspositionArray(const int *x_one_indexs, } } -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) template static __global__ void ElemwiseGradBroadcast1CUDAKernel( const T *x, const T *y, const T *out, const T *dout, int h, int w, @@ -1212,7 +1216,7 @@ void CommonGradBroadcastCUDA( } } -#endif // __NVCC__ +#endif // __NVCC__ or __HIPCC__ inline framework::DDim trim_trailing_singular_dims( const framework::DDim &dims) { @@ -1339,7 +1343,7 @@ class MidWiseTransformIterator int64_t post_; }; -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) template class RowwiseTransformIterator : public thrust::iterator_adaptor< @@ -1504,10 +1508,10 @@ static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const T *out, } } -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) template -static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x, +static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream, const T *x, const T *y, const T *out, const T *dout, int h, int w, bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { @@ -1577,7 +1581,7 @@ static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const T *out, } } -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) template static __global__ void ElemwiseGradBroadcast2CUDAKernel( const T *x, const T *y, const T *out, const T *dout, int pre, int n, @@ -1646,7 +1650,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( } template -static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T *x, +static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream, const T *x, const T *y, const T *out, const T *dout, int pre, int n, int post, bool is_xsize_larger, DX_OP dx_op, @@ -1686,7 +1690,7 @@ void CommonElementwiseBroadcastBackward( << " ydim:" << framework::make_ddim(y_dims_array); if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) CommonGradBroadcastCUDA( x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(), max_dim, @@ -1769,7 +1773,7 @@ void ElemwiseGradComputeWithBroadcast( } if (post == 1) { if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) ElemwiseGradBroadcast1CUDA( ctx.template device_context().stream(), x.data(), y.data(), out.data(), dout.data(), pre, n, is_xsize_larger, @@ -1786,7 +1790,7 @@ void ElemwiseGradComputeWithBroadcast( } } else { if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) ElemwiseGradBroadcast2CUDA( ctx.template device_context().stream(), x.data(), y.data(), out.data(), dout.data(), pre, n, post, @@ -1830,7 +1834,7 @@ void CommonElementwiseBroadcastForward( axis); if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) CommonForwardBroadcastCUDA( x, y, z, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(), max_dim, @@ -1942,7 +1946,7 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, } if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) ComputeElementwiseCUDA( x, y, z, pre, n, post, ctx.template device_context(), func, @@ -2066,7 +2070,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x, const T *y, int pre, } } -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) template static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel( @@ -2107,7 +2111,7 @@ static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel( template -static void FusedElemwiseAndActBroadcast1CUDA(cudaStream_t stream, const T *x, +static void FusedElemwiseAndActBroadcast1CUDA(gpuStream_t stream, const T *x, const T *y, CompoundFunctor compound_functor, int h, int w, T *out, @@ -2164,7 +2168,7 @@ static __global__ void FusedElemwiseAndActBroadcast2CUDAKernel( template -static void FusedElemwiseAndActBroadcast2CUDA(cudaStream_t stream, const T *x, +static void FusedElemwiseAndActBroadcast2CUDA(gpuStream_t stream, const T *x, const T *y, int pre, int n, int post, CompoundFunctor compound_functor, @@ -2219,7 +2223,7 @@ void FusedElemwiseAndActComputeWithBroadcast( int h = pre; int w = n; if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) FusedElemwiseAndActBroadcast1CUDA( @@ -2242,7 +2246,7 @@ void FusedElemwiseAndActComputeWithBroadcast( } } else { if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) FusedElemwiseAndActBroadcast2CUDA( @@ -2493,7 +2497,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU( } } -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) template @@ -2593,7 +2597,7 @@ template static void FusedElemwiseAndActGradBroadcast1CUDA( - cudaStream_t stream, const T *x, const T *y, const T *intermediate_out, + gpuStream_t stream, const T *x, const T *y, const T *intermediate_out, const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); @@ -2708,7 +2712,7 @@ template static void FusedElemwiseAndActGradBroadcast2CUDA( - cudaStream_t stream, const T *x, const T *y, const T *intermediate_out, + gpuStream_t stream, const T *x, const T *y, const T *intermediate_out, const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy, T *dintermediate) { @@ -2748,7 +2752,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast( int w = n; if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) FusedElemwiseAndActGradBroadcast1CUDA( @@ -2774,7 +2778,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast( } } else { if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) FusedElemwiseAndActGradBroadcast2CUDA( diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h index 8cc4b166fc..ee718a3ecd 100755 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h @@ -25,7 +25,7 @@ struct PowFunctor { inline HOSTDEVICE T operator()(T a, T b) const { // TODO(wujionghao): A potential speed improvement is supporting different // types in C++. -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and // it will return a float number like 2.99... , which floor to 2 // when cast to int by default and it is wrong. diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index 3e97366b61..4171d2eb9e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -86,7 +86,7 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx, ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), SubGradDY()); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // cuda definition template typename std::enable_if< diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc index 15c31a4cec..12d8265436 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc @@ -60,7 +60,7 @@ TEST(test_elementwise_add_grad_grad_without_ddx, cpu_place) { TestElementwiseAddGradGradWithoutDDX test(p, dims); ASSERT_TRUE(test.Check()); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(test_elementwise_add_grad_grad_without_ddx, gpu_place) { framework::DDim dims({32, 64}); platform::CUDAPlace p(0); diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc index cf9e9dbb04..ab45b6f4de 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc @@ -32,6 +32,8 @@ static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice)); +#elif defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, n, hipMemcpyHostToDevice)); #else PADDLE_THROW( platform::errors::InvalidArgument("Check your paddle version, current " @@ -129,7 +131,7 @@ TEST(test_elementwise_add_not_inplace, cpu_place) { ASSERT_TRUE(TestMain(p, dims, false)); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(test_elementwise_add_inplace, gpu_place) { framework::DDim dims({32, 64}); platform::CUDAPlace p(0); diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc index e1f893dd2b..82448c681c 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -84,7 +84,7 @@ TEST(test_elementwise_div_grad_grad_without_dout, cpu_place) { ASSERT_TRUE(test.Check()); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(test_elementwise_div_grad_grad_without_dout, gpu_place) { framework::DDim dims({32, 64}); platform::CUDAPlace p(0); diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h index 54e7c7d1b6..8bfb566d49 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h +++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h @@ -88,7 +88,7 @@ class TestElementwiseOpGradGrad { auto dst_place = BOOST_GET_CONST(platform::CPUPlace, place_); memory::Copy(dst_place, dst, src_place, src, bytes); } else if (platform::is_gpu_place(place_)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto dst_place = BOOST_GET_CONST(platform::CUDAPlace, place_); memory::Copy(dst_place, dst, src_place, src, bytes, nullptr); #else @@ -126,8 +126,14 @@ class TestElementwiseOpGradGrad { } auto *out_ptr = cpu_out.data(); size_t numel = static_cast(framework::product(dims_)); +#ifdef PADDLE_WITH_HIP + auto is_equal = std::equal( + out_ptr, out_ptr + numel, expected_outs_[out_name].data(), + [](const float &l, const float &r) { return fabs(l - r) < 1e-8; }); +#else auto is_equal = std::equal(out_ptr, out_ptr + numel, expected_outs_[out_name].data()); +#endif if (!is_equal) { all_equal = false; break; -- GitLab