未验证 提交 7cdf6ea7 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid elementwise op for rocm (part10), test=develop (#31361)

* [ROCM] update fluid elementwise op for rocm (part10), test=develop

* update, test=develop

* address review comments, test=develop
上级 84639b61
......@@ -239,7 +239,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::SetDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
<< dst_place << " by stream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
#ifdef PADDLE_WITH_HIP
......
......@@ -20,12 +20,18 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef __NVCC__
#include <cuda.h>
#include <cuda_fp16.h>
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#endif
namespace paddle {
......@@ -179,7 +185,7 @@ __global__ void MatrixColReduce(const T *__restrict__ in, T *__restrict__ out,
}
}
#if CUDA_VERSION >= 10000
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
template <int SIZE>
__global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in,
__half2 *__restrict__ out, size_t width,
......@@ -287,7 +293,7 @@ bool static RunSpecialDims(const framework::DDim &dx_dims,
return true;
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cuda definition
template <typename DeviceContext, typename T>
typename std::enable_if<
......
......@@ -144,7 +144,7 @@ elementwise_div_grad(const framework::ExecutionContext& ctx,
ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cuda definition
template <typename DeviceContext, typename T>
typename std::enable_if<
......
......@@ -25,10 +25,14 @@ namespace operators {
template <typename T>
struct FloorDivFunctor {
inline HOSTDEVICE T operator()(T a, T b) const {
#ifdef __CUDA_ARCH__
#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
if (b == 0) {
printf("Error: Divide by zero encounter in floor_divide\n");
#ifdef __HIPCC__
abort();
#else
asm("trap;");
#endif
}
#else
if (b == 0)
......@@ -42,10 +46,14 @@ struct FloorDivFunctor {
template <typename T>
struct InverseFloorDivFunctor {
inline HOSTDEVICE T operator()(T a, T b) const {
#ifdef __CUDA_ARCH__
#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
if (a == 0) {
printf("Error: Divide by zero encounter in floor_divide\n");
#ifdef __HIPCC__
abort();
#else
asm("trap;");
#endif
}
#else
if (a == 0)
......
......@@ -192,7 +192,7 @@ elementwise_mul_grad(const framework::ExecutionContext& ctx,
ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cuda definition
template <typename DeviceContext, typename T>
typename std::enable_if<
......
......@@ -22,13 +22,19 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#ifdef PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#endif
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#ifdef PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#include <hip/hip_fp16.h>
#endif
#endif // PADDLE_WITH_HIP
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
#define __h2div h2div
#endif
......@@ -101,7 +107,7 @@ struct DivRangeFunctor<
#ifdef PADDLE_CUDA_FP16
inline DEVICE half2 half2_add(const half2& a, const half2& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return __hadd2(a, b);
#else
float a1 = __low2float(a);
......@@ -115,7 +121,7 @@ inline DEVICE half2 half2_add(const half2& a, const half2& b) {
}
inline DEVICE half2 half2_sub(const half2& a, const half2& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return __hsub2(a, b);
#else
float a1 = __low2float(a);
......@@ -129,7 +135,7 @@ inline DEVICE half2 half2_sub(const half2& a, const half2& b) {
}
inline DEVICE half2 half2_mul(const half2& a, const half2& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return __hmul2(a, b);
#else
float a1 = __low2float(a);
......@@ -143,7 +149,7 @@ inline DEVICE half2 half2_mul(const half2& a, const half2& b) {
}
inline DEVICE half2 half2_div(const half2& a, const half2& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return __h2div(a, b);
#else
float a1 = __low2float(a);
......
......@@ -29,8 +29,12 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/transform.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#ifdef __NVCC__
#include <cuda.h>
#elif defined(__HIPCC__)
#include <hip/hip_runtime.h>
#endif
#include <thrust/iterator/iterator_adaptor.h>
#include "paddle/fluid/platform/cuda_device_function.h"
......@@ -196,7 +200,7 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x,
}
}
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename Functor, typename T, typename OutType>
__global__ void ElementwiseKernel(const T *x, const T *y, OutType *out, int pre,
int n, int post, int total, Functor func) {
......@@ -310,7 +314,7 @@ void CommonForwardBroadcastCUDA(
y_data, out_data, out_size, max_dim, func, is_xsize_larger);
}
#endif // __NVCC__
#endif // __NVCC__ or __HIPCC__
template <typename T, typename DX_OP, typename DY_OP>
void CommonGradBroadcastCPU(
......@@ -382,7 +386,7 @@ inline void ComputeBroadcastTranspositionArray(const int *x_one_indexs,
}
}
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T, typename DX_OP, typename DY_OP>
static __global__ void ElemwiseGradBroadcast1CUDAKernel(
const T *x, const T *y, const T *out, const T *dout, int h, int w,
......@@ -1212,7 +1216,7 @@ void CommonGradBroadcastCUDA(
}
}
#endif // __NVCC__
#endif // __NVCC__ or __HIPCC__
inline framework::DDim trim_trailing_singular_dims(
const framework::DDim &dims) {
......@@ -1339,7 +1343,7 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext>
int64_t post_;
};
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T>
class RowwiseTransformIterator<T, platform::CUDADeviceContext>
: public thrust::iterator_adaptor<
......@@ -1504,10 +1508,10 @@ static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const T *out,
}
}
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T, typename DX_OP, typename DY_OP>
static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x,
static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream, const T *x,
const T *y, const T *out, const T *dout,
int h, int w, bool is_xsize_larger,
DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
......@@ -1577,7 +1581,7 @@ static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const T *out,
}
}
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T, typename DX_OP, typename DY_OP>
static __global__ void ElemwiseGradBroadcast2CUDAKernel(
const T *x, const T *y, const T *out, const T *dout, int pre, int n,
......@@ -1646,7 +1650,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
}
template <typename T, typename DX_OP, typename DY_OP>
static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T *x,
static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream, const T *x,
const T *y, const T *out, const T *dout,
int pre, int n, int post,
bool is_xsize_larger, DX_OP dx_op,
......@@ -1686,7 +1690,7 @@ void CommonElementwiseBroadcastBackward(
<< " ydim:" << framework::make_ddim(y_dims_array);
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
CommonGradBroadcastCUDA<T, DX_OP, DY_OP>(
x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(),
out_dims_array.data(), max_dim,
......@@ -1769,7 +1773,7 @@ void ElemwiseGradComputeWithBroadcast(
}
if (post == 1) {
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
ElemwiseGradBroadcast1CUDA(
ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, is_xsize_larger,
......@@ -1786,7 +1790,7 @@ void ElemwiseGradComputeWithBroadcast(
}
} else {
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
ElemwiseGradBroadcast2CUDA(
ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
......@@ -1830,7 +1834,7 @@ void CommonElementwiseBroadcastForward(
axis);
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
CommonForwardBroadcastCUDA<Functor, T, OutType>(
x, y, z, x_dims_array.data(), y_dims_array.data(),
out_dims_array.data(), max_dim,
......@@ -1942,7 +1946,7 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
}
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
ComputeElementwiseCUDA<Functor, T, OutType>(
x, y, z, pre, n, post,
ctx.template device_context<platform::CUDADeviceContext>(), func,
......@@ -2066,7 +2070,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x, const T *y, int pre,
}
}
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T, typename CompoundFunctor, bool BcastY,
bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel(
......@@ -2107,7 +2111,7 @@ static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel(
template <typename T, typename CompoundFunctor, bool BcastY,
bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
static void FusedElemwiseAndActBroadcast1CUDA(cudaStream_t stream, const T *x,
static void FusedElemwiseAndActBroadcast1CUDA(gpuStream_t stream, const T *x,
const T *y,
CompoundFunctor compound_functor,
int h, int w, T *out,
......@@ -2164,7 +2168,7 @@ static __global__ void FusedElemwiseAndActBroadcast2CUDAKernel(
template <typename T, typename CompoundFunctor, bool BcastY,
bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
static void FusedElemwiseAndActBroadcast2CUDA(cudaStream_t stream, const T *x,
static void FusedElemwiseAndActBroadcast2CUDA(gpuStream_t stream, const T *x,
const T *y, int pre, int n,
int post,
CompoundFunctor compound_functor,
......@@ -2219,7 +2223,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
int h = pre;
int w = n;
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
FusedElemwiseAndActBroadcast1CUDA<T, CompoundFunctor, BcastY,
KeepIntermediateOut,
SameShapeOfIntermediateOutAndOut>(
......@@ -2242,7 +2246,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
}
} else {
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
FusedElemwiseAndActBroadcast2CUDA<T, CompoundFunctor, BcastY,
KeepIntermediateOut,
SameShapeOfIntermediateOutAndOut>(
......@@ -2493,7 +2497,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
}
}
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool UseIntermediateOut, bool BcastY,
bool SameShapeOfIntermediateOutAndOut>
......@@ -2593,7 +2597,7 @@ template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool UseIntermediateOut, bool BcastY,
bool SameShapeOfIntermediateOutAndOut>
static void FusedElemwiseAndActGradBroadcast1CUDA(
cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
gpuStream_t stream, const T *x, const T *y, const T *intermediate_out,
const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op,
DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
......@@ -2708,7 +2712,7 @@ template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool UseIntermediateOut, bool BcastY,
bool SameShapeOfIntermediateOutAndOut>
static void FusedElemwiseAndActGradBroadcast2CUDA(
cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
gpuStream_t stream, const T *x, const T *y, const T *intermediate_out,
const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op,
DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy,
T *dintermediate) {
......@@ -2748,7 +2752,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
int w = n;
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
FusedElemwiseAndActGradBroadcast1CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
UseIntermediateOut, BcastY,
SameShapeOfIntermediateOutAndOut>(
......@@ -2774,7 +2778,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
}
} else {
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
FusedElemwiseAndActGradBroadcast2CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
UseIntermediateOut, BcastY,
SameShapeOfIntermediateOutAndOut>(
......
......@@ -25,7 +25,7 @@ struct PowFunctor {
inline HOSTDEVICE T operator()(T a, T b) const {
// TODO(wujionghao): A potential speed improvement is supporting different
// types in C++.
#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
// On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
// it will return a float number like 2.99... , which floor to 2
// when cast to int by default and it is wrong.
......
......@@ -86,7 +86,7 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx,
ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cuda definition
template <typename DeviceContext, typename T>
typename std::enable_if<
......
......@@ -60,7 +60,7 @@ TEST(test_elementwise_add_grad_grad_without_ddx, cpu_place) {
TestElementwiseAddGradGradWithoutDDX<float> test(p, dims);
ASSERT_TRUE(test.Check());
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(test_elementwise_add_grad_grad_without_ddx, gpu_place) {
framework::DDim dims({32, 64});
platform::CUDAPlace p(0);
......
......@@ -32,6 +32,8 @@ static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
#elif defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, n, hipMemcpyHostToDevice));
#else
PADDLE_THROW(
platform::errors::InvalidArgument("Check your paddle version, current "
......@@ -129,7 +131,7 @@ TEST(test_elementwise_add_not_inplace, cpu_place) {
ASSERT_TRUE(TestMain<float>(p, dims, false));
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(test_elementwise_add_inplace, gpu_place) {
framework::DDim dims({32, 64});
platform::CUDAPlace p(0);
......
......@@ -84,7 +84,7 @@ TEST(test_elementwise_div_grad_grad_without_dout, cpu_place) {
ASSERT_TRUE(test.Check());
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(test_elementwise_div_grad_grad_without_dout, gpu_place) {
framework::DDim dims({32, 64});
platform::CUDAPlace p(0);
......
......@@ -88,7 +88,7 @@ class TestElementwiseOpGradGrad {
auto dst_place = BOOST_GET_CONST(platform::CPUPlace, place_);
memory::Copy(dst_place, dst, src_place, src, bytes);
} else if (platform::is_gpu_place(place_)) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto dst_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
memory::Copy(dst_place, dst, src_place, src, bytes, nullptr);
#else
......@@ -126,8 +126,14 @@ class TestElementwiseOpGradGrad {
}
auto *out_ptr = cpu_out.data<T>();
size_t numel = static_cast<size_t>(framework::product(dims_));
#ifdef PADDLE_WITH_HIP
auto is_equal = std::equal(
out_ptr, out_ptr + numel, expected_outs_[out_name].data(),
[](const float &l, const float &r) { return fabs(l - r) < 1e-8; });
#else
auto is_equal =
std::equal(out_ptr, out_ptr + numel, expected_outs_[out_name].data());
#endif
if (!is_equal) {
all_equal = false;
break;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册