未验证 提交 9b016c7c 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid operators for rocm (part2), test=develop (#31211)

上级 2fd999d9
......@@ -30,7 +30,7 @@ endforeach()
register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
if(WITH_NCCL)
if(WITH_NCCL OR WITH_RCCL)
set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
endif()
......
......@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -36,7 +36,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet(
"AllReduce op can run on gpu place only for now."));
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
......@@ -73,7 +73,11 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
comm, stream));
if (ctx.Attr<bool>("sync_mode")) {
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -39,7 +39,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
platform::errors::PreconditionNotMet(
"The place of ExecutionContext should be CUDAPlace."));
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
int root_dev_id = ctx.Attr<int>("root");
......@@ -68,7 +68,11 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
<< " From " << root_dev_id << " to " << dev_id;
if (ctx.Attr<bool>("sync_mode")) {
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
......
......@@ -30,7 +30,7 @@ class RefByTrainerIdKernel : public framework::OpKernel<T> {
int64_t trainer_id = 0;
auto* trainer_id_data = trainer_id_t->data<int64_t>();
if (platform::is_gpu_place(context.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream = context.cuda_device_context().stream();
memory::Copy<>(platform::CPUPlace(), &trainer_id,
BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#if __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
#include "thrust/device_vector.h"
#endif
......@@ -87,7 +87,7 @@ struct KronOpFunctor {
const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
*p_stride_out = nullptr, *p_shape_y = nullptr;
#if __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
thrust::device_vector<int64_t> d_stride_x(ndims);
thrust::device_vector<int64_t> d_stride_y(ndims);
thrust::device_vector<int64_t> d_stride_out(ndims);
......@@ -326,7 +326,7 @@ struct KronGradOpFunctor {
const int64_t* p_stride_y = nullptr;
const int64_t* p_stride_dout = nullptr;
const int64_t* p_shape_y = nullptr;
#if __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
thrust::device_vector<int64_t> d_stride_x(ndims);
thrust::device_vector<int64_t> d_stride_y(ndims);
thrust::device_vector<int64_t> d_stride_dout(ndims);
......@@ -369,7 +369,19 @@ struct KronGradOpFunctor {
for_range(func);
// reduce_sum along aixs 1
#if __NVCC__
#ifdef __HIPCC__
auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) {
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
dout_x, dx, {1}, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
}
if (dy) {
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
dout_y, dy, {1}, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
}
#elif defined(__NVCC__)
auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) {
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
......
......@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/complex_functors.h"
#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
#endif
......@@ -45,7 +45,12 @@ template <typename DeviceContext, typename T>
void ReduceSumForMatmulGrad(const Tensor* input, Tensor* output,
const std::vector<int>& reduce_dims,
const paddle::framework::ExecutionContext& ctx) {
#ifdef __NVCC__
#ifdef __HIPCC__
auto stream = ctx.cuda_device_context().stream();
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
#elif defined(__NVCC__)
auto stream = ctx.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
......
......@@ -95,7 +95,7 @@ __global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr,
template <typename T>
class PreluOpGradFunctor {
public:
void operator()(cudaStream_t stream, const T* x, const T* alpha, const T* dy,
void operator()(gpuStream_t stream, const T* x, const T* alpha, const T* dy,
T* dx, T* dalpha, const framework::DDim& input_dims,
PRELU_MODE mode) {
size_t numel = 1;
......@@ -174,9 +174,15 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
reduce_dims.push_back(i);
}
#ifdef __HIPCC__
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
dalpha_tmp, dalpha, reduce_dims, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
#else
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
dalpha_tmp, dalpha, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream);
#endif
}
};
......
......@@ -13,7 +13,7 @@ else()
register_operators()
endif()
if(WITH_GPU)
if(WITH_GPU OR WITH_ROCM)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu")
string(REPLACE ".part.cu" "" OPS "${OPS}")
......@@ -38,3 +38,7 @@ if(WITH_GPU)
nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
endif()
endif()
if(WITH_ROCM)
hip_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
endif()
......@@ -20,7 +20,14 @@
#include <set>
#include <vector>
#include <cub/cub.cuh> // NOLINT
#ifdef __NVCC__
#include "cub/cub.cuh" // NOLINT
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
......@@ -64,7 +71,12 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init,
int reduce_num) {
#ifdef __HIPCC__
__shared__
typename hipcub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
#else
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
#endif
int idx_x = blockIdx.x * reduce_num;
int idx_y = threadIdx.x;
Ty reduce_var = init;
......@@ -73,8 +85,13 @@ __global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
reducer(reduce_var, static_cast<Ty>(transformer(x[idx_x + idx_y])));
__syncthreads();
#ifdef __HIPCC__
reduce_var = hipcub::BlockReduce<Ty, BlockDim>(temp_storage)
.Reduce(reduce_var, reducer);
#else
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
#endif
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
......@@ -90,7 +107,12 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
Array<int, ReduceRank> reduce_strides,
Array<int, Rank - ReduceRank> left_dim,
Array<int, Rank - ReduceRank> left_strides) {
#ifdef __HIPCC__
__shared__
typename hipcub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
#else
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
#endif
Array<int, Rank> sub_index;
int left_idx = blockIdx.x;
for (int i = 0; i < Rank - ReduceRank; ++i) {
......@@ -122,8 +144,13 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
}
__syncthreads();
#ifdef __HIPCC__
reduce_var = hipcub::BlockReduce<Ty, BlockDim>(temp_storage)
.Reduce(reduce_var, reducer);
#else
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
#endif
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
......@@ -188,7 +215,7 @@ static void TensorReduceImpl(
int left_num, int reduce_num, const std::vector<int>& x_strides,
const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
const std::vector<int>& left_dim, const std::vector<int>& left_strides,
cudaStream_t stream) {
gpuStream_t stream) {
#define CUB_RANK_CASE(i, ...) \
case i: { \
constexpr auto kRank = i; \
......@@ -211,17 +238,32 @@ static void TensorReduceImpl(
int rank = x_strides.size();
int reduce_rank = reduce_strides.size();
if (rank == reduce_rank) {
#ifdef __HIPCC__
hipcub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer);
#else
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer);
#endif
size_t temp_storage_bytes = 0;
#ifdef __HIPCC__
hipcub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
#else
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
#endif
framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
place);
#ifdef __HIPCC__
hipcub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x,
y_data, reduce_num, reducer, init, stream);
#else
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
#endif
return;
}
if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
......@@ -280,7 +322,7 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
std::vector<int> origin_reduce_dims, const Ty& init,
const ReduceOp& reducer, const TransformOp& transformer,
cudaStream_t stream) {
gpuStream_t stream) {
auto x_dim = framework::vectorize<int>(x.dims());
std::vector<int> new_x_dim, new_reduce_dims;
int is_reduced = 0;
......@@ -362,11 +404,11 @@ struct TensorReduceFunctor {
const double& init;
const ReduceOp& reducer;
const TransformOp& transformer;
cudaStream_t stream;
gpuStream_t stream;
TensorReduceFunctor(const framework::Tensor& x, framework::Tensor* y,
std::vector<int> origin_reduce_dims, const double& init,
const ReduceOp& reducer, const TransformOp& transformer,
cudaStream_t stream)
gpuStream_t stream)
: x(x),
y(y),
origin_reduce_dims(origin_reduce_dims),
......
......@@ -56,9 +56,15 @@ class ReduceMeanKernel : public framework::OpKernel<T> {
}
auto stream = context.cuda_device_context().stream();
#ifdef PADDLE_WITH_HIP
TensorReduce<T, T, hipcub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), hipcub::Sum(),
DivideFunctor<T>(reduce_num), stream);
#else
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
DivideFunctor<T>(reduce_num), stream);
#endif
}
};
......
......@@ -56,13 +56,25 @@ class ReduceSumKernel : public framework::OpKernel<T> {
if (out_dtype >= 0) {
framework::VisitDataTypeSmall(
static_cast<framework::proto::VarType::Type>(out_dtype),
#ifdef __HIPCC__
TensorReduceFunctor<T, hipcub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<double>(0.0),
hipcub::Sum(), IdentityFunctor<T>(), stream));
#else
TensorReduceFunctor<T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<double>(0.0), cub::Sum(),
IdentityFunctor<T>(), stream));
#endif
} else {
#ifdef __HIPCC__
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
#else
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream);
#endif
}
}
};
......
......@@ -14,7 +14,7 @@
#pragma once
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_ptr.h>
#include <thrust/functional.h>
#include <thrust/reduce.h>
......@@ -107,7 +107,7 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> {
auto *x_data = x->data<Tx>();
auto x_numel = x->numel();
if (maxlen < 0) {
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
VLOG(10)
<< "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
maxlen = static_cast<int>(
......
......@@ -130,13 +130,13 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
const size_t *lod;
size_t lod_count = x.lod()[0].size();
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
lod = x.lod()[0].CUDAData(ctx.GetPlace());
} else {
#endif
lod = x.lod()[0].data();
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
}
#endif
......
......@@ -104,9 +104,18 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
#ifdef PADDLE_WITH_HIP
// MIOPEN not support float64
REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace,
ops::SequenceSoftmaxCUDNNKernel<float>);
REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
ops::SequenceSoftmaxGradCUDNNKernel<float>);
#else
REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace,
ops::SequenceSoftmaxCUDNNKernel<float>,
ops::SequenceSoftmaxCUDNNKernel<double>);
REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
ops::SequenceSoftmaxGradCUDNNKernel<float>,
ops::SequenceSoftmaxGradCUDNNKernel<double>);
#endif
......@@ -36,7 +36,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
// choose cudnn kernel if the runtime supported.
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
bool runtime_cudnn_support = false;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
......@@ -132,7 +132,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
// choose cudnn kernel if the runtime supported.
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
bool runtime_cudnn_support = false;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
......
......@@ -13,7 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <cub/cub.cuh> // NOLINT
#ifdef __NVCC__
#include <cub/cub.cuh>
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
......@@ -23,7 +31,11 @@ namespace operators {
using LoDTensor = framework::LoDTensor;
template <typename T, int BlockDim>
#ifdef __HIPCC__
using BlockReduce = hipcub::BlockReduce<T, BlockDim>;
#else
using BlockReduce = cub::BlockReduce<T, BlockDim>;
#endif
template <typename T, int BlockDim>
using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
......@@ -45,8 +57,13 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod,
T ele = in_data[start + tid];
max_ele = max_ele > ele ? max_ele : ele;
}
#ifdef __HIPCC__
max_ele =
BlockReduce<T, BlockDim>(temp_storage).Reduce(max_ele, hipcub::Max());
#else
max_ele =
BlockReduce<T, BlockDim>(temp_storage).Reduce(max_ele, cub::Max());
#endif
if (threadIdx.x == 0) {
shared_max_data = max_ele;
}
......@@ -58,8 +75,13 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod,
T ele = in_data[start + tid];
sum_data += real_exp(ele - shared_max_data);
}
#ifdef __HIPCC__
sum_data =
BlockReduce<T, BlockDim>(temp_storage).Reduce(sum_data, hipcub::Sum());
#else
sum_data =
BlockReduce<T, BlockDim>(temp_storage).Reduce(sum_data, cub::Sum());
#endif
if (threadIdx.x == 0) {
shared_sum_data = sum_data;
}
......@@ -94,7 +116,12 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data,
T s_d = softmax_data[idx];
result += s_g_d * s_d;
}
#ifdef __HIPCC__
result =
BlockReduce<T, BlockDim>(temp_storage).Reduce(result, hipcub::Sum());
#else
result = BlockReduce<T, BlockDim>(temp_storage).Reduce(result, cub::Sum());
#endif
if (threadIdx.x == 0) {
shared_data = result;
}
......
......@@ -43,9 +43,15 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
auto stream = context.cuda_device_context().stream();
std::vector<int> reduce_dims;
reduce_dims.push_back(out->dims().size());
#ifdef __HIPCC__
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
diag, out, reduce_dims, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
#else
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
diag, out, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream);
#endif
}
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册