未验证 提交 f9377965 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] fix dropout and remove hipcub, test=develop (#31455)

上级 fadabbe9
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
#ifdef __HIPCC__ #ifdef __HIPCC__
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
#include "paddle/fluid/platform/miopen_helper.h" #include "paddle/fluid/platform/miopen_helper.h"
namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -64,27 +65,16 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, ...@@ -64,27 +65,16 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
// Determine temporary device storage requirements // Determine temporary device storage requirements
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
#endif
// Allocate temporary storage // Allocate temporary storage
auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
idx_out, num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
idx_out, num); idx_out, num);
#endif
} }
template <typename T> template <typename T>
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#endif #endif
#ifdef __HIPCC__ #ifdef __HIPCC__
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif #endif
#include <paddle/fluid/memory/allocation/allocator.h> #include <paddle/fluid/memory/allocation/allocator.h>
...@@ -141,29 +142,17 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -141,29 +142,17 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
// Determine temporary device storage requirements // Determine temporary device storage requirements
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
idx_out, total_roi_num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in, nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
idx_out, total_roi_num); idx_out, total_roi_num);
#endif
// Allocate temporary storage // Allocate temporary storage
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
// sort score to get corresponding index // sort score to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
keys_out, idx_in, idx_out, total_roi_num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(), d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
keys_out, idx_in, idx_out, total_roi_num); keys_out, idx_in, idx_out, total_roi_num);
#endif
index_out_t.Resize({real_post_num}); index_out_t.Resize({real_post_num});
Tensor sorted_rois; Tensor sorted_rois;
sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
...@@ -185,29 +174,17 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -185,29 +174,17 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace()); out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
// Determine temporary device storage requirements // Determine temporary device storage requirements
temp_storage_bytes = 0; temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
batch_idx_in, index_out_t.data<int>(), real_post_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>( cub::DeviceRadixSort::SortPairs<int, int>(
nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data, nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
batch_idx_in, index_out_t.data<int>(), real_post_num); batch_idx_in, index_out_t.data<int>(), real_post_num);
#endif
// Allocate temporary storage // Allocate temporary storage
d_temp_storage = memory::Alloc(place, temp_storage_bytes); d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
// sort batch_id to get corresponding index // sort batch_id to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>( cub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(), d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num); out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
#endif
GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois); GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#endif #endif
#ifdef __HIPCC__ #ifdef __HIPCC__
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif #endif
#include <paddle/fluid/memory/allocation/allocator.h> #include <paddle/fluid/memory/allocation/allocator.h>
...@@ -149,42 +150,24 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -149,42 +150,24 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
// Determine temporary device storage requirements // Determine temporary device storage requirements
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes, cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
target_lvls_data, keys_out, target_lvls_data, keys_out,
idx_in, idx_out, roi_num); idx_in, idx_out, roi_num);
#endif
// Allocate temporary storage // Allocate temporary storage
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
// sort target level to get corresponding index // sort target level to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>( cub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
idx_in, idx_out, roi_num); idx_in, idx_out, roi_num);
#endif
int* restore_idx_data = int* restore_idx_data =
restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace()); restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
// sort current index to get restore index // sort current index to get restore index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
restore_idx_data, roi_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>( cub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
restore_idx_data, roi_num); restore_idx_data, roi_num);
#endif
int start = 0; int start = 0;
auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum"); auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#endif #endif
#ifdef __HIPCC__ #ifdef __HIPCC__
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/operators/group_norm_op.h" #include "paddle/fluid/operators/group_norm_op.h"
...@@ -46,18 +47,10 @@ enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 }; ...@@ -46,18 +47,10 @@ enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
template <typename T> template <typename T>
__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
#ifdef PADDLE_WITH_CUDA
typedef cub::WarpReduce<T> WarpReduce; typedef cub::WarpReduce<T> WarpReduce;
#else
typedef hipcub::WarpReduce<T> WarpReduce;
#endif
typename WarpReduce::TempStorage temp_storage; typename WarpReduce::TempStorage temp_storage;
value = WarpReduce(temp_storage).Sum(value); value = WarpReduce(temp_storage).Sum(value);
#ifdef PADDLE_WITH_CUDA
if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value); if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
#else
if (hipcub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
#endif
} }
template <typename T> template <typename T>
......
...@@ -369,19 +369,7 @@ struct KronGradOpFunctor { ...@@ -369,19 +369,7 @@ struct KronGradOpFunctor {
for_range(func); for_range(func);
// reduce_sum along aixs 1 // reduce_sum along aixs 1
#ifdef __HIPCC__ #if defined(__NVCC__) || defined(__HIPCC__)
auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) {
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
dout_x, dx, {1}, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
}
if (dy) {
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
dout_y, dy, {1}, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
}
#elif defined(__NVCC__)
auto stream = dev_ctx.stream(); // it is a cuda device_context auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) { if (dx) {
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>( TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
......
...@@ -45,12 +45,7 @@ template <typename DeviceContext, typename T> ...@@ -45,12 +45,7 @@ template <typename DeviceContext, typename T>
void ReduceSumForMatmulGrad(const Tensor* input, Tensor* output, void ReduceSumForMatmulGrad(const Tensor* input, Tensor* output,
const std::vector<int>& reduce_dims, const std::vector<int>& reduce_dims,
const paddle::framework::ExecutionContext& ctx) { const paddle::framework::ExecutionContext& ctx) {
#ifdef __HIPCC__ #if defined(__NVCC__) || defined(__HIPCC__)
auto stream = ctx.cuda_device_context().stream();
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
#elif defined(__NVCC__)
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>( TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(), *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
......
...@@ -213,12 +213,7 @@ class PoolKernel : public framework::OpKernel<T> { ...@@ -213,12 +213,7 @@ class PoolKernel : public framework::OpKernel<T> {
if (reduce_num > 0 && if (reduce_num > 0 &&
adaptive) { // for adaptive_avg_pool2d && output_size == 1 adaptive) { // for adaptive_avg_pool2d && output_size == 1
#ifdef __HIPCC__ #if defined(__HIPCC__) || defined(__NVCC__)
auto stream = dev_ctx.stream();
TensorReduce<T, T, hipcub::Sum, DivideFunctor<T>>(
*in_x, out, reduce_dim, static_cast<T>(0), hipcub::Sum(),
DivideFunctor<T>(reduce_num), stream);
#elif defined(__NVCC__)
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>( TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*in_x, out, reduce_dim, static_cast<T>(0), cub::Sum(), *in_x, out, reduce_dim, static_cast<T>(0), cub::Sum(),
......
...@@ -174,15 +174,9 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> { ...@@ -174,15 +174,9 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
reduce_dims.push_back(i); reduce_dims.push_back(i);
} }
#ifdef __HIPCC__
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
dalpha_tmp, dalpha, reduce_dims, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
#else
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>( TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
dalpha_tmp, dalpha, reduce_dims, static_cast<T>(0), cub::Sum(), dalpha_tmp, dalpha, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream); IdentityFunctor<T>(), stream);
#endif
} }
}; };
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#ifdef __HIPCC__ #ifdef __HIPCC__
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -71,12 +72,7 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp, ...@@ -71,12 +72,7 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer, __global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init, TransformOp transformer, Ty init,
int reduce_num) { int reduce_num) {
#ifdef __HIPCC__
__shared__
typename hipcub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
#else
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage; __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
#endif
int idx_x = blockIdx.x * reduce_num; int idx_x = blockIdx.x * reduce_num;
int idx_y = threadIdx.x; int idx_y = threadIdx.x;
Ty reduce_var = init; Ty reduce_var = init;
...@@ -85,13 +81,8 @@ __global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer, ...@@ -85,13 +81,8 @@ __global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
reducer(reduce_var, static_cast<Ty>(transformer(x[idx_x + idx_y]))); reducer(reduce_var, static_cast<Ty>(transformer(x[idx_x + idx_y])));
__syncthreads(); __syncthreads();
#ifdef __HIPCC__
reduce_var = hipcub::BlockReduce<Ty, BlockDim>(temp_storage)
.Reduce(reduce_var, reducer);
#else
reduce_var = reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer); cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
#endif
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var; y[blockIdx.x] = reduce_var;
...@@ -107,12 +98,7 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, ...@@ -107,12 +98,7 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
Array<int, ReduceRank> reduce_strides, Array<int, ReduceRank> reduce_strides,
Array<int, Rank - ReduceRank> left_dim, Array<int, Rank - ReduceRank> left_dim,
Array<int, Rank - ReduceRank> left_strides) { Array<int, Rank - ReduceRank> left_strides) {
#ifdef __HIPCC__
__shared__
typename hipcub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
#else
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage; __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
#endif
Array<int, Rank> sub_index; Array<int, Rank> sub_index;
int left_idx = blockIdx.x; int left_idx = blockIdx.x;
for (int i = 0; i < Rank - ReduceRank; ++i) { for (int i = 0; i < Rank - ReduceRank; ++i) {
...@@ -144,13 +130,8 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, ...@@ -144,13 +130,8 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
} }
__syncthreads(); __syncthreads();
#ifdef __HIPCC__
reduce_var = hipcub::BlockReduce<Ty, BlockDim>(temp_storage)
.Reduce(reduce_var, reducer);
#else
reduce_var = reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer); cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
#endif
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var; y[blockIdx.x] = reduce_var;
...@@ -238,32 +219,17 @@ static void TensorReduceImpl( ...@@ -238,32 +219,17 @@ static void TensorReduceImpl(
int rank = x_strides.size(); int rank = x_strides.size();
int reduce_rank = reduce_strides.size(); int reduce_rank = reduce_strides.size();
if (rank == reduce_rank) { if (rank == reduce_rank) {
#ifdef __HIPCC__
hipcub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer);
#else
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x( cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer); x_data, transformer);
#endif
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
#ifdef __HIPCC__
hipcub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
#else
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream); reduce_num, reducer, init, stream);
#endif
framework::Tensor tmp; framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>( auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}), framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
place); place);
#ifdef __HIPCC__
hipcub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x,
y_data, reduce_num, reducer, init, stream);
#else
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream); reduce_num, reducer, init, stream);
#endif
return; return;
} }
if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
......
...@@ -56,15 +56,9 @@ class ReduceMeanKernel : public framework::OpKernel<T> { ...@@ -56,15 +56,9 @@ class ReduceMeanKernel : public framework::OpKernel<T> {
} }
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
#ifdef PADDLE_WITH_HIP
TensorReduce<T, T, hipcub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), hipcub::Sum(),
DivideFunctor<T>(reduce_num), stream);
#else
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>( TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(), *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
DivideFunctor<T>(reduce_num), stream); DivideFunctor<T>(reduce_num), stream);
#endif
} }
}; };
......
...@@ -56,25 +56,13 @@ class ReduceSumKernel : public framework::OpKernel<T> { ...@@ -56,25 +56,13 @@ class ReduceSumKernel : public framework::OpKernel<T> {
if (out_dtype >= 0) { if (out_dtype >= 0) {
framework::VisitDataTypeSmall( framework::VisitDataTypeSmall(
static_cast<framework::proto::VarType::Type>(out_dtype), static_cast<framework::proto::VarType::Type>(out_dtype),
#ifdef __HIPCC__
TensorReduceFunctor<T, hipcub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<double>(0.0),
hipcub::Sum(), IdentityFunctor<T>(), stream));
#else
TensorReduceFunctor<T, cub::Sum, IdentityFunctor<T>>( TensorReduceFunctor<T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<double>(0.0), cub::Sum(), *input, output, reduce_dims, static_cast<double>(0.0), cub::Sum(),
IdentityFunctor<T>(), stream)); IdentityFunctor<T>(), stream));
#endif
} else { } else {
#ifdef __HIPCC__
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
#else
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>( TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(), *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream); IdentityFunctor<T>(), stream);
#endif
} }
} }
}; };
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#ifdef __HIPCC__ #ifdef __HIPCC__
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math.h"
...@@ -31,11 +32,7 @@ namespace operators { ...@@ -31,11 +32,7 @@ namespace operators {
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
template <typename T, int BlockDim> template <typename T, int BlockDim>
#ifdef __HIPCC__
using BlockReduce = hipcub::BlockReduce<T, BlockDim>;
#else
using BlockReduce = cub::BlockReduce<T, BlockDim>; using BlockReduce = cub::BlockReduce<T, BlockDim>;
#endif
template <typename T, int BlockDim> template <typename T, int BlockDim>
using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage; using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
...@@ -57,13 +54,8 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod, ...@@ -57,13 +54,8 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod,
T ele = in_data[start + tid]; T ele = in_data[start + tid];
max_ele = max_ele > ele ? max_ele : ele; max_ele = max_ele > ele ? max_ele : ele;
} }
#ifdef __HIPCC__
max_ele =
BlockReduce<T, BlockDim>(temp_storage).Reduce(max_ele, hipcub::Max());
#else
max_ele = max_ele =
BlockReduce<T, BlockDim>(temp_storage).Reduce(max_ele, cub::Max()); BlockReduce<T, BlockDim>(temp_storage).Reduce(max_ele, cub::Max());
#endif
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
shared_max_data = max_ele; shared_max_data = max_ele;
} }
...@@ -75,13 +67,8 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod, ...@@ -75,13 +67,8 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod,
T ele = in_data[start + tid]; T ele = in_data[start + tid];
sum_data += real_exp(ele - shared_max_data); sum_data += real_exp(ele - shared_max_data);
} }
#ifdef __HIPCC__
sum_data =
BlockReduce<T, BlockDim>(temp_storage).Reduce(sum_data, hipcub::Sum());
#else
sum_data = sum_data =
BlockReduce<T, BlockDim>(temp_storage).Reduce(sum_data, cub::Sum()); BlockReduce<T, BlockDim>(temp_storage).Reduce(sum_data, cub::Sum());
#endif
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
shared_sum_data = sum_data; shared_sum_data = sum_data;
} }
...@@ -116,12 +103,7 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data, ...@@ -116,12 +103,7 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data,
T s_d = softmax_data[idx]; T s_d = softmax_data[idx];
result += s_g_d * s_d; result += s_g_d * s_d;
} }
#ifdef __HIPCC__
result =
BlockReduce<T, BlockDim>(temp_storage).Reduce(result, hipcub::Sum());
#else
result = BlockReduce<T, BlockDim>(temp_storage).Reduce(result, cub::Sum()); result = BlockReduce<T, BlockDim>(temp_storage).Reduce(result, cub::Sum());
#endif
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
shared_data = result; shared_data = result;
} }
......
...@@ -43,15 +43,9 @@ class TraceCUDAKernel : public framework::OpKernel<T> { ...@@ -43,15 +43,9 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
std::vector<int> reduce_dims; std::vector<int> reduce_dims;
reduce_dims.push_back(out->dims().size()); reduce_dims.push_back(out->dims().size());
#ifdef __HIPCC__
TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
diag, out, reduce_dims, static_cast<T>(0), hipcub::Sum(),
IdentityFunctor<T>(), stream);
#else
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>( TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
diag, out, reduce_dims, static_cast<T>(0), cub::Sum(), diag, out, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream); IdentityFunctor<T>(), stream);
#endif
} }
} }
}; };
......
...@@ -41,7 +41,11 @@ struct GpuLaunchConfig { ...@@ -41,7 +41,11 @@ struct GpuLaunchConfig {
inline GpuLaunchConfig GetGpuLaunchConfig1D( inline GpuLaunchConfig GetGpuLaunchConfig1D(
const platform::CUDADeviceContext& context, int element_count, const platform::CUDADeviceContext& context, int element_count,
#ifdef PADDLE_WITH_HIP
int max_threads = 256) {
#else
int max_threads = 1024) { int max_threads = 1024) {
#endif
PADDLE_ENFORCE_GT(element_count, 0, PADDLE_ENFORCE_GT(element_count, 0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"element count should be greater than 0," "element count should be greater than 0,"
......
# A image for building paddle binaries # A image for building paddle binaries
# Use rocm-terminal base image for both rocm environment # Use rocm-terminal base image for both rocm environment
# When you modify it, please be aware of rocm version # When you modify it, please be aware of rocm version
# #
# Build: ROCM 3.9 # Build: ROCM 4.0.1
# cd Paddle/tools/dockerfile # cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \ # docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=3.9 \ # --build-arg ROCM_VERSION=4.0.1 \
# -t paddlepaddle/paddle-centos-rocm39-dev:latest . # -t paddlepaddle/paddle-centos-rocm401-dev:latest .
# #
# docker run -it --device=/dev/kfd --device=/dev/dri \ # docker run -it --device=/dev/kfd --device=/dev/dri \
# --security-opt seccomp=unconfined --group-add video \ # --security-opt seccomp=unconfined --group-add video \
# paddlepaddle/paddle-centos-rocm39-dev:latest /bin/bash # paddlepaddle/paddle-centos-rocm401-dev:latest /bin/bash
FROM centos:7.8.2003 FROM centos:7.8.2003
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com> MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
...@@ -21,7 +21,8 @@ ENV LANGUAGE en_US.UTF-8 ...@@ -21,7 +21,8 @@ ENV LANGUAGE en_US.UTF-8
RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \
zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \ zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel \
net-tools numactl-devel chrpath
# Install devtoolset-7 # Install devtoolset-7
RUN yum install -y yum-utils centos-release-scl && \ RUN yum install -y yum-utils centos-release-scl && \
...@@ -70,7 +71,7 @@ RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \ ...@@ -70,7 +71,7 @@ RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
make -j8 && make install && \ make -j8 && make install && \
cd .. && rm -rf git-2.17.1.tar.gz && rm -rf git-2.17.1 cd .. && rm -rf git-2.17.1.tar.gz && rm -rf git-2.17.1
ENV GOROOT=/usr/local/go ENV GOROOT=/usr/local/go
ENV GOPATH=/root/gopath ENV GOPATH=/root/gopath
ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH} ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
...@@ -82,7 +83,7 @@ RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8 ...@@ -82,7 +83,7 @@ RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8
mkdir /root/gopath/src mkdir /root/gopath/src
# protobuf 3.6.1 # protobuf 3.6.1
RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \
tar xzf protobuf-cpp-3.6.1.tar.gz && \ tar xzf protobuf-cpp-3.6.1.tar.gz && \
cd protobuf-3.6.1 && ./configure && make -j4 && make install && \ cd protobuf-3.6.1 && ./configure && make -j4 && make install && \
cd .. && rm -f protobuf-cpp-3.6.1.tar.gz && rm -rf protobuf-3.6.1 cd .. && rm -f protobuf-cpp-3.6.1.tar.gz && rm -rf protobuf-3.6.1
...@@ -91,28 +92,34 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p ...@@ -91,28 +92,34 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p
RUN cd /opt && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && chmod +x Miniconda3-latest-Linux-x86_64.sh RUN cd /opt && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && chmod +x Miniconda3-latest-Linux-x86_64.sh
RUN mkdir /opt/conda && ./Miniconda3-latest-Linux-x86_64.sh -b -f -p "/opt/conda" && rm -rf Miniconda3-latest-Linux-x86_64.sh RUN mkdir /opt/conda && ./Miniconda3-latest-Linux-x86_64.sh -b -f -p "/opt/conda" && rm -rf Miniconda3-latest-Linux-x86_64.sh
ENV PATH=/opt/conda/bin:${PATH} ENV PATH=/opt/conda/bin:${PATH}
RUN conda init bash && \ RUN conda init bash && conda install -n base jupyter
conda create -n python2.7 python=2.7 && \
conda create -n python3.7 python=3.7
# install paddle requirement # install Paddle requirement
RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
RUN /opt/conda/bin/pip install -r /root/requirements.txt && \ RUN /opt/conda/bin/pip install -r /root/requirements.txt && \
/opt/conda/envs/python2.7/bin/pip install -r /root/requirements.txt && \
/opt/conda/envs/python3.7/bin/pip install -r /root/requirements.txt && \
rm -rf /root/requirements.txt rm -rf /root/requirements.txt
RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/unittest_py/requirements.txt -O /root/requirements.txt RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/unittest_py/requirements.txt -O /root/requirements.txt
RUN /opt/conda/bin/pip install -r /root/requirements.txt && \ RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
/opt/conda/envs/python2.7/bin/pip install -r /root/requirements.txt && \
/opt/conda/envs/python3.7/bin/pip install -r /root/requirements.txt && \ # install PaddleClas requirement
rm -rf /root/requirements.txt RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleClas/develop/requirements.txt -O /root/requirements.txt
RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
# install PaddleDetection requirement
RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleDetection/develop/requirements.txt -O /root/requirements.txt
RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
# configure ssh # configure ssh
RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \ RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \ sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config
# clang-format 3.8
RUN wget https://copr.fedorainfracloud.org/coprs/alonid/llvm-3.8.0/repo/epel-7/alonid-llvm-3.8.0-epel-7.repo -P /etc/yum.repos.d/
RUN yum install -y clang-3.8.0
ENV PATH=/opt/llvm-3.8.0/bin:${PATH}
# patchelf # patchelf
RUN yum install -y patchelf && \ RUN yum install -y patchelf && \
yum clean all && \ yum clean all && \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册