Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f9377965
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f9377965
编写于
3月 08, 2021
作者:
Q
Qi Li
提交者:
GitHub
3月 08, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] fix dropout and remove hipcub, test=develop (#31455)
上级
fadabbe9
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
47 addition
and
197 deletion
+47
-197
paddle/fluid/operators/detection/bbox_util.cu.h
paddle/fluid/operators/detection/bbox_util.cu.h
+2
-12
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+5
-28
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
.../fluid/operators/detection/distribute_fpn_proposals_op.cu
+4
-21
paddle/fluid/operators/group_norm_op.cu
paddle/fluid/operators/group_norm_op.cu
+1
-8
paddle/fluid/operators/kron_op.h
paddle/fluid/operators/kron_op.h
+1
-13
paddle/fluid/operators/matmul_v2_op.h
paddle/fluid/operators/matmul_v2_op.h
+1
-6
paddle/fluid/operators/pool_op.h
paddle/fluid/operators/pool_op.h
+1
-6
paddle/fluid/operators/prelu_op.cu
paddle/fluid/operators/prelu_op.cu
+0
-6
paddle/fluid/operators/reduce_ops/cub_reduce.h
paddle/fluid/operators/reduce_ops/cub_reduce.h
+1
-35
paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+0
-6
paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+0
-12
paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+1
-19
paddle/fluid/operators/trace_op.cu
paddle/fluid/operators/trace_op.cu
+0
-6
paddle/fluid/platform/gpu_launch_config.h
paddle/fluid/platform/gpu_launch_config.h
+4
-0
tools/dockerfile/Dockerfile.rocm
tools/dockerfile/Dockerfile.rocm
+26
-19
未找到文件。
paddle/fluid/operators/detection/bbox_util.cu.h
浏览文件 @
f9377965
...
...
@@ -23,6 +23,7 @@ limitations under the License. */
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#include "paddle/fluid/platform/miopen_helper.h"
namespace
cub
=
hipcub
;
#endif
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h"
...
...
@@ -64,27 +65,16 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
// Determine temporary device storage requirements
size_t
temp_storage_bytes
=
0
;
#ifdef PADDLE_WITH_HIP
hipcub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
nullptr
,
temp_storage_bytes
,
keys_in
,
keys_out
,
idx_in
,
idx_out
,
num
);
#else
cub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
nullptr
,
temp_storage_bytes
,
keys_in
,
keys_out
,
idx_in
,
idx_out
,
num
);
#endif
// Allocate temporary storage
auto
place
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
ctx
.
GetPlace
());
auto
d_temp_storage
=
memory
::
Alloc
(
place
,
temp_storage_bytes
);
// Run sorting operation
#ifdef PADDLE_WITH_HIP
hipcub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
keys_in
,
keys_out
,
idx_in
,
idx_out
,
num
);
#else
// Run sorting operation
cub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
keys_in
,
keys_out
,
idx_in
,
idx_out
,
num
);
#endif
}
template
<
typename
T
>
...
...
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
浏览文件 @
f9377965
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#endif
#include <paddle/fluid/memory/allocation/allocator.h>
...
...
@@ -141,29 +142,17 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
// Determine temporary device storage requirements
size_t
temp_storage_bytes
=
0
;
#ifdef PADDLE_WITH_HIP
hipcub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
nullptr
,
temp_storage_bytes
,
concat_scores
.
data
<
T
>
(),
keys_out
,
idx_in
,
idx_out
,
total_roi_num
);
#else
cub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
nullptr
,
temp_storage_bytes
,
concat_scores
.
data
<
T
>
(),
keys_out
,
idx_in
,
idx_out
,
total_roi_num
);
#endif
// Allocate temporary storage
auto
d_temp_storage
=
memory
::
Alloc
(
place
,
temp_storage_bytes
);
// Run sorting operation
// sort score to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
concat_scores
.
data
<
T
>
(),
keys_out
,
idx_in
,
idx_out
,
total_roi_num
);
#else
// Run sorting operation
// sort score to get corresponding index
cub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
concat_scores
.
data
<
T
>
(),
keys_out
,
idx_in
,
idx_out
,
total_roi_num
);
#endif
index_out_t
.
Resize
({
real_post_num
});
Tensor
sorted_rois
;
sorted_rois
.
mutable_data
<
T
>
({
real_post_num
,
kBBoxSize
},
dev_ctx
.
GetPlace
());
...
...
@@ -185,29 +174,17 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
out_id_t
.
mutable_data
<
int
>
({
real_post_num
},
dev_ctx
.
GetPlace
());
// Determine temporary device storage requirements
temp_storage_bytes
=
0
;
#ifdef PADDLE_WITH_HIP
hipcub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
nullptr
,
temp_storage_bytes
,
sorted_batch_id
.
data
<
int
>
(),
out_id_data
,
batch_idx_in
,
index_out_t
.
data
<
int
>
(),
real_post_num
);
#else
cub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
nullptr
,
temp_storage_bytes
,
sorted_batch_id
.
data
<
int
>
(),
out_id_data
,
batch_idx_in
,
index_out_t
.
data
<
int
>
(),
real_post_num
);
#endif
// Allocate temporary storage
d_temp_storage
=
memory
::
Alloc
(
place
,
temp_storage_bytes
);
// Run sorting operation
// sort batch_id to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
sorted_batch_id
.
data
<
int
>
(),
out_id_data
,
batch_idx_in
,
index_out_t
.
data
<
int
>
(),
real_post_num
);
#else
// Run sorting operation
// sort batch_id to get corresponding index
cub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
sorted_batch_id
.
data
<
int
>
(),
out_id_data
,
batch_idx_in
,
index_out_t
.
data
<
int
>
(),
real_post_num
);
#endif
GPUGather
<
T
>
(
dev_ctx
,
sorted_rois
,
index_out_t
,
fpn_rois
);
...
...
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
浏览文件 @
f9377965
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#endif
#include <paddle/fluid/memory/allocation/allocator.h>
...
...
@@ -149,42 +150,24 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
// Determine temporary device storage requirements
size_t
temp_storage_bytes
=
0
;
#ifdef PADDLE_WITH_HIP
hipcub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
nullptr
,
temp_storage_bytes
,
target_lvls_data
,
keys_out
,
idx_in
,
idx_out
,
roi_num
);
#else
cub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
nullptr
,
temp_storage_bytes
,
target_lvls_data
,
keys_out
,
idx_in
,
idx_out
,
roi_num
);
#endif
// Allocate temporary storage
auto
d_temp_storage
=
memory
::
Alloc
(
place
,
temp_storage_bytes
);
// Run sorting operation
// sort target level to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
target_lvls_data
,
keys_out
,
idx_in
,
idx_out
,
roi_num
);
#else
// Run sorting operation
// sort target level to get corresponding index
cub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
target_lvls_data
,
keys_out
,
idx_in
,
idx_out
,
roi_num
);
#endif
int
*
restore_idx_data
=
restore_index
->
mutable_data
<
int
>
({
roi_num
,
1
},
dev_ctx
.
GetPlace
());
// sort current index to get restore index
#ifdef PADDLE_WITH_HIP
hipcub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
idx_out
,
keys_out
,
idx_in
,
restore_idx_data
,
roi_num
);
#else
// sort current index to get restore index
cub
::
DeviceRadixSort
::
SortPairs
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
idx_out
,
keys_out
,
idx_in
,
restore_idx_data
,
roi_num
);
#endif
int
start
=
0
;
auto
multi_rois_num
=
ctx
.
MultiOutput
<
Tensor
>
(
"MultiLevelRoIsNum"
);
...
...
paddle/fluid/operators/group_norm_op.cu
浏览文件 @
f9377965
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#endif
#include "paddle/fluid/operators/group_norm_op.h"
...
...
@@ -46,18 +47,10 @@ enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
template
<
typename
T
>
__device__
__inline__
void
CudaAtomicAddWithWarp
(
T
*
sum
,
T
value
)
{
#ifdef PADDLE_WITH_CUDA
typedef
cub
::
WarpReduce
<
T
>
WarpReduce
;
#else
typedef
hipcub
::
WarpReduce
<
T
>
WarpReduce
;
#endif
typename
WarpReduce
::
TempStorage
temp_storage
;
value
=
WarpReduce
(
temp_storage
).
Sum
(
value
);
#ifdef PADDLE_WITH_CUDA
if
(
cub
::
LaneId
()
==
0
)
platform
::
CudaAtomicAdd
(
sum
,
value
);
#else
if
(
hipcub
::
LaneId
()
==
0
)
platform
::
CudaAtomicAdd
(
sum
,
value
);
#endif
}
template
<
typename
T
>
...
...
paddle/fluid/operators/kron_op.h
浏览文件 @
f9377965
...
...
@@ -369,19 +369,7 @@ struct KronGradOpFunctor {
for_range
(
func
);
// reduce_sum along aixs 1
#ifdef __HIPCC__
auto
stream
=
dev_ctx
.
stream
();
// it is a cuda device_context
if
(
dx
)
{
TensorReduce
<
T
,
T
,
hipcub
::
Sum
,
IdentityFunctor
<
T
>>
(
dout_x
,
dx
,
{
1
},
static_cast
<
T
>
(
0
),
hipcub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
}
if
(
dy
)
{
TensorReduce
<
T
,
T
,
hipcub
::
Sum
,
IdentityFunctor
<
T
>>
(
dout_y
,
dy
,
{
1
},
static_cast
<
T
>
(
0
),
hipcub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
}
#elif defined(__NVCC__)
#if defined(__NVCC__) || defined(__HIPCC__)
auto
stream
=
dev_ctx
.
stream
();
// it is a cuda device_context
if
(
dx
)
{
TensorReduce
<
T
,
T
,
cub
::
Sum
,
IdentityFunctor
<
T
>>
(
...
...
paddle/fluid/operators/matmul_v2_op.h
浏览文件 @
f9377965
...
...
@@ -45,12 +45,7 @@ template <typename DeviceContext, typename T>
void
ReduceSumForMatmulGrad
(
const
Tensor
*
input
,
Tensor
*
output
,
const
std
::
vector
<
int
>&
reduce_dims
,
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
{
#ifdef __HIPCC__
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
TensorReduce
<
T
,
T
,
hipcub
::
Sum
,
IdentityFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
hipcub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
#elif defined(__NVCC__)
#if defined(__NVCC__) || defined(__HIPCC__)
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
TensorReduce
<
T
,
T
,
cub
::
Sum
,
IdentityFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
...
...
paddle/fluid/operators/pool_op.h
浏览文件 @
f9377965
...
...
@@ -213,12 +213,7 @@ class PoolKernel : public framework::OpKernel<T> {
if
(
reduce_num
>
0
&&
adaptive
)
{
// for adaptive_avg_pool2d && output_size == 1
#ifdef __HIPCC__
auto
stream
=
dev_ctx
.
stream
();
TensorReduce
<
T
,
T
,
hipcub
::
Sum
,
DivideFunctor
<
T
>>
(
*
in_x
,
out
,
reduce_dim
,
static_cast
<
T
>
(
0
),
hipcub
::
Sum
(),
DivideFunctor
<
T
>
(
reduce_num
),
stream
);
#elif defined(__NVCC__)
#if defined(__HIPCC__) || defined(__NVCC__)
auto
stream
=
dev_ctx
.
stream
();
TensorReduce
<
T
,
T
,
cub
::
Sum
,
DivideFunctor
<
T
>>
(
*
in_x
,
out
,
reduce_dim
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
...
...
paddle/fluid/operators/prelu_op.cu
浏览文件 @
f9377965
...
...
@@ -174,15 +174,9 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
reduce_dims
.
push_back
(
i
);
}
#ifdef __HIPCC__
TensorReduce
<
T
,
T
,
hipcub
::
Sum
,
IdentityFunctor
<
T
>>
(
dalpha_tmp
,
dalpha
,
reduce_dims
,
static_cast
<
T
>
(
0
),
hipcub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
#else
TensorReduce
<
T
,
T
,
cub
::
Sum
,
IdentityFunctor
<
T
>>
(
dalpha_tmp
,
dalpha
,
reduce_dims
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
#endif
}
};
...
...
paddle/fluid/operators/reduce_ops/cub_reduce.h
浏览文件 @
f9377965
...
...
@@ -26,6 +26,7 @@
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#endif
#include "paddle/fluid/framework/tensor.h"
...
...
@@ -71,12 +72,7 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
__global__
void
ReduceKernel2D
(
const
Tx
*
x
,
Ty
*
y
,
ReduceOp
reducer
,
TransformOp
transformer
,
Ty
init
,
int
reduce_num
)
{
#ifdef __HIPCC__
__shared__
typename
hipcub
::
BlockReduce
<
Ty
,
BlockDim
>::
TempStorage
temp_storage
;
#else
__shared__
typename
cub
::
BlockReduce
<
Ty
,
BlockDim
>::
TempStorage
temp_storage
;
#endif
int
idx_x
=
blockIdx
.
x
*
reduce_num
;
int
idx_y
=
threadIdx
.
x
;
Ty
reduce_var
=
init
;
...
...
@@ -85,13 +81,8 @@ __global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
reducer
(
reduce_var
,
static_cast
<
Ty
>
(
transformer
(
x
[
idx_x
+
idx_y
])));
__syncthreads
();
#ifdef __HIPCC__
reduce_var
=
hipcub
::
BlockReduce
<
Ty
,
BlockDim
>
(
temp_storage
)
.
Reduce
(
reduce_var
,
reducer
);
#else
reduce_var
=
cub
::
BlockReduce
<
Ty
,
BlockDim
>
(
temp_storage
).
Reduce
(
reduce_var
,
reducer
);
#endif
if
(
threadIdx
.
x
==
0
)
{
y
[
blockIdx
.
x
]
=
reduce_var
;
...
...
@@ -107,12 +98,7 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
Array
<
int
,
ReduceRank
>
reduce_strides
,
Array
<
int
,
Rank
-
ReduceRank
>
left_dim
,
Array
<
int
,
Rank
-
ReduceRank
>
left_strides
)
{
#ifdef __HIPCC__
__shared__
typename
hipcub
::
BlockReduce
<
Ty
,
BlockDim
>::
TempStorage
temp_storage
;
#else
__shared__
typename
cub
::
BlockReduce
<
Ty
,
BlockDim
>::
TempStorage
temp_storage
;
#endif
Array
<
int
,
Rank
>
sub_index
;
int
left_idx
=
blockIdx
.
x
;
for
(
int
i
=
0
;
i
<
Rank
-
ReduceRank
;
++
i
)
{
...
...
@@ -144,13 +130,8 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
}
__syncthreads
();
#ifdef __HIPCC__
reduce_var
=
hipcub
::
BlockReduce
<
Ty
,
BlockDim
>
(
temp_storage
)
.
Reduce
(
reduce_var
,
reducer
);
#else
reduce_var
=
cub
::
BlockReduce
<
Ty
,
BlockDim
>
(
temp_storage
).
Reduce
(
reduce_var
,
reducer
);
#endif
if
(
threadIdx
.
x
==
0
)
{
y
[
blockIdx
.
x
]
=
reduce_var
;
...
...
@@ -238,32 +219,17 @@ static void TensorReduceImpl(
int
rank
=
x_strides
.
size
();
int
reduce_rank
=
reduce_strides
.
size
();
if
(
rank
==
reduce_rank
)
{
#ifdef __HIPCC__
hipcub
::
TransformInputIterator
<
Ty
,
TransformOp
,
const
Tx
*>
trans_x
(
x_data
,
transformer
);
#else
cub
::
TransformInputIterator
<
Ty
,
TransformOp
,
const
Tx
*>
trans_x
(
x_data
,
transformer
);
#endif
size_t
temp_storage_bytes
=
0
;
#ifdef __HIPCC__
hipcub
::
DeviceReduce
::
Reduce
(
nullptr
,
temp_storage_bytes
,
trans_x
,
y_data
,
reduce_num
,
reducer
,
init
,
stream
);
#else
cub
::
DeviceReduce
::
Reduce
(
nullptr
,
temp_storage_bytes
,
trans_x
,
y_data
,
reduce_num
,
reducer
,
init
,
stream
);
#endif
framework
::
Tensor
tmp
;
auto
*
temp_storage
=
tmp
.
mutable_data
<
uint8_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
temp_storage_bytes
)}),
place
);
#ifdef __HIPCC__
hipcub
::
DeviceReduce
::
Reduce
(
temp_storage
,
temp_storage_bytes
,
trans_x
,
y_data
,
reduce_num
,
reducer
,
init
,
stream
);
#else
cub
::
DeviceReduce
::
Reduce
(
temp_storage
,
temp_storage_bytes
,
trans_x
,
y_data
,
reduce_num
,
reducer
,
init
,
stream
);
#endif
return
;
}
if
(
rank
==
2
&&
reduce_rank
==
1
&&
reduce_dim
[
0
]
==
1
)
{
...
...
paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
浏览文件 @
f9377965
...
...
@@ -56,15 +56,9 @@ class ReduceMeanKernel : public framework::OpKernel<T> {
}
auto
stream
=
context
.
cuda_device_context
().
stream
();
#ifdef PADDLE_WITH_HIP
TensorReduce
<
T
,
T
,
hipcub
::
Sum
,
DivideFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
hipcub
::
Sum
(),
DivideFunctor
<
T
>
(
reduce_num
),
stream
);
#else
TensorReduce
<
T
,
T
,
cub
::
Sum
,
DivideFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
DivideFunctor
<
T
>
(
reduce_num
),
stream
);
#endif
}
};
...
...
paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
浏览文件 @
f9377965
...
...
@@ -56,25 +56,13 @@ class ReduceSumKernel : public framework::OpKernel<T> {
if
(
out_dtype
>=
0
)
{
framework
::
VisitDataTypeSmall
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
),
#ifdef __HIPCC__
TensorReduceFunctor
<
T
,
hipcub
::
Sum
,
IdentityFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
double
>
(
0.0
),
hipcub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
));
#else
TensorReduceFunctor
<
T
,
cub
::
Sum
,
IdentityFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
double
>
(
0.0
),
cub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
));
#endif
}
else
{
#ifdef __HIPCC__
TensorReduce
<
T
,
T
,
hipcub
::
Sum
,
IdentityFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
hipcub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
#else
TensorReduce
<
T
,
T
,
cub
::
Sum
,
IdentityFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
#endif
}
}
};
...
...
paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
浏览文件 @
f9377965
...
...
@@ -20,6 +20,7 @@ limitations under the License. */
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#endif
#include "paddle/fluid/operators/math.h"
...
...
@@ -31,11 +32,7 @@ namespace operators {
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
T
,
int
BlockDim
>
#ifdef __HIPCC__
using
BlockReduce
=
hipcub
::
BlockReduce
<
T
,
BlockDim
>
;
#else
using
BlockReduce
=
cub
::
BlockReduce
<
T
,
BlockDim
>
;
#endif
template
<
typename
T
,
int
BlockDim
>
using
BlockReduceTempStorage
=
typename
BlockReduce
<
T
,
BlockDim
>::
TempStorage
;
...
...
@@ -57,13 +54,8 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod,
T
ele
=
in_data
[
start
+
tid
];
max_ele
=
max_ele
>
ele
?
max_ele
:
ele
;
}
#ifdef __HIPCC__
max_ele
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
max_ele
,
hipcub
::
Max
());
#else
max_ele
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
max_ele
,
cub
::
Max
());
#endif
if
(
threadIdx
.
x
==
0
)
{
shared_max_data
=
max_ele
;
}
...
...
@@ -75,13 +67,8 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod,
T
ele
=
in_data
[
start
+
tid
];
sum_data
+=
real_exp
(
ele
-
shared_max_data
);
}
#ifdef __HIPCC__
sum_data
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
sum_data
,
hipcub
::
Sum
());
#else
sum_data
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
sum_data
,
cub
::
Sum
());
#endif
if
(
threadIdx
.
x
==
0
)
{
shared_sum_data
=
sum_data
;
}
...
...
@@ -116,12 +103,7 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data,
T
s_d
=
softmax_data
[
idx
];
result
+=
s_g_d
*
s_d
;
}
#ifdef __HIPCC__
result
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
result
,
hipcub
::
Sum
());
#else
result
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
result
,
cub
::
Sum
());
#endif
if
(
threadIdx
.
x
==
0
)
{
shared_data
=
result
;
}
...
...
paddle/fluid/operators/trace_op.cu
浏览文件 @
f9377965
...
...
@@ -43,15 +43,9 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
auto
stream
=
context
.
cuda_device_context
().
stream
();
std
::
vector
<
int
>
reduce_dims
;
reduce_dims
.
push_back
(
out
->
dims
().
size
());
#ifdef __HIPCC__
TensorReduce
<
T
,
T
,
hipcub
::
Sum
,
IdentityFunctor
<
T
>>
(
diag
,
out
,
reduce_dims
,
static_cast
<
T
>
(
0
),
hipcub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
#else
TensorReduce
<
T
,
T
,
cub
::
Sum
,
IdentityFunctor
<
T
>>
(
diag
,
out
,
reduce_dims
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
#endif
}
}
};
...
...
paddle/fluid/platform/gpu_launch_config.h
浏览文件 @
f9377965
...
...
@@ -41,7 +41,11 @@ struct GpuLaunchConfig {
inline
GpuLaunchConfig
GetGpuLaunchConfig1D
(
const
platform
::
CUDADeviceContext
&
context
,
int
element_count
,
#ifdef PADDLE_WITH_HIP
int
max_threads
=
256
)
{
#else
int
max_threads
=
1024
)
{
#endif
PADDLE_ENFORCE_GT
(
element_count
,
0
,
platform
::
errors
::
InvalidArgument
(
"element count should be greater than 0,"
...
...
tools/dockerfile/Dockerfile.rocm
浏览文件 @
f9377965
# A image for building paddle binaries
# Use rocm-terminal base image for both rocm environment
# When you modify it, please be aware of rocm version
#
# Build: ROCM
3.9
#
# Build: ROCM
4.0.1
# cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=
3.9
\
# -t paddlepaddle/paddle-centos-rocm
39
-dev:latest .
#
# --build-arg ROCM_VERSION=
4.0.1
\
# -t paddlepaddle/paddle-centos-rocm
401
-dev:latest .
#
# docker run -it --device=/dev/kfd --device=/dev/dri \
# --security-opt seccomp=unconfined --group-add video \
# paddlepaddle/paddle-centos-rocm
39
-dev:latest /bin/bash
# paddlepaddle/paddle-centos-rocm
401
-dev:latest /bin/bash
FROM centos:7.8.2003
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
...
...
@@ -21,7 +21,8 @@ ENV LANGUAGE en_US.UTF-8
RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \
zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel
make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel \
net-tools numactl-devel chrpath
# Install devtoolset-7
RUN yum install -y yum-utils centos-release-scl && \
...
...
@@ -70,7 +71,7 @@ RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
make -j8 && make install && \
cd .. && rm -rf git-2.17.1.tar.gz && rm -rf git-2.17.1
ENV GOROOT=/usr/local/go
ENV GOROOT=/usr/local/go
ENV GOPATH=/root/gopath
ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
...
...
@@ -82,7 +83,7 @@ RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8
mkdir /root/gopath/src
# protobuf 3.6.1
RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \
RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \
tar xzf protobuf-cpp-3.6.1.tar.gz && \
cd protobuf-3.6.1 && ./configure && make -j4 && make install && \
cd .. && rm -f protobuf-cpp-3.6.1.tar.gz && rm -rf protobuf-3.6.1
...
...
@@ -91,28 +92,34 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p
RUN cd /opt && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && chmod +x Miniconda3-latest-Linux-x86_64.sh
RUN mkdir /opt/conda && ./Miniconda3-latest-Linux-x86_64.sh -b -f -p "/opt/conda" && rm -rf Miniconda3-latest-Linux-x86_64.sh
ENV PATH=/opt/conda/bin:${PATH}
RUN conda init bash && \
conda create -n python2.7 python=2.7 && \
conda create -n python3.7 python=3.7
RUN conda init bash && conda install -n base jupyter
# install
p
addle requirement
# install
P
addle requirement
RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
RUN /opt/conda/bin/pip install -r /root/requirements.txt && \
/opt/conda/envs/python2.7/bin/pip install -r /root/requirements.txt && \
/opt/conda/envs/python3.7/bin/pip install -r /root/requirements.txt && \
rm -rf /root/requirements.txt
RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/unittest_py/requirements.txt -O /root/requirements.txt
RUN /opt/conda/bin/pip install -r /root/requirements.txt && \
/opt/conda/envs/python2.7/bin/pip install -r /root/requirements.txt && \
/opt/conda/envs/python3.7/bin/pip install -r /root/requirements.txt && \
rm -rf /root/requirements.txt
RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
# install PaddleClas requirement
RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleClas/develop/requirements.txt -O /root/requirements.txt
RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
# install PaddleDetection requirement
RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleDetection/develop/requirements.txt -O /root/requirements.txt
RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
# configure ssh
RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config
# clang-format 3.8
RUN wget https://copr.fedorainfracloud.org/coprs/alonid/llvm-3.8.0/repo/epel-7/alonid-llvm-3.8.0-epel-7.repo -P /etc/yum.repos.d/
RUN yum install -y clang-3.8.0
ENV PATH=/opt/llvm-3.8.0/bin:${PATH}
# patchelf
RUN yum install -y patchelf && \
yum clean all && \
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录