Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
28b356b9
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
28b356b9
编写于
2月 26, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 26, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid framework for rocm (part6), test=develop (#31015)
上级
c8fac5ee
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
229 addition
and
97 deletion
+229
-97
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+3
-3
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+10
-8
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+3
-3
paddle/fluid/framework/tensor_util_test.cc
paddle/fluid/framework/tensor_util_test.cc
+5
-5
paddle/fluid/framework/tensor_util_test.cu
paddle/fluid/framework/tensor_util_test.cu
+71
-0
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+18
-15
paddle/fluid/framework/trainer_factory.cc
paddle/fluid/framework/trainer_factory.cc
+5
-3
paddle/fluid/framework/var_type_traits.cc
paddle/fluid/framework/var_type_traits.cc
+8
-0
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+10
-4
paddle/fluid/framework/var_type_traits_test.cc
paddle/fluid/framework/var_type_traits_test.cc
+8
-0
paddle/fluid/operators/nccl/CMakeLists.txt
paddle/fluid/operators/nccl/CMakeLists.txt
+11
-3
paddle/fluid/operators/nccl/nccl_gpu_common.h
paddle/fluid/operators/nccl/nccl_gpu_common.h
+4
-0
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+20
-20
paddle/fluid/pybind/global_value_getter_setter.cc
paddle/fluid/pybind/global_value_getter_setter.cc
+2
-2
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+5
-4
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+2
-1
paddle/fluid/pybind/ps_gpu_wrapper_py.h
paddle/fluid/pybind/ps_gpu_wrapper_py.h
+2
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+31
-19
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+11
-6
未找到文件。
paddle/fluid/framework/tensor_test.cc
浏览文件 @
28b356b9
...
...
@@ -118,7 +118,7 @@ TEST(Tensor, MutableData) {
EXPECT_EQ
(
static_cast
<
int
>
(
p2
[
0
]),
1
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
framework
::
Tensor
src_tensor
;
float
*
p1
=
nullptr
;
...
...
@@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) {
ASSERT_EQ
(
src_tensor
.
data
<
int
>
(),
dst_tensor
.
data
<
int
>
());
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
framework
::
Tensor
src_tensor
;
framework
::
Tensor
dst_tensor
;
...
...
@@ -212,7 +212,7 @@ TEST(Tensor, Slice) {
EXPECT_EQ
(
src_data_address
+
3
*
4
*
1
*
sizeof
(
int
),
slice_data_address
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
framework
::
Tensor
src_tensor
;
src_tensor
.
mutable_data
<
double
>
(
framework
::
make_ddim
({
6
,
9
}),
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
28b356b9
...
...
@@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
platform
::
is_cuda_pinned_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
dst_place
),
dst_ptr
,
...
...
@@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
platform
::
is_cuda_pinned_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
dst_place
),
dst_ptr
,
...
...
@@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
return
!
Any
(
tensor
,
pred_inf
)
&&
!
Any
(
tensor
,
pred_nan
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
typename
T
>
static
inline
void
__global__
BothFalse
(
const
T
*
cmp
,
T
*
out
,
int
element_num
)
{
CUDA_KERNEL_LOOP
(
i
,
element_num
)
{
out
[
i
]
=
(
!
cmp
[
i
])
&&
(
!
out
[
i
]);
}
...
...
@@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
}
void
VisitorImpl
(
const
platform
::
CUDAPlace
&
gpu
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
*
ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
gpu
);
constexpr
int
MAX_BLOCK_DIM
=
512
;
const
int
MAX_GRID_DIM
=
ctx
->
GetMaxPhysicalThreadCount
()
/
MAX_BLOCK_DIM
;
...
...
@@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
platform
::
errors
::
ResourceExhausted
(
"tensor size %d overflow when writing tensor"
,
size
));
if
(
platform
::
is_gpu_place
(
tensor
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
constexpr
size_t
kBufSize
=
1024
*
1024
*
64
;
// 64MB
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
kBufSize
]);
auto
&
gpu_dev_ctx
=
...
...
@@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor
cpu_tensor
;
cpu_tensor
.
Resize
(
framework
::
make_ddim
(
shape
));
framework
::
VisitDataType
(
...
...
@@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor
cpu_tensor
;
cpu_tensor
.
Resize
(
framework
::
make_ddim
(
dims
));
framework
::
VisitDataType
(
...
...
@@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
if
(
dl_tensor
.
ctx
.
device_type
==
kDLCPU
)
{
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
dl_tensor
.
ctx
.
device_type
==
kDLGPU
)
{
platform
::
CUDAPlace
dst_place
=
platform
::
CUDAPlace
(
dl_tensor
.
ctx
.
device_id
);
...
...
paddle/fluid/framework/tensor_util.h
浏览文件 @
28b356b9
...
...
@@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_gpu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dst_place
),
dst_ptr
,
src_place
,
...
...
@@ -150,7 +150,7 @@ void TensorFromVector(const std::vector<T>& src,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_gpu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dst_place
),
dst_ptr
,
src_place
,
...
...
@@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src
.
place
()),
src_ptr
,
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_gpu_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
src
.
place
()),
...
...
paddle/fluid/framework/tensor_util_test.cc
浏览文件 @
28b356b9
...
...
@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) {
}
EXPECT_TRUE
(
dst_tensor
.
layout
()
==
src_tensor
.
layout
());
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
Tensor
src_tensor
;
Tensor
gpu_tensor
;
...
...
@@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) {
delete
cpu_place
;
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
paddle
::
framework
::
Tensor
cpu_tensor
;
...
...
@@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) {
EXPECT_EQ
(
src_ptr
[
i
],
dst
[
i
]);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
paddle
::
framework
::
Tensor
gpu_tensor
;
...
...
@@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) {
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
paddle
::
framework
::
Tensor
cpu_tensor
;
...
...
@@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) {
EXPECT_EQ
(
dst_tensor
.
dims
(),
src_tensor
.
dims
());
delete
place
;
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
Tensor
gpu_tensor
;
gpu_tensor
.
Resize
({
2
,
3
});
...
...
paddle/fluid/framework/tensor_util_test.cu
浏览文件 @
28b356b9
...
...
@@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
...
...
@@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) {
Tensor
tensor
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
...
...
@@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
...
...
@@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) {
Tensor
tensor
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
...
...
@@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
...
...
@@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
...
...
@@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillFinite
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillFinite
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
}
...
...
@@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorContainsInf
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
Tensor
tensor
,
out
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorContainsInf
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorContainsNAN
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
Tensor
tensor
,
out
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorContainsNAN
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillFinite
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
28b356b9
...
...
@@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer {
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
};
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class
HeterServiceContext
{
public:
...
...
@@ -155,8 +156,9 @@ class HeterServiceContext {
void
Reset
()
{
push_dense_status_
.
clear
();
}
int
place_num_
;
Scope
*
scope_
{
nullptr
};
#ifdef PADDLE_WITH_CUDA
cudaEvent_t
event_
;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuEvent_t
event_
;
#endif
std
::
vector
<
OperatorBase
*>
ops_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
...
...
@@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase {
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
void
InitDumpEnv
()
{}
template
<
typename
T
>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
cuda
Stream_t
stream
);
gpu
Stream_t
stream
);
#endif
#ifdef PADDLE_WITH_XPU
void
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
...
...
@@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase {
std
::
vector
<
Scope
*>
place_scopes_
;
BtObjectPool
<
HeterServiceContext
>
object_pool_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if
def PADDLE_WITH_CUDA
std
::
vector
<
cuda
Stream_t
>
copy_streams_
;
std
::
vector
<
cuda
Event_t
>
events_
;
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
gpu
Stream_t
>
copy_streams_
;
std
::
vector
<
gpu
Event_t
>
events_
;
#endif
};
...
...
@@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase {
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
void
InitDumpEnv
()
{}
template
<
typename
T
>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
cuda
Stream_t
stream
);
gpu
Stream_t
stream
);
#endif
void
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
);
template
<
typename
T
>
...
...
@@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase {
std
::
vector
<
std
::
thread
>
threads_
;
int
use_ps_gpu_
;
int
thread_num_
;
#if
def PADDLE_WITH_CUDA
std
::
vector
<
cuda
Stream_t
>
copy_streams_
;
std
::
vector
<
cuda
Event_t
>
events_
;
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
gpu
Stream_t
>
copy_streams_
;
std
::
vector
<
gpu
Event_t
>
events_
;
#endif
};
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
class
PSGPUTrainer
:
public
TrainerBase
{
public:
PSGPUTrainer
()
{}
...
...
@@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase {
};
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
PipelineTrainer
:
public
TrainerBase
{
public:
PipelineTrainer
()
{}
...
...
paddle/fluid/framework/trainer_factory.cc
浏览文件 @
28b356b9
...
...
@@ -66,15 +66,17 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS
(
MultiTrainer
);
REGISTER_TRAINER_CLASS
(
DistMultiTrainer
);
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
HeterXpuTrainer
);
REGISTER_TRAINER_CLASS
(
HeterBoxTrainer
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
PSGPUTrainer
);
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
REGISTER_TRAINER_CLASS
(
PipelineTrainer
);
#endif
}
// namespace framework
...
...
paddle/fluid/framework/var_type_traits.cc
浏览文件 @
28b356b9
...
...
@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
28b356b9
...
...
@@ -30,6 +30,12 @@
#include <nccl.h>
#endif
#endif
#ifdef PADDLE_WITH_HIP
#include <miopen/miopen.h>
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
...
...
@@ -39,8 +45,8 @@
namespace
paddle
{
namespace
platform
{
#if
def PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_NCCL)
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
Communicator
;
class
NCCLCommunicator
;
#endif
...
...
@@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
LoDTensorArray
,
platform
::
PlaceList
,
ReaderHolder
,
std
::
string
,
Scope
*
,
operators
::
reader
::
LoDTensorBlockingQueueHolder
,
FetchList
,
operators
::
reader
::
OrderedMultiDeviceLoDTensorBlockingQueueHolder
,
#if
def PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_NCCL)
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
ncclUniqueId
,
platform
::
Communicator
,
platform
::
NCCLCommunicator
,
#endif
operators
::
CudnnRNNCache
,
...
...
paddle/fluid/framework/var_type_traits_test.cc
浏览文件 @
28b356b9
...
...
@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
...
...
paddle/fluid/operators/nccl/CMakeLists.txt
浏览文件 @
28b356b9
if
(
NOT
WITH_NCCL
)
if
(
NOT
(
WITH_NCCL OR WITH_RCCL
)
)
return
()
endif
()
...
...
@@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32)
nv_library
(
nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator
)
endif
()
if
(
WITH_GPU
)
if
(
WITH_ROCM AND NOT WIN32
)
hip_library
(
nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator
)
endif
()
if
(
WITH_GPU OR WITH_ROCM
)
op_library
(
nccl_op DEPS nccl_common
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(ncclAllReduce);
\n
"
)
set
(
OPERATOR_DEPS
${
OPERATOR_DEPS
}
nccl_common PARENT_SCOPE
)
endif
()
if
(
NOT WIN32
)
if
(
WITH_GPU AND
NOT WIN32
)
nv_test
(
nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context
)
endif
()
if
(
WITH_ROCM AND NOT WIN32
)
hip_test
(
nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context
)
endif
()
paddle/fluid/operators/nccl/nccl_gpu_common.h
浏览文件 @
28b356b9
...
...
@@ -23,7 +23,11 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
28b356b9
...
...
@@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
)
if
(
WITH_GPU
)
if
(
WITH_GPU
OR WITH_ROCM
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
dynload_cuda
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
cuda_device_guard
)
endif
()
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_wrapper
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
reducer
)
endif
()
...
...
@@ -21,7 +21,7 @@ endif()
if
(
NOT WIN32
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
data_loader
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
mmap_allocator
)
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_context
)
endif
()
endif
(
NOT WIN32
)
...
...
@@ -71,7 +71,7 @@ if (WITH_PSCORE)
list
(
APPEND PYBIND_SRCS fleet_py.cc
)
endif
()
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
list
(
APPEND PYBIND_SRCS nccl_wrapper_py.cc
)
endif
()
...
...
@@ -81,9 +81,9 @@ if(WITH_PYTHON)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS
${
GLOB_OP_LIB
}
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS
${
GLOB_OPERATOR_DEPS
}
)
if
(
WITH_N
CCL
)
if
(
WITH_NCCL OR WITH_R
CCL
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context
)
endif
(
WITH_NCCL
)
endif
()
if
(
WITH_XPU_BKCL
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context
)
...
...
@@ -93,6 +93,9 @@ if(WITH_PYTHON)
target_link_libraries
(
op_function_generator
${
OP_FUNCTION_GENERETOR_DEPS
}
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
op_function_generator
${
os_dependency_modules
}
)
if
(
WITH_ROCM
)
target_link_libraries
(
op_function_generator
${
ROCM_HIPRTC_LIB
}
)
endif
()
set
(
impl_file
${
CMAKE_SOURCE_DIR
}
/paddle/fluid/pybind/op_function_impl.h
)
set
(
tmp_impl_file
${
impl_file
}
.tmp
)
...
...
@@ -164,20 +167,17 @@ if(WITH_PYTHON)
endif
(
WITH_MKLDNN
)
endif
(
WIN32
)
if
(
WITH_ROCM_PLATFORM
)
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
else
()
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
if
(
NOT APPLE AND NOT WIN32
)
target_link_libraries
(
paddle_pybind rt
)
endif
(
NOT APPLE AND NOT WIN32
)
endif
(
WITH_ROCM_PLATFORM
)
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
if
(
NOT APPLE AND NOT WIN32
)
target_link_libraries
(
paddle_pybind rt
)
endif
(
NOT APPLE AND NOT WIN32
)
if
(
WITH_ROCM
)
target_link_libraries
(
paddle_pybind
${
ROCM_HIPRTC_LIB
}
)
endif
()
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
paddle_pybind
${
os_dependency_modules
}
)
...
...
paddle/fluid/pybind/global_value_getter_setter.cc
浏览文件 @
28b356b9
...
...
@@ -66,7 +66,7 @@ DECLARE_bool(benchmark);
DECLARE_int32
(
inner_op_parallelism
);
DECLARE_int32
(
max_inplace_grad_add
);
DECLARE_string
(
tracer_profile_fname
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cudnn
DECLARE_uint64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_batchnorm_spatial_persistent
);
...
...
@@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_paddle_num_threads
,
FLAGS_use_mkldnn
,
FLAGS_max_inplace_grad_add
,
FLAGS_tracer_mkldnn_ops_on
,
FLAGS_tracer_mkldnn_ops_off
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_gpu_memory_limit_mb
,
FLAGS_cudnn_deterministic
,
FLAGS_conv_workspace_size_limit
,
FLAGS_cudnn_batchnorm_spatial_persistent
,
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
28b356b9
...
...
@@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) {
[](
imperative
::
VarBase
&
self
,
const
imperative
::
ParallelStrategy
&
strategy
)
{
if
(
strategy
.
nranks_
>
1
)
{
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if NCCL_VERSION_CODE >= 2212
imperative
::
AllReduce
(
self
.
Var
(),
self
.
MutableVar
(),
strategy
);
#else
...
...
@@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) {
)DOC"
)
.
def
(
"pin_memory"
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot copy this Tensor to pinned memory in CPU version "
"Paddle, "
...
...
@@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) {
.
def
(
"cuda"
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
,
int
device_id
,
bool
blocking
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot copy this Tensor to GPU in CPU version Paddle, "
"Please recompile or reinstall Paddle with CUDA support."
));
...
...
@@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) {
},
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
py
::
class_
<
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
ParallelContext
>>
(
m
,
"ParallelContext"
);
...
...
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
浏览文件 @
28b356b9
...
...
@@ -32,7 +32,8 @@ namespace py = pybind11;
namespace
paddle
{
namespace
pybind
{
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void
BindPSGPUWrapper
(
py
::
module
*
m
)
{
py
::
class_
<
framework
::
PSGPUWrapper
,
std
::
shared_ptr
<
framework
::
PSGPUWrapper
>>
(
*
m
,
"PSGPU"
)
...
...
paddle/fluid/pybind/ps_gpu_wrapper_py.h
浏览文件 @
28b356b9
...
...
@@ -22,7 +22,8 @@ namespace py = pybind11;
namespace
paddle
{
namespace
pybind
{
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void
BindPSGPUWrapper
(
py
::
module
*
m
);
#endif
}
// namespace pybind
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
28b356b9
...
...
@@ -86,7 +86,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
#endif
#include "paddle/fluid/framework/data_type.h"
...
...
@@ -95,11 +95,13 @@ limitations under the License. */
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h"
#if
def PADDLE_WITH_CUDA
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#ifndef PADDLE_WITH_HIP
#include "paddle/fluid/platform/cuda_profiler.h"
#endif
#include "paddle/fluid/platform/gpu_info.h"
#endif
...
...
@@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace
paddle
{
namespace
pybind
{
bool
IsCompiledWithCUDA
()
{
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
return
false
;
#else
return
true
;
#endif
}
bool
IsCompiledWithROCM
()
{
#ifndef PADDLE_WITH_HIP
return
false
;
#else
return
true
;
...
...
@@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) {
m
.
def
(
"set_num_threads"
,
&
platform
::
SetNumThreads
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"cudnn_version"
,
&
platform
::
CudnnVersion
);
#endif
...
...
@@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) {
if
(
dl
.
ctx
.
device_type
==
kDLCPU
)
{
paddle
::
framework
::
TensorFromDLPack
(
dl
,
&
tensor
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
dl
.
ctx
.
device_type
==
kDLGPU
)
{
paddle
::
framework
::
TensorFromDLPack
(
dl
,
&
tensor
);
}
...
...
@@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) {
.
def
(
"height"
,
&
SelectedRows
::
height
)
.
def
(
"set_rows"
,
[](
SelectedRows
&
self
,
std
::
vector
<
int64_t
>
rows
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
self
.
set_rows
(
rows
);
#else
Vector
<
int64_t
>
new_rows
(
rows
);
...
...
@@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def_static
(
"create"
,
[](
paddle
::
platform
::
CUDAPlace
&
place
)
->
paddle
::
platform
::
DeviceContext
*
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPlace in CPU only version, "
...
...
@@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def_static
(
"create"
,
[](
paddle
::
platform
::
CUDAPinnedPlace
&
place
)
->
paddle
::
platform
::
DeviceContext
*
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPinnedPlace in CPU only version, "
...
...
@@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle.
#endif
});;
// clang-format on
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
py
::
class_
<
platform
::
Communicator
>
(
m
,
"Communicator"
).
def
(
py
::
init
<>
());
#endif
py
::
class_
<
platform
::
CUDAPlace
>
(
m
,
"CUDAPlace"
,
R"DOC(
...
...
@@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC"
)
.
def
(
"__init__"
,
[](
platform
::
CUDAPlace
&
self
,
int
dev_id
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
UNLIKELY
(
dev_id
<
0
))
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Invalid CUDAPlace(%d), device id must be 0 or "
...
...
@@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle.
std
::
exit
(
-
1
);
#endif
})
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
.
def
(
"get_device_id"
,
[](
const
platform
::
CUDAPlace
&
self
)
{
return
self
.
GetDeviceId
();
})
.
def
(
"_type"
,
&
PlaceIndex
<
platform
::
CUDAPlace
>
)
...
...
@@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC"
)
.
def
(
"__init__"
,
[](
platform
::
CUDAPinnedPlace
&
self
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPinnedPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."
));
...
...
@@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"init_devices"
,
[]()
{
framework
::
InitDevices
();
});
m
.
def
(
"is_compiled_with_cuda"
,
IsCompiledWithCUDA
);
m
.
def
(
"is_compiled_with_rocm"
,
IsCompiledWithROCM
);
m
.
def
(
"is_compiled_with_xpu"
,
IsCompiledWithXPU
);
m
.
def
(
"is_compiled_with_mkldnn"
,
IsCompiledWithMKLDNN
);
m
.
def
(
"supports_bfloat16"
,
SupportsBfloat16
);
...
...
@@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle.
py
::
arg
(
"cmd"
),
py
::
arg
(
"time_out"
)
=
0
,
py
::
arg
(
"sleep_inter"
)
=
0
,
py
::
arg
(
"redirect_stderr"
)
=
false
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"is_float16_supported"
,
[](
const
platform
::
CUDAPlace
&
place
)
->
bool
{
// Only GPUs with Compute Capability >= 53 support float16
return
platform
::
GetCUDAComputeCapability
(
place
.
device
)
>=
53
;
...
...
@@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle.
py
::
return_value_policy
::
take_ownership
);
m
.
def
(
"op_support_gpu"
,
OpSupportGPU
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"get_cuda_device_count"
,
platform
::
GetCUDADeviceCount
);
#if
ndef _WIN32
#if
!defined(PADDLE_WITH_HIP) && !defined(_WIN32)
m
.
def
(
"nvprof_init"
,
platform
::
CudaProfilerInit
);
m
.
def
(
"nvprof_start"
,
platform
::
CudaProfilerStart
);
m
.
def
(
"nvprof_stop"
,
platform
::
CudaProfilerStop
);
...
...
@@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"size_of_dtype"
,
framework
::
SizeOfType
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"set_cublas_switch"
,
platform
::
SetAllowTF32Cublas
);
m
.
def
(
"get_cublas_switch"
,
platform
::
AllowTF32Cublas
);
m
.
def
(
"set_cudnn_switch"
,
platform
::
SetAllowTF32Cudnn
);
...
...
@@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_PSLIB
BindHeterWrapper
(
&
m
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
BindPSGPUWrapper
(
&
m
);
#endif
BindGlooWrapper
(
&
m
);
...
...
@@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_BOX_PS
BindBoxWrapper
(
&
m
);
#endif
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
BindNCCLWrapper
(
&
m
);
#endif
#ifdef PADDLE_WITH_GLOO
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
28b356b9
...
...
@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/bfloat16.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h"
...
...
@@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
));
#endif
}
else
if
(
platform
::
is_gpu_place
(
self
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const
T
*
a
=
self
.
data
<
T
>
();
auto
p
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
self
.
place
());
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
),
...
...
@@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
));
#endif
}
else
if
(
platform
::
is_gpu_place
(
self
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
p
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
self
->
place
());
T
*
a
=
self
->
mutable_data
<
T
>
(
p
);
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
),
...
...
@@ -296,7 +296,7 @@ void SetTensorFromPyArrayT(
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
paddle
::
platform
::
is_gpu_place
(
place
))
{
// NOTE(wangxi): When copying data to the accelerator card,
// we need set_device(dev_id) first.
...
...
@@ -304,8 +304,13 @@ void SetTensorFromPyArrayT(
platform
::
CUDADeviceGuard
guard
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
tmp_place
).
device
);
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
#ifdef PADDLE_WITH_HIP
paddle
::
platform
::
GpuMemcpySync
(
dst
,
array
.
data
(),
array
.
nbytes
(),
hipMemcpyHostToDevice
);
#else
paddle
::
platform
::
GpuMemcpySync
(
dst
,
array
.
data
(),
array
.
nbytes
(),
cudaMemcpyHostToDevice
);
#endif
}
else
if
(
paddle
::
platform
::
is_cuda_pinned_place
(
place
))
{
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
...
...
@@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
self
.
type
());
#endif
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_cuda_pinned_place
(
place
))
{
output
->
mutable_data
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
place
),
self
.
type
());
...
...
@@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
else
if
(
is_gpu_tensor
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
py
::
array
py_arr
(
py
::
dtype
(
py_dtype_str
.
c_str
()),
py_dims
,
py_strides
);
PADDLE_ENFORCE_EQ
(
py_arr
.
writeable
(),
true
,
platform
::
errors
::
InvalidArgument
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录