Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
28b356b9
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
28b356b9
编写于
2月 26, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 26, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid framework for rocm (part6), test=develop (#31015)
上级
c8fac5ee
变更
19
显示空白变更内容
内联
并排
Showing
19 changed file
with
229 addition
and
97 deletion
+229
-97
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+3
-3
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+10
-8
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+3
-3
paddle/fluid/framework/tensor_util_test.cc
paddle/fluid/framework/tensor_util_test.cc
+5
-5
paddle/fluid/framework/tensor_util_test.cu
paddle/fluid/framework/tensor_util_test.cu
+71
-0
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+18
-15
paddle/fluid/framework/trainer_factory.cc
paddle/fluid/framework/trainer_factory.cc
+5
-3
paddle/fluid/framework/var_type_traits.cc
paddle/fluid/framework/var_type_traits.cc
+8
-0
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+10
-4
paddle/fluid/framework/var_type_traits_test.cc
paddle/fluid/framework/var_type_traits_test.cc
+8
-0
paddle/fluid/operators/nccl/CMakeLists.txt
paddle/fluid/operators/nccl/CMakeLists.txt
+11
-3
paddle/fluid/operators/nccl/nccl_gpu_common.h
paddle/fluid/operators/nccl/nccl_gpu_common.h
+4
-0
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+20
-20
paddle/fluid/pybind/global_value_getter_setter.cc
paddle/fluid/pybind/global_value_getter_setter.cc
+2
-2
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+5
-4
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+2
-1
paddle/fluid/pybind/ps_gpu_wrapper_py.h
paddle/fluid/pybind/ps_gpu_wrapper_py.h
+2
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+31
-19
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+11
-6
未找到文件。
paddle/fluid/framework/tensor_test.cc
浏览文件 @
28b356b9
...
...
@@ -118,7 +118,7 @@ TEST(Tensor, MutableData) {
EXPECT_EQ
(
static_cast
<
int
>
(
p2
[
0
]),
1
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
framework
::
Tensor
src_tensor
;
float
*
p1
=
nullptr
;
...
...
@@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) {
ASSERT_EQ
(
src_tensor
.
data
<
int
>
(),
dst_tensor
.
data
<
int
>
());
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
framework
::
Tensor
src_tensor
;
framework
::
Tensor
dst_tensor
;
...
...
@@ -212,7 +212,7 @@ TEST(Tensor, Slice) {
EXPECT_EQ
(
src_data_address
+
3
*
4
*
1
*
sizeof
(
int
),
slice_data_address
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
framework
::
Tensor
src_tensor
;
src_tensor
.
mutable_data
<
double
>
(
framework
::
make_ddim
({
6
,
9
}),
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
28b356b9
...
...
@@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
platform
::
is_cuda_pinned_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
dst_place
),
dst_ptr
,
...
...
@@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
platform
::
is_cuda_pinned_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
dst_place
),
dst_ptr
,
...
...
@@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
return
!
Any
(
tensor
,
pred_inf
)
&&
!
Any
(
tensor
,
pred_nan
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
typename
T
>
static
inline
void
__global__
BothFalse
(
const
T
*
cmp
,
T
*
out
,
int
element_num
)
{
CUDA_KERNEL_LOOP
(
i
,
element_num
)
{
out
[
i
]
=
(
!
cmp
[
i
])
&&
(
!
out
[
i
]);
}
...
...
@@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
}
void
VisitorImpl
(
const
platform
::
CUDAPlace
&
gpu
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
*
ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
gpu
);
constexpr
int
MAX_BLOCK_DIM
=
512
;
const
int
MAX_GRID_DIM
=
ctx
->
GetMaxPhysicalThreadCount
()
/
MAX_BLOCK_DIM
;
...
...
@@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
platform
::
errors
::
ResourceExhausted
(
"tensor size %d overflow when writing tensor"
,
size
));
if
(
platform
::
is_gpu_place
(
tensor
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
constexpr
size_t
kBufSize
=
1024
*
1024
*
64
;
// 64MB
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
kBufSize
]);
auto
&
gpu_dev_ctx
=
...
...
@@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor
cpu_tensor
;
cpu_tensor
.
Resize
(
framework
::
make_ddim
(
shape
));
framework
::
VisitDataType
(
...
...
@@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor
cpu_tensor
;
cpu_tensor
.
Resize
(
framework
::
make_ddim
(
dims
));
framework
::
VisitDataType
(
...
...
@@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
if
(
dl_tensor
.
ctx
.
device_type
==
kDLCPU
)
{
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
dl_tensor
.
ctx
.
device_type
==
kDLGPU
)
{
platform
::
CUDAPlace
dst_place
=
platform
::
CUDAPlace
(
dl_tensor
.
ctx
.
device_id
);
...
...
paddle/fluid/framework/tensor_util.h
浏览文件 @
28b356b9
...
...
@@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_gpu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dst_place
),
dst_ptr
,
src_place
,
...
...
@@ -150,7 +150,7 @@ void TensorFromVector(const std::vector<T>& src,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_gpu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dst_place
),
dst_ptr
,
src_place
,
...
...
@@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src
.
place
()),
src_ptr
,
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_gpu_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
src
.
place
()),
...
...
paddle/fluid/framework/tensor_util_test.cc
浏览文件 @
28b356b9
...
...
@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) {
}
EXPECT_TRUE
(
dst_tensor
.
layout
()
==
src_tensor
.
layout
());
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
Tensor
src_tensor
;
Tensor
gpu_tensor
;
...
...
@@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) {
delete
cpu_place
;
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
paddle
::
framework
::
Tensor
cpu_tensor
;
...
...
@@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) {
EXPECT_EQ
(
src_ptr
[
i
],
dst
[
i
]);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
paddle
::
framework
::
Tensor
gpu_tensor
;
...
...
@@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) {
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
paddle
::
framework
::
Tensor
cpu_tensor
;
...
...
@@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) {
EXPECT_EQ
(
dst_tensor
.
dims
(),
src_tensor
.
dims
());
delete
place
;
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
Tensor
gpu_tensor
;
gpu_tensor
.
Resize
({
2
,
3
});
...
...
paddle/fluid/framework/tensor_util_test.cu
浏览文件 @
28b356b9
...
...
@@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
...
...
@@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) {
Tensor
tensor
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
...
...
@@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
...
...
@@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) {
Tensor
tensor
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
...
...
@@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
...
...
@@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
...
...
@@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) {
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillFinite
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillFinite
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
}
...
...
@@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorContainsInf
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
Tensor
tensor
,
out
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorContainsInf
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorContainsNAN
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
Tensor
tensor
,
out
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorContainsNAN
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
@@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillFinite
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
28b356b9
...
...
@@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer {
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
};
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class
HeterServiceContext
{
public:
...
...
@@ -155,8 +156,9 @@ class HeterServiceContext {
void
Reset
()
{
push_dense_status_
.
clear
();
}
int
place_num_
;
Scope
*
scope_
{
nullptr
};
#ifdef PADDLE_WITH_CUDA
cudaEvent_t
event_
;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuEvent_t
event_
;
#endif
std
::
vector
<
OperatorBase
*>
ops_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
...
...
@@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase {
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
void
InitDumpEnv
()
{}
template
<
typename
T
>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
cuda
Stream_t
stream
);
gpu
Stream_t
stream
);
#endif
#ifdef PADDLE_WITH_XPU
void
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
...
...
@@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase {
std
::
vector
<
Scope
*>
place_scopes_
;
BtObjectPool
<
HeterServiceContext
>
object_pool_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if
def PADDLE_WITH_CUDA
std
::
vector
<
cuda
Stream_t
>
copy_streams_
;
std
::
vector
<
cuda
Event_t
>
events_
;
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
gpu
Stream_t
>
copy_streams_
;
std
::
vector
<
gpu
Event_t
>
events_
;
#endif
};
...
...
@@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase {
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
void
InitDumpEnv
()
{}
template
<
typename
T
>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
cuda
Stream_t
stream
);
gpu
Stream_t
stream
);
#endif
void
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
);
template
<
typename
T
>
...
...
@@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase {
std
::
vector
<
std
::
thread
>
threads_
;
int
use_ps_gpu_
;
int
thread_num_
;
#if
def PADDLE_WITH_CUDA
std
::
vector
<
cuda
Stream_t
>
copy_streams_
;
std
::
vector
<
cuda
Event_t
>
events_
;
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
gpu
Stream_t
>
copy_streams_
;
std
::
vector
<
gpu
Event_t
>
events_
;
#endif
};
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
class
PSGPUTrainer
:
public
TrainerBase
{
public:
PSGPUTrainer
()
{}
...
...
@@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase {
};
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
PipelineTrainer
:
public
TrainerBase
{
public:
PipelineTrainer
()
{}
...
...
paddle/fluid/framework/trainer_factory.cc
浏览文件 @
28b356b9
...
...
@@ -66,15 +66,17 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS
(
MultiTrainer
);
REGISTER_TRAINER_CLASS
(
DistMultiTrainer
);
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
HeterXpuTrainer
);
REGISTER_TRAINER_CLASS
(
HeterBoxTrainer
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
PSGPUTrainer
);
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
REGISTER_TRAINER_CLASS
(
PipelineTrainer
);
#endif
}
// namespace framework
...
...
paddle/fluid/framework/var_type_traits.cc
浏览文件 @
28b356b9
...
...
@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
28b356b9
...
...
@@ -30,6 +30,12 @@
#include <nccl.h>
#endif
#endif
#ifdef PADDLE_WITH_HIP
#include <miopen/miopen.h>
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
...
...
@@ -39,8 +45,8 @@
namespace
paddle
{
namespace
platform
{
#if
def PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_NCCL)
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
Communicator
;
class
NCCLCommunicator
;
#endif
...
...
@@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
LoDTensorArray
,
platform
::
PlaceList
,
ReaderHolder
,
std
::
string
,
Scope
*
,
operators
::
reader
::
LoDTensorBlockingQueueHolder
,
FetchList
,
operators
::
reader
::
OrderedMultiDeviceLoDTensorBlockingQueueHolder
,
#if
def PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_NCCL)
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
ncclUniqueId
,
platform
::
Communicator
,
platform
::
NCCLCommunicator
,
#endif
operators
::
CudnnRNNCache
,
...
...
paddle/fluid/framework/var_type_traits_test.cc
浏览文件 @
28b356b9
...
...
@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
...
...
paddle/fluid/operators/nccl/CMakeLists.txt
浏览文件 @
28b356b9
if
(
NOT
WITH_NCCL
)
if
(
NOT
(
WITH_NCCL OR WITH_RCCL
)
)
return
()
endif
()
...
...
@@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32)
nv_library
(
nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator
)
endif
()
if
(
WITH_GPU
)
if
(
WITH_ROCM AND NOT WIN32
)
hip_library
(
nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator
)
endif
()
if
(
WITH_GPU OR WITH_ROCM
)
op_library
(
nccl_op DEPS nccl_common
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(ncclAllReduce);
\n
"
)
set
(
OPERATOR_DEPS
${
OPERATOR_DEPS
}
nccl_common PARENT_SCOPE
)
endif
()
if
(
NOT WIN32
)
if
(
WITH_GPU AND
NOT WIN32
)
nv_test
(
nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context
)
endif
()
if
(
WITH_ROCM AND NOT WIN32
)
hip_test
(
nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context
)
endif
()
paddle/fluid/operators/nccl/nccl_gpu_common.h
浏览文件 @
28b356b9
...
...
@@ -23,7 +23,11 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
28b356b9
...
...
@@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
)
if
(
WITH_GPU
)
if
(
WITH_GPU
OR WITH_ROCM
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
dynload_cuda
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
cuda_device_guard
)
endif
()
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_wrapper
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
reducer
)
endif
()
...
...
@@ -21,7 +21,7 @@ endif()
if
(
NOT WIN32
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
data_loader
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
mmap_allocator
)
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_context
)
endif
()
endif
(
NOT WIN32
)
...
...
@@ -71,7 +71,7 @@ if (WITH_PSCORE)
list
(
APPEND PYBIND_SRCS fleet_py.cc
)
endif
()
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
list
(
APPEND PYBIND_SRCS nccl_wrapper_py.cc
)
endif
()
...
...
@@ -81,9 +81,9 @@ if(WITH_PYTHON)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS
${
GLOB_OP_LIB
}
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS
${
GLOB_OPERATOR_DEPS
}
)
if
(
WITH_N
CCL
)
if
(
WITH_NCCL OR WITH_R
CCL
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context
)
endif
(
WITH_NCCL
)
endif
()
if
(
WITH_XPU_BKCL
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context
)
...
...
@@ -93,6 +93,9 @@ if(WITH_PYTHON)
target_link_libraries
(
op_function_generator
${
OP_FUNCTION_GENERETOR_DEPS
}
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
op_function_generator
${
os_dependency_modules
}
)
if
(
WITH_ROCM
)
target_link_libraries
(
op_function_generator
${
ROCM_HIPRTC_LIB
}
)
endif
()
set
(
impl_file
${
CMAKE_SOURCE_DIR
}
/paddle/fluid/pybind/op_function_impl.h
)
set
(
tmp_impl_file
${
impl_file
}
.tmp
)
...
...
@@ -164,12 +167,6 @@ if(WITH_PYTHON)
endif
(
WITH_MKLDNN
)
endif
(
WIN32
)
if
(
WITH_ROCM_PLATFORM
)
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
else
()
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
...
...
@@ -177,7 +174,10 @@ if(WITH_PYTHON)
if
(
NOT APPLE AND NOT WIN32
)
target_link_libraries
(
paddle_pybind rt
)
endif
(
NOT APPLE AND NOT WIN32
)
endif
(
WITH_ROCM_PLATFORM
)
if
(
WITH_ROCM
)
target_link_libraries
(
paddle_pybind
${
ROCM_HIPRTC_LIB
}
)
endif
()
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
paddle_pybind
${
os_dependency_modules
}
)
...
...
paddle/fluid/pybind/global_value_getter_setter.cc
浏览文件 @
28b356b9
...
...
@@ -66,7 +66,7 @@ DECLARE_bool(benchmark);
DECLARE_int32
(
inner_op_parallelism
);
DECLARE_int32
(
max_inplace_grad_add
);
DECLARE_string
(
tracer_profile_fname
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cudnn
DECLARE_uint64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_batchnorm_spatial_persistent
);
...
...
@@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_paddle_num_threads
,
FLAGS_use_mkldnn
,
FLAGS_max_inplace_grad_add
,
FLAGS_tracer_mkldnn_ops_on
,
FLAGS_tracer_mkldnn_ops_off
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_gpu_memory_limit_mb
,
FLAGS_cudnn_deterministic
,
FLAGS_conv_workspace_size_limit
,
FLAGS_cudnn_batchnorm_spatial_persistent
,
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
28b356b9
...
...
@@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) {
[](
imperative
::
VarBase
&
self
,
const
imperative
::
ParallelStrategy
&
strategy
)
{
if
(
strategy
.
nranks_
>
1
)
{
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if NCCL_VERSION_CODE >= 2212
imperative
::
AllReduce
(
self
.
Var
(),
self
.
MutableVar
(),
strategy
);
#else
...
...
@@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) {
)DOC"
)
.
def
(
"pin_memory"
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot copy this Tensor to pinned memory in CPU version "
"Paddle, "
...
...
@@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) {
.
def
(
"cuda"
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
,
int
device_id
,
bool
blocking
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot copy this Tensor to GPU in CPU version Paddle, "
"Please recompile or reinstall Paddle with CUDA support."
));
...
...
@@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) {
},
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
py
::
class_
<
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
ParallelContext
>>
(
m
,
"ParallelContext"
);
...
...
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
浏览文件 @
28b356b9
...
...
@@ -32,7 +32,8 @@ namespace py = pybind11;
namespace
paddle
{
namespace
pybind
{
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void
BindPSGPUWrapper
(
py
::
module
*
m
)
{
py
::
class_
<
framework
::
PSGPUWrapper
,
std
::
shared_ptr
<
framework
::
PSGPUWrapper
>>
(
*
m
,
"PSGPU"
)
...
...
paddle/fluid/pybind/ps_gpu_wrapper_py.h
浏览文件 @
28b356b9
...
...
@@ -22,7 +22,8 @@ namespace py = pybind11;
namespace
paddle
{
namespace
pybind
{
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void
BindPSGPUWrapper
(
py
::
module
*
m
);
#endif
}
// namespace pybind
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
28b356b9
...
...
@@ -86,7 +86,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
#endif
#include "paddle/fluid/framework/data_type.h"
...
...
@@ -95,11 +95,13 @@ limitations under the License. */
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h"
#if
def PADDLE_WITH_CUDA
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#ifndef PADDLE_WITH_HIP
#include "paddle/fluid/platform/cuda_profiler.h"
#endif
#include "paddle/fluid/platform/gpu_info.h"
#endif
...
...
@@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace
paddle
{
namespace
pybind
{
bool
IsCompiledWithCUDA
()
{
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
return
false
;
#else
return
true
;
#endif
}
bool
IsCompiledWithROCM
()
{
#ifndef PADDLE_WITH_HIP
return
false
;
#else
return
true
;
...
...
@@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) {
m
.
def
(
"set_num_threads"
,
&
platform
::
SetNumThreads
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"cudnn_version"
,
&
platform
::
CudnnVersion
);
#endif
...
...
@@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) {
if
(
dl
.
ctx
.
device_type
==
kDLCPU
)
{
paddle
::
framework
::
TensorFromDLPack
(
dl
,
&
tensor
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
dl
.
ctx
.
device_type
==
kDLGPU
)
{
paddle
::
framework
::
TensorFromDLPack
(
dl
,
&
tensor
);
}
...
...
@@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) {
.
def
(
"height"
,
&
SelectedRows
::
height
)
.
def
(
"set_rows"
,
[](
SelectedRows
&
self
,
std
::
vector
<
int64_t
>
rows
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
self
.
set_rows
(
rows
);
#else
Vector
<
int64_t
>
new_rows
(
rows
);
...
...
@@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def_static
(
"create"
,
[](
paddle
::
platform
::
CUDAPlace
&
place
)
->
paddle
::
platform
::
DeviceContext
*
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPlace in CPU only version, "
...
...
@@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def_static
(
"create"
,
[](
paddle
::
platform
::
CUDAPinnedPlace
&
place
)
->
paddle
::
platform
::
DeviceContext
*
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPinnedPlace in CPU only version, "
...
...
@@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle.
#endif
});;
// clang-format on
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
py
::
class_
<
platform
::
Communicator
>
(
m
,
"Communicator"
).
def
(
py
::
init
<>
());
#endif
py
::
class_
<
platform
::
CUDAPlace
>
(
m
,
"CUDAPlace"
,
R"DOC(
...
...
@@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC"
)
.
def
(
"__init__"
,
[](
platform
::
CUDAPlace
&
self
,
int
dev_id
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
UNLIKELY
(
dev_id
<
0
))
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Invalid CUDAPlace(%d), device id must be 0 or "
...
...
@@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle.
std
::
exit
(
-
1
);
#endif
})
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
.
def
(
"get_device_id"
,
[](
const
platform
::
CUDAPlace
&
self
)
{
return
self
.
GetDeviceId
();
})
.
def
(
"_type"
,
&
PlaceIndex
<
platform
::
CUDAPlace
>
)
...
...
@@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC"
)
.
def
(
"__init__"
,
[](
platform
::
CUDAPinnedPlace
&
self
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPinnedPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."
));
...
...
@@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"init_devices"
,
[]()
{
framework
::
InitDevices
();
});
m
.
def
(
"is_compiled_with_cuda"
,
IsCompiledWithCUDA
);
m
.
def
(
"is_compiled_with_rocm"
,
IsCompiledWithROCM
);
m
.
def
(
"is_compiled_with_xpu"
,
IsCompiledWithXPU
);
m
.
def
(
"is_compiled_with_mkldnn"
,
IsCompiledWithMKLDNN
);
m
.
def
(
"supports_bfloat16"
,
SupportsBfloat16
);
...
...
@@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle.
py
::
arg
(
"cmd"
),
py
::
arg
(
"time_out"
)
=
0
,
py
::
arg
(
"sleep_inter"
)
=
0
,
py
::
arg
(
"redirect_stderr"
)
=
false
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"is_float16_supported"
,
[](
const
platform
::
CUDAPlace
&
place
)
->
bool
{
// Only GPUs with Compute Capability >= 53 support float16
return
platform
::
GetCUDAComputeCapability
(
place
.
device
)
>=
53
;
...
...
@@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle.
py
::
return_value_policy
::
take_ownership
);
m
.
def
(
"op_support_gpu"
,
OpSupportGPU
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"get_cuda_device_count"
,
platform
::
GetCUDADeviceCount
);
#if
ndef _WIN32
#if
!defined(PADDLE_WITH_HIP) && !defined(_WIN32)
m
.
def
(
"nvprof_init"
,
platform
::
CudaProfilerInit
);
m
.
def
(
"nvprof_start"
,
platform
::
CudaProfilerStart
);
m
.
def
(
"nvprof_stop"
,
platform
::
CudaProfilerStop
);
...
...
@@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"size_of_dtype"
,
framework
::
SizeOfType
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"set_cublas_switch"
,
platform
::
SetAllowTF32Cublas
);
m
.
def
(
"get_cublas_switch"
,
platform
::
AllowTF32Cublas
);
m
.
def
(
"set_cudnn_switch"
,
platform
::
SetAllowTF32Cudnn
);
...
...
@@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_PSLIB
BindHeterWrapper
(
&
m
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
BindPSGPUWrapper
(
&
m
);
#endif
BindGlooWrapper
(
&
m
);
...
...
@@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_BOX_PS
BindBoxWrapper
(
&
m
);
#endif
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
BindNCCLWrapper
(
&
m
);
#endif
#ifdef PADDLE_WITH_GLOO
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
28b356b9
...
...
@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/bfloat16.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h"
...
...
@@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
));
#endif
}
else
if
(
platform
::
is_gpu_place
(
self
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const
T
*
a
=
self
.
data
<
T
>
();
auto
p
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
self
.
place
());
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
),
...
...
@@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
));
#endif
}
else
if
(
platform
::
is_gpu_place
(
self
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
p
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
self
->
place
());
T
*
a
=
self
->
mutable_data
<
T
>
(
p
);
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
),
...
...
@@ -296,7 +296,7 @@ void SetTensorFromPyArrayT(
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
paddle
::
platform
::
is_gpu_place
(
place
))
{
// NOTE(wangxi): When copying data to the accelerator card,
// we need set_device(dev_id) first.
...
...
@@ -304,8 +304,13 @@ void SetTensorFromPyArrayT(
platform
::
CUDADeviceGuard
guard
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
tmp_place
).
device
);
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
#ifdef PADDLE_WITH_HIP
paddle
::
platform
::
GpuMemcpySync
(
dst
,
array
.
data
(),
array
.
nbytes
(),
hipMemcpyHostToDevice
);
#else
paddle
::
platform
::
GpuMemcpySync
(
dst
,
array
.
data
(),
array
.
nbytes
(),
cudaMemcpyHostToDevice
);
#endif
}
else
if
(
paddle
::
platform
::
is_cuda_pinned_place
(
place
))
{
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
...
...
@@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
self
.
type
());
#endif
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_cuda_pinned_place
(
place
))
{
output
->
mutable_data
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
place
),
self
.
type
());
...
...
@@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
else
if
(
is_gpu_tensor
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
py
::
array
py_arr
(
py
::
dtype
(
py_dtype_str
.
c_str
()),
py_dims
,
py_strides
);
PADDLE_ENFORCE_EQ
(
py_arr
.
writeable
(),
true
,
platform
::
errors
::
InvalidArgument
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录