Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
28b356b9
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
28b356b9
编写于
2月 26, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 26, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid framework for rocm (part6), test=develop (#31015)
上级
c8fac5ee
变更
19
显示空白变更内容
内联
并排
Showing
19 changed file
with
229 addition
and
97 deletion
+229
-97
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+3
-3
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+10
-8
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+3
-3
paddle/fluid/framework/tensor_util_test.cc
paddle/fluid/framework/tensor_util_test.cc
+5
-5
paddle/fluid/framework/tensor_util_test.cu
paddle/fluid/framework/tensor_util_test.cu
+71
-0
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+18
-15
paddle/fluid/framework/trainer_factory.cc
paddle/fluid/framework/trainer_factory.cc
+5
-3
paddle/fluid/framework/var_type_traits.cc
paddle/fluid/framework/var_type_traits.cc
+8
-0
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+10
-4
paddle/fluid/framework/var_type_traits_test.cc
paddle/fluid/framework/var_type_traits_test.cc
+8
-0
paddle/fluid/operators/nccl/CMakeLists.txt
paddle/fluid/operators/nccl/CMakeLists.txt
+11
-3
paddle/fluid/operators/nccl/nccl_gpu_common.h
paddle/fluid/operators/nccl/nccl_gpu_common.h
+4
-0
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+20
-20
paddle/fluid/pybind/global_value_getter_setter.cc
paddle/fluid/pybind/global_value_getter_setter.cc
+2
-2
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+5
-4
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+2
-1
paddle/fluid/pybind/ps_gpu_wrapper_py.h
paddle/fluid/pybind/ps_gpu_wrapper_py.h
+2
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+31
-19
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+11
-6
未找到文件。
paddle/fluid/framework/tensor_test.cc
浏览文件 @
28b356b9
...
@@ -118,7 +118,7 @@ TEST(Tensor, MutableData) {
...
@@ -118,7 +118,7 @@ TEST(Tensor, MutableData) {
EXPECT_EQ
(
static_cast
<
int
>
(
p2
[
0
]),
1
);
EXPECT_EQ
(
static_cast
<
int
>
(
p2
[
0
]),
1
);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
{
framework
::
Tensor
src_tensor
;
framework
::
Tensor
src_tensor
;
float
*
p1
=
nullptr
;
float
*
p1
=
nullptr
;
...
@@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) {
...
@@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) {
ASSERT_EQ
(
src_tensor
.
data
<
int
>
(),
dst_tensor
.
data
<
int
>
());
ASSERT_EQ
(
src_tensor
.
data
<
int
>
(),
dst_tensor
.
data
<
int
>
());
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
{
framework
::
Tensor
src_tensor
;
framework
::
Tensor
src_tensor
;
framework
::
Tensor
dst_tensor
;
framework
::
Tensor
dst_tensor
;
...
@@ -212,7 +212,7 @@ TEST(Tensor, Slice) {
...
@@ -212,7 +212,7 @@ TEST(Tensor, Slice) {
EXPECT_EQ
(
src_data_address
+
3
*
4
*
1
*
sizeof
(
int
),
slice_data_address
);
EXPECT_EQ
(
src_data_address
+
3
*
4
*
1
*
sizeof
(
int
),
slice_data_address
);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
{
framework
::
Tensor
src_tensor
;
framework
::
Tensor
src_tensor
;
src_tensor
.
mutable_data
<
double
>
(
framework
::
make_ddim
({
6
,
9
}),
src_tensor
.
mutable_data
<
double
>
(
framework
::
make_ddim
({
6
,
9
}),
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
28b356b9
...
@@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
...
@@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
}
#endif
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
platform
::
is_cuda_pinned_place
(
dst_place
))
{
platform
::
is_cuda_pinned_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
dst_place
),
dst_ptr
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
dst_place
),
dst_ptr
,
...
@@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
...
@@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
}
#endif
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
platform
::
is_cuda_pinned_place
(
dst_place
))
{
platform
::
is_cuda_pinned_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
dst_place
),
dst_ptr
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
dst_place
),
dst_ptr
,
...
@@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
...
@@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
return
!
Any
(
tensor
,
pred_inf
)
&&
!
Any
(
tensor
,
pred_nan
);
return
!
Any
(
tensor
,
pred_inf
)
&&
!
Any
(
tensor
,
pred_nan
);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
typename
T
>
template
<
typename
T
>
static
inline
void
__global__
BothFalse
(
const
T
*
cmp
,
T
*
out
,
int
element_num
)
{
static
inline
void
__global__
BothFalse
(
const
T
*
cmp
,
T
*
out
,
int
element_num
)
{
CUDA_KERNEL_LOOP
(
i
,
element_num
)
{
out
[
i
]
=
(
!
cmp
[
i
])
&&
(
!
out
[
i
]);
}
CUDA_KERNEL_LOOP
(
i
,
element_num
)
{
out
[
i
]
=
(
!
cmp
[
i
])
&&
(
!
out
[
i
]);
}
...
@@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
...
@@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
}
}
void
VisitorImpl
(
const
platform
::
CUDAPlace
&
gpu
)
const
{
void
VisitorImpl
(
const
platform
::
CUDAPlace
&
gpu
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
*
ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
gpu
);
auto
*
ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
gpu
);
constexpr
int
MAX_BLOCK_DIM
=
512
;
constexpr
int
MAX_BLOCK_DIM
=
512
;
const
int
MAX_GRID_DIM
=
ctx
->
GetMaxPhysicalThreadCount
()
/
MAX_BLOCK_DIM
;
const
int
MAX_GRID_DIM
=
ctx
->
GetMaxPhysicalThreadCount
()
/
MAX_BLOCK_DIM
;
...
@@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
...
@@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
platform
::
errors
::
ResourceExhausted
(
platform
::
errors
::
ResourceExhausted
(
"tensor size %d overflow when writing tensor"
,
size
));
"tensor size %d overflow when writing tensor"
,
size
));
if
(
platform
::
is_gpu_place
(
tensor
.
place
()))
{
if
(
platform
::
is_gpu_place
(
tensor
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
constexpr
size_t
kBufSize
=
1024
*
1024
*
64
;
// 64MB
constexpr
size_t
kBufSize
=
1024
*
1024
*
64
;
// 64MB
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
kBufSize
]);
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
kBufSize
]);
auto
&
gpu_dev_ctx
=
auto
&
gpu_dev_ctx
=
...
@@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
...
@@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor
cpu_tensor
;
Tensor
cpu_tensor
;
cpu_tensor
.
Resize
(
framework
::
make_ddim
(
shape
));
cpu_tensor
.
Resize
(
framework
::
make_ddim
(
shape
));
framework
::
VisitDataType
(
framework
::
VisitDataType
(
...
@@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
...
@@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor
cpu_tensor
;
Tensor
cpu_tensor
;
cpu_tensor
.
Resize
(
framework
::
make_ddim
(
dims
));
cpu_tensor
.
Resize
(
framework
::
make_ddim
(
dims
));
framework
::
VisitDataType
(
framework
::
VisitDataType
(
...
@@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
...
@@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
if
(
dl_tensor
.
ctx
.
device_type
==
kDLCPU
)
{
if
(
dl_tensor
.
ctx
.
device_type
==
kDLCPU
)
{
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
dl_tensor
.
ctx
.
device_type
==
kDLGPU
)
{
if
(
dl_tensor
.
ctx
.
device_type
==
kDLGPU
)
{
platform
::
CUDAPlace
dst_place
=
platform
::
CUDAPlace
dst_place
=
platform
::
CUDAPlace
(
dl_tensor
.
ctx
.
device_id
);
platform
::
CUDAPlace
(
dl_tensor
.
ctx
.
device_id
);
...
...
paddle/fluid/framework/tensor_util.h
浏览文件 @
28b356b9
...
@@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
...
@@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
src_place
,
src_ptr
,
size
);
src_place
,
src_ptr
,
size
);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_gpu_place
(
dst_place
))
{
// NOLINT
else
if
(
platform
::
is_gpu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dst_place
),
dst_ptr
,
src_place
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dst_place
),
dst_ptr
,
src_place
,
...
@@ -150,7 +150,7 @@ void TensorFromVector(const std::vector<T>& src,
...
@@ -150,7 +150,7 @@ void TensorFromVector(const std::vector<T>& src,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
src_place
,
src_ptr
,
size
);
src_place
,
src_ptr
,
size
);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_gpu_place
(
dst_place
))
{
// NOLINT
else
if
(
platform
::
is_gpu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dst_place
),
dst_ptr
,
src_place
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dst_place
),
dst_ptr
,
src_place
,
...
@@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
...
@@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src
.
place
()),
src_ptr
,
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src
.
place
()),
src_ptr
,
size
);
size
);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else
if
(
platform
::
is_gpu_place
(
src
.
place
()))
{
// NOLINT
else
if
(
platform
::
is_gpu_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
memory
::
Copy
(
dst_place
,
dst_ptr
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
src
.
place
()),
dst_place
,
dst_ptr
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
src
.
place
()),
...
...
paddle/fluid/framework/tensor_util_test.cc
浏览文件 @
28b356b9
...
@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) {
...
@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) {
}
}
EXPECT_TRUE
(
dst_tensor
.
layout
()
==
src_tensor
.
layout
());
EXPECT_TRUE
(
dst_tensor
.
layout
()
==
src_tensor
.
layout
());
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
{
Tensor
src_tensor
;
Tensor
src_tensor
;
Tensor
gpu_tensor
;
Tensor
gpu_tensor
;
...
@@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) {
...
@@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) {
delete
cpu_place
;
delete
cpu_place
;
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
{
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
paddle
::
framework
::
Tensor
cpu_tensor
;
paddle
::
framework
::
Tensor
cpu_tensor
;
...
@@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) {
...
@@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) {
EXPECT_EQ
(
src_ptr
[
i
],
dst
[
i
]);
EXPECT_EQ
(
src_ptr
[
i
],
dst
[
i
]);
}
}
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
{
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
paddle
::
framework
::
Tensor
gpu_tensor
;
paddle
::
framework
::
Tensor
gpu_tensor
;
...
@@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) {
...
@@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) {
}
}
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
{
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
std
::
vector
<
int
>
src_vec
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
paddle
::
framework
::
Tensor
cpu_tensor
;
paddle
::
framework
::
Tensor
cpu_tensor
;
...
@@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) {
...
@@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) {
EXPECT_EQ
(
dst_tensor
.
dims
(),
src_tensor
.
dims
());
EXPECT_EQ
(
dst_tensor
.
dims
(),
src_tensor
.
dims
());
delete
place
;
delete
place
;
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
{
Tensor
gpu_tensor
;
Tensor
gpu_tensor
;
gpu_tensor
.
Resize
({
2
,
3
});
gpu_tensor
.
Resize
({
2
,
3
});
...
...
paddle/fluid/framework/tensor_util_test.cu
浏览文件 @
28b356b9
...
@@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) {
...
@@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) {
{
{
Tensor
tensor
;
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
}
...
@@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) {
...
@@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) {
Tensor
tensor
;
Tensor
tensor
;
paddle
::
platform
::
float16
*
buf
=
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
}
...
@@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) {
...
@@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) {
{
{
Tensor
tensor
;
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
}
...
@@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) {
...
@@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) {
Tensor
tensor
;
Tensor
tensor
;
paddle
::
platform
::
float16
*
buf
=
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
}
...
@@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) {
...
@@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) {
{
{
Tensor
tensor
;
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
}
{
{
Tensor
tensor
;
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
}
...
@@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) {
...
@@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) {
{
{
Tensor
tensor
;
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
}
{
{
Tensor
tensor
;
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
}
...
@@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) {
...
@@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) {
{
{
Tensor
tensor
;
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillFinite
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
}
}
{
{
Tensor
tensor
;
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillFinite
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
}
}
...
@@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
...
@@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
{
{
Tensor
tensor
,
out
;
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
TensorContainsInf
(
tensor
,
&
out
);
TensorContainsInf
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
platform
::
CPUPlace
cpu
;
...
@@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
...
@@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
Tensor
tensor
,
out
;
Tensor
tensor
,
out
;
paddle
::
platform
::
float16
*
buf
=
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
TensorContainsInf
(
tensor
,
&
out
);
TensorContainsInf
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
platform
::
CPUPlace
cpu
;
...
@@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
...
@@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
{
{
Tensor
tensor
,
out
;
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
TensorContainsNAN
(
tensor
,
&
out
);
TensorContainsNAN
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
platform
::
CPUPlace
cpu
;
...
@@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
...
@@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
Tensor
tensor
,
out
;
Tensor
tensor
,
out
;
paddle
::
platform
::
float16
*
buf
=
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
TensorContainsNAN
(
tensor
,
&
out
);
TensorContainsNAN
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
platform
::
CPUPlace
cpu
;
...
@@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
...
@@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
{
Tensor
tensor
,
out
;
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillInf
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
platform
::
CPUPlace
cpu
;
...
@@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
...
@@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
{
Tensor
tensor
,
out
;
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillNAN
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
platform
::
CPUPlace
cpu
;
...
@@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) {
...
@@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
{
Tensor
tensor
,
out
;
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
FillFinite
,
dim3
(
1
),
dim3
(
1
),
0
,
cuda_ctx
->
stream
(),
buf
);
#else
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
#endif
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
platform
::
CPUPlace
cpu
;
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
28b356b9
...
@@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer {
...
@@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer {
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
};
};
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
(defined PADDLE_WITH_PSLIB)
class
HeterServiceContext
{
class
HeterServiceContext
{
public:
public:
...
@@ -155,8 +156,9 @@ class HeterServiceContext {
...
@@ -155,8 +156,9 @@ class HeterServiceContext {
void
Reset
()
{
push_dense_status_
.
clear
();
}
void
Reset
()
{
push_dense_status_
.
clear
();
}
int
place_num_
;
int
place_num_
;
Scope
*
scope_
{
nullptr
};
Scope
*
scope_
{
nullptr
};
#ifdef PADDLE_WITH_CUDA
cudaEvent_t
event_
;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuEvent_t
event_
;
#endif
#endif
std
::
vector
<
OperatorBase
*>
ops_
;
std
::
vector
<
OperatorBase
*>
ops_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
...
@@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase {
...
@@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase {
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
void
InitDumpEnv
()
{}
virtual
void
InitDumpEnv
()
{}
template
<
typename
T
>
template
<
typename
T
>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
const
paddle
::
platform
::
Place
&
thread_place
,
cuda
Stream_t
stream
);
gpu
Stream_t
stream
);
#endif
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
void
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
void
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
...
@@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase {
...
@@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase {
std
::
vector
<
Scope
*>
place_scopes_
;
std
::
vector
<
Scope
*>
place_scopes_
;
BtObjectPool
<
HeterServiceContext
>
object_pool_
;
BtObjectPool
<
HeterServiceContext
>
object_pool_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
cuda
Stream_t
>
copy_streams_
;
std
::
vector
<
gpu
Stream_t
>
copy_streams_
;
std
::
vector
<
cuda
Event_t
>
events_
;
std
::
vector
<
gpu
Event_t
>
events_
;
#endif
#endif
};
};
...
@@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase {
...
@@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase {
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
void
InitDumpEnv
()
{}
virtual
void
InitDumpEnv
()
{}
template
<
typename
T
>
template
<
typename
T
>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
const
paddle
::
platform
::
Place
&
thread_place
,
cuda
Stream_t
stream
);
gpu
Stream_t
stream
);
#endif
#endif
void
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
);
void
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
);
template
<
typename
T
>
template
<
typename
T
>
...
@@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase {
...
@@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase {
std
::
vector
<
std
::
thread
>
threads_
;
std
::
vector
<
std
::
thread
>
threads_
;
int
use_ps_gpu_
;
int
use_ps_gpu_
;
int
thread_num_
;
int
thread_num_
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
cuda
Stream_t
>
copy_streams_
;
std
::
vector
<
gpu
Stream_t
>
copy_streams_
;
std
::
vector
<
cuda
Event_t
>
events_
;
std
::
vector
<
gpu
Event_t
>
events_
;
#endif
#endif
};
};
#endif
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
class
PSGPUTrainer
:
public
TrainerBase
{
class
PSGPUTrainer
:
public
TrainerBase
{
public:
public:
PSGPUTrainer
()
{}
PSGPUTrainer
()
{}
...
@@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase {
...
@@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase {
};
};
#endif
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
PipelineTrainer
:
public
TrainerBase
{
class
PipelineTrainer
:
public
TrainerBase
{
public:
public:
PipelineTrainer
()
{}
PipelineTrainer
()
{}
...
...
paddle/fluid/framework/trainer_factory.cc
浏览文件 @
28b356b9
...
@@ -66,15 +66,17 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
...
@@ -66,15 +66,17 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS
(
MultiTrainer
);
REGISTER_TRAINER_CLASS
(
MultiTrainer
);
REGISTER_TRAINER_CLASS
(
DistMultiTrainer
);
REGISTER_TRAINER_CLASS
(
DistMultiTrainer
);
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
HeterXpuTrainer
);
REGISTER_TRAINER_CLASS
(
HeterXpuTrainer
);
REGISTER_TRAINER_CLASS
(
HeterBoxTrainer
);
REGISTER_TRAINER_CLASS
(
HeterBoxTrainer
);
#endif
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
PSGPUTrainer
);
REGISTER_TRAINER_CLASS
(
PSGPUTrainer
);
#endif
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
REGISTER_TRAINER_CLASS
(
PipelineTrainer
);
REGISTER_TRAINER_CLASS
(
PipelineTrainer
);
#endif
#endif
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/var_type_traits.cc
浏览文件 @
28b356b9
...
@@ -28,6 +28,14 @@
...
@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#include "paddle/fluid/platform/bkcl_helper.h"
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
28b356b9
...
@@ -30,6 +30,12 @@
...
@@ -30,6 +30,12 @@
#include <nccl.h>
#include <nccl.h>
#endif
#endif
#endif
#endif
#ifdef PADDLE_WITH_HIP
#include <miopen/miopen.h>
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
#include "xpu/bkcl.h"
...
@@ -39,8 +45,8 @@
...
@@ -39,8 +45,8 @@
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
Communicator
;
class
Communicator
;
class
NCCLCommunicator
;
class
NCCLCommunicator
;
#endif
#endif
...
@@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
...
@@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
LoDTensorArray
,
platform
::
PlaceList
,
ReaderHolder
,
std
::
string
,
Scope
*
,
LoDTensorArray
,
platform
::
PlaceList
,
ReaderHolder
,
std
::
string
,
Scope
*
,
operators
::
reader
::
LoDTensorBlockingQueueHolder
,
FetchList
,
operators
::
reader
::
LoDTensorBlockingQueueHolder
,
FetchList
,
operators
::
reader
::
OrderedMultiDeviceLoDTensorBlockingQueueHolder
,
operators
::
reader
::
OrderedMultiDeviceLoDTensorBlockingQueueHolder
,
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
ncclUniqueId
,
platform
::
Communicator
,
platform
::
NCCLCommunicator
,
ncclUniqueId
,
platform
::
Communicator
,
platform
::
NCCLCommunicator
,
#endif
#endif
operators
::
CudnnRNNCache
,
operators
::
CudnnRNNCache
,
...
...
paddle/fluid/framework/var_type_traits_test.cc
浏览文件 @
28b356b9
...
@@ -28,6 +28,14 @@
...
@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
#endif
...
...
paddle/fluid/operators/nccl/CMakeLists.txt
浏览文件 @
28b356b9
if
(
NOT
WITH_NCCL
)
if
(
NOT
(
WITH_NCCL OR WITH_RCCL
)
)
return
()
return
()
endif
()
endif
()
...
@@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32)
...
@@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32)
nv_library
(
nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator
)
nv_library
(
nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator
)
endif
()
endif
()
if
(
WITH_GPU
)
if
(
WITH_ROCM AND NOT WIN32
)
hip_library
(
nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator
)
endif
()
if
(
WITH_GPU OR WITH_ROCM
)
op_library
(
nccl_op DEPS nccl_common
)
op_library
(
nccl_op DEPS nccl_common
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(ncclAllReduce);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(ncclAllReduce);
\n
"
)
set
(
OPERATOR_DEPS
${
OPERATOR_DEPS
}
nccl_common PARENT_SCOPE
)
set
(
OPERATOR_DEPS
${
OPERATOR_DEPS
}
nccl_common PARENT_SCOPE
)
endif
()
endif
()
if
(
NOT WIN32
)
if
(
WITH_GPU AND
NOT WIN32
)
nv_test
(
nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context
)
nv_test
(
nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context
)
endif
()
endif
()
if
(
WITH_ROCM AND NOT WIN32
)
hip_test
(
nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context
)
endif
()
paddle/fluid/operators/nccl/nccl_gpu_common.h
浏览文件 @
28b356b9
...
@@ -23,7 +23,11 @@ limitations under the License. */
...
@@ -23,7 +23,11 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/macros.h"
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
28b356b9
...
@@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
...
@@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
)
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
)
if
(
WITH_GPU
)
if
(
WITH_GPU
OR WITH_ROCM
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
dynload_cuda
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
dynload_cuda
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
cuda_device_guard
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
cuda_device_guard
)
endif
()
endif
()
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_wrapper
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_wrapper
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
reducer
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
reducer
)
endif
()
endif
()
...
@@ -21,7 +21,7 @@ endif()
...
@@ -21,7 +21,7 @@ endif()
if
(
NOT WIN32
)
if
(
NOT WIN32
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
data_loader
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
data_loader
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
mmap_allocator
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
mmap_allocator
)
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_context
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_context
)
endif
()
endif
()
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
...
@@ -71,7 +71,7 @@ if (WITH_PSCORE)
...
@@ -71,7 +71,7 @@ if (WITH_PSCORE)
list
(
APPEND PYBIND_SRCS fleet_py.cc
)
list
(
APPEND PYBIND_SRCS fleet_py.cc
)
endif
()
endif
()
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
list
(
APPEND PYBIND_SRCS nccl_wrapper_py.cc
)
list
(
APPEND PYBIND_SRCS nccl_wrapper_py.cc
)
endif
()
endif
()
...
@@ -81,9 +81,9 @@ if(WITH_PYTHON)
...
@@ -81,9 +81,9 @@ if(WITH_PYTHON)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS
${
GLOB_OP_LIB
}
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS
${
GLOB_OP_LIB
}
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS
${
GLOB_OPERATOR_DEPS
}
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS
${
GLOB_OPERATOR_DEPS
}
)
if
(
WITH_N
CCL
)
if
(
WITH_NCCL OR WITH_R
CCL
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context
)
endif
(
WITH_NCCL
)
endif
()
if
(
WITH_XPU_BKCL
)
if
(
WITH_XPU_BKCL
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context
)
...
@@ -93,6 +93,9 @@ if(WITH_PYTHON)
...
@@ -93,6 +93,9 @@ if(WITH_PYTHON)
target_link_libraries
(
op_function_generator
${
OP_FUNCTION_GENERETOR_DEPS
}
)
target_link_libraries
(
op_function_generator
${
OP_FUNCTION_GENERETOR_DEPS
}
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
op_function_generator
${
os_dependency_modules
}
)
target_link_libraries
(
op_function_generator
${
os_dependency_modules
}
)
if
(
WITH_ROCM
)
target_link_libraries
(
op_function_generator
${
ROCM_HIPRTC_LIB
}
)
endif
()
set
(
impl_file
${
CMAKE_SOURCE_DIR
}
/paddle/fluid/pybind/op_function_impl.h
)
set
(
impl_file
${
CMAKE_SOURCE_DIR
}
/paddle/fluid/pybind/op_function_impl.h
)
set
(
tmp_impl_file
${
impl_file
}
.tmp
)
set
(
tmp_impl_file
${
impl_file
}
.tmp
)
...
@@ -164,12 +167,6 @@ if(WITH_PYTHON)
...
@@ -164,12 +167,6 @@ if(WITH_PYTHON)
endif
(
WITH_MKLDNN
)
endif
(
WITH_MKLDNN
)
endif
(
WIN32
)
endif
(
WIN32
)
if
(
WITH_ROCM_PLATFORM
)
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
else
()
cc_library
(
paddle_pybind SHARED
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
DEPS
${
PYBIND_DEPS
}
...
@@ -177,7 +174,10 @@ if(WITH_PYTHON)
...
@@ -177,7 +174,10 @@ if(WITH_PYTHON)
if
(
NOT APPLE AND NOT WIN32
)
if
(
NOT APPLE AND NOT WIN32
)
target_link_libraries
(
paddle_pybind rt
)
target_link_libraries
(
paddle_pybind rt
)
endif
(
NOT APPLE AND NOT WIN32
)
endif
(
NOT APPLE AND NOT WIN32
)
endif
(
WITH_ROCM_PLATFORM
)
if
(
WITH_ROCM
)
target_link_libraries
(
paddle_pybind
${
ROCM_HIPRTC_LIB
}
)
endif
()
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
paddle_pybind
${
os_dependency_modules
}
)
target_link_libraries
(
paddle_pybind
${
os_dependency_modules
}
)
...
...
paddle/fluid/pybind/global_value_getter_setter.cc
浏览文件 @
28b356b9
...
@@ -66,7 +66,7 @@ DECLARE_bool(benchmark);
...
@@ -66,7 +66,7 @@ DECLARE_bool(benchmark);
DECLARE_int32
(
inner_op_parallelism
);
DECLARE_int32
(
inner_op_parallelism
);
DECLARE_int32
(
max_inplace_grad_add
);
DECLARE_int32
(
max_inplace_grad_add
);
DECLARE_string
(
tracer_profile_fname
);
DECLARE_string
(
tracer_profile_fname
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cudnn
// cudnn
DECLARE_uint64
(
conv_workspace_size_limit
);
DECLARE_uint64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_batchnorm_spatial_persistent
);
DECLARE_bool
(
cudnn_batchnorm_spatial_persistent
);
...
@@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() {
...
@@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_paddle_num_threads
,
FLAGS_use_mkldnn
,
FLAGS_max_inplace_grad_add
,
FLAGS_paddle_num_threads
,
FLAGS_use_mkldnn
,
FLAGS_max_inplace_grad_add
,
FLAGS_tracer_mkldnn_ops_on
,
FLAGS_tracer_mkldnn_ops_off
);
FLAGS_tracer_mkldnn_ops_on
,
FLAGS_tracer_mkldnn_ops_off
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_PUBLIC_GLOBAL_VAR
(
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_gpu_memory_limit_mb
,
FLAGS_cudnn_deterministic
,
FLAGS_gpu_memory_limit_mb
,
FLAGS_cudnn_deterministic
,
FLAGS_conv_workspace_size_limit
,
FLAGS_cudnn_batchnorm_spatial_persistent
,
FLAGS_conv_workspace_size_limit
,
FLAGS_cudnn_batchnorm_spatial_persistent
,
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
28b356b9
...
@@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) {
...
@@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) {
[](
imperative
::
VarBase
&
self
,
[](
imperative
::
VarBase
&
self
,
const
imperative
::
ParallelStrategy
&
strategy
)
{
const
imperative
::
ParallelStrategy
&
strategy
)
{
if
(
strategy
.
nranks_
>
1
)
{
if
(
strategy
.
nranks_
>
1
)
{
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if NCCL_VERSION_CODE >= 2212
#if NCCL_VERSION_CODE >= 2212
imperative
::
AllReduce
(
self
.
Var
(),
self
.
MutableVar
(),
strategy
);
imperative
::
AllReduce
(
self
.
Var
(),
self
.
MutableVar
(),
strategy
);
#else
#else
...
@@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) {
...
@@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) {
)DOC"
)
)DOC"
)
.
def
(
"pin_memory"
,
.
def
(
"pin_memory"
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
)
{
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot copy this Tensor to pinned memory in CPU version "
"Cannot copy this Tensor to pinned memory in CPU version "
"Paddle, "
"Paddle, "
...
@@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) {
...
@@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) {
.
def
(
"cuda"
,
.
def
(
"cuda"
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
,
int
device_id
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
,
int
device_id
,
bool
blocking
)
{
bool
blocking
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot copy this Tensor to GPU in CPU version Paddle, "
"Cannot copy this Tensor to GPU in CPU version Paddle, "
"Please recompile or reinstall Paddle with CUDA support."
));
"Please recompile or reinstall Paddle with CUDA support."
));
...
@@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) {
...
@@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) {
},
},
py
::
call_guard
<
py
::
gil_scoped_release
>
());
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
py
::
class_
<
imperative
::
ParallelContext
,
py
::
class_
<
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
ParallelContext
>>
(
m
,
std
::
shared_ptr
<
imperative
::
ParallelContext
>>
(
m
,
"ParallelContext"
);
"ParallelContext"
);
...
...
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
浏览文件 @
28b356b9
...
@@ -32,7 +32,8 @@ namespace py = pybind11;
...
@@ -32,7 +32,8 @@ namespace py = pybind11;
namespace
paddle
{
namespace
paddle
{
namespace
pybind
{
namespace
pybind
{
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void
BindPSGPUWrapper
(
py
::
module
*
m
)
{
void
BindPSGPUWrapper
(
py
::
module
*
m
)
{
py
::
class_
<
framework
::
PSGPUWrapper
,
std
::
shared_ptr
<
framework
::
PSGPUWrapper
>>
(
py
::
class_
<
framework
::
PSGPUWrapper
,
std
::
shared_ptr
<
framework
::
PSGPUWrapper
>>
(
*
m
,
"PSGPU"
)
*
m
,
"PSGPU"
)
...
...
paddle/fluid/pybind/ps_gpu_wrapper_py.h
浏览文件 @
28b356b9
...
@@ -22,7 +22,8 @@ namespace py = pybind11;
...
@@ -22,7 +22,8 @@ namespace py = pybind11;
namespace
paddle
{
namespace
paddle
{
namespace
pybind
{
namespace
pybind
{
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void
BindPSGPUWrapper
(
py
::
module
*
m
);
void
BindPSGPUWrapper
(
py
::
module
*
m
);
#endif
#endif
}
// namespace pybind
}
// namespace pybind
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
28b356b9
...
@@ -86,7 +86,7 @@ limitations under the License. */
...
@@ -86,7 +86,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
#endif
#endif
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
...
@@ -95,11 +95,13 @@ limitations under the License. */
...
@@ -95,11 +95,13 @@ limitations under the License. */
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h"
#include "paddle/fluid/string/to_string.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#endif
#ifndef PADDLE_WITH_HIP
#include "paddle/fluid/platform/cuda_profiler.h"
#include "paddle/fluid/platform/cuda_profiler.h"
#endif
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#endif
...
@@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
...
@@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace
paddle
{
namespace
paddle
{
namespace
pybind
{
namespace
pybind
{
bool
IsCompiledWithCUDA
()
{
bool
IsCompiledWithCUDA
()
{
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
return
false
;
#else
return
true
;
#endif
}
bool
IsCompiledWithROCM
()
{
#ifndef PADDLE_WITH_HIP
return
false
;
return
false
;
#else
#else
return
true
;
return
true
;
...
@@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) {
...
@@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) {
m
.
def
(
"set_num_threads"
,
&
platform
::
SetNumThreads
);
m
.
def
(
"set_num_threads"
,
&
platform
::
SetNumThreads
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"cudnn_version"
,
&
platform
::
CudnnVersion
);
m
.
def
(
"cudnn_version"
,
&
platform
::
CudnnVersion
);
#endif
#endif
...
@@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) {
...
@@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) {
if
(
dl
.
ctx
.
device_type
==
kDLCPU
)
{
if
(
dl
.
ctx
.
device_type
==
kDLCPU
)
{
paddle
::
framework
::
TensorFromDLPack
(
dl
,
&
tensor
);
paddle
::
framework
::
TensorFromDLPack
(
dl
,
&
tensor
);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
dl
.
ctx
.
device_type
==
kDLGPU
)
{
if
(
dl
.
ctx
.
device_type
==
kDLGPU
)
{
paddle
::
framework
::
TensorFromDLPack
(
dl
,
&
tensor
);
paddle
::
framework
::
TensorFromDLPack
(
dl
,
&
tensor
);
}
}
...
@@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) {
...
@@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) {
.
def
(
"height"
,
&
SelectedRows
::
height
)
.
def
(
"height"
,
&
SelectedRows
::
height
)
.
def
(
"set_rows"
,
.
def
(
"set_rows"
,
[](
SelectedRows
&
self
,
std
::
vector
<
int64_t
>
rows
)
{
[](
SelectedRows
&
self
,
std
::
vector
<
int64_t
>
rows
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
self
.
set_rows
(
rows
);
self
.
set_rows
(
rows
);
#else
#else
Vector
<
int64_t
>
new_rows
(
rows
);
Vector
<
int64_t
>
new_rows
(
rows
);
...
@@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def_static
(
"create"
,
.
def_static
(
"create"
,
[](
paddle
::
platform
::
CUDAPlace
&
place
)
[](
paddle
::
platform
::
CUDAPlace
&
place
)
->
paddle
::
platform
::
DeviceContext
*
{
->
paddle
::
platform
::
DeviceContext
*
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPlace in CPU only version, "
"Cannot use CUDAPlace in CPU only version, "
...
@@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def_static
(
"create"
,
.
def_static
(
"create"
,
[](
paddle
::
platform
::
CUDAPinnedPlace
&
place
)
[](
paddle
::
platform
::
CUDAPinnedPlace
&
place
)
->
paddle
::
platform
::
DeviceContext
*
{
->
paddle
::
platform
::
DeviceContext
*
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPinnedPlace in CPU only version, "
"Cannot use CUDAPinnedPlace in CPU only version, "
...
@@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle.
#endif
#endif
});;
});;
// clang-format on
// clang-format on
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
py
::
class_
<
platform
::
Communicator
>
(
m
,
"Communicator"
).
def
(
py
::
init
<>
());
py
::
class_
<
platform
::
Communicator
>
(
m
,
"Communicator"
).
def
(
py
::
init
<>
());
#endif
#endif
py
::
class_
<
platform
::
CUDAPlace
>
(
m
,
"CUDAPlace"
,
R"DOC(
py
::
class_
<
platform
::
CUDAPlace
>
(
m
,
"CUDAPlace"
,
R"DOC(
...
@@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC"
)
)DOC"
)
.
def
(
"__init__"
,
.
def
(
"__init__"
,
[](
platform
::
CUDAPlace
&
self
,
int
dev_id
)
{
[](
platform
::
CUDAPlace
&
self
,
int
dev_id
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
UNLIKELY
(
dev_id
<
0
))
{
if
(
UNLIKELY
(
dev_id
<
0
))
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Invalid CUDAPlace(%d), device id must be 0 or "
"Invalid CUDAPlace(%d), device id must be 0 or "
...
@@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle.
std
::
exit
(
-
1
);
std
::
exit
(
-
1
);
#endif
#endif
})
})
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
.
def
(
"get_device_id"
,
.
def
(
"get_device_id"
,
[](
const
platform
::
CUDAPlace
&
self
)
{
return
self
.
GetDeviceId
();
})
[](
const
platform
::
CUDAPlace
&
self
)
{
return
self
.
GetDeviceId
();
})
.
def
(
"_type"
,
&
PlaceIndex
<
platform
::
CUDAPlace
>
)
.
def
(
"_type"
,
&
PlaceIndex
<
platform
::
CUDAPlace
>
)
...
@@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC"
)
)DOC"
)
.
def
(
"__init__"
,
.
def
(
"__init__"
,
[](
platform
::
CUDAPinnedPlace
&
self
)
{
[](
platform
::
CUDAPinnedPlace
&
self
)
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPinnedPlace in CPU only version, "
"Cannot use CUDAPinnedPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."
));
"Please recompile or reinstall Paddle with CUDA support."
));
...
@@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"init_devices"
,
[]()
{
framework
::
InitDevices
();
});
m
.
def
(
"init_devices"
,
[]()
{
framework
::
InitDevices
();
});
m
.
def
(
"is_compiled_with_cuda"
,
IsCompiledWithCUDA
);
m
.
def
(
"is_compiled_with_cuda"
,
IsCompiledWithCUDA
);
m
.
def
(
"is_compiled_with_rocm"
,
IsCompiledWithROCM
);
m
.
def
(
"is_compiled_with_xpu"
,
IsCompiledWithXPU
);
m
.
def
(
"is_compiled_with_xpu"
,
IsCompiledWithXPU
);
m
.
def
(
"is_compiled_with_mkldnn"
,
IsCompiledWithMKLDNN
);
m
.
def
(
"is_compiled_with_mkldnn"
,
IsCompiledWithMKLDNN
);
m
.
def
(
"supports_bfloat16"
,
SupportsBfloat16
);
m
.
def
(
"supports_bfloat16"
,
SupportsBfloat16
);
...
@@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle.
py
::
arg
(
"cmd"
),
py
::
arg
(
"time_out"
)
=
0
,
py
::
arg
(
"sleep_inter"
)
=
0
,
py
::
arg
(
"cmd"
),
py
::
arg
(
"time_out"
)
=
0
,
py
::
arg
(
"sleep_inter"
)
=
0
,
py
::
arg
(
"redirect_stderr"
)
=
false
);
py
::
arg
(
"redirect_stderr"
)
=
false
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"is_float16_supported"
,
[](
const
platform
::
CUDAPlace
&
place
)
->
bool
{
m
.
def
(
"is_float16_supported"
,
[](
const
platform
::
CUDAPlace
&
place
)
->
bool
{
// Only GPUs with Compute Capability >= 53 support float16
// Only GPUs with Compute Capability >= 53 support float16
return
platform
::
GetCUDAComputeCapability
(
place
.
device
)
>=
53
;
return
platform
::
GetCUDAComputeCapability
(
place
.
device
)
>=
53
;
...
@@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle.
py
::
return_value_policy
::
take_ownership
);
py
::
return_value_policy
::
take_ownership
);
m
.
def
(
"op_support_gpu"
,
OpSupportGPU
);
m
.
def
(
"op_support_gpu"
,
OpSupportGPU
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"get_cuda_device_count"
,
platform
::
GetCUDADeviceCount
);
m
.
def
(
"get_cuda_device_count"
,
platform
::
GetCUDADeviceCount
);
#if
ndef _WIN32
#if
!defined(PADDLE_WITH_HIP) && !defined(_WIN32)
m
.
def
(
"nvprof_init"
,
platform
::
CudaProfilerInit
);
m
.
def
(
"nvprof_init"
,
platform
::
CudaProfilerInit
);
m
.
def
(
"nvprof_start"
,
platform
::
CudaProfilerStart
);
m
.
def
(
"nvprof_start"
,
platform
::
CudaProfilerStart
);
m
.
def
(
"nvprof_stop"
,
platform
::
CudaProfilerStop
);
m
.
def
(
"nvprof_stop"
,
platform
::
CudaProfilerStop
);
...
@@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"size_of_dtype"
,
framework
::
SizeOfType
);
m
.
def
(
"size_of_dtype"
,
framework
::
SizeOfType
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m
.
def
(
"set_cublas_switch"
,
platform
::
SetAllowTF32Cublas
);
m
.
def
(
"set_cublas_switch"
,
platform
::
SetAllowTF32Cublas
);
m
.
def
(
"get_cublas_switch"
,
platform
::
AllowTF32Cublas
);
m
.
def
(
"get_cublas_switch"
,
platform
::
AllowTF32Cublas
);
m
.
def
(
"set_cudnn_switch"
,
platform
::
SetAllowTF32Cudnn
);
m
.
def
(
"set_cudnn_switch"
,
platform
::
SetAllowTF32Cudnn
);
...
@@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_PSLIB
#ifdef PADDLE_WITH_PSLIB
BindHeterWrapper
(
&
m
);
BindHeterWrapper
(
&
m
);
#endif
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
BindPSGPUWrapper
(
&
m
);
BindPSGPUWrapper
(
&
m
);
#endif
#endif
BindGlooWrapper
(
&
m
);
BindGlooWrapper
(
&
m
);
...
@@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_BOX_PS
#ifdef PADDLE_WITH_BOX_PS
BindBoxWrapper
(
&
m
);
BindBoxWrapper
(
&
m
);
#endif
#endif
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
BindNCCLWrapper
(
&
m
);
BindNCCLWrapper
(
&
m
);
#endif
#endif
#ifdef PADDLE_WITH_GLOO
#ifdef PADDLE_WITH_GLOO
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
28b356b9
...
@@ -27,7 +27,7 @@ limitations under the License. */
...
@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/bfloat16.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#endif
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
...
@@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
...
@@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
));
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
));
#endif
#endif
}
else
if
(
platform
::
is_gpu_place
(
self
.
place
()))
{
}
else
if
(
platform
::
is_gpu_place
(
self
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const
T
*
a
=
self
.
data
<
T
>
();
const
T
*
a
=
self
.
data
<
T
>
();
auto
p
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
self
.
place
());
auto
p
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
self
.
place
());
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
),
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
),
...
@@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
...
@@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
));
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
));
#endif
#endif
}
else
if
(
platform
::
is_gpu_place
(
self
->
place
()))
{
}
else
if
(
platform
::
is_gpu_place
(
self
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
p
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
self
->
place
());
auto
p
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
self
->
place
());
T
*
a
=
self
->
mutable_data
<
T
>
(
p
);
T
*
a
=
self
->
mutable_data
<
T
>
(
p
);
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
),
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
),
...
@@ -296,7 +296,7 @@ void SetTensorFromPyArrayT(
...
@@ -296,7 +296,7 @@ void SetTensorFromPyArrayT(
"Please recompile or reinstall Paddle with XPU support."
));
"Please recompile or reinstall Paddle with XPU support."
));
#endif
#endif
}
else
{
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
paddle
::
platform
::
is_gpu_place
(
place
))
{
if
(
paddle
::
platform
::
is_gpu_place
(
place
))
{
// NOTE(wangxi): When copying data to the accelerator card,
// NOTE(wangxi): When copying data to the accelerator card,
// we need set_device(dev_id) first.
// we need set_device(dev_id) first.
...
@@ -304,8 +304,13 @@ void SetTensorFromPyArrayT(
...
@@ -304,8 +304,13 @@ void SetTensorFromPyArrayT(
platform
::
CUDADeviceGuard
guard
(
platform
::
CUDADeviceGuard
guard
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
tmp_place
).
device
);
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
tmp_place
).
device
);
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
#ifdef PADDLE_WITH_HIP
paddle
::
platform
::
GpuMemcpySync
(
dst
,
array
.
data
(),
array
.
nbytes
(),
hipMemcpyHostToDevice
);
#else
paddle
::
platform
::
GpuMemcpySync
(
dst
,
array
.
data
(),
array
.
nbytes
(),
paddle
::
platform
::
GpuMemcpySync
(
dst
,
array
.
data
(),
array
.
nbytes
(),
cudaMemcpyHostToDevice
);
cudaMemcpyHostToDevice
);
#endif
}
else
if
(
paddle
::
platform
::
is_cuda_pinned_place
(
place
))
{
}
else
if
(
paddle
::
platform
::
is_cuda_pinned_place
(
place
))
{
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
...
@@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
...
@@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
self
.
type
());
self
.
type
());
#endif
#endif
}
else
{
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_cuda_pinned_place
(
place
))
{
if
(
platform
::
is_cuda_pinned_place
(
place
))
{
output
->
mutable_data
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
place
),
output
->
mutable_data
(
BOOST_GET_CONST
(
platform
::
CUDAPinnedPlace
,
place
),
self
.
type
());
self
.
type
());
...
@@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
...
@@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
"Please recompile or reinstall Paddle with XPU support."
));
"Please recompile or reinstall Paddle with XPU support."
));
#endif
#endif
}
else
if
(
is_gpu_tensor
)
{
}
else
if
(
is_gpu_tensor
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
py
::
array
py_arr
(
py
::
dtype
(
py_dtype_str
.
c_str
()),
py_dims
,
py_strides
);
py
::
array
py_arr
(
py
::
dtype
(
py_dtype_str
.
c_str
()),
py_dims
,
py_strides
);
PADDLE_ENFORCE_EQ
(
py_arr
.
writeable
(),
true
,
PADDLE_ENFORCE_EQ
(
py_arr
.
writeable
(),
true
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录