Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
c8fac5ee
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c8fac5ee
编写于
2月 26, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 26, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid framework for rocm (part5), test=develop (#31014)
上级
580447d0
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
135 addition
and
48 deletion
+135
-48
paddle/fluid/framework/garbage_collector.cc
paddle/fluid/framework/garbage_collector.cc
+12
-3
paddle/fluid/framework/garbage_collector.h
paddle/fluid/framework/garbage_collector.h
+3
-3
paddle/fluid/framework/heter_service.h
paddle/fluid/framework/heter_service.h
+1
-1
paddle/fluid/framework/heterbox_trainer.cc
paddle/fluid/framework/heterbox_trainer.cc
+22
-7
paddle/fluid/framework/lod_tensor.h
paddle/fluid/framework/lod_tensor.h
+1
-1
paddle/fluid/framework/lod_tensor_test.cu
paddle/fluid/framework/lod_tensor_test.cu
+12
-2
paddle/fluid/framework/mixed_vector.h
paddle/fluid/framework/mixed_vector.h
+1
-1
paddle/fluid/framework/mixed_vector_test.cu
paddle/fluid/framework/mixed_vector_test.cu
+25
-1
paddle/fluid/framework/op_registry.h
paddle/fluid/framework/op_registry.h
+1
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+5
-1
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+1
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+23
-12
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+1
-1
paddle/fluid/framework/pipeline_trainer.cc
paddle/fluid/framework/pipeline_trainer.cc
+1
-1
paddle/fluid/framework/ps_gpu_trainer.cc
paddle/fluid/framework/ps_gpu_trainer.cc
+2
-1
paddle/fluid/framework/ps_gpu_worker.cc
paddle/fluid/framework/ps_gpu_worker.cc
+2
-1
paddle/fluid/framework/pull_dense_worker.cc
paddle/fluid/framework/pull_dense_worker.cc
+11
-7
paddle/fluid/framework/save_load_util.cc
paddle/fluid/framework/save_load_util.cc
+1
-1
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+10
-2
未找到文件。
paddle/fluid/framework/garbage_collector.cc
浏览文件 @
c8fac5ee
...
...
@@ -13,7 +13,7 @@
// limitations under the License.
#include <functional>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "gflags/gflags.h"
...
...
@@ -53,7 +53,7 @@ void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
UnsafeFastGPUGarbageCollector
::
UnsafeFastGPUGarbageCollector
(
const
platform
::
CUDAPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{}
...
...
@@ -82,18 +82,27 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{
platform
::
CUDADeviceGuard
guard
(
place
.
device
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamCreate
(
&
stream_
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreate
(
&
stream_
));
#endif
callback_manager_
.
reset
(
new
platform
::
StreamCallbackManager
(
stream_
));
}
StreamGarbageCollector
::~
StreamGarbageCollector
()
{
auto
place
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
this
->
dev_ctx_
->
GetPlace
());
platform
::
CUDADeviceGuard
guard
(
place
.
device
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamSynchronize
(
stream_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamDestroy
(
stream_
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
stream_
));
#endif
}
cuda
Stream_t
StreamGarbageCollector
::
stream
()
const
{
return
stream_
;
}
gpu
Stream_t
StreamGarbageCollector
::
stream
()
const
{
return
stream_
;
}
void
StreamGarbageCollector
::
Wait
()
const
{
callback_manager_
->
Wait
();
}
...
...
paddle/fluid/framework/garbage_collector.h
浏览文件 @
c8fac5ee
...
...
@@ -80,7 +80,7 @@ class XPUGarbageCollector : public GarbageCollector {
};
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
UnsafeFastGPUGarbageCollector
:
public
GarbageCollector
{
public:
UnsafeFastGPUGarbageCollector
(
const
platform
::
CUDAPlace
&
place
,
...
...
@@ -110,13 +110,13 @@ class StreamGarbageCollector : public GarbageCollector {
void
Wait
()
const
override
;
cuda
Stream_t
stream
()
const
;
gpu
Stream_t
stream
()
const
;
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
private:
cuda
Stream_t
stream_
;
gpu
Stream_t
stream_
;
std
::
unique_ptr
<
platform
::
StreamCallbackManager
>
callback_manager_
;
};
...
...
paddle/fluid/framework/heter_service.h
浏览文件 @
c8fac5ee
...
...
@@ -152,7 +152,7 @@ class HeterObjectPool {
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
pool_
.
empty
())
{
num_
+=
1
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
VLOG
(
0
)
<<
"pool construct size: "
<<
num_
;
#endif
return
std
::
make_shared
<
T
>
();
...
...
paddle/fluid/framework/heterbox_trainer.cc
浏览文件 @
c8fac5ee
...
...
@@ -21,9 +21,10 @@ limitations under the License. */
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace
paddle
{
...
...
@@ -48,16 +49,25 @@ void HeterBoxTrainer::Initialize(const TrainerDesc& trainer_desc,
dataset
->
GetReaders
();
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
int
num
=
trainer_desc
.
worker_places
(
i
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
num
);
platform
::
CUDADeviceGuard
guard
(
place
.
device
);
cudaStream_t
stream
;
gpuStream_t
stream
;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamCreate
(
&
stream
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreate
(
&
stream
));
#endif
copy_streams_
.
push_back
(
stream
);
places_
.
push_back
(
place
);
cudaEvent_t
event
;
gpuEvent_t
event
;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
event
,
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
#endif
events_
.
push_back
(
event
);
#endif
#ifdef PADDLE_WITH_XPU
...
...
@@ -140,8 +150,13 @@ void HeterBoxTrainer::InitTrainerEnv(const ProgramDesc& main_program,
_ForEachDataType_
(
HeterMemcpyFunc
);
}
}
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventRecord
(
event
,
stream
));
hipEventSynchronize
(
event
);
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event
,
stream
));
cudaEventSynchronize
(
event
);
#endif
}
place_
=
place
;
}
...
...
@@ -150,7 +165,7 @@ template <typename T>
void
HeterBoxTrainer
::
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
cuda
Stream_t
stream
)
{
gpu
Stream_t
stream
)
{
T
*
thread_ptr
=
thread_tensor
->
mutable_data
<
T
>
(
root_tensor
->
dims
(),
thread_place
);
T
*
root_ptr
=
root_tensor
->
data
<
T
>
();
...
...
@@ -171,7 +186,7 @@ void HeterBoxTrainer::InitOtherEnv(const ProgramDesc& main_program) {
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
pull_dense_worker_
->
AddThreadScope
(
workers_
[
i
]
->
GetThreadScope
());
pull_dense_worker_
->
AddPlace
(
places_
[
i
]);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
pull_dense_worker_
->
AddStream
(
copy_streams_
[
i
]);
#endif
}
...
...
paddle/fluid/framework/lod_tensor.h
浏览文件 @
c8fac5ee
...
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include <string>
#include <utility>
#include <vector>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#endif
...
...
paddle/fluid/framework/lod_tensor_test.cu
浏览文件 @
c8fac5ee
...
...
@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include "gtest/gtest.h"
...
...
@@ -34,8 +32,14 @@ TEST(LoD, data) {
auto
&
v
=
lod
[
0
];
paddle
::
platform
::
CUDAPlace
gpu
(
0
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
test
,
dim3
(
1
),
dim3
(
1
),
0
,
0
,
v
.
CUDAMutableData
(
gpu
),
v
.
size
());
hipDeviceSynchronize
();
#else
test
<<<
1
,
1
>>>
(
v
.
CUDAMutableData
(
gpu
),
v
.
size
());
cudaDeviceSynchronize
();
#endif
for
(
size_t
i
=
0
;
i
<
v
.
size
();
++
i
)
{
EXPECT_EQ
(
v
[
i
],
i
*
2
);
}
...
...
@@ -59,8 +63,14 @@ TEST(LoDTensor, LoDInGPU) {
auto
lod
=
lod_tensor
.
lod
();
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
test
,
dim3
(
1
),
dim3
(
8
),
0
,
0
,
lod
[
0
].
CUDAMutableData
(
place
),
lod
[
0
].
size
());
hipDeviceSynchronize
();
#else
test
<<<
1
,
8
>>>
(
lod
[
0
].
CUDAMutableData
(
place
),
lod
[
0
].
size
());
cudaDeviceSynchronize
();
#endif
for
(
size_t
i
=
0
;
i
<
src_lod
[
0
].
size
();
++
i
)
{
EXPECT_EQ
(
lod
[
0
].
data
()[
i
],
src_lod
[
0
].
data
()[
i
]
*
2
);
...
...
paddle/fluid/framework/mixed_vector.h
浏览文件 @
c8fac5ee
...
...
@@ -31,7 +31,7 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA)
|| defined(PADDLE_WITH_HIP)
// Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside.
template
<
typename
T
>
...
...
paddle/fluid/framework/mixed_vector_test.cu
浏览文件 @
c8fac5ee
...
...
@@ -12,7 +12,13 @@
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <memory>
#include "glog/logging.h"
...
...
@@ -22,6 +28,7 @@
template
<
typename
T
>
using
vec
=
paddle
::
framework
::
Vector
<
T
>
;
using
gpuStream_t
=
paddle
::
gpuStream_t
;
static
__global__
void
multiply_10
(
int
*
ptr
)
{
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
...
...
@@ -29,7 +36,7 @@ static __global__ void multiply_10(int* ptr) {
}
}
cuda
Stream_t
GetCUDAStream
(
paddle
::
platform
::
CUDAPlace
place
)
{
gpu
Stream_t
GetCUDAStream
(
paddle
::
platform
::
CUDAPlace
place
)
{
return
reinterpret_cast
<
const
paddle
::
platform
::
CUDADeviceContext
*>
(
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
...
...
@@ -43,7 +50,12 @@ TEST(mixed_vector, GPU_VECTOR) {
ASSERT_EQ
(
tmp
.
size
(),
10UL
);
paddle
::
platform
::
CUDAPlace
gpu
(
0
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
multiply_10
,
dim3
(
1
),
dim3
(
1
),
0
,
GetCUDAStream
(
gpu
),
tmp
.
MutableData
(
gpu
));
#else
multiply_10
<<<
1
,
1
,
0
,
GetCUDAStream
(
gpu
)
>>>
(
tmp
.
MutableData
(
gpu
));
#endif
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
ASSERT_EQ
(
tmp
[
i
],
i
*
10
);
...
...
@@ -64,11 +76,23 @@ TEST(mixed_vector, MultiGPU) {
ASSERT_EQ
(
tmp
.
size
(),
10UL
);
paddle
::
platform
::
CUDAPlace
gpu0
(
0
);
paddle
::
platform
::
SetDeviceId
(
0
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
multiply_10
,
dim3
(
1
),
dim3
(
1
),
0
,
GetCUDAStream
(
gpu0
),
tmp
.
MutableData
(
gpu0
));
#else
multiply_10
<<<
1
,
1
,
0
,
GetCUDAStream
(
gpu0
)
>>>
(
tmp
.
MutableData
(
gpu0
));
#endif
paddle
::
platform
::
CUDAPlace
gpu1
(
1
);
auto
*
gpu1_ptr
=
tmp
.
MutableData
(
gpu1
);
paddle
::
platform
::
SetDeviceId
(
1
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
multiply_10
,
dim3
(
1
),
dim3
(
1
),
0
,
GetCUDAStream
(
gpu1
),
gpu1_ptr
);
#else
multiply_10
<<<
1
,
1
,
0
,
GetCUDAStream
(
gpu1
)
>>>
(
gpu1_ptr
);
#endif
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
ASSERT_EQ
(
tmp
[
i
],
i
*
100
);
}
...
...
paddle/fluid/framework/op_registry.h
浏览文件 @
c8fac5ee
...
...
@@ -369,7 +369,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
// TODO(fengjiayi): The following macros
// seems ugly, do we have better method?
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
#else
#define USE_OP_KERNEL(op_type) \
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
c8fac5ee
...
...
@@ -193,7 +193,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
try
{
VLOG
(
4
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
if
(
platform
::
is_gpu_place
(
place
))
{
#if
ndef PADDLE_WITH_CUDA
#if
!defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with CUDA support."
,
...
...
@@ -1166,6 +1166,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
#if defined(PADDLE_WITH_CUDA)
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaGetLastError
());
VLOG
(
4
)
<<
"Operator("
<<
Type
()
<<
"): context wait and get last error"
;
#endif
#if defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipGetLastError
());
VLOG
(
4
)
<<
"Operator("
<<
Type
()
<<
"): context wait and get last error"
;
#endif
}
...
...
paddle/fluid/framework/operator.h
浏览文件 @
c8fac5ee
...
...
@@ -384,7 +384,7 @@ class ExecutionContext {
return
device_context_
;
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const
inline
platform
::
CUDADeviceContext
&
cuda_device_context
()
const
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
device_context_
.
GetPlace
()),
true
,
platform
::
errors
::
PreconditionNotMet
(
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
c8fac5ee
...
...
@@ -37,7 +37,7 @@ limitations under the License. */
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/profiler.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
...
...
@@ -60,7 +60,7 @@ static std::once_flag gProfileOnce;
static
bool
gProfileStarted
=
false
;
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
once_flag
p2p_init_flag
;
#endif
...
...
@@ -132,7 +132,7 @@ class ParallelExecutorPrivate {
}
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
void
InitNCCLCtxs
(
framework
::
Scope
*
scope
,
const
BuildStrategy
&
bst
)
{
VLOG
(
1
)
<<
"nccl comm num:"
<<
bst
.
nccl_comm_num_
<<
", nranks:"
<<
nranks_
<<
", num_trainers:"
<<
bst
.
num_trainers_
...
...
@@ -371,7 +371,7 @@ class ParallelExecutorPrivate {
std
::
unordered_map
<
std
::
string
,
bool
>
is_persistable_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
platform
::
NCCLCommunicator
*
nccl_ctxs_
{
nullptr
};
#elif defined(PADDLE_WITH_XPU_BKCL)
platform
::
BKCLCommunicator
*
bkcl_ctxs_
{
nullptr
};
...
...
@@ -483,7 +483,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
}
std
::
unique_ptr
<
GarbageCollector
>
gc
;
if
(
platform
::
is_gpu_place
(
place
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
max_memory_size
));
...
...
@@ -572,7 +572,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
}
void
InitP2P
(
const
std
::
vector
<
platform
::
Place
>
&
places
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
call_once
(
p2p_init_flag
,
[
&
]()
{
int
count
=
places
.
size
();
if
(
count
<=
1
)
return
;
...
...
@@ -590,14 +590,24 @@ void InitP2P(const std::vector<platform::Place> &places) {
for
(
int
j
=
0
;
j
<
count
;
++
j
)
{
if
(
devices
[
i
]
==
devices
[
j
])
continue
;
int
can_acess
=
-
1
;
#ifdef PADDLE_WITH_HIP
hipError_t
ret
=
hipDeviceCanAccessPeer
(
&
can_acess
,
devices
[
i
],
devices
[
j
]);
if
(
ret
!=
hipSuccess
||
can_acess
!=
1
)
{
#else
cudaError_t
ret
=
cudaDeviceCanAccessPeer
(
&
can_acess
,
devices
[
i
],
devices
[
j
]);
if
(
ret
!=
cudaSuccess
||
can_acess
!=
1
)
{
#endif
LOG
(
WARNING
)
<<
"Cannot enable P2P access from "
<<
devices
[
i
]
<<
" to "
<<
devices
[
j
];
}
else
{
platform
::
CUDADeviceGuard
guard
(
devices
[
i
]);
#ifdef PADDLE_WITH_HIP
hipDeviceEnablePeerAccess
(
devices
[
j
],
0
);
#else
cudaDeviceEnablePeerAccess
(
devices
[
j
],
0
);
#endif
}
}
}
...
...
@@ -630,7 +640,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
member_
->
use_all_reduce_
=
true
;
}
#if
defined(PADDLE_WITH_CUDA
) && defined(_WIN32)
#if
(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
) && defined(_WIN32)
if
(
member_
->
IsUseCUDA
(
member_
->
use_device_
))
{
PADDLE_ENFORCE_EQ
(
places
.
size
(),
1
,
...
...
@@ -638,7 +648,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
}
#endif
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL)
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
if
(
member_
->
IsUseCUDA
(
member_
->
use_device_
))
{
PADDLE_ENFORCE_EQ
(
places
.
size
(),
1
,
...
...
@@ -710,7 +721,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
}
if
(
member_
->
IsUseCUDA
(
member_
->
use_device_
)
&&
member_
->
nranks_
>
1
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
member_
->
InitOrGetNCCLCommunicator
(
scope
,
&
member_
->
build_strategy_
);
// Initialize device context's nccl comm, will be used by normal
...
...
@@ -774,7 +785,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
std
::
vector
<
ir
::
Graph
*>
async_graphs
(
places
.
size
());
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
if
(
member_
->
build_strategy_
.
async_mode_
)
{
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
member_
->
build_strategy_
.
Apply
(
...
...
@@ -885,7 +896,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
final_graphs
=
async_graphs
;
}
else
if
(
member_
->
build_strategy_
.
enable_parallel_graph_
)
{
VLOG
(
3
)
<<
"use ParallelSSAGraphExecutor"
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(Yancey1989): Remove passing in the main_program when
// allreduce_seq_pass doesn't need it as the attr.
bool
is_inference
=
details
::
IsDataParallelInferenceGraph
(
*
graph
);
...
...
@@ -996,7 +1007,7 @@ void ParallelExecutor::BCastParamsToDevices(
}
auto
&
dims
=
main_tensor
.
dims
();
if
(
paddle
::
platform
::
is_gpu_place
(
main_tensor
.
place
()))
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
std
::
vector
<
void
*>
buffers
;
buffers
.
reserve
(
member_
->
places_
.
size
());
size_t
numel
=
main_tensor
.
numel
();
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
c8fac5ee
...
...
@@ -32,7 +32,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
...
...
paddle/fluid/framework/pipeline_trainer.cc
浏览文件 @
c8fac5ee
...
...
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"
...
...
paddle/fluid/framework/ps_gpu_trainer.cc
浏览文件 @
c8fac5ee
...
...
@@ -24,7 +24,8 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h"
namespace
paddle
{
...
...
paddle/fluid/framework/ps_gpu_worker.cc
浏览文件 @
c8fac5ee
...
...
@@ -19,7 +19,8 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/string/string_helper.h"
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h"
#if defined _WIN32 || defined __APPLE__
...
...
paddle/fluid/framework/pull_dense_worker.cc
浏览文件 @
c8fac5ee
...
...
@@ -59,17 +59,19 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
current_version_
[
tid
]
=
0
;
}
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
copy_streams_
.
clear
();
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
places_
.
clear
();
thread_scopes_
.
clear
();
#endif
}
void
PullDenseWorker
::
CreatePinVar
()
{
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
for
(
int
i
=
0
;
i
<
dwp_param_
.
program_config
(
0
).
pull_dense_table_id_size
();
...
...
@@ -84,7 +86,7 @@ void PullDenseWorker::CreatePinVar() {
auto
*
ptr
=
root_scope_
->
Var
(
name
+
"pin"
);
InitializeVariable
(
ptr
,
proto
::
VarType
::
LOD_TENSOR
);
LoDTensor
*
pin_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
platform
::
CUDAPinnedPlace
());
#endif
...
...
@@ -113,7 +115,8 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit
(
-
1
);
}
status_vec
->
resize
(
0
);
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
// for (auto& v : dense_value_names_) {
...
...
@@ -131,7 +134,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
Variable
*
var
=
thread_scopes_
[
i
]
->
FindVar
(
name
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
float
*
w
=
tensor
->
data
<
float
>
();
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
places_
[
i
]),
w
,
platform
::
CUDAPinnedPlace
(),
pin_w
,
sizeof
(
float
)
*
tensor
->
numel
(),
copy_streams_
[
i
]);
...
...
@@ -161,7 +164,8 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t
tid
=
static_cast
<
uint64_t
>
(
dwp_param_
.
program_config
(
0
).
pull_dense_table_id
(
i
));
if
(
force_update
||
CheckUpdateParam
(
tid
))
{
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
VLOG
(
3
)
<<
"pull dense "
<<
force_update
<<
" "
<<
tid
;
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
tid
,
dense_value_names_
[
tid
],
&
pull_dense_status_
,
false
);
...
...
paddle/fluid/framework/save_load_util.cc
浏览文件 @
c8fac5ee
...
...
@@ -297,7 +297,7 @@ bool SaveTensorToDisk(const std::string& file_name,
tensor
->
numel
()
*
framework
::
SizeOfType
(
tensor
->
type
());
auto
*
data_ptr
=
tensor
->
data
<
void
>
();
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework
::
Tensor
temp
;
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
temp
);
data_ptr
=
temp
.
data
<
void
>
();
...
...
paddle/fluid/framework/section_worker.cc
浏览文件 @
c8fac5ee
...
...
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include <float.h>
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
...
...
@@ -38,7 +38,7 @@ void SectionWorker::TrainFiles() {
std
::
unique_ptr
<
GarbageCollector
>
gc
;
auto
unused_vars_
=
GetUnusedVars
(
program_
->
Block
(
0
),
ops_
,
skip_vars_
);
if
(
max_memory_size
>=
0
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
place_
))
{
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
...
...
@@ -70,7 +70,11 @@ void SectionWorker::TrainFiles() {
}
}
}
#ifdef PADDLE_WITH_RCCL
hipDeviceSynchronize
();
#else
cudaDeviceSynchronize
();
#endif
}
// backward pass
...
...
@@ -89,7 +93,11 @@ void SectionWorker::TrainFiles() {
}
}
}
#ifdef PADDLE_WITH_RCCL
hipDeviceSynchronize
();
#else
cudaDeviceSynchronize
();
#endif
}
// update pass
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录