Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f13c3a9c
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f13c3a9c
编写于
12月 16, 2020
作者:
L
liuyuhui
提交者:
GitHub
12月 16, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Kunlun] PR1:Support one Kunlun card training in parallel executor (#29337)
上级
76738504
变更
20
显示空白变更内容
内联
并排
Showing
20 changed file
with
282 addition
and
87 deletion
+282
-87
paddle/fluid/framework/details/broadcast_op_handle_test.h
paddle/fluid/framework/details/broadcast_op_handle_test.h
+6
-2
paddle/fluid/framework/details/execution_strategy.h
paddle/fluid/framework/details/execution_strategy.h
+6
-1
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+1
-1
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
...fluid/framework/details/fused_broadcast_op_handle_test.cc
+5
-2
paddle/fluid/framework/details/gather_op_handle_test.cc
paddle/fluid/framework/details/gather_op_handle_test.cc
+4
-1
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+4
-3
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+2
-1
paddle/fluid/framework/details/reduce_op_handle_test.cc
paddle/fluid/framework/details/reduce_op_handle_test.cc
+6
-2
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+15
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+1
-1
paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
...optimize_pass/test_reference_count_pass_last_lived_ops.cc
+2
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+64
-36
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+28
-12
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+14
-9
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+25
-11
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+47
-0
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
...luid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+1
-1
python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
+47
-0
python/paddle/static/__init__.py
python/paddle/static/__init__.py
+2
-1
tools/wlist.json
tools/wlist.json
+2
-1
未找到文件。
paddle/fluid/framework/details/broadcast_op_handle_test.h
浏览文件 @
f13c3a9c
...
...
@@ -33,6 +33,8 @@ struct VarHandle;
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
...
...
@@ -273,7 +275,8 @@ struct TestBroadcastOpHandle {
f
::
LoD
lod
{{
0
,
10
,
20
}};
auto
send_vector
=
InitLoDTensor
(
"input"
,
input_scope_idx
,
lod
);
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
...
...
@@ -287,7 +290,8 @@ struct TestBroadcastOpHandle {
int
height
=
static_cast
<
int
>
(
kDims
[
0
]
*
2
);
auto
send_vector
=
InitSelectedRows
(
"input"
,
input_scope_idx
,
rows
,
height
);
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
...
...
paddle/fluid/framework/details/execution_strategy.h
浏览文件 @
f13c3a9c
...
...
@@ -21,10 +21,15 @@ namespace details {
struct
ExecutionStrategy
{
enum
ExecutorType
{
kDefault
=
0
,
kExperimental
=
1
};
enum
UseDevice
{
kCPU
=
0
,
kCUDA
=
1
,
kXPU
=
2
,
};
// num_threads indicates the size of thread pool.
size_t
num_threads_
{
0
};
bool
use_cuda_
{
true
};
UseDevice
use_device_
{
kCUDA
};
// Note that allow_op_delay is invalid now.
bool
allow_op_delay_
{
false
};
// num_iteration_per_drop_scope indicates how many
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
f13c3a9c
...
...
@@ -330,7 +330,7 @@ bool FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
try
{
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
op
->
Run
(
strategy_
.
use_
cuda
_
);
op
->
Run
(
strategy_
.
use_
device
_
);
}
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
return
true
;
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
浏览文件 @
f13c3a9c
...
...
@@ -32,6 +32,7 @@ namespace framework {
namespace
details
{
struct
VarHandle
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
struct
TestFusedBroadcastOpHandle
:
TestBroadcastOpHandle
{
std
::
vector
<
std
::
string
>
out_varnames_
;
...
...
@@ -108,7 +109,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
InitLoDTensor
(
varname
,
input_scope_idxes
[
i
],
lod
,
val_scalar
));
}
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
...
...
@@ -131,7 +133,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
rows
,
height
,
val_scalar
));
}
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
...
...
paddle/fluid/framework/details/gather_op_handle_test.cc
浏览文件 @
f13c3a9c
...
...
@@ -27,6 +27,8 @@ struct DummyVarHandle;
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
...
...
@@ -171,7 +173,8 @@ struct TestGatherOpHandle {
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
in_selected_rows
->
value
());
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
f13c3a9c
...
...
@@ -85,13 +85,14 @@ void OpHandleBase::InitCUDA() {
#endif
}
void
OpHandleBase
::
Run
(
bool
use_cuda
)
{
void
OpHandleBase
::
Run
(
ExecutionStrategy
::
UseDevice
use_device
)
{
#ifdef PADDLE_WITH_CUDA
if
(
events_
.
empty
()
&&
use_cuda
&&
dev_ctxes_
.
size
()
>
0
)
{
if
(
events_
.
empty
()
&&
use_device
==
ExecutionStrategy
::
UseDevice
::
kCUDA
&&
dev_ctxes_
.
size
()
>
0
)
{
InitCUDA
();
}
#else
PADDLE_ENFORCE_
EQ
(
use_cuda
,
false
,
PADDLE_ENFORCE_
NE
(
use_device
,
ExecutionStrategy
::
UseDevice
::
kCUDA
,
platform
::
errors
::
InvalidArgument
(
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."
));
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
f13c3a9c
...
...
@@ -19,6 +19,7 @@
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/platform/device_context.h"
...
...
@@ -71,7 +72,7 @@ class OpHandleBase {
virtual
std
::
string
Name
()
const
=
0
;
void
Run
(
bool
use_cuda
);
void
Run
(
ExecutionStrategy
::
UseDevice
use_device
);
virtual
void
RecordWaitEventOnCtx
(
platform
::
DeviceContext
*
waited_ctx
);
...
...
paddle/fluid/framework/details/reduce_op_handle_test.cc
浏览文件 @
f13c3a9c
...
...
@@ -25,6 +25,8 @@ namespace details {
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
...
...
@@ -196,7 +198,8 @@ struct TestReduceOpHandle {
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
in_selected_rows
->
value
());
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
@@ -260,7 +263,8 @@ struct TestReduceOpHandle {
out_lodtensor
->
ShareDataWith
(
in_lodtensor
);
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
f13c3a9c
...
...
@@ -58,6 +58,17 @@ struct ScaleLossGradFunctor {
auto
*
out_data
=
out_
->
mutable_data
<
OutT
>
(
place_
);
if
(
platform
::
is_cpu_place
(
place_
))
{
*
out_data
=
static_cast
<
OutT
>
(
coeff_
);
}
else
if
(
platform
::
is_xpu_place
(
place_
))
{
#if defined(PADDLE_WITH_XPU)
OutT
cast_coeff
=
static_cast
<
OutT
>
(
coeff_
);
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place_
),
out_data
,
platform
::
CPUPlace
(),
&
cast_coeff
,
SizeOfType
(
out_dtype_
));
VLOG
(
10
)
<<
place_
<<
"RUN Scale loss grad op"
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
else
{
#ifdef PADDLE_WITH_CUDA
OutT
cast_coeff
=
static_cast
<
OutT
>
(
coeff_
);
...
...
@@ -66,7 +77,10 @@ struct ScaleLossGradFunctor {
platform
::
CPUPlace
(),
&
cast_coeff
,
SizeOfType
(
out_dtype_
),
stream
);
VLOG
(
10
)
<<
place_
<<
"RUN Scale loss grad op"
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."
));
#endif
}
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
f13c3a9c
...
...
@@ -348,7 +348,7 @@ bool ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
try
{
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
op
->
Run
(
strategy_
.
use_
cuda
_
);
op
->
Run
(
strategy_
.
use_
device
_
);
}
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
return
true
;
...
...
paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
浏览文件 @
f13c3a9c
...
...
@@ -88,7 +88,8 @@ class ReferenceCountPassTestHelper {
FLAGS_eager_delete_tensor_gb
=
-
1
;
details
::
ExecutionStrategy
exec_strategy
;
exec_strategy
.
use_cuda_
=
use_cuda
;
exec_strategy
.
use_device_
=
use_cuda
?
(
ExecutionStrategy
::
kCUDA
)
:
(
ExecutionStrategy
::
kCPU
);
executor_
.
reset
(
new
ParallelExecutor
(
CreatePlaces
(
1
,
use_cuda
),
{},
""
,
&
scope_
,
{},
exec_strategy
,
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
f13c3a9c
...
...
@@ -63,6 +63,8 @@ static bool gProfileStarted = false;
std
::
once_flag
p2p_init_flag
;
#endif
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
class
ParallelExecutorPrivate
{
public:
ParallelExecutorPrivate
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
@@ -93,6 +95,8 @@ class ParallelExecutorPrivate {
}
}
bool
IsUseCUDA
(
UseDevice
use_device
);
void
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
=
true
);
bool
AllowPartialFeed
()
const
;
...
...
@@ -286,7 +290,7 @@ class ParallelExecutorPrivate {
platform
::
NCCLCommunicator
*
nccl_ctxs_
{
nullptr
};
#endif
bool
own_local_scope_
;
bool
use_cuda
_
;
UseDevice
use_device
_
;
bool
use_all_reduce_
;
size_t
nranks_
;
...
...
@@ -296,6 +300,10 @@ class ParallelExecutorPrivate {
details
::
ParallelSSAGraphExecutor
*
inference_executor_
{
nullptr
};
};
bool
ParallelExecutorPrivate
::
IsUseCUDA
(
UseDevice
use_device
)
{
return
use_device
==
UseDevice
::
kCUDA
;
}
void
ParallelExecutorPrivate
::
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
)
{
if
(
inference_executor_
)
{
inference_executor_
->
SetHasFeed
(
dev_idx
,
has_feed
);
...
...
@@ -340,7 +348,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
auto
addto_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"inplace_addto_op_pass"
);
addto_pass
->
SetNotOwned
(
ir
::
kMemOptVarInfoMapList
,
&
mem_opt_var_infos_
);
addto_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
addto_pass
->
Set
NotOwned
(
ir
::
kUseCuda
,
&
use_cuda_
);
addto_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
UseDevice
::
kCUDA
)
);
VLOG
(
10
)
<<
"Start to apply inplace_addto_op_pass"
;
graph
=
addto_pass
->
Apply
(
graph
);
VLOG
(
10
)
<<
"inplace_addto_op_pass Applied"
;
...
...
@@ -351,7 +359,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
ir
::
PassRegistry
::
Instance
().
Get
(
"buffer_shared_inplace_pass"
);
inplace_pass
->
SetNotOwned
(
ir
::
kMemOptVarInfoMapList
,
&
mem_opt_var_infos_
);
inplace_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
inplace_pass
->
Set
NotOwned
(
ir
::
kUseCuda
,
&
use_cuda_
);
inplace_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
UseDevice
::
kCUDA
)
);
VLOG
(
10
)
<<
"Start to apply buffer_shared_inplace_pass"
;
graph
=
inplace_pass
->
Apply
(
graph
);
VLOG
(
10
)
<<
"buffer_shared_inplace_pass Applied"
;
...
...
@@ -366,7 +374,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
&
mem_opt_var_infos_
);
cross_op_memory_reuse_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
cross_op_memory_reuse_pass
->
SetNotOwned
(
ir
::
kUseCuda
,
&
use_cuda_
);
cross_op_memory_reuse_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
UseDevice
::
kCUDA
));
VLOG
(
10
)
<<
"Start to apply buffer_shared_cross_op_memory_reuse_pass"
;
graph
=
cross_op_memory_reuse_pass
->
Apply
(
graph
);
VLOG
(
10
)
<<
"buffer_shared_cross_op_memory_reuse_pass Applied"
;
...
...
@@ -386,8 +395,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
continue
;
}
std
::
unique_ptr
<
GarbageCollector
>
gc
;
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place
))
{
#ifdef PADDLE_WITH_CUDA
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
max_memory_size
));
...
...
@@ -396,9 +405,22 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
max_memory_size
));
}
VLOG
(
10
)
<<
"Created "
<<
i
<<
"-th GarbageCollector at "
<<
place
;
}
else
{
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."
));
#endif
}
else
if
(
platform
::
is_xpu_place
(
place
))
{
#if defined(PADDLE_WITH_XPU)
gc
.
reset
(
new
XPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
),
max_memory_size
));
VLOG
(
10
)
<<
"Created "
<<
i
<<
"-th GarbageCollector at "
<<
place
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support."
));
#endif
if
(
platform
::
is_cpu_place
(
place
))
{
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
gc
.
reset
(
new
CPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
place
),
max_memory_size
));
VLOG
(
10
)
<<
"Created GarbageCollector at "
<<
place
;
...
...
@@ -406,10 +428,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Unsupported place for garbage collection"
));
}
#ifdef PADDLE_WITH_CUDA
}
#endif
gcs_
.
emplace
(
place
,
std
::
move
(
gc
));
}
...
...
@@ -510,13 +528,10 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const
BuildStrategy
&
build_strategy
,
ir
::
Graph
*
graph
)
:
member_
(
new
ParallelExecutorPrivate
(
places
,
scope
))
{
PADDLE_ENFORCE
(
places
.
size
()
>
0
&&
!
is_xpu_place
(
places
[
0
]),
platform
::
errors
::
Unavailable
(
"XPU is not supported in ParallelExecutor"
));
InitP2P
(
places
);
ir
::
InitReaderQueueDeviceCount
(
graph
,
*
(
member_
->
global_scope_
),
member_
->
places_
.
size
());
member_
->
use_
cuda_
=
exec_strategy
.
use_cuda
_
;
member_
->
use_
device_
=
exec_strategy
.
use_device
_
;
member_
->
build_strategy_
=
build_strategy
;
member_
->
use_all_reduce_
=
member_
->
build_strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
...
...
@@ -529,7 +544,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
member_
->
use_all_reduce_
=
true
;
}
#if defined(PADDLE_WITH_CUDA) && defined(_WIN32)
if
(
member_
->
use_cuda_
)
{
if
(
member_
->
IsUseCUDA
(
member_
->
use_device_
)
)
{
PADDLE_ENFORCE_EQ
(
places
.
size
(),
1
,
platform
::
errors
::
Unavailable
(
"Windows can support Single GPU only."
));
...
...
@@ -537,7 +552,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
#endif
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL)
if
(
member_
->
use_cuda_
)
{
if
(
member_
->
IsUseCUDA
(
member_
->
use_device_
)
)
{
PADDLE_ENFORCE_EQ
(
places
.
size
(),
1
,
platform
::
errors
::
PermissionDenied
(
...
...
@@ -548,10 +563,19 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
}
#endif
std
::
string
device_name
;
if
(
member_
->
use_device_
==
UseDevice
::
kCPU
)
{
device_name
=
"CPU"
;
}
else
if
(
member_
->
use_device_
==
UseDevice
::
kCUDA
)
{
device_name
=
"CUDA"
;
}
else
{
device_name
=
"XPU"
;
}
VLOG
(
1
)
<<
string
::
Sprintf
(
"The Program will be executed on %s using ParallelExecutor, %lu "
"cards are used, so %lu programs are executed in parallel."
,
(
member_
->
use_cuda_
?
"CUDA"
:
"CPU"
)
,
places
.
size
(),
places
.
size
());
device_name
,
places
.
size
(),
places
.
size
());
// Step 1. Bcast the bcast_vars to devs.
// Create local scopes
...
...
@@ -575,7 +599,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
std
::
vector
<
ir
::
Graph
*>
graphs
;
if
(
member_
->
build_strategy_
.
async_mode_
)
{
PADDLE_ENFORCE_EQ
(
member_
->
use_cuda_
,
false
,
PADDLE_ENFORCE_EQ
(
member_
->
IsUseCUDA
(
member_
->
use_device_
)
,
false
,
platform
::
errors
::
Unavailable
(
"gpu mode does not support async_mode_ now!"
));
graphs
.
push_back
(
graph
);
...
...
@@ -598,7 +622,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
<<
"you can force it off by env FLAGS_enable_parallel_graph=0"
;
}
if
(
member_
->
use_cuda_
&&
member_
->
nranks_
>
1
)
{
if
(
member_
->
IsUseCUDA
(
member_
->
use_device_
)
&&
member_
->
nranks_
>
1
)
{
#if defined(PADDLE_WITH_NCCL)
member_
->
InitOrGetNCCLCommunicator
(
scope
,
&
member_
->
build_strategy_
);
...
...
@@ -647,36 +671,39 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
);
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
),
member_
->
nccl_ctxs_
);
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
build_strategy_
.
Apply
(
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
);
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
),
member_
->
nccl_ctxs_
);
async_graphs
[
i
]
=
graphs
[
i
];
}
}
else
{
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
);
member_
->
nranks_
,
member_
->
IsUseCUDA
(
member_
->
use_device_
),
member_
->
nccl_ctxs_
);
}
#else
if
(
member_
->
build_strategy_
.
async_mode_
)
{
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
use_cuda_
);
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
));
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
build_strategy_
.
Apply
(
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
use_cuda_
);
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
));
async_graphs
[
i
]
=
graphs
[
i
];
}
}
else
{
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
);
member_
->
nranks_
,
member_
->
IsUseCUDA
(
member_
->
use_device_
)
);
}
#endif
...
...
@@ -874,7 +901,8 @@ void ParallelExecutor::BCastParamsToDevices(
// FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
if
(
member_
->
build_strategy_
.
async_mode_
)
{
share_memory
();
}
else
if
(
member_
->
use_all_reduce_
||
member_
->
use_cuda_
||
}
else
if
(
member_
->
use_all_reduce_
||
member_
->
IsUseCUDA
(
member_
->
use_device_
)
||
var
==
"@LR_DECAY_COUNTER@"
)
{
copy_memory
();
}
else
{
...
...
@@ -1105,7 +1133,7 @@ bool ParallelExecutor::EnableParallelGraphExecution(
}
}
if
(
!
member_
->
use_all_reduce_
||
!
member_
->
use_cuda_
)
{
if
(
!
member_
->
use_all_reduce_
||
!
member_
->
IsUseCUDA
(
member_
->
use_device_
)
)
{
if
(
build_strategy
.
enable_sequential_execution_
||
exec_strategy
.
type_
==
ExecutionStrategy
::
ExecutorType
::
kExperimental
)
{
enable_parallel_graph
=
false
;
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
f13c3a9c
...
...
@@ -29,10 +29,12 @@ namespace memory {
AllocationPtr
Alloc
(
const
platform
::
DeviceContext
&
dev_ctx
,
size_t
size
)
{
auto
place
=
dev_ctx
.
GetPlace
();
#ifdef PADDLE_WITH_CUDA
if
(
size
==
0
||
!
platform
::
is_gpu_place
(
place
))
{
if
(
size
==
0
)
{
return
Alloc
(
place
,
size
);
}
if
(
platform
::
is_gpu_place
(
place
))
{
#ifdef PADDLE_WITH_CUDA
auto
*
default_dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
&
desired_dev_ctx
=
...
...
@@ -44,8 +46,22 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
desired_dev_ctx
,
size
);
}
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."
));
#endif
}
else
if
(
platform
::
is_xpu_place
(
place
))
{
#ifdef PADDLE_WITH_XPU
// TODO(liuyuhui): Consider xpu stream later
return
Alloc
(
place
,
size
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
else
{
return
Alloc
(
place
,
size
);
}
}
}
// namespace memory
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
f13c3a9c
...
...
@@ -1492,7 +1492,9 @@ All parameter, weight, gradient are variables in Paddle.
#endif
.
def
(
"__repr__"
,
string
::
to_string
<
const
platform
::
XPUPlace
&>
)
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
XPUPlace
&>
);
#ifdef PADDLE_WITH_XPU
m
.
def
(
"get_xpu_device_count"
,
platform
::
GetXPUDeviceCount
);
#endif
py
::
class_
<
paddle
::
platform
::
CPUPlace
>
(
m
,
"CPUPlace"
,
R"DOC(
CPUPlace is a descriptor of a device.
It represents a CPU device on which a tensor will be allocated and a model will run.
...
...
@@ -2077,6 +2079,11 @@ All parameter, weight, gradient are variables in Paddle.
exec_strategy=exec_strategy)
)DOC"
);
py
::
enum_
<
ExecutionStrategy
::
UseDevice
>
(
exec_strategy
,
"UseDevice"
)
.
value
(
"CPU"
,
ExecutionStrategy
::
UseDevice
::
kCPU
)
.
value
(
"CUDA"
,
ExecutionStrategy
::
UseDevice
::
kCUDA
)
.
value
(
"XPU"
,
ExecutionStrategy
::
UseDevice
::
kXPU
);
exec_strategy
.
def
(
py
::
init
())
.
def_property
(
"num_threads"
,
...
...
@@ -2107,14 +2114,12 @@ All parameter, weight, gradient are variables in Paddle.
exec_strategy.num_threads = 4
)DOC"
)
.
def_property
(
"use_cuda"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_cuda_
;
},
[](
ExecutionStrategy
&
self
,
bool
use_cuda
)
{
self
.
use_cuda_
=
use_cuda
;
})
// FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
// make user confuse, because ParallelExecutor has a parameter named
// 'use_cuda' too, in current implementation, ParallelExecutor's
// 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
"_use_device"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_device_
;
},
[](
ExecutionStrategy
&
self
,
ExecutionStrategy
::
UseDevice
use_device
)
{
self
.
use_device_
=
use_device
;
})
// NOTE(liuyuhui): Doesn't add doc for 'use_device', because
// use_device isn‘t exposed to users.
.
def_property
(
"allow_op_delay"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
allow_op_delay_
;
},
...
...
python/paddle/fluid/compiler.py
浏览文件 @
f13c3a9c
...
...
@@ -18,7 +18,7 @@ import six
import
sys
from
..
import
compat
as
cpt
from
.
import
framework
from
.framework
import
cuda_places
,
cpu_places
from
.framework
import
cuda_places
,
cpu_places
,
xpu_places
from
.
import
core
...
...
@@ -316,7 +316,7 @@ class CompiledProgram(object):
"Subclass of CompiledProgram should implement _with_distributed method."
)
def
_compile_data_parallel
(
self
,
places
,
use_
cuda
=
Fals
e
,
scope
=
None
):
def
_compile_data_parallel
(
self
,
places
,
use_
devic
e
,
scope
=
None
):
if
self
.
_share_vars_from
:
if
scope
:
sys
.
stderr
.
write
(
"share_vars_from is set, scope is ignored.
\n
"
)
...
...
@@ -342,16 +342,23 @@ class CompiledProgram(object):
if
self
.
_exec_strategy
is
None
:
self
.
_exec_strategy
=
ExecutionStrategy
()
self
.
_exec_strategy
.
use_cuda
=
use_cuda
self
.
_exec_strategy
.
_use_device
=
use_device
if
self
.
_exec_strategy
.
num_threads
==
0
:
if
self
.
_exec_strategy
.
use_cuda
:
if
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevice
.
CUDA
:
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
self
.
_exec_strategy
.
num_threads
=
len
(
places
)
*
4
elif
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevice
.
XPU
:
# Currently only single thread is supported in Kunlun XPU.
self
.
_exec_strategy
.
num_threads
=
1
else
:
self
.
_exec_strategy
.
num_threads
=
len
(
places
)
*
2
if
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevice
.
XPU
:
assert
self
.
_exec_strategy
.
num_threads
==
1
,
\
"Currently only single thread is supported in Kunlun XPU."
if
self
.
_build_strategy
.
num_trainers
>
1
:
assert
self
.
_is_data_parallel
,
\
"If you use multi-trainer to train the model, you should use "
\
...
...
@@ -377,7 +384,7 @@ class CompiledProgram(object):
self
.
_build_strategy
.
enable_sequential_execution
=
True
if
self
.
_program
is
not
None
and
self
.
_program
.
_enable_dgc
:
assert
use_cuda
,
"DGC only used under CUDA environment."
assert
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevice
.
CUDA
,
"DGC only used under CUDA environment."
assert
self
.
_build_strategy
.
num_trainers
*
len
(
places
)
>
1
,
"DGC is not avaliable for single card training."
assert
self
.
_build_strategy
.
reduce_strategy
==
BuildStrategy
.
ReduceStrategy
.
AllReduce
,
"DGC
\
...
...
@@ -447,11 +454,14 @@ class CompiledProgram(object):
raise
NotImplementedError
(
"If optimizer is used in control flow, "
"training on multi-places is not supported now."
)
if
isinstance
(
self
.
_place
,
core
.
CUDAPlace
):
use_device
=
ExecutionStrategy
.
UseDevice
.
CUDA
elif
isinstance
(
self
.
_place
,
core
.
XPUPlace
):
use_device
=
ExecutionStrategy
.
UseDevice
.
XPU
else
:
use_device
=
ExecutionStrategy
.
UseDevice
.
CPU
self
.
_executor
=
self
.
_compile_data_parallel
(
use_cuda
=
isinstance
(
self
.
_place
,
core
.
CUDAPlace
),
scope
=
self
.
_scope
,
places
=
self
.
_places
)
use_device
=
use_device
,
scope
=
self
.
_scope
,
places
=
self
.
_places
)
return
self
def
_get_places
(
self
,
place
,
place_list
):
...
...
@@ -461,7 +471,11 @@ class CompiledProgram(object):
assert
p
.
_type
()
==
place
.
_type
(),
\
"Place type not match. You may set wrong type of places."
else
:
place_list
=
cuda_places
()
if
isinstance
(
place
,
core
.
CUDAPlace
)
else
cpu_places
()
if
isinstance
(
place
,
core
.
CUDAPlace
):
place_list
=
cuda_places
()
elif
isinstance
(
place
,
core
.
XPUPlace
):
place_list
=
xpu_places
()
else
:
place_list
=
cpu_places
()
assert
place_list
,
"No places for execution."
return
place_list
python/paddle/fluid/framework.py
浏览文件 @
f13c3a9c
...
...
@@ -47,6 +47,7 @@ __all__ = [
'name_scope'
,
'cuda_places'
,
'cpu_places'
,
'xpu_places'
,
'cuda_pinned_places'
,
'in_dygraph_mode'
,
'is_compiled_with_cuda'
,
...
...
@@ -354,6 +355,15 @@ def _cuda_ids():
return
device_ids
def
_xpu_ids
():
xpus_env
=
os
.
getenv
(
"FLAGS_selected_xpus"
)
if
xpus_env
:
device_ids
=
[
int
(
s
)
for
s
in
xpus_env
.
split
(
","
)]
else
:
device_ids
=
six
.
moves
.
range
(
core
.
get_xpu_device_count
())
return
device_ids
def
is_compiled_with_xpu
():
"""
Whether this whl package can be used to run the model on XPU.
...
...
@@ -430,6 +440,43 @@ def cuda_places(device_ids=None):
return
[
core
.
CUDAPlace
(
dev_id
)
for
dev_id
in
device_ids
]
def
xpu_places
(
device_ids
=
None
):
"""
**Note**:
For multi-card tasks, please use `FLAGS_selected_xpus` environment variable to set the visible XPU device.
This function creates a list of :code:`paddle.XPUPlace` objects.
If :code:`device_ids` is None, environment variable of
:code:`FLAGS_selected_xpus` would be checked first. For example, if
:code:`FLAGS_selected_xpus=0,1,2`, the returned list would
be [paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)].
If :code:`FLAGS_selected_xpus` is not set, all visible
xpu places would be returned.
If :code:`device_ids` is not None, it should be the device
ids of XPUs. For example, if :code:`device_ids=[0,1,2]`,
the returned list would be
[paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)].
Parameters:
device_ids (list or tuple of int, optional): list of XPU device ids.
Returns:
list of paddle.XPUPlace: Created XPU place list.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
xpu_places = static.xpu_places()
"""
assert
core
.
is_compiled_with_xpu
(),
\
"Not compiled with XPU"
if
device_ids
is
None
:
device_ids
=
_xpu_ids
()
elif
not
isinstance
(
device_ids
,
(
list
,
tuple
)):
device_ids
=
[
device_ids
]
return
[
core
.
XPUPlace
(
dev_id
)
for
dev_id
in
device_ids
]
def
cpu_places
(
device_count
=
None
):
"""
This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
浏览文件 @
f13c3a9c
...
...
@@ -75,7 +75,7 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
exe
=
Executor
(
place
)
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
.
use_cuda
=
use_cuda
exec_strategy
.
_use_device
=
fluid
.
ExecutionStrategy
.
UseDevice
.
CUDA
if
use_cuda
else
fluid
.
ExecutionStrategy
.
UseDevice
.
CPU
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
memory_optimize
=
use_mem_opt
...
...
python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
0 → 100644
浏览文件 @
f13c3a9c
# copyright (c) 2020 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
from
__future__
import
print_function
import
unittest
import
os
import
paddle
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
import
paddle.static
as
static
class
Test_XPU_Places
(
unittest
.
TestCase
):
def
assert_places_equal
(
self
,
places0
,
places1
):
self
.
assertEqual
(
len
(
places0
),
len
(
places1
))
for
place0
,
place1
in
zip
(
places0
,
places1
):
self
.
assertEqual
(
type
(
place0
),
type
(
place1
))
self
.
assertEqual
(
place0
.
get_device_id
(),
place1
.
get_device_id
())
def
test_check_preset_envs
(
self
):
if
core
.
is_compiled_with_xpu
():
os
.
environ
[
"FLAGS_selected_xpus"
]
=
"0"
place_list
=
static
.
xpu_places
()
self
.
assert_places_equal
([
fluid
.
XPUPlace
(
0
)],
place_list
)
def
test_check_no_preset_envs
(
self
):
if
core
.
is_compiled_with_xpu
():
place_list
=
static
.
xpu_places
(
0
)
self
.
assert_places_equal
([
fluid
.
XPUPlace
(
0
)],
place_list
)
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
unittest
.
main
()
python/paddle/static/__init__.py
浏览文件 @
f13c3a9c
...
...
@@ -20,7 +20,7 @@ __all__ = [
'default_main_program'
,
'default_startup_program'
,
'Program'
,
'data'
,
'InputSpec'
,
'save'
,
'load'
,
'save_inference_model'
,
'load_inference_model'
,
'load_program_state'
,
'set_program_state'
,
'cpu_places'
,
'cuda_places'
,
'Variable'
'
xpu_places'
,
'
Variable'
]
from
.
import
nn
...
...
@@ -45,6 +45,7 @@ from ..fluid.framework import name_scope #DEFINE_ALIAS
from
..fluid.framework
import
program_guard
#DEFINE_ALIAS
from
..fluid.framework
import
cpu_places
#DEFINE_ALIAS
from
..fluid.framework
import
cuda_places
#DEFINE_ALIAS
from
..fluid.framework
import
xpu_places
#DEFINE_ALIAS
from
..fluid.framework
import
Variable
#DEFINE_ALIAS
from
..fluid.layers.control_flow
import
Print
#DEFINE_ALIAS
from
..fluid.layers.nn
import
py_func
#DEFINE_ALIAS
...
...
tools/wlist.json
浏览文件 @
f13c3a9c
...
...
@@ -413,7 +413,8 @@
"CRFDecoding.forward"
,
"SequenceTagging.forward"
,
"XPUPlace"
,
"is_compiled_with_xpu"
"is_compiled_with_xpu"
,
"xpu_places"
],
"gpu_not_white"
:[
"deformable_conv"
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录