Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
f13c3a9c
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f13c3a9c
编写于
12月 16, 2020
作者:
L
liuyuhui
提交者:
GitHub
12月 16, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Kunlun] PR1:Support one Kunlun card training in parallel executor (#29337)
上级
76738504
变更
20
隐藏空白更改
内联
并排
Showing
20 changed file
with
282 addition
and
87 deletion
+282
-87
paddle/fluid/framework/details/broadcast_op_handle_test.h
paddle/fluid/framework/details/broadcast_op_handle_test.h
+6
-2
paddle/fluid/framework/details/execution_strategy.h
paddle/fluid/framework/details/execution_strategy.h
+6
-1
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+1
-1
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
...fluid/framework/details/fused_broadcast_op_handle_test.cc
+5
-2
paddle/fluid/framework/details/gather_op_handle_test.cc
paddle/fluid/framework/details/gather_op_handle_test.cc
+4
-1
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+4
-3
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+2
-1
paddle/fluid/framework/details/reduce_op_handle_test.cc
paddle/fluid/framework/details/reduce_op_handle_test.cc
+6
-2
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+15
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+1
-1
paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
...optimize_pass/test_reference_count_pass_last_lived_ops.cc
+2
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+64
-36
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+28
-12
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+14
-9
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+25
-11
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+47
-0
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
...luid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+1
-1
python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
+47
-0
python/paddle/static/__init__.py
python/paddle/static/__init__.py
+2
-1
tools/wlist.json
tools/wlist.json
+2
-1
未找到文件。
paddle/fluid/framework/details/broadcast_op_handle_test.h
浏览文件 @
f13c3a9c
...
@@ -33,6 +33,8 @@ struct VarHandle;
...
@@ -33,6 +33,8 @@ struct VarHandle;
namespace
f
=
paddle
::
framework
;
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
namespace
p
=
paddle
::
platform
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
// test data amount
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
const
f
::
DDim
kDims
=
{
20
,
20
};
...
@@ -273,7 +275,8 @@ struct TestBroadcastOpHandle {
...
@@ -273,7 +275,8 @@ struct TestBroadcastOpHandle {
f
::
LoD
lod
{{
0
,
10
,
20
}};
f
::
LoD
lod
{{
0
,
10
,
20
}};
auto
send_vector
=
InitLoDTensor
(
"input"
,
input_scope_idx
,
lod
);
auto
send_vector
=
InitLoDTensor
(
"input"
,
input_scope_idx
,
lod
);
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
WaitAll
();
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
...
@@ -287,7 +290,8 @@ struct TestBroadcastOpHandle {
...
@@ -287,7 +290,8 @@ struct TestBroadcastOpHandle {
int
height
=
static_cast
<
int
>
(
kDims
[
0
]
*
2
);
int
height
=
static_cast
<
int
>
(
kDims
[
0
]
*
2
);
auto
send_vector
=
InitSelectedRows
(
"input"
,
input_scope_idx
,
rows
,
height
);
auto
send_vector
=
InitSelectedRows
(
"input"
,
input_scope_idx
,
rows
,
height
);
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
WaitAll
();
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
...
...
paddle/fluid/framework/details/execution_strategy.h
浏览文件 @
f13c3a9c
...
@@ -21,10 +21,15 @@ namespace details {
...
@@ -21,10 +21,15 @@ namespace details {
struct
ExecutionStrategy
{
struct
ExecutionStrategy
{
enum
ExecutorType
{
kDefault
=
0
,
kExperimental
=
1
};
enum
ExecutorType
{
kDefault
=
0
,
kExperimental
=
1
};
enum
UseDevice
{
kCPU
=
0
,
kCUDA
=
1
,
kXPU
=
2
,
};
// num_threads indicates the size of thread pool.
// num_threads indicates the size of thread pool.
size_t
num_threads_
{
0
};
size_t
num_threads_
{
0
};
bool
use_cuda_
{
true
};
UseDevice
use_device_
{
kCUDA
};
// Note that allow_op_delay is invalid now.
// Note that allow_op_delay is invalid now.
bool
allow_op_delay_
{
false
};
bool
allow_op_delay_
{
false
};
// num_iteration_per_drop_scope indicates how many
// num_iteration_per_drop_scope indicates how many
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
f13c3a9c
...
@@ -330,7 +330,7 @@ bool FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
...
@@ -330,7 +330,7 @@ bool FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
try
{
try
{
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
op
->
Run
(
strategy_
.
use_
cuda
_
);
op
->
Run
(
strategy_
.
use_
device
_
);
}
}
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
return
true
;
return
true
;
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
浏览文件 @
f13c3a9c
...
@@ -32,6 +32,7 @@ namespace framework {
...
@@ -32,6 +32,7 @@ namespace framework {
namespace
details
{
namespace
details
{
struct
VarHandle
;
struct
VarHandle
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
struct
TestFusedBroadcastOpHandle
:
TestBroadcastOpHandle
{
struct
TestFusedBroadcastOpHandle
:
TestBroadcastOpHandle
{
std
::
vector
<
std
::
string
>
out_varnames_
;
std
::
vector
<
std
::
string
>
out_varnames_
;
...
@@ -108,7 +109,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
...
@@ -108,7 +109,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
InitLoDTensor
(
varname
,
input_scope_idxes
[
i
],
lod
,
val_scalar
));
InitLoDTensor
(
varname
,
input_scope_idxes
[
i
],
lod
,
val_scalar
));
}
}
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
WaitAll
();
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
...
@@ -131,7 +133,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
...
@@ -131,7 +133,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
rows
,
height
,
val_scalar
));
rows
,
height
,
val_scalar
));
}
}
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
WaitAll
();
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
...
...
paddle/fluid/framework/details/gather_op_handle_test.cc
浏览文件 @
f13c3a9c
...
@@ -27,6 +27,8 @@ struct DummyVarHandle;
...
@@ -27,6 +27,8 @@ struct DummyVarHandle;
namespace
f
=
paddle
::
framework
;
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
namespace
p
=
paddle
::
platform
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
// test data amount
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
const
f
::
DDim
kDims
=
{
20
,
20
};
...
@@ -171,7 +173,8 @@ struct TestGatherOpHandle {
...
@@ -171,7 +173,8 @@ struct TestGatherOpHandle {
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
in_selected_rows
->
value
());
in_selected_rows
->
value
());
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
WaitAll
();
...
...
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
f13c3a9c
...
@@ -85,13 +85,14 @@ void OpHandleBase::InitCUDA() {
...
@@ -85,13 +85,14 @@ void OpHandleBase::InitCUDA() {
#endif
#endif
}
}
void
OpHandleBase
::
Run
(
bool
use_cuda
)
{
void
OpHandleBase
::
Run
(
ExecutionStrategy
::
UseDevice
use_device
)
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
events_
.
empty
()
&&
use_cuda
&&
dev_ctxes_
.
size
()
>
0
)
{
if
(
events_
.
empty
()
&&
use_device
==
ExecutionStrategy
::
UseDevice
::
kCUDA
&&
dev_ctxes_
.
size
()
>
0
)
{
InitCUDA
();
InitCUDA
();
}
}
#else
#else
PADDLE_ENFORCE_
EQ
(
use_cuda
,
false
,
PADDLE_ENFORCE_
NE
(
use_device
,
ExecutionStrategy
::
UseDevice
::
kCUDA
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
"Argument use_cuda should be false when Paddle is not "
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."
));
"compiled with CUDA."
));
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
f13c3a9c
...
@@ -19,6 +19,7 @@
...
@@ -19,6 +19,7 @@
#include <unordered_set>
#include <unordered_set>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
...
@@ -71,7 +72,7 @@ class OpHandleBase {
...
@@ -71,7 +72,7 @@ class OpHandleBase {
virtual
std
::
string
Name
()
const
=
0
;
virtual
std
::
string
Name
()
const
=
0
;
void
Run
(
bool
use_cuda
);
void
Run
(
ExecutionStrategy
::
UseDevice
use_device
);
virtual
void
RecordWaitEventOnCtx
(
platform
::
DeviceContext
*
waited_ctx
);
virtual
void
RecordWaitEventOnCtx
(
platform
::
DeviceContext
*
waited_ctx
);
...
...
paddle/fluid/framework/details/reduce_op_handle_test.cc
浏览文件 @
f13c3a9c
...
@@ -25,6 +25,8 @@ namespace details {
...
@@ -25,6 +25,8 @@ namespace details {
namespace
f
=
paddle
::
framework
;
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
namespace
p
=
paddle
::
platform
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
// test data amount
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
const
f
::
DDim
kDims
=
{
20
,
20
};
...
@@ -196,7 +198,8 @@ struct TestReduceOpHandle {
...
@@ -196,7 +198,8 @@ struct TestReduceOpHandle {
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
in_selected_rows
->
value
());
in_selected_rows
->
value
());
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
WaitAll
();
...
@@ -260,7 +263,8 @@ struct TestReduceOpHandle {
...
@@ -260,7 +263,8 @@ struct TestReduceOpHandle {
out_lodtensor
->
ShareDataWith
(
in_lodtensor
);
out_lodtensor
->
ShareDataWith
(
in_lodtensor
);
op_handle_
->
Run
(
false
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
WaitAll
();
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
f13c3a9c
...
@@ -58,6 +58,17 @@ struct ScaleLossGradFunctor {
...
@@ -58,6 +58,17 @@ struct ScaleLossGradFunctor {
auto
*
out_data
=
out_
->
mutable_data
<
OutT
>
(
place_
);
auto
*
out_data
=
out_
->
mutable_data
<
OutT
>
(
place_
);
if
(
platform
::
is_cpu_place
(
place_
))
{
if
(
platform
::
is_cpu_place
(
place_
))
{
*
out_data
=
static_cast
<
OutT
>
(
coeff_
);
*
out_data
=
static_cast
<
OutT
>
(
coeff_
);
}
else
if
(
platform
::
is_xpu_place
(
place_
))
{
#if defined(PADDLE_WITH_XPU)
OutT
cast_coeff
=
static_cast
<
OutT
>
(
coeff_
);
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place_
),
out_data
,
platform
::
CPUPlace
(),
&
cast_coeff
,
SizeOfType
(
out_dtype_
));
VLOG
(
10
)
<<
place_
<<
"RUN Scale loss grad op"
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
else
{
}
else
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
OutT
cast_coeff
=
static_cast
<
OutT
>
(
coeff_
);
OutT
cast_coeff
=
static_cast
<
OutT
>
(
coeff_
);
...
@@ -66,7 +77,10 @@ struct ScaleLossGradFunctor {
...
@@ -66,7 +77,10 @@ struct ScaleLossGradFunctor {
platform
::
CPUPlace
(),
&
cast_coeff
,
SizeOfType
(
out_dtype_
),
platform
::
CPUPlace
(),
&
cast_coeff
,
SizeOfType
(
out_dtype_
),
stream
);
stream
);
VLOG
(
10
)
<<
place_
<<
"RUN Scale loss grad op"
;
VLOG
(
10
)
<<
place_
<<
"RUN Scale loss grad op"
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."
));
#endif
#endif
}
}
}
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
f13c3a9c
...
@@ -348,7 +348,7 @@ bool ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
...
@@ -348,7 +348,7 @@ bool ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
try
{
try
{
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
op
->
Run
(
strategy_
.
use_
cuda
_
);
op
->
Run
(
strategy_
.
use_
device
_
);
}
}
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
return
true
;
return
true
;
...
...
paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
浏览文件 @
f13c3a9c
...
@@ -88,7 +88,8 @@ class ReferenceCountPassTestHelper {
...
@@ -88,7 +88,8 @@ class ReferenceCountPassTestHelper {
FLAGS_eager_delete_tensor_gb
=
-
1
;
FLAGS_eager_delete_tensor_gb
=
-
1
;
details
::
ExecutionStrategy
exec_strategy
;
details
::
ExecutionStrategy
exec_strategy
;
exec_strategy
.
use_cuda_
=
use_cuda
;
exec_strategy
.
use_device_
=
use_cuda
?
(
ExecutionStrategy
::
kCUDA
)
:
(
ExecutionStrategy
::
kCPU
);
executor_
.
reset
(
new
ParallelExecutor
(
CreatePlaces
(
1
,
use_cuda
),
{},
""
,
executor_
.
reset
(
new
ParallelExecutor
(
CreatePlaces
(
1
,
use_cuda
),
{},
""
,
&
scope_
,
{},
exec_strategy
,
&
scope_
,
{},
exec_strategy
,
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
f13c3a9c
...
@@ -63,6 +63,8 @@ static bool gProfileStarted = false;
...
@@ -63,6 +63,8 @@ static bool gProfileStarted = false;
std
::
once_flag
p2p_init_flag
;
std
::
once_flag
p2p_init_flag
;
#endif
#endif
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
class
ParallelExecutorPrivate
{
class
ParallelExecutorPrivate
{
public:
public:
ParallelExecutorPrivate
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
ParallelExecutorPrivate
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
@@ -93,6 +95,8 @@ class ParallelExecutorPrivate {
...
@@ -93,6 +95,8 @@ class ParallelExecutorPrivate {
}
}
}
}
bool
IsUseCUDA
(
UseDevice
use_device
);
void
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
=
true
);
void
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
=
true
);
bool
AllowPartialFeed
()
const
;
bool
AllowPartialFeed
()
const
;
...
@@ -286,7 +290,7 @@ class ParallelExecutorPrivate {
...
@@ -286,7 +290,7 @@ class ParallelExecutorPrivate {
platform
::
NCCLCommunicator
*
nccl_ctxs_
{
nullptr
};
platform
::
NCCLCommunicator
*
nccl_ctxs_
{
nullptr
};
#endif
#endif
bool
own_local_scope_
;
bool
own_local_scope_
;
bool
use_cuda
_
;
UseDevice
use_device
_
;
bool
use_all_reduce_
;
bool
use_all_reduce_
;
size_t
nranks_
;
size_t
nranks_
;
...
@@ -296,6 +300,10 @@ class ParallelExecutorPrivate {
...
@@ -296,6 +300,10 @@ class ParallelExecutorPrivate {
details
::
ParallelSSAGraphExecutor
*
inference_executor_
{
nullptr
};
details
::
ParallelSSAGraphExecutor
*
inference_executor_
{
nullptr
};
};
};
bool
ParallelExecutorPrivate
::
IsUseCUDA
(
UseDevice
use_device
)
{
return
use_device
==
UseDevice
::
kCUDA
;
}
void
ParallelExecutorPrivate
::
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
)
{
void
ParallelExecutorPrivate
::
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
)
{
if
(
inference_executor_
)
{
if
(
inference_executor_
)
{
inference_executor_
->
SetHasFeed
(
dev_idx
,
has_feed
);
inference_executor_
->
SetHasFeed
(
dev_idx
,
has_feed
);
...
@@ -340,7 +348,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
...
@@ -340,7 +348,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
auto
addto_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"inplace_addto_op_pass"
);
auto
addto_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"inplace_addto_op_pass"
);
addto_pass
->
SetNotOwned
(
ir
::
kMemOptVarInfoMapList
,
&
mem_opt_var_infos_
);
addto_pass
->
SetNotOwned
(
ir
::
kMemOptVarInfoMapList
,
&
mem_opt_var_infos_
);
addto_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
addto_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
addto_pass
->
Set
NotOwned
(
ir
::
kUseCuda
,
&
use_cuda_
);
addto_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
UseDevice
::
kCUDA
)
);
VLOG
(
10
)
<<
"Start to apply inplace_addto_op_pass"
;
VLOG
(
10
)
<<
"Start to apply inplace_addto_op_pass"
;
graph
=
addto_pass
->
Apply
(
graph
);
graph
=
addto_pass
->
Apply
(
graph
);
VLOG
(
10
)
<<
"inplace_addto_op_pass Applied"
;
VLOG
(
10
)
<<
"inplace_addto_op_pass Applied"
;
...
@@ -351,7 +359,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
...
@@ -351,7 +359,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
ir
::
PassRegistry
::
Instance
().
Get
(
"buffer_shared_inplace_pass"
);
ir
::
PassRegistry
::
Instance
().
Get
(
"buffer_shared_inplace_pass"
);
inplace_pass
->
SetNotOwned
(
ir
::
kMemOptVarInfoMapList
,
&
mem_opt_var_infos_
);
inplace_pass
->
SetNotOwned
(
ir
::
kMemOptVarInfoMapList
,
&
mem_opt_var_infos_
);
inplace_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
inplace_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
inplace_pass
->
Set
NotOwned
(
ir
::
kUseCuda
,
&
use_cuda_
);
inplace_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
UseDevice
::
kCUDA
)
);
VLOG
(
10
)
<<
"Start to apply buffer_shared_inplace_pass"
;
VLOG
(
10
)
<<
"Start to apply buffer_shared_inplace_pass"
;
graph
=
inplace_pass
->
Apply
(
graph
);
graph
=
inplace_pass
->
Apply
(
graph
);
VLOG
(
10
)
<<
"buffer_shared_inplace_pass Applied"
;
VLOG
(
10
)
<<
"buffer_shared_inplace_pass Applied"
;
...
@@ -366,7 +374,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
...
@@ -366,7 +374,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
&
mem_opt_var_infos_
);
&
mem_opt_var_infos_
);
cross_op_memory_reuse_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
cross_op_memory_reuse_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
&
last_live_ops_of_vars
);
cross_op_memory_reuse_pass
->
SetNotOwned
(
ir
::
kUseCuda
,
&
use_cuda_
);
cross_op_memory_reuse_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
UseDevice
::
kCUDA
));
VLOG
(
10
)
<<
"Start to apply buffer_shared_cross_op_memory_reuse_pass"
;
VLOG
(
10
)
<<
"Start to apply buffer_shared_cross_op_memory_reuse_pass"
;
graph
=
cross_op_memory_reuse_pass
->
Apply
(
graph
);
graph
=
cross_op_memory_reuse_pass
->
Apply
(
graph
);
VLOG
(
10
)
<<
"buffer_shared_cross_op_memory_reuse_pass Applied"
;
VLOG
(
10
)
<<
"buffer_shared_cross_op_memory_reuse_pass Applied"
;
...
@@ -386,8 +395,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
...
@@ -386,8 +395,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
continue
;
continue
;
}
}
std
::
unique_ptr
<
GarbageCollector
>
gc
;
std
::
unique_ptr
<
GarbageCollector
>
gc
;
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place
))
{
if
(
platform
::
is_gpu_place
(
place
))
{
#ifdef PADDLE_WITH_CUDA
if
(
IsFastEagerDeletionModeEnabled
())
{
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
max_memory_size
));
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
max_memory_size
));
...
@@ -396,20 +405,29 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
...
@@ -396,20 +405,29 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
max_memory_size
));
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
max_memory_size
));
}
}
VLOG
(
10
)
<<
"Created "
<<
i
<<
"-th GarbageCollector at "
<<
place
;
VLOG
(
10
)
<<
"Created "
<<
i
<<
"-th GarbageCollector at "
<<
place
;
}
else
{
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."
));
#endif
#endif
if
(
platform
::
is_cpu_place
(
place
))
{
}
else
if
(
platform
::
is_xpu_place
(
place
))
{
gc
.
reset
(
new
CPUGarbageCollector
(
#if defined(PADDLE_WITH_XPU)
BOOST_GET_CONST
(
platform
::
CPUPlace
,
place
),
max_memory_size
));
gc
.
reset
(
new
XPUGarbageCollector
(
VLOG
(
10
)
<<
"Created GarbageCollector at "
<<
place
;
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
),
max_memory_size
));
}
else
{
VLOG
(
10
)
<<
"Created "
<<
i
<<
"-th GarbageCollector at "
<<
place
;
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
#else
"Unsupported place for garbage collection"
));
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
}
"Paddle can't use XPU device since it's not compiled with XPU,"
#ifdef PADDLE_WITH_CUDA
"Please recompile or reinstall Paddle with XPU support."
));
}
#endif
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
gc
.
reset
(
new
CPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
place
),
max_memory_size
));
VLOG
(
10
)
<<
"Created GarbageCollector at "
<<
place
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Unsupported place for garbage collection"
));
}
gcs_
.
emplace
(
place
,
std
::
move
(
gc
));
gcs_
.
emplace
(
place
,
std
::
move
(
gc
));
}
}
...
@@ -510,13 +528,10 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
...
@@ -510,13 +528,10 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const
BuildStrategy
&
build_strategy
,
const
BuildStrategy
&
build_strategy
,
ir
::
Graph
*
graph
)
ir
::
Graph
*
graph
)
:
member_
(
new
ParallelExecutorPrivate
(
places
,
scope
))
{
:
member_
(
new
ParallelExecutorPrivate
(
places
,
scope
))
{
PADDLE_ENFORCE
(
places
.
size
()
>
0
&&
!
is_xpu_place
(
places
[
0
]),
platform
::
errors
::
Unavailable
(
"XPU is not supported in ParallelExecutor"
));
InitP2P
(
places
);
InitP2P
(
places
);
ir
::
InitReaderQueueDeviceCount
(
graph
,
*
(
member_
->
global_scope_
),
ir
::
InitReaderQueueDeviceCount
(
graph
,
*
(
member_
->
global_scope_
),
member_
->
places_
.
size
());
member_
->
places_
.
size
());
member_
->
use_
cuda_
=
exec_strategy
.
use_cuda
_
;
member_
->
use_
device_
=
exec_strategy
.
use_device
_
;
member_
->
build_strategy_
=
build_strategy
;
member_
->
build_strategy_
=
build_strategy
;
member_
->
use_all_reduce_
=
member_
->
build_strategy_
.
reduce_
==
member_
->
use_all_reduce_
=
member_
->
build_strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
...
@@ -529,7 +544,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
...
@@ -529,7 +544,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
member_
->
use_all_reduce_
=
true
;
member_
->
use_all_reduce_
=
true
;
}
}
#if defined(PADDLE_WITH_CUDA) && defined(_WIN32)
#if defined(PADDLE_WITH_CUDA) && defined(_WIN32)
if
(
member_
->
use_cuda_
)
{
if
(
member_
->
IsUseCUDA
(
member_
->
use_device_
)
)
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
places
.
size
(),
1
,
places
.
size
(),
1
,
platform
::
errors
::
Unavailable
(
"Windows can support Single GPU only."
));
platform
::
errors
::
Unavailable
(
"Windows can support Single GPU only."
));
...
@@ -537,7 +552,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
...
@@ -537,7 +552,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
#endif
#endif
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL)
if
(
member_
->
use_cuda_
)
{
if
(
member_
->
IsUseCUDA
(
member_
->
use_device_
)
)
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
places
.
size
(),
1
,
places
.
size
(),
1
,
platform
::
errors
::
PermissionDenied
(
platform
::
errors
::
PermissionDenied
(
...
@@ -548,10 +563,19 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
...
@@ -548,10 +563,19 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
}
}
#endif
#endif
std
::
string
device_name
;
if
(
member_
->
use_device_
==
UseDevice
::
kCPU
)
{
device_name
=
"CPU"
;
}
else
if
(
member_
->
use_device_
==
UseDevice
::
kCUDA
)
{
device_name
=
"CUDA"
;
}
else
{
device_name
=
"XPU"
;
}
VLOG
(
1
)
<<
string
::
Sprintf
(
VLOG
(
1
)
<<
string
::
Sprintf
(
"The Program will be executed on %s using ParallelExecutor, %lu "
"The Program will be executed on %s using ParallelExecutor, %lu "
"cards are used, so %lu programs are executed in parallel."
,
"cards are used, so %lu programs are executed in parallel."
,
(
member_
->
use_cuda_
?
"CUDA"
:
"CPU"
)
,
places
.
size
(),
places
.
size
());
device_name
,
places
.
size
(),
places
.
size
());
// Step 1. Bcast the bcast_vars to devs.
// Step 1. Bcast the bcast_vars to devs.
// Create local scopes
// Create local scopes
...
@@ -575,7 +599,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
...
@@ -575,7 +599,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
std
::
vector
<
ir
::
Graph
*>
graphs
;
std
::
vector
<
ir
::
Graph
*>
graphs
;
if
(
member_
->
build_strategy_
.
async_mode_
)
{
if
(
member_
->
build_strategy_
.
async_mode_
)
{
PADDLE_ENFORCE_EQ
(
member_
->
use_cuda_
,
false
,
PADDLE_ENFORCE_EQ
(
member_
->
IsUseCUDA
(
member_
->
use_device_
)
,
false
,
platform
::
errors
::
Unavailable
(
platform
::
errors
::
Unavailable
(
"gpu mode does not support async_mode_ now!"
));
"gpu mode does not support async_mode_ now!"
));
graphs
.
push_back
(
graph
);
graphs
.
push_back
(
graph
);
...
@@ -598,7 +622,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
...
@@ -598,7 +622,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
<<
"you can force it off by env FLAGS_enable_parallel_graph=0"
;
<<
"you can force it off by env FLAGS_enable_parallel_graph=0"
;
}
}
if
(
member_
->
use_cuda_
&&
member_
->
nranks_
>
1
)
{
if
(
member_
->
IsUseCUDA
(
member_
->
use_device_
)
&&
member_
->
nranks_
>
1
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
member_
->
InitOrGetNCCLCommunicator
(
scope
,
&
member_
->
build_strategy_
);
member_
->
InitOrGetNCCLCommunicator
(
scope
,
&
member_
->
build_strategy_
);
...
@@ -647,36 +671,39 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
...
@@ -647,36 +671,39 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
VLOG
(
3
)
<<
"use local async mode"
;
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
member_
->
build_strategy_
.
Apply
(
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
use_cuda_
,
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
nccl_ctxs_
);
member_
->
IsUseCUDA
(
member_
->
use_device_
),
member_
->
nccl_ctxs_
);
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
build_strategy_
.
Apply
(
graphs
[
i
]
=
member_
->
build_strategy_
.
Apply
(
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
use_cuda_
,
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
nccl_ctxs_
);
member_
->
IsUseCUDA
(
member_
->
use_device_
),
member_
->
nccl_ctxs_
);
async_graphs
[
i
]
=
graphs
[
i
];
async_graphs
[
i
]
=
graphs
[
i
];
}
}
}
else
{
}
else
{
graph
=
member_
->
build_strategy_
.
Apply
(
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
);
member_
->
nranks_
,
member_
->
IsUseCUDA
(
member_
->
use_device_
),
member_
->
nccl_ctxs_
);
}
}
#else
#else
if
(
member_
->
build_strategy_
.
async_mode_
)
{
if
(
member_
->
build_strategy_
.
async_mode_
)
{
VLOG
(
3
)
<<
"use local async mode"
;
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
member_
->
build_strategy_
.
Apply
(
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
use_cuda_
);
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
));
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
build_strategy_
.
Apply
(
graphs
[
i
]
=
member_
->
build_strategy_
.
Apply
(
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
use_cuda_
);
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
));
async_graphs
[
i
]
=
graphs
[
i
];
async_graphs
[
i
]
=
graphs
[
i
];
}
}
}
else
{
}
else
{
graph
=
member_
->
build_strategy_
.
Apply
(
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
);
member_
->
nranks_
,
member_
->
IsUseCUDA
(
member_
->
use_device_
)
);
}
}
#endif
#endif
...
@@ -874,7 +901,8 @@ void ParallelExecutor::BCastParamsToDevices(
...
@@ -874,7 +901,8 @@ void ParallelExecutor::BCastParamsToDevices(
// FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
// FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
if
(
member_
->
build_strategy_
.
async_mode_
)
{
if
(
member_
->
build_strategy_
.
async_mode_
)
{
share_memory
();
share_memory
();
}
else
if
(
member_
->
use_all_reduce_
||
member_
->
use_cuda_
||
}
else
if
(
member_
->
use_all_reduce_
||
member_
->
IsUseCUDA
(
member_
->
use_device_
)
||
var
==
"@LR_DECAY_COUNTER@"
)
{
var
==
"@LR_DECAY_COUNTER@"
)
{
copy_memory
();
copy_memory
();
}
else
{
}
else
{
...
@@ -1105,7 +1133,7 @@ bool ParallelExecutor::EnableParallelGraphExecution(
...
@@ -1105,7 +1133,7 @@ bool ParallelExecutor::EnableParallelGraphExecution(
}
}
}
}
if
(
!
member_
->
use_all_reduce_
||
!
member_
->
use_cuda_
)
{
if
(
!
member_
->
use_all_reduce_
||
!
member_
->
IsUseCUDA
(
member_
->
use_device_
)
)
{
if
(
build_strategy
.
enable_sequential_execution_
||
if
(
build_strategy
.
enable_sequential_execution_
||
exec_strategy
.
type_
==
ExecutionStrategy
::
ExecutorType
::
kExperimental
)
{
exec_strategy
.
type_
==
ExecutionStrategy
::
ExecutorType
::
kExperimental
)
{
enable_parallel_graph
=
false
;
enable_parallel_graph
=
false
;
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
f13c3a9c
...
@@ -29,23 +29,39 @@ namespace memory {
...
@@ -29,23 +29,39 @@ namespace memory {
AllocationPtr
Alloc
(
const
platform
::
DeviceContext
&
dev_ctx
,
size_t
size
)
{
AllocationPtr
Alloc
(
const
platform
::
DeviceContext
&
dev_ctx
,
size_t
size
)
{
auto
place
=
dev_ctx
.
GetPlace
();
auto
place
=
dev_ctx
.
GetPlace
();
#ifdef PADDLE_WITH_CUDA
if
(
size
==
0
)
{
if
(
size
==
0
||
!
platform
::
is_gpu_place
(
place
))
{
return
Alloc
(
place
,
size
);
return
Alloc
(
place
,
size
);
}
}
auto
*
default_dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
if
(
platform
::
is_gpu_place
(
place
))
{
auto
&
desired_dev_ctx
=
#ifdef PADDLE_WITH_CUDA
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
);
auto
*
default_dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
if
(
default_dev_ctx
->
stream
()
==
desired_dev_ctx
.
stream
())
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
&
desired_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
);
if
(
default_dev_ctx
->
stream
()
==
desired_dev_ctx
.
stream
())
{
return
Alloc
(
place
,
size
);
}
else
{
return
allocation
::
CUDADeviceContextAllocatorPool
::
Instance
().
Alloc
(
desired_dev_ctx
,
size
);
}
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."
));
#endif
}
else
if
(
platform
::
is_xpu_place
(
place
))
{
#ifdef PADDLE_WITH_XPU
// TODO(liuyuhui): Consider xpu stream later
return
Alloc
(
place
,
size
);
return
Alloc
(
place
,
size
);
}
else
{
return
allocation
::
CUDADeviceContextAllocatorPool
::
Instance
().
Alloc
(
desired_dev_ctx
,
size
);
}
#else
#else
return
Alloc
(
place
,
size
);
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support."
));
#endif
#endif
}
else
{
return
Alloc
(
place
,
size
);
}
}
}
}
// namespace memory
}
// namespace memory
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
f13c3a9c
...
@@ -1492,7 +1492,9 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1492,7 +1492,9 @@ All parameter, weight, gradient are variables in Paddle.
#endif
#endif
.
def
(
"__repr__"
,
string
::
to_string
<
const
platform
::
XPUPlace
&>
)
.
def
(
"__repr__"
,
string
::
to_string
<
const
platform
::
XPUPlace
&>
)
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
XPUPlace
&>
);
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
XPUPlace
&>
);
#ifdef PADDLE_WITH_XPU
m
.
def
(
"get_xpu_device_count"
,
platform
::
GetXPUDeviceCount
);
#endif
py
::
class_
<
paddle
::
platform
::
CPUPlace
>
(
m
,
"CPUPlace"
,
R"DOC(
py
::
class_
<
paddle
::
platform
::
CPUPlace
>
(
m
,
"CPUPlace"
,
R"DOC(
CPUPlace is a descriptor of a device.
CPUPlace is a descriptor of a device.
It represents a CPU device on which a tensor will be allocated and a model will run.
It represents a CPU device on which a tensor will be allocated and a model will run.
...
@@ -2077,6 +2079,11 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -2077,6 +2079,11 @@ All parameter, weight, gradient are variables in Paddle.
exec_strategy=exec_strategy)
exec_strategy=exec_strategy)
)DOC"
);
)DOC"
);
py
::
enum_
<
ExecutionStrategy
::
UseDevice
>
(
exec_strategy
,
"UseDevice"
)
.
value
(
"CPU"
,
ExecutionStrategy
::
UseDevice
::
kCPU
)
.
value
(
"CUDA"
,
ExecutionStrategy
::
UseDevice
::
kCUDA
)
.
value
(
"XPU"
,
ExecutionStrategy
::
UseDevice
::
kXPU
);
exec_strategy
.
def
(
py
::
init
())
exec_strategy
.
def
(
py
::
init
())
.
def_property
(
.
def_property
(
"num_threads"
,
"num_threads"
,
...
@@ -2107,14 +2114,12 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -2107,14 +2114,12 @@ All parameter, weight, gradient are variables in Paddle.
exec_strategy.num_threads = 4
exec_strategy.num_threads = 4
)DOC"
)
)DOC"
)
.
def_property
(
.
def_property
(
"use_cuda"
,
"_use_device"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_cuda_
;
},
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_device_
;
},
[](
ExecutionStrategy
&
self
,
bool
use_cuda
)
{
[](
ExecutionStrategy
&
self
,
ExecutionStrategy
::
UseDevice
use_device
)
{
self
.
use_cuda_
=
use_cuda
;
self
.
use_device_
=
use_device
;
})
// FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
})
// NOTE(liuyuhui): Doesn't add doc for 'use_device', because
// make user confuse, because ParallelExecutor has a parameter named
// use_device isn‘t exposed to users.
// 'use_cuda' too, in current implementation, ParallelExecutor's
// 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
.
def_property
(
.
def_property
(
"allow_op_delay"
,
"allow_op_delay"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
allow_op_delay_
;
},
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
allow_op_delay_
;
},
...
...
python/paddle/fluid/compiler.py
浏览文件 @
f13c3a9c
...
@@ -18,7 +18,7 @@ import six
...
@@ -18,7 +18,7 @@ import six
import
sys
import
sys
from
..
import
compat
as
cpt
from
..
import
compat
as
cpt
from
.
import
framework
from
.
import
framework
from
.framework
import
cuda_places
,
cpu_places
from
.framework
import
cuda_places
,
cpu_places
,
xpu_places
from
.
import
core
from
.
import
core
...
@@ -316,7 +316,7 @@ class CompiledProgram(object):
...
@@ -316,7 +316,7 @@ class CompiledProgram(object):
"Subclass of CompiledProgram should implement _with_distributed method."
"Subclass of CompiledProgram should implement _with_distributed method."
)
)
def
_compile_data_parallel
(
self
,
places
,
use_
cuda
=
Fals
e
,
scope
=
None
):
def
_compile_data_parallel
(
self
,
places
,
use_
devic
e
,
scope
=
None
):
if
self
.
_share_vars_from
:
if
self
.
_share_vars_from
:
if
scope
:
if
scope
:
sys
.
stderr
.
write
(
"share_vars_from is set, scope is ignored.
\n
"
)
sys
.
stderr
.
write
(
"share_vars_from is set, scope is ignored.
\n
"
)
...
@@ -342,16 +342,23 @@ class CompiledProgram(object):
...
@@ -342,16 +342,23 @@ class CompiledProgram(object):
if
self
.
_exec_strategy
is
None
:
if
self
.
_exec_strategy
is
None
:
self
.
_exec_strategy
=
ExecutionStrategy
()
self
.
_exec_strategy
=
ExecutionStrategy
()
self
.
_exec_strategy
.
use_cuda
=
use_cuda
self
.
_exec_strategy
.
_use_device
=
use_device
if
self
.
_exec_strategy
.
num_threads
==
0
:
if
self
.
_exec_strategy
.
num_threads
==
0
:
if
self
.
_exec_strategy
.
use_cuda
:
if
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevice
.
CUDA
:
# Experiments on se-resnext shows that too many threads hurt
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
# performance. Worth tunning for other models in the future.
self
.
_exec_strategy
.
num_threads
=
len
(
places
)
*
4
self
.
_exec_strategy
.
num_threads
=
len
(
places
)
*
4
elif
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevice
.
XPU
:
# Currently only single thread is supported in Kunlun XPU.
self
.
_exec_strategy
.
num_threads
=
1
else
:
else
:
self
.
_exec_strategy
.
num_threads
=
len
(
places
)
*
2
self
.
_exec_strategy
.
num_threads
=
len
(
places
)
*
2
if
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevice
.
XPU
:
assert
self
.
_exec_strategy
.
num_threads
==
1
,
\
"Currently only single thread is supported in Kunlun XPU."
if
self
.
_build_strategy
.
num_trainers
>
1
:
if
self
.
_build_strategy
.
num_trainers
>
1
:
assert
self
.
_is_data_parallel
,
\
assert
self
.
_is_data_parallel
,
\
"If you use multi-trainer to train the model, you should use "
\
"If you use multi-trainer to train the model, you should use "
\
...
@@ -377,7 +384,7 @@ class CompiledProgram(object):
...
@@ -377,7 +384,7 @@ class CompiledProgram(object):
self
.
_build_strategy
.
enable_sequential_execution
=
True
self
.
_build_strategy
.
enable_sequential_execution
=
True
if
self
.
_program
is
not
None
and
self
.
_program
.
_enable_dgc
:
if
self
.
_program
is
not
None
and
self
.
_program
.
_enable_dgc
:
assert
use_cuda
,
"DGC only used under CUDA environment."
assert
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevice
.
CUDA
,
"DGC only used under CUDA environment."
assert
self
.
_build_strategy
.
num_trainers
*
len
(
assert
self
.
_build_strategy
.
num_trainers
*
len
(
places
)
>
1
,
"DGC is not avaliable for single card training."
places
)
>
1
,
"DGC is not avaliable for single card training."
assert
self
.
_build_strategy
.
reduce_strategy
==
BuildStrategy
.
ReduceStrategy
.
AllReduce
,
"DGC
\
assert
self
.
_build_strategy
.
reduce_strategy
==
BuildStrategy
.
ReduceStrategy
.
AllReduce
,
"DGC
\
...
@@ -447,11 +454,14 @@ class CompiledProgram(object):
...
@@ -447,11 +454,14 @@ class CompiledProgram(object):
raise
NotImplementedError
(
raise
NotImplementedError
(
"If optimizer is used in control flow, "
"If optimizer is used in control flow, "
"training on multi-places is not supported now."
)
"training on multi-places is not supported now."
)
if
isinstance
(
self
.
_place
,
core
.
CUDAPlace
):
use_device
=
ExecutionStrategy
.
UseDevice
.
CUDA
elif
isinstance
(
self
.
_place
,
core
.
XPUPlace
):
use_device
=
ExecutionStrategy
.
UseDevice
.
XPU
else
:
use_device
=
ExecutionStrategy
.
UseDevice
.
CPU
self
.
_executor
=
self
.
_compile_data_parallel
(
self
.
_executor
=
self
.
_compile_data_parallel
(
use_cuda
=
isinstance
(
self
.
_place
,
core
.
CUDAPlace
),
use_device
=
use_device
,
scope
=
self
.
_scope
,
places
=
self
.
_places
)
scope
=
self
.
_scope
,
places
=
self
.
_places
)
return
self
return
self
def
_get_places
(
self
,
place
,
place_list
):
def
_get_places
(
self
,
place
,
place_list
):
...
@@ -461,7 +471,11 @@ class CompiledProgram(object):
...
@@ -461,7 +471,11 @@ class CompiledProgram(object):
assert
p
.
_type
()
==
place
.
_type
(),
\
assert
p
.
_type
()
==
place
.
_type
(),
\
"Place type not match. You may set wrong type of places."
"Place type not match. You may set wrong type of places."
else
:
else
:
place_list
=
cuda_places
()
if
isinstance
(
if
isinstance
(
place
,
core
.
CUDAPlace
):
place
,
core
.
CUDAPlace
)
else
cpu_places
()
place_list
=
cuda_places
()
elif
isinstance
(
place
,
core
.
XPUPlace
):
place_list
=
xpu_places
()
else
:
place_list
=
cpu_places
()
assert
place_list
,
"No places for execution."
assert
place_list
,
"No places for execution."
return
place_list
return
place_list
python/paddle/fluid/framework.py
浏览文件 @
f13c3a9c
...
@@ -47,6 +47,7 @@ __all__ = [
...
@@ -47,6 +47,7 @@ __all__ = [
'name_scope'
,
'name_scope'
,
'cuda_places'
,
'cuda_places'
,
'cpu_places'
,
'cpu_places'
,
'xpu_places'
,
'cuda_pinned_places'
,
'cuda_pinned_places'
,
'in_dygraph_mode'
,
'in_dygraph_mode'
,
'is_compiled_with_cuda'
,
'is_compiled_with_cuda'
,
...
@@ -354,6 +355,15 @@ def _cuda_ids():
...
@@ -354,6 +355,15 @@ def _cuda_ids():
return
device_ids
return
device_ids
def
_xpu_ids
():
xpus_env
=
os
.
getenv
(
"FLAGS_selected_xpus"
)
if
xpus_env
:
device_ids
=
[
int
(
s
)
for
s
in
xpus_env
.
split
(
","
)]
else
:
device_ids
=
six
.
moves
.
range
(
core
.
get_xpu_device_count
())
return
device_ids
def
is_compiled_with_xpu
():
def
is_compiled_with_xpu
():
"""
"""
Whether this whl package can be used to run the model on XPU.
Whether this whl package can be used to run the model on XPU.
...
@@ -430,6 +440,43 @@ def cuda_places(device_ids=None):
...
@@ -430,6 +440,43 @@ def cuda_places(device_ids=None):
return
[
core
.
CUDAPlace
(
dev_id
)
for
dev_id
in
device_ids
]
return
[
core
.
CUDAPlace
(
dev_id
)
for
dev_id
in
device_ids
]
def
xpu_places
(
device_ids
=
None
):
"""
**Note**:
For multi-card tasks, please use `FLAGS_selected_xpus` environment variable to set the visible XPU device.
This function creates a list of :code:`paddle.XPUPlace` objects.
If :code:`device_ids` is None, environment variable of
:code:`FLAGS_selected_xpus` would be checked first. For example, if
:code:`FLAGS_selected_xpus=0,1,2`, the returned list would
be [paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)].
If :code:`FLAGS_selected_xpus` is not set, all visible
xpu places would be returned.
If :code:`device_ids` is not None, it should be the device
ids of XPUs. For example, if :code:`device_ids=[0,1,2]`,
the returned list would be
[paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)].
Parameters:
device_ids (list or tuple of int, optional): list of XPU device ids.
Returns:
list of paddle.XPUPlace: Created XPU place list.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
xpu_places = static.xpu_places()
"""
assert
core
.
is_compiled_with_xpu
(),
\
"Not compiled with XPU"
if
device_ids
is
None
:
device_ids
=
_xpu_ids
()
elif
not
isinstance
(
device_ids
,
(
list
,
tuple
)):
device_ids
=
[
device_ids
]
return
[
core
.
XPUPlace
(
dev_id
)
for
dev_id
in
device_ids
]
def
cpu_places
(
device_count
=
None
):
def
cpu_places
(
device_count
=
None
):
"""
"""
This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
浏览文件 @
f13c3a9c
...
@@ -75,7 +75,7 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
...
@@ -75,7 +75,7 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
exe
=
Executor
(
place
)
exe
=
Executor
(
place
)
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
.
use_cuda
=
use_cuda
exec_strategy
.
_use_device
=
fluid
.
ExecutionStrategy
.
UseDevice
.
CUDA
if
use_cuda
else
fluid
.
ExecutionStrategy
.
UseDevice
.
CPU
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
memory_optimize
=
use_mem_opt
build_strategy
.
memory_optimize
=
use_mem_opt
...
...
python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
0 → 100644
浏览文件 @
f13c3a9c
# copyright (c) 2020 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
from
__future__
import
print_function
import
unittest
import
os
import
paddle
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
import
paddle.static
as
static
class
Test_XPU_Places
(
unittest
.
TestCase
):
def
assert_places_equal
(
self
,
places0
,
places1
):
self
.
assertEqual
(
len
(
places0
),
len
(
places1
))
for
place0
,
place1
in
zip
(
places0
,
places1
):
self
.
assertEqual
(
type
(
place0
),
type
(
place1
))
self
.
assertEqual
(
place0
.
get_device_id
(),
place1
.
get_device_id
())
def
test_check_preset_envs
(
self
):
if
core
.
is_compiled_with_xpu
():
os
.
environ
[
"FLAGS_selected_xpus"
]
=
"0"
place_list
=
static
.
xpu_places
()
self
.
assert_places_equal
([
fluid
.
XPUPlace
(
0
)],
place_list
)
def
test_check_no_preset_envs
(
self
):
if
core
.
is_compiled_with_xpu
():
place_list
=
static
.
xpu_places
(
0
)
self
.
assert_places_equal
([
fluid
.
XPUPlace
(
0
)],
place_list
)
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
unittest
.
main
()
python/paddle/static/__init__.py
浏览文件 @
f13c3a9c
...
@@ -20,7 +20,7 @@ __all__ = [
...
@@ -20,7 +20,7 @@ __all__ = [
'default_main_program'
,
'default_startup_program'
,
'Program'
,
'data'
,
'default_main_program'
,
'default_startup_program'
,
'Program'
,
'data'
,
'InputSpec'
,
'save'
,
'load'
,
'save_inference_model'
,
'load_inference_model'
,
'InputSpec'
,
'save'
,
'load'
,
'save_inference_model'
,
'load_inference_model'
,
'load_program_state'
,
'set_program_state'
,
'cpu_places'
,
'cuda_places'
,
'load_program_state'
,
'set_program_state'
,
'cpu_places'
,
'cuda_places'
,
'Variable'
'
xpu_places'
,
'
Variable'
]
]
from
.
import
nn
from
.
import
nn
...
@@ -45,6 +45,7 @@ from ..fluid.framework import name_scope #DEFINE_ALIAS
...
@@ -45,6 +45,7 @@ from ..fluid.framework import name_scope #DEFINE_ALIAS
from
..fluid.framework
import
program_guard
#DEFINE_ALIAS
from
..fluid.framework
import
program_guard
#DEFINE_ALIAS
from
..fluid.framework
import
cpu_places
#DEFINE_ALIAS
from
..fluid.framework
import
cpu_places
#DEFINE_ALIAS
from
..fluid.framework
import
cuda_places
#DEFINE_ALIAS
from
..fluid.framework
import
cuda_places
#DEFINE_ALIAS
from
..fluid.framework
import
xpu_places
#DEFINE_ALIAS
from
..fluid.framework
import
Variable
#DEFINE_ALIAS
from
..fluid.framework
import
Variable
#DEFINE_ALIAS
from
..fluid.layers.control_flow
import
Print
#DEFINE_ALIAS
from
..fluid.layers.control_flow
import
Print
#DEFINE_ALIAS
from
..fluid.layers.nn
import
py_func
#DEFINE_ALIAS
from
..fluid.layers.nn
import
py_func
#DEFINE_ALIAS
...
...
tools/wlist.json
浏览文件 @
f13c3a9c
...
@@ -413,7 +413,8 @@
...
@@ -413,7 +413,8 @@
"CRFDecoding.forward"
,
"CRFDecoding.forward"
,
"SequenceTagging.forward"
,
"SequenceTagging.forward"
,
"XPUPlace"
,
"XPUPlace"
,
"is_compiled_with_xpu"
"is_compiled_with_xpu"
,
"xpu_places"
],
],
"gpu_not_white"
:[
"gpu_not_white"
:[
"deformable_conv"
,
"deformable_conv"
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录