Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
b0675c81
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b0675c81
编写于
2月 12, 2020
作者:
T
tangwei12
提交者:
GitHub
2月 12, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bug with compiledProgram (#22495)
* add thread barrier for the compiled program
上级
4cddb43c
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
48 addition
and
17 deletion
+48
-17
paddle/fluid/framework/details/async_ssa_graph_executor.cc
paddle/fluid/framework/details/async_ssa_graph_executor.cc
+6
-0
paddle/fluid/framework/details/execution_strategy.h
paddle/fluid/framework/details/execution_strategy.h
+1
-0
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+12
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+8
-0
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
...eter_server/distribute_transpiler/distributed_strategy.py
+2
-1
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+8
-1
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+4
-10
python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+7
-5
未找到文件。
paddle/fluid/framework/details/async_ssa_graph_executor.cc
浏览文件 @
b0675c81
...
...
@@ -168,6 +168,12 @@ FeedFetchList AsyncSSAGraphExecutor::Run(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
{
// init once
if
(
run_futures_
.
size
()
==
0
&&
places_
.
size
()
>
1
)
{
if
(
strategy_
.
thread_barrier_
)
{
#ifdef PADDLE_WITH_DISTRIBUTE
operators
::
distributed
::
Communicator
::
GetInstance
()
->
BarrierTriggerReset
(
places_
.
size
());
#endif
}
exception_holder_
.
Clear
();
StartOffPythonTrainLoop
();
}
...
...
paddle/fluid/framework/details/execution_strategy.h
浏览文件 @
b0675c81
...
...
@@ -36,6 +36,7 @@ struct ExecutionStrategy {
ExecutorType
type_
{
kExperimental
};
// This debug option.
bool
dry_run_
{
false
};
bool
thread_barrier_
{
false
};
// only use with async_ssa_graph_executor
// and pyreader with data queue
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
b0675c81
...
...
@@ -16,6 +16,10 @@
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/communicator.h"
#endif
namespace
paddle
{
namespace
framework
{
namespace
details
{
...
...
@@ -337,8 +341,16 @@ bool ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
void
ThreadedSSAGraphExecutor
::
ExecutionFinal
(
std
::
vector
<
OpHandleBase
*>
*
fetch_ops
)
{
#ifdef PADDLE_WITH_DISTRIBUTE
if
(
strategy_
.
thread_barrier_
)
{
operators
::
distributed
::
Communicator
::
GetInstance
()
->
BarrierTriggerDecrement
();
}
#endif
VLOG
(
3
)
<<
"caught exception "
<<
exception_holder_
.
Type
()
<<
", rethrow it"
;
ClearFetchOp
(
graph_
,
fetch_ops
);
exception_holder_
.
ReThrow
();
}
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
b0675c81
...
...
@@ -1732,6 +1732,14 @@ All parameter, weight, gradient are variables in Paddle.
R"DOC(This config that how many iteration the executor will run when
user call exe.run() in python
)DOC"
)
.
def_property
(
"use_thread_barrier"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
thread_barrier_
;
},
[](
ExecutionStrategy
&
self
,
bool
use_thread_barrier
)
{
self
.
thread_barrier_
=
use_thread_barrier
;
},
R"DOC(This config that the this is distributed training with parameter server
)DOC"
)
.
def_property
(
"_dry_run"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
dry_run_
;
},
[](
ExecutionStrategy
&
self
,
bool
dry_run
)
{
...
...
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
浏览文件 @
b0675c81
...
...
@@ -196,8 +196,9 @@ class HalfAsyncStrategy(DistributedStrategy):
super
(
HalfAsyncStrategy
,
self
).
__init__
()
self
.
_program_config
.
sync_mode
=
False
self
.
_program_config
.
runtime_split_send_recv
=
True
self
.
_build_strategy
.
async_mode
=
True
self
.
_program_config
.
half_async
=
True
self
.
_build_strategy
.
async_mode
=
True
self
.
_execute_strategy
.
use_thread_barrier
=
True
class
GeoStrategy
(
DistributedStrategy
):
...
...
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
浏览文件 @
b0675c81
...
...
@@ -39,7 +39,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
For test CTR model, using Fleet api
"""
def
net
(
self
,
batch_size
=
4
,
lr
=
0.01
):
def
net
(
self
,
args
,
batch_size
=
4
,
lr
=
0.01
):
"""
network definition
...
...
@@ -72,6 +72,13 @@ class TestDistCTR2x2(FleetDistRunnerBase):
datas
=
[
dnn_data
,
lr_data
,
label
]
if
args
.
reader
==
"pyreader"
:
self
.
reader
=
fluid
.
io
.
PyReader
(
feed_list
=
datas
,
capacity
=
64
,
iterable
=
False
,
use_double_buffer
=
False
)
# build dnn model
dnn_layer_dims
=
[
128
,
128
,
64
,
32
,
1
]
dnn_embedding
=
fluid
.
layers
.
embedding
(
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
浏览文件 @
b0675c81
...
...
@@ -102,7 +102,7 @@ class FleetDistRunnerBase(object):
def
run_pserver
(
self
,
args
):
fleet
.
init
(
self
.
build_role
(
args
))
strategy
=
self
.
build_strategy
(
args
)
avg_cost
=
self
.
net
()
avg_cost
=
self
.
net
(
args
)
self
.
build_optimizer
(
avg_cost
,
strategy
)
fleet
.
init_server
()
...
...
@@ -111,24 +111,18 @@ class FleetDistRunnerBase(object):
def
run_dataset_trainer
(
self
,
args
):
fleet
.
init
(
self
.
build_role
(
args
))
strategy
=
self
.
build_strategy
(
args
)
avg_cost
=
self
.
net
()
avg_cost
=
self
.
net
(
args
)
self
.
build_optimizer
(
avg_cost
,
strategy
)
out
=
self
.
do_dataset_training
(
fleet
)
def
run_pyreader_trainer
(
self
,
args
):
fleet
.
init
(
self
.
build_role
(
args
))
strategy
=
self
.
build_strategy
(
args
)
avg_cost
=
self
.
net
()
self
.
reader
=
fluid
.
io
.
PyReader
(
feed_list
=
self
.
feeds
,
capacity
=
64
,
iterable
=
False
,
use_double_buffer
=
False
)
avg_cost
=
self
.
net
(
args
)
self
.
build_optimizer
(
avg_cost
,
strategy
)
out
=
self
.
do_pyreader_training
(
fleet
)
def
net
(
self
,
batch_size
=
4
,
lr
=
0.01
):
def
net
(
self
,
args
,
batch_size
=
4
,
lr
=
0.01
):
raise
NotImplementedError
(
"get_model should be implemented by child classes."
)
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
浏览文件 @
b0675c81
...
...
@@ -34,7 +34,8 @@ class TestDistMnistSync2x2(TestFleetBase):
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"FLAGS_rpc_deadline"
:
"5000"
,
# 5sec to fail fast
"http_proxy"
:
""
"http_proxy"
:
""
,
"CPU_NUM"
:
"2"
}
required_envs
.
update
(
need_envs
)
...
...
@@ -65,7 +66,8 @@ class TestDistMnistAsync2x2(TestFleetBase):
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"FLAGS_rpc_deadline"
:
"5000"
,
# 5sec to fail fast
"http_proxy"
:
""
"http_proxy"
:
""
,
"CPU_NUM"
:
"2"
}
required_envs
.
update
(
need_envs
)
...
...
@@ -129,9 +131,9 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"FLAGS_rpc_deadline"
:
"30000"
,
# 5sec to fail fast
"http_proxy"
:
""
,
"FLAGS_communicator_send_queue_size"
:
"
1
"
,
"FLAGS_communicator_max_merge_var_num"
:
"
1
"
,
"CPU_NUM"
:
"
1
"
,
"FLAGS_communicator_send_queue_size"
:
"
2
"
,
"FLAGS_communicator_max_merge_var_num"
:
"
2
"
,
"CPU_NUM"
:
"
2
"
,
"SAVE_MODEL"
:
"0"
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录