Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
b123ce88
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b123ce88
编写于
4月 02, 2018
作者:
X
Xin Pan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add enable/disable for delayed ops
上级
be1373dc
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
41 addition
and
16 deletion
+41
-16
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+8
-4
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+3
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+3
-3
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+4
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+2
-2
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+7
-2
python/paddle/fluid/tests/unittests/test_parallel_executor.py
...on/paddle/fluid/tests/unittests/test_parallel_executor.py
+14
-2
未找到文件。
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
b123ce88
...
@@ -23,14 +23,15 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
...
@@ -23,14 +23,15 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
size_t
num_threads
,
bool
use_event
,
size_t
num_threads
,
bool
use_event
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
SSAGraph
>
&&
graph
)
std
::
unique_ptr
<
SSAGraph
>
&&
graph
,
bool
allow_op_delay
)
:
SSAGraphExecutor
(
std
::
move
(
graph
)),
:
SSAGraphExecutor
(
std
::
move
(
graph
)),
pool_
(
num_threads
>=
2
?
new
::
ThreadPool
(
num_threads
)
:
nullptr
),
pool_
(
num_threads
>=
2
?
new
::
ThreadPool
(
num_threads
)
:
nullptr
),
local_scopes_
(
local_scopes
),
local_scopes_
(
local_scopes
),
places_
(
places
),
places_
(
places
),
fetch_ctxs_
(
places
),
fetch_ctxs_
(
places
),
use_event_
(
use_event
),
use_event_
(
use_event
),
running_ops_
(
0
)
{}
running_ops_
(
0
),
allow_op_delay_
(
allow_op_delay
)
{}
void
ThreadedSSAGraphExecutor
::
RunDelayedOps
(
void
ThreadedSSAGraphExecutor
::
RunDelayedOps
(
const
std
::
unordered_set
<
OpHandleBase
*>
&
delayed_ops
)
{
const
std
::
unordered_set
<
OpHandleBase
*>
&
delayed_ops
)
{
...
@@ -119,7 +120,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
...
@@ -119,7 +120,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto
run_all_ready_ops
=
[
&
]
{
auto
run_all_ready_ops
=
[
&
]
{
for
(
auto
*
op
:
ready_ops
)
{
for
(
auto
*
op
:
ready_ops
)
{
if
(
op
->
IsMultiDeviceTransfer
())
{
if
(
op
->
IsMultiDeviceTransfer
()
&&
allow_op_delay_
)
{
delayed_ops
.
insert
(
op
);
delayed_ops
.
insert
(
op
);
delayed_vars
.
insert
(
op
->
outputs_
.
begin
(),
op
->
outputs_
.
end
());
delayed_vars
.
insert
(
op
->
outputs_
.
begin
(),
op
->
outputs_
.
end
());
ready_vars
.
Extend
(
op
->
outputs_
);
ready_vars
.
Extend
(
op
->
outputs_
);
...
@@ -138,7 +139,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
...
@@ -138,7 +139,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
}
// Step 3. Execution
// Step 3. Execution
while
(
!
pending_vars
.
empty
())
{
while
(
!
pending_vars
.
empty
()
||
!
ready_ops
.
empty
()
||
!
delayed_ops
.
empty
()
)
{
// 1. Run All Ready ops
// 1. Run All Ready ops
run_all_ready_ops
();
run_all_ready_ops
();
...
@@ -181,6 +182,9 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
...
@@ -181,6 +182,9 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
}
// Keep loop until all vars are ready.
// Keep loop until all vars are ready.
}
}
PADDLE_ENFORCE
(
ready_ops
.
empty
());
PADDLE_ENFORCE
(
delayed_ops
.
empty
());
PADDLE_ENFORCE
(
blocked_by_delayed_ops
.
empty
());
++
computation_count_
;
++
computation_count_
;
auto
sync_computation
=
[
&
]
{
auto
sync_computation
=
[
&
]
{
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
b123ce88
...
@@ -75,7 +75,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -75,7 +75,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
ThreadedSSAGraphExecutor
(
size_t
num_threads
,
bool
use_event
,
ThreadedSSAGraphExecutor
(
size_t
num_threads
,
bool
use_event
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
SSAGraph
>
&&
graph
);
std
::
unique_ptr
<
SSAGraph
>
&&
graph
,
bool
allow_op_delay
);
// Run a SSAGraph by a thread pool
// Run a SSAGraph by a thread pool
// Use topological sort algorithm
// Use topological sort algorithm
...
@@ -97,6 +98,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -97,6 +98,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
const
bool
use_event_
;
const
bool
use_event_
;
std
::
unique_ptr
<
platform
::
EnforceNotMet
>
exception_
;
std
::
unique_ptr
<
platform
::
EnforceNotMet
>
exception_
;
std
::
atomic
<
int
>
running_ops_
;
std
::
atomic
<
int
>
running_ops_
;
bool
allow_op_delay_
;
size_t
computation_count_
{
0
};
size_t
computation_count_
{
0
};
size_t
max_async_computation
{
100
};
size_t
max_async_computation
{
100
};
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
b123ce88
...
@@ -48,7 +48,7 @@ ParallelExecutor::ParallelExecutor(
...
@@ -48,7 +48,7 @@ ParallelExecutor::ParallelExecutor(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
params
,
const
std
::
unordered_set
<
std
::
string
>
&
params
,
const
ProgramDesc
&
startup_program
,
const
ProgramDesc
&
main_program
,
const
ProgramDesc
&
startup_program
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
)
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
bool
allow_op_delay
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
global_scope_
=
scope
;
...
@@ -83,8 +83,8 @@ ParallelExecutor::ParallelExecutor(
...
@@ -83,8 +83,8 @@ ParallelExecutor::ParallelExecutor(
auto
graph
=
builder
.
Build
(
main_program
);
auto
graph
=
builder
.
Build
(
main_program
);
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
num_threads
,
use_event
,
member_
->
local_scopes_
,
places
,
num_threads
,
use_event
,
member_
->
local_scopes_
,
places
,
std
::
move
(
graph
),
std
::
move
(
graph
)
));
allow_op_delay
));
// Step 3. Create vars in each scope;
// Step 3. Create vars in each scope;
for
(
auto
*
scope
:
member_
->
local_scopes_
)
{
for
(
auto
*
scope
:
member_
->
local_scopes_
)
{
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
b123ce88
...
@@ -14,8 +14,9 @@ limitations under the License. */
...
@@ -14,8 +14,9 @@ limitations under the License. */
#pragma once
#pragma once
#include <
future
>
#include <
string
>
#include <unordered_set>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
...
@@ -37,7 +38,8 @@ class ParallelExecutor {
...
@@ -37,7 +38,8 @@ class ParallelExecutor {
const
std
::
unordered_set
<
std
::
string
>&
params
,
const
std
::
unordered_set
<
std
::
string
>&
params
,
const
ProgramDesc
&
startup_program
,
const
ProgramDesc
&
startup_program
,
const
ProgramDesc
&
main_program
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
);
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
bool
allow_op_delay
);
void
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
,
void
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
=
"fetched_var"
);
const
std
::
string
&
fetched_var_name
=
"fetched_var"
);
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
b123ce88
...
@@ -504,10 +504,10 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -504,10 +504,10 @@ All parameter, weight, gradient are variables in Paddle.
const
std
::
unordered_set
<
std
::
string
>
&
params
,
const
std
::
unordered_set
<
std
::
string
>
&
params
,
const
ProgramDesc
&
startup_program
,
const
ProgramDesc
&
startup_program
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
)
{
Scope
*
scope
,
bool
allow_op_delay
)
{
new
(
&
self
)
ParallelExecutor
(
num_threads
,
use_event
,
places
,
new
(
&
self
)
ParallelExecutor
(
num_threads
,
use_event
,
places
,
params
,
startup_program
,
main_program
,
params
,
startup_program
,
main_program
,
loss_var_name
,
scope
);
loss_var_name
,
scope
,
allow_op_delay
);
})
})
.
def
(
"run"
,
&
ParallelExecutor
::
Run
);
.
def
(
"run"
,
&
ParallelExecutor
::
Run
);
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
b123ce88
...
@@ -21,7 +21,11 @@ __all__ = ['ParallelExecutor']
...
@@ -21,7 +21,11 @@ __all__ = ['ParallelExecutor']
class
ParallelExecutor
(
object
):
class
ParallelExecutor
(
object
):
def
__init__
(
self
,
loss_name
,
use_cuda
,
num_threads
=
None
):
def
__init__
(
self
,
loss_name
,
use_cuda
,
num_threads
=
None
,
allow_op_delay
=
False
):
places
=
[]
places
=
[]
if
use_cuda
:
if
use_cuda
:
for
i
in
xrange
(
core
.
get_cuda_device_count
()):
for
i
in
xrange
(
core
.
get_cuda_device_count
()):
...
@@ -57,7 +61,8 @@ class ParallelExecutor(object):
...
@@ -57,7 +61,8 @@ class ParallelExecutor(object):
startup
.
desc
,
startup
.
desc
,
main
.
desc
,
main
.
desc
,
loss_name
,
loss_name
,
scope
)
scope
,
allow_op_delay
)
self
.
scope
=
scope
self
.
scope
=
scope
def
run
(
self
,
fetch_list
):
def
run
(
self
,
fetch_list
):
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor.py
浏览文件 @
b123ce88
...
@@ -184,7 +184,8 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -184,7 +184,8 @@ class TestParallelExecutorBase(unittest.TestCase):
method
,
method
,
memory_opt
=
True
,
memory_opt
=
True
,
iter
=
10
,
iter
=
10
,
batch_size
=
None
):
batch_size
=
None
,
allow_op_delay
=
False
):
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
with
fluid
.
program_guard
(
main
,
startup
):
...
@@ -194,7 +195,10 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -194,7 +195,10 @@ class TestParallelExecutorBase(unittest.TestCase):
if
memory_opt
:
if
memory_opt
:
fluid
.
memory_optimize
(
main
)
fluid
.
memory_optimize
(
main
)
exe
=
fluid
.
ParallelExecutor
(
loss_name
=
loss
.
name
,
use_cuda
=
True
)
exe
=
fluid
.
ParallelExecutor
(
loss_name
=
loss
.
name
,
use_cuda
=
True
,
allow_op_delay
=
allow_op_delay
)
if
batch_size
is
not
None
:
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
()
batch_size
*=
fluid
.
core
.
get_cuda_device_count
()
begin
=
time
.
time
()
begin
=
time
.
time
()
...
@@ -236,9 +240,11 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -236,9 +240,11 @@ class TestMNIST(TestParallelExecutorBase):
def
test_simple_fc
(
self
):
def
test_simple_fc
(
self
):
self
.
check_network_convergence
(
simple_fc_net
)
self
.
check_network_convergence
(
simple_fc_net
)
self
.
check_network_convergence
(
simple_fc_net
,
allow_op_delay
=
True
)
def
test_batchnorm_fc
(
self
):
def
test_batchnorm_fc
(
self
):
self
.
check_network_convergence
(
fc_with_batchnorm
)
self
.
check_network_convergence
(
fc_with_batchnorm
)
self
.
check_network_convergence
(
fc_with_batchnorm
,
allow_op_delay
=
True
)
class
TestResnet
(
TestParallelExecutorBase
):
class
TestResnet
(
TestParallelExecutorBase
):
...
@@ -268,6 +274,12 @@ class TestResnet(TestParallelExecutorBase):
...
@@ -268,6 +274,12 @@ class TestResnet(TestParallelExecutorBase):
SE_ResNeXt152
,
batch_size
=
batch_size
),
SE_ResNeXt152
,
batch_size
=
batch_size
),
iter
=
20
,
iter
=
20
,
batch_size
=
batch_size
)
batch_size
=
batch_size
)
self
.
check_network_convergence
(
functools
.
partial
(
SE_ResNeXt152
,
batch_size
=
batch_size
),
iter
=
20
,
batch_size
=
batch_size
,
allow_op_delay
=
True
)
class
ModelHyperParams
(
object
):
class
ModelHyperParams
(
object
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录