Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
d4b461eb
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d4b461eb
编写于
3月 01, 2019
作者:
C
chengduo
提交者:
ceci3
3月 04, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Unified ParallelExecutor and Compiler (#15970)
* Unified ParallelExecutor and Compiler
上级
ea9d6731
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
65 addition
and
179 deletion
+65
-179
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+3
-1
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+43
-29
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+0
-9
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+19
-140
未找到文件。
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
d4b461eb
...
@@ -12,7 +12,9 @@
...
@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <memory>
#include <string>
#include <string>
#include <unordered_map>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
...
@@ -55,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
...
@@ -55,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
std
::
vector
<
FetchOpHandle
*>
fetch_ops
;
std
::
vector
<
FetchOpHandle
*>
fetch_ops
;
for
(
auto
&
fetch_var_name
:
fetch_tensors
)
{
for
(
auto
&
fetch_var_name
:
fetch_tensors
)
{
for
(
auto
&
var_map
:
graph_
->
Get
<
details
::
GraphVars
>
(
"vars"
))
{
for
(
auto
&
var_map
:
graph_
->
Get
<
details
::
GraphVars
>
(
details
::
kGraphVars
))
{
auto
it
=
var_map
.
find
(
fetch_var_name
);
auto
it
=
var_map
.
find
(
fetch_var_name
);
if
(
it
!=
var_map
.
end
())
{
if
(
it
!=
var_map
.
end
())
{
fetched_vars
[
fetch_var_name
].
push_back
(
*
it
->
second
.
rbegin
());
fetched_vars
[
fetch_var_name
].
push_back
(
*
it
->
second
.
rbegin
());
...
...
python/paddle/fluid/compiler.py
浏览文件 @
d4b461eb
...
@@ -17,7 +17,6 @@ import os
...
@@ -17,7 +17,6 @@ import os
import
six
import
six
import
sys
import
sys
from
..
import
compat
as
cpt
from
..
import
compat
as
cpt
from
.
import
framework
from
.
import
core
from
.
import
core
from
.
import
framework
from
.
import
framework
...
@@ -36,6 +35,30 @@ def _place_obj(place):
...
@@ -36,6 +35,30 @@ def _place_obj(place):
return
p
return
p
def
_is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
def
get_available_places
(
use_cuda
):
if
use_cuda
:
gpus_env
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
if
gpus_env
:
gpus
=
[
int
(
s
)
for
s
in
gpus_env
.
split
(
","
)]
else
:
gpus
=
[
i
for
i
in
six
.
moves
.
range
(
core
.
get_cuda_device_count
())]
places
=
[
core
.
CUDAPlace
(
i
)
for
i
in
gpus
]
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
places
=
[
core
.
CPUPlace
()
for
_
in
six
.
moves
.
range
(
cpu_num
)]
assert
places
,
"no place for execution"
return
places
class
CompiledProgram
(
object
):
class
CompiledProgram
(
object
):
"""
"""
Compiles to Graph for execution.
Compiles to Graph for execution.
...
@@ -127,8 +150,7 @@ class CompiledProgram(object):
...
@@ -127,8 +150,7 @@ class CompiledProgram(object):
self
.
_exec_strategy
=
ExecutionStrategy
()
self
.
_exec_strategy
=
ExecutionStrategy
()
if
self
.
_build_strategy
is
None
:
if
self
.
_build_strategy
is
None
:
self
.
_build_strategy
=
BuildStrategy
()
self
.
_build_strategy
=
BuildStrategy
()
self
.
_build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
self
.
_build_strategy
.
is_distribution
=
_is_pserver_mode
(
self
.
_program
)
self
.
_program
)
return
self
return
self
def
with_inference_optimize
(
self
,
config
):
def
with_inference_optimize
(
self
,
config
):
...
@@ -153,9 +175,9 @@ class CompiledProgram(object):
...
@@ -153,9 +175,9 @@ class CompiledProgram(object):
def
_with_distributed
(
self
):
def
_with_distributed
(
self
):
raise
NotImplementedError
()
raise
NotImplementedError
()
def
_compile_data_parallel
(
self
):
def
_compile_data_parallel
(
self
,
use_cuda
=
False
,
scope
=
None
):
if
self
.
_share_vars_from
:
if
self
.
_share_vars_from
:
if
s
elf
.
_s
cope
:
if
scope
:
sys
.
stderr
.
write
(
"share_vars_from is set, scope is ignored.
\n
"
)
sys
.
stderr
.
write
(
"share_vars_from is set, scope is ignored.
\n
"
)
if
not
self
.
_share_vars_from
.
_is_data_parallel
:
if
not
self
.
_share_vars_from
.
_is_data_parallel
:
raise
ValueError
(
"share_vars_from is not data parallel. Cannot "
raise
ValueError
(
"share_vars_from is not data parallel. Cannot "
...
@@ -166,23 +188,11 @@ class CompiledProgram(object):
...
@@ -166,23 +188,11 @@ class CompiledProgram(object):
"var to share."
)
"var to share."
)
self
.
_local_scopes
=
self
.
_share_vars_from
.
_executor
.
local_scopes
()
self
.
_local_scopes
=
self
.
_share_vars_from
.
_executor
.
local_scopes
()
else
:
else
:
assert
scope
is
not
None
,
""
self
.
_local_scopes
=
[]
self
.
_local_scopes
=
[]
self
.
_exec_strategy
.
use_cuda
=
isinstance
(
self
.
_place
,
core
.
CUDAPlace
)
self
.
_exec_strategy
.
use_cuda
=
use_cuda
if
self
.
_exec_strategy
.
use_cuda
:
self
.
_places
=
get_available_places
(
self
.
_exec_strategy
.
use_cuda
)
gpus_env
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
if
gpus_env
:
gpus
=
[
int
(
s
)
for
s
in
gpus_env
.
split
(
","
)]
else
:
gpus
=
[
i
for
i
in
six
.
moves
.
range
(
core
.
get_cuda_device_count
())
]
self
.
_places
=
[
core
.
CUDAPlace
(
i
)
for
i
in
gpus
]
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_places
=
[
core
.
CPUPlace
()
for
_
in
six
.
moves
.
range
(
cpu_num
)]
assert
self
.
_places
,
"no place for execution"
if
self
.
_exec_strategy
.
num_threads
==
0
:
if
self
.
_exec_strategy
.
num_threads
==
0
:
if
self
.
_exec_strategy
.
use_cuda
:
if
self
.
_exec_strategy
.
use_cuda
:
...
@@ -197,9 +207,11 @@ class CompiledProgram(object):
...
@@ -197,9 +207,11 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
# if turn on python memory optimize, turn off the inplace_pass.
if
self
.
_build_strategy
.
memory_optimize
is
None
:
if
self
.
_build_strategy
.
memory_optimize
is
None
:
self
.
_build_strategy
.
memory_optimize
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
memory_optimize
=
False
\
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
if
self
.
_build_strategy
.
enable_inplace
is
None
:
if
self
.
_build_strategy
.
enable_inplace
is
None
:
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
enable_inplace
=
False
\
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
# TODO(wuyi): trainer endpoings should be passed in through
# TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx.
# build_strategy, not program.xxx.
...
@@ -221,12 +233,12 @@ class CompiledProgram(object):
...
@@ -221,12 +233,12 @@ class CompiledProgram(object):
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
return
core
.
ParallelExecutor
(
return
core
.
ParallelExecutor
(
places
,
places
,
set
(
self
.
_persistable_vars
)
,
set
(
self
.
_persistable_vars
),
cpt
.
to_text
(
self
.
_loss_name
)
cpt
.
to_text
(
self
.
_loss_name
)
if
self
.
_loss_name
else
six
.
u
(
''
),
scope
,
if
self
.
_loss_name
else
six
.
u
(
''
),
self
.
_scope
,
self
.
_local_scopes
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_exec_strategy
,
self
.
_build_strategy
,
self
.
_graph
)
self
.
_build_strategy
,
self
.
_graph
)
def
_compile_inference
(
self
):
def
_compile_inference
(
self
):
return
core
.
create_paddle_predictor
(
self
.
_infer_config
)
return
core
.
create_paddle_predictor
(
self
.
_infer_config
)
...
@@ -253,7 +265,9 @@ class CompiledProgram(object):
...
@@ -253,7 +265,9 @@ class CompiledProgram(object):
self
.
_scope
=
scope
self
.
_scope
=
scope
self
.
_place
=
place
self
.
_place
=
place
if
self
.
_is_data_parallel
:
if
self
.
_is_data_parallel
:
self
.
_executor
=
self
.
_compile_data_parallel
()
self
.
_executor
=
self
.
_compile_data_parallel
(
use_cuda
=
isinstance
(
self
.
_place
,
core
.
CUDAPlace
),
scope
=
self
.
_scope
)
elif
self
.
_is_inference
:
elif
self
.
_is_inference
:
self
.
_executor
=
self
.
_compile_inference
()
self
.
_executor
=
self
.
_compile_inference
()
else
:
else
:
...
...
python/paddle/fluid/framework.py
浏览文件 @
d4b461eb
...
@@ -87,15 +87,6 @@ def _current_expected_place():
...
@@ -87,15 +87,6 @@ def _current_expected_place():
return
_imperative_current_expected_place_
return
_imperative_current_expected_place_
def
is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
class
NameScope
(
object
):
class
NameScope
(
object
):
def
__init__
(
self
,
name
=
""
,
parent
=
None
):
def
__init__
(
self
,
name
=
""
,
parent
=
None
):
self
.
_children
=
dict
()
self
.
_children
=
dict
()
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
d4b461eb
...
@@ -13,15 +13,11 @@
...
@@ -13,15 +13,11 @@
# limitations under the License.
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
import
multiprocessing
from
.
import
core
from
.
import
core
from
.
import
framework
from
.
import
framework
from
.
import
executor
from
.
import
executor
from
..
import
compat
as
cpt
from
.
import
compiler
import
warnings
import
sys
import
sys
import
six
import
os
__all__
=
[
'ParallelExecutor'
]
__all__
=
[
'ParallelExecutor'
]
...
@@ -97,99 +93,27 @@ class ParallelExecutor(object):
...
@@ -97,99 +93,27 @@ class ParallelExecutor(object):
'Please use CompiledProgram and Executor. CompiledProgram '
'Please use CompiledProgram and Executor. CompiledProgram '
'is a central place for optimization and Executor is the '
'is a central place for optimization and Executor is the '
'unified executor. Example can be found in compiler.py.
\n
'
)
'unified executor. Example can be found in compiler.py.
\n
'
)
# step1: get places, the places are used in run too.
self
.
_places
=
[]
if
use_cuda
:
gpus_env
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
if
gpus_env
:
gpus
=
[
int
(
s
)
for
s
in
gpus_env
.
split
(
","
)]
else
:
gpus
=
[
i
for
i
in
six
.
moves
.
range
(
core
.
get_cuda_device_count
())
]
self
.
_places
=
[
core
.
CUDAPlace
(
i
)
for
i
in
gpus
]
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_places
=
[
core
.
CPUPlace
()
for
_
in
six
.
moves
.
range
(
cpu_num
)]
assert
self
.
_places
,
"no place for execution"
# step2: init exec_strategy
if
exec_strategy
is
None
:
exec_strategy
=
ExecutionStrategy
()
exec_strategy
.
use_cuda
=
use_cuda
if
exec_strategy
.
num_threads
==
0
:
if
use_cuda
:
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
exec_strategy
.
num_threads
=
len
(
self
.
_places
)
*
4
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
exec_strategy
.
num_threads
=
cpu_num
*
2
# step3: init build_strategy
if
build_strategy
is
None
:
if
build_strategy
is
None
:
build_strategy
=
BuildStrategy
()
build_strategy
=
BuildStrategy
()
build_strategy
.
num_trainers
=
num_trainers
build_strategy
.
num_trainers
=
num_trainers
build_strategy
.
trainer_id
=
trainer_id
build_strategy
.
trainer_id
=
trainer_id
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
main_program
)
or
num_trainers
>
1
# step4: get main_program, scope, local_scopes
main
=
main_program
if
main_program
\
else
framework
.
default_main_program
()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
if
build_strategy
.
memory_optimize
is
None
:
build_strategy
.
memory_optimize
=
False
if
main
.
_is_mem_optimized
else
True
if
build_strategy
.
enable_inplace
is
None
:
build_strategy
.
enable_inplace
=
False
if
main
.
_is_mem_optimized
else
True
scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
if
share_vars_from
and
not
isinstance
(
share_vars_from
,
ParallelExecutor
):
raise
TypeError
(
"share_vars_from must be ParallelExecutor."
)
local_scopes
=
share_vars_from
.
executor
.
local_scopes
()
\
if
share_vars_from
else
[]
# step5: check trainers_endpoints, it is used for distribution.
trainers_endpoints
=
main
.
_trainers_endpoints
if
num_trainers
>
1
and
trainers_endpoints
:
assert
num_trainers
==
len
(
trainers_endpoints
),
"num_trainers == len(endpoints)"
build_strategy
.
trainers_endpoints
=
trainers_endpoints
# step6: get persistable_vars, places. persistable_vars
# need be broadcast to other local_scope.
persistable_vars
=
set
([
cpt
.
to_text
(
v
.
name
)
for
v
in
[
var
for
var
in
main
.
list_vars
()
if
var
.
persistable
and
var
.
type
!=
core
.
VarDesc
.
VarType
.
RAW
]
])
def
place_obj
(
place
):
p
=
core
.
Place
()
p
.
set_place
(
place
)
return
p
places
=
list
(
map
(
place_obj
,
self
.
_places
))
# step7: init ParallelExecutor
self
.
_places
=
compiler
.
get_available_places
(
use_cuda
)
# ParallelExecutor API will be deprecated, don't support parallel graph.
self
.
_scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
self
.
_graph
=
core
.
Graph
(
main
.
desc
)
self
.
executor
=
core
.
ParallelExecutor
(
main_program
=
main_program
if
main_program
is
not
None
\
places
,
persistable_vars
,
else
framework
.
default_main_program
()
cpt
.
to_text
(
loss_name
)
if
loss_name
else
six
.
u
(
''
),
scope
,
local_scopes
,
exec_strategy
,
build_strategy
,
self
.
_graph
)
self
.
scope
=
scope
self
.
_compiled_program
=
compiler
.
CompiledProgram
(
main_program
)
self
.
_compiled_program
.
with_data_parallel
(
loss_name
=
loss_name
,
build_strategy
=
build_strategy
,
exec_strategy
=
exec_strategy
,
share_vars_from
=
share_vars_from
)
self
.
_place
=
core
.
CUDAPlace
(
0
)
if
use_cuda
else
core
.
CPUPlace
()
self
.
_executor
=
executor
.
Executor
(
self
.
_place
)
self
.
_compiled_program
.
_compile
(
place
=
self
.
_place
,
scope
=
self
.
_scope
)
def
run
(
self
,
fetch_list
,
feed
=
None
,
feed_dict
=
None
,
return_numpy
=
True
):
def
run
(
self
,
fetch_list
,
feed
=
None
,
feed_dict
=
None
,
return_numpy
=
True
):
"""
"""
...
@@ -256,56 +180,11 @@ class ParallelExecutor(object):
...
@@ -256,56 +180,11 @@ class ParallelExecutor(object):
loss = pe.run(feed=feeder.feed(cur_batch),
loss = pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))
fetch_list=[avg_cost.name]))
"""
"""
if
feed
is
None
and
feed_dict
is
not
None
:
return
self
.
_executor
.
run
(
program
=
self
.
_compiled_program
,
feed
=
feed_dict
scope
=
self
.
_scope
,
print
(
feed
=
feed
,
"`feed_dict` is deprecated. Please use `feed=`"
,
fetch_list
=
fetch_list
,
file
=
sys
.
stderr
)
return_numpy
=
return_numpy
)
if
isinstance
(
feed
,
dict
):
feed_tensor_dict
=
dict
()
for
feed_name
in
feed
:
feed_tensor
=
feed
[
feed_name
]
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
feed_tensor
=
core
.
LoDTensor
()
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor
.
set
(
feed
[
feed_name
],
core
.
CPUPlace
())
feed_tensor_dict
[
feed_name
]
=
feed_tensor
self
.
executor
.
feed_and_split_tensor_into_local_scopes
(
feed_tensor_dict
)
elif
isinstance
(
feed
,
list
)
or
isinstance
(
feed
,
tuple
):
if
len
(
feed
)
!=
len
(
self
.
_places
):
raise
ValueError
(
"Feed a list of tensor, the list should be the same size as places"
)
res
=
list
()
for
i
,
each
in
enumerate
(
feed
):
if
not
isinstance
(
each
,
dict
):
raise
TypeError
(
"Each element of feed list should be a dict"
)
res_dict
=
dict
()
for
feed_name
in
each
:
tensor
=
each
[
feed_name
]
if
not
isinstance
(
tensor
,
core
.
LoDTensor
):
tmp
=
core
.
LoDTensor
()
tmp
.
set
(
tensor
,
self
.
_places
[
i
])
tensor
=
tmp
res_dict
[
feed_name
]
=
tensor
res
.
append
(
res_dict
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
fetch_var_name
=
'fetch'
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
)
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
if
return_numpy
:
return
executor
.
as_numpy
(
arr
)
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
@
property
@
property
def
device_count
(
self
):
def
device_count
(
self
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录