Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
b4aaa00a
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b4aaa00a
编写于
4月 17, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Polish logic of ParallelExecutor
上级
2ab12ca2
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
96 addition
and
42 deletion
+96
-42
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+22
-12
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+11
-5
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+4
-3
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+53
-16
python/paddle/fluid/tests/unittests/test_parallel_executor.py
...on/paddle/fluid/tests/unittests/test_parallel_executor.py
+6
-6
未找到文件。
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
b4aaa00a
...
@@ -155,13 +155,9 @@ void ParallelExecutor::BCastParamsToGPUs(
...
@@ -155,13 +155,9 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif
#endif
}
}
void
ParallelExecutor
::
Run
(
void
ParallelExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
)
{
const
std
::
string
&
fetched_var_name
,
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
feed_tensors
)
{
platform
::
RecordBlock
b
(
0
);
platform
::
RecordBlock
b
(
0
);
SplitTensorToPlaces
(
feed_tensors
);
// Create local scopes.
// Create local scopes.
for
(
auto
&
scope
:
member_
->
local_scopes_
)
{
for
(
auto
&
scope
:
member_
->
local_scopes_
)
{
Scope
&
local_scope
=
scope
->
NewScope
();
Scope
&
local_scope
=
scope
->
NewScope
();
...
@@ -195,14 +191,28 @@ void ParallelExecutor::Run(
...
@@ -195,14 +191,28 @@ void ParallelExecutor::Run(
auto
&
local_scope
=
auto
&
local_scope
=
*
scope
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
();
*
scope
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
();
scope
->
DeleteScope
(
local_scope
);
scope
->
DeleteScope
(
local_scope
);
local_scope
=
nullptr
;
}
}
}
}
void
ParallelExecutor
::
SplitTensorToPlaces
(
void
ParallelExecutor
::
FeedTensorsIntoLocalScopes
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
feed_tensors
)
{
const
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
LoDTensor
>>
&
tensors
)
{
for
(
auto
it
:
feed_tensors
)
{
PADDLE_ENFORCE_EQ
(
member_
->
local_scopes_
.
size
(),
tensors
.
size
());
auto
lod_tensors
=
it
.
second
.
SplitLoDTensor
(
member_
->
places_
);
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
auto
&
map
=
tensors
[
i
];
auto
*
scope
=
member_
->
local_scopes_
[
i
];
for
(
auto
&
pair
:
map
)
{
auto
*
trg
=
scope
->
Var
(
pair
.
first
)
->
GetMutable
<
LoDTensor
>
();
trg
->
ShareDataWith
(
pair
.
second
);
trg
->
set_lod
(
pair
.
second
.
lod
());
}
}
}
void
ParallelExecutor
::
FeedAndSplitTensorIntoLocalScopes
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
tensors
)
{
for
(
auto
pair
:
tensors
)
{
auto
lod_tensors
=
pair
.
second
.
SplitLoDTensor
(
member_
->
places_
);
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
member_
->
places_
.
size
(),
lod_tensors
.
size
(),
member_
->
places_
.
size
(),
lod_tensors
.
size
(),
"The number of samples of current batch is less than the count of "
"The number of samples of current batch is less than the count of "
...
@@ -211,7 +221,7 @@ void ParallelExecutor::SplitTensorToPlaces(
...
@@ -211,7 +221,7 @@ void ParallelExecutor::SplitTensorToPlaces(
for
(
size_t
j
=
0
;
j
<
member_
->
places_
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
member_
->
places_
.
size
();
++
j
)
{
// TODO(panxy0718): Do I need to delete this var?
// TODO(panxy0718): Do I need to delete this var?
auto
t
=
auto
t
=
member_
->
local_scopes_
[
j
]
->
Var
(
it
.
first
)
->
GetMutable
<
LoDTensor
>
();
member_
->
local_scopes_
[
j
]
->
Var
(
pair
.
first
)
->
GetMutable
<
LoDTensor
>
();
t
->
ShareDataWith
(
lod_tensors
[
j
]);
t
->
ShareDataWith
(
lod_tensors
[
j
]);
t
->
set_lod
(
lod_tensors
[
j
].
lod
());
t
->
set_lod
(
lod_tensors
[
j
].
lod
());
}
}
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
b4aaa00a
...
@@ -44,16 +44,22 @@ class ParallelExecutor {
...
@@ -44,16 +44,22 @@ class ParallelExecutor {
std
::
vector
<
Scope
*>&
GetLocalScopes
();
std
::
vector
<
Scope
*>&
GetLocalScopes
();
/**
* Feed tensors to local scopes. The size of tensors should be equal to the
* size of local scopes.
*/
void
FeedTensorsIntoLocalScopes
(
const
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
LoDTensor
>>&
tensors
);
void
FeedAndSplitTensorIntoLocalScopes
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>&
tensors
);
void
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
,
void
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
,
const
std
::
string
&
fetched_var_name
);
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>&
feed_tensors
);
void
BCastParamsToGPUs
(
const
std
::
unordered_set
<
std
::
string
>&
vars
)
const
;
void
BCastParamsToGPUs
(
const
std
::
unordered_set
<
std
::
string
>&
vars
)
const
;
private:
private:
void
SplitTensorToPlaces
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>&
feed_tensors
);
ParallelExecutorPrivate
*
member_
;
ParallelExecutorPrivate
*
member_
;
};
};
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
b4aaa00a
...
@@ -514,9 +514,10 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -514,9 +514,10 @@ All parameter, weight, gradient are variables in Paddle.
return
&
self
.
GetLocalScopes
();
return
&
self
.
GetLocalScopes
();
},
},
py
::
return_value_policy
::
reference
)
py
::
return_value_policy
::
reference
)
.
def
(
"local_scope"
,
[](
ParallelExecutor
&
self
,
.
def
(
"feed_tensors_into_local_scopes"
,
size_t
i
)
{
return
self
.
GetLocalScopes
()[
i
];
},
&
ParallelExecutor
::
FeedTensorsIntoLocalScopes
)
py
::
return_value_policy
::
reference
)
.
def
(
"feed_and_split_tensor_into_local_scopes"
,
&
ParallelExecutor
::
FeedAndSplitTensorIntoLocalScopes
)
.
def
(
"run"
,
&
ParallelExecutor
::
Run
);
.
def
(
"run"
,
&
ParallelExecutor
::
Run
);
BindRecordIOWriter
(
&
m
);
BindRecordIOWriter
(
&
m
);
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
b4aaa00a
...
@@ -123,28 +123,65 @@ class ParallelExecutor(object):
...
@@ -123,28 +123,65 @@ class ParallelExecutor(object):
allow_op_delay
)
allow_op_delay
)
self
.
scope
=
scope
self
.
scope
=
scope
def
run
(
self
,
fetch_list
,
feed
_dict
=
{}
):
def
run
(
self
,
fetch_list
,
feed
=
None
,
feed_dict
=
None
):
"""
"""
:param fetch_list: A list of variable names that will be fetched.
:param feed_dict: A dict mapping for feed variable name to LoDTensor
or numpy array.
:return: fetched value list.
"""
if
not
isinstance
(
feed_dict
,
dict
):
raise
TypeError
(
"feed_dict should be a dict"
)
feed_tensor_dict
=
{}
Args:
for
i
,
feed_name
in
enumerate
(
feed_dict
):
fetch_list(list): The fetched variable names
feed_tensor
=
feed_dict
[
feed_name
]
feed(list|dict|None): The feed variables. If the feed is a dict, tensors in that dict will be splitted
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
into each devices. If the feed is a list, each element of the list will be copied to each device.
feed_tensor
=
core
.
LoDTensor
()
feed_dict: Alias for feed parameter, for backward compatibility.
feed_tensor
.
set
(
feed_dict
[
feed_name
],
self
.
_act_places
[
0
])
feed_tensor_dict
[
feed_name
]
=
feed_tensor
Returns: fetched result list.
"""
if
feed
is
None
:
feed
=
feed_dict
if
isinstance
(
feed
,
dict
):
feed_tensor_dict
=
dict
()
for
feed_name
in
feed
:
feed_tensor
=
feed
[
feed_name
]
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
feed_tensor
=
core
.
LoDTensor
()
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor
.
set
(
feed
[
feed_name
],
core
.
CPUPlace
())
feed_tensor_dict
[
feed_name
]
=
feed_tensor
self
.
executor
.
feed_and_split_tensor_into_local_scopes
(
feed_tensor_dict
)
elif
isinstance
(
feed
,
list
)
or
isinstance
(
feed
,
tuple
):
if
len
(
feed
)
!=
len
(
self
.
_act_places
):
raise
ValueError
(
"Feed a list of tensor, the list should be the same size as places"
)
res
=
list
()
for
i
,
each
in
enumerate
(
feed
):
if
not
isinstance
(
each
,
dict
):
raise
TypeError
(
"Each element of feed list should be a dict"
)
res_dict
=
dict
()
for
feed_name
in
each
:
tensor
=
each
[
feed_name
]
if
not
isinstance
(
tensor
,
core
.
LoDTensor
):
tmp
=
core
.
LoDTensor
()
tmp
.
set
(
tensor
,
self
.
_act_places
[
i
])
tensor
=
tmp
res_dict
[
feed_name
]
=
tensor
res
.
append
(
res_dict
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
fetch_var_name
=
'@FETCHED_VAR_NAME@'
fetch_var_name
=
'@FETCHED_VAR_NAME@'
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
,
feed_tensor_dict
)
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
)
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
def
bcast_params
(
self
):
def
bcast_params
(
self
):
self
.
executor
.
bcast_params
(
set
(
self
.
persistable_vars
))
self
.
executor
.
bcast_params
(
set
(
self
.
persistable_vars
))
@
property
def
device_count
(
self
):
return
len
(
self
.
_act_places
)
python/paddle/fluid/tests/unittests/test_parallel_executor.py
浏览文件 @
b4aaa00a
...
@@ -203,11 +203,11 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -203,11 +203,11 @@ class TestParallelExecutorBase(unittest.TestCase):
iter
=
10
,
iter
=
10
,
batch_size
=
None
,
batch_size
=
None
,
allow_op_delay
=
False
,
allow_op_delay
=
False
,
feed_dict
=
{}
):
feed_dict
=
None
):
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
method
(
use_feed
=
len
(
feed_dict
)
>
0
)
loss
=
method
(
use_feed
=
feed_dict
is
not
None
)
adam
=
fluid
.
optimizer
.
Adam
()
adam
=
fluid
.
optimizer
.
Adam
()
adam
.
minimize
(
loss
)
adam
.
minimize
(
loss
)
if
memory_opt
:
if
memory_opt
:
...
@@ -221,13 +221,13 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -221,13 +221,13 @@ class TestParallelExecutorBase(unittest.TestCase):
if
batch_size
is
not
None
:
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
()
batch_size
*=
fluid
.
core
.
get_cuda_device_count
()
begin
=
time
.
time
()
begin
=
time
.
time
()
first_loss
,
=
exe
.
run
([
loss
.
name
],
feed
_dict
=
feed_dict
)
first_loss
,
=
exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
first_loss
=
numpy
.
array
(
first_loss
)
first_loss
=
numpy
.
array
(
first_loss
)
for
i
in
xrange
(
iter
):
for
i
in
xrange
(
iter
):
exe
.
run
([],
feed
_dict
=
feed_dict
)
exe
.
run
([],
feed
=
feed_dict
)
last_loss
,
=
exe
.
run
([
loss
.
name
],
feed
_dict
=
feed_dict
)
last_loss
,
=
exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
end
=
time
.
time
()
end
=
time
.
time
()
if
batch_size
is
not
None
:
if
batch_size
is
not
None
:
...
@@ -648,5 +648,5 @@ class TestCRFModel(unittest.TestCase):
...
@@ -648,5 +648,5 @@ class TestCRFModel(unittest.TestCase):
for
i
in
xrange
(
10
):
for
i
in
xrange
(
10
):
cur_batch
=
next
(
data
)
cur_batch
=
next
(
data
)
print
map
(
numpy
.
array
,
print
map
(
numpy
.
array
,
pe
.
run
(
feed
_dict
=
feeder
.
feed
(
cur_batch
),
pe
.
run
(
feed
=
feeder
.
feed
(
cur_batch
),
fetch_list
=
[
avg_cost
.
name
]))[
0
]
fetch_list
=
[
avg_cost
.
name
]))[
0
]
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录