Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
fc6f0be2
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
fc6f0be2
编写于
4月 17, 2018
作者:
Y
Yu Yang
提交者:
GitHub
4月 17, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #9942 from reyoung/feature/tuning_pe_trans
Feature/tuning pe trans
上级
b53f7e2c
72869543
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
139 addition
and
39 deletion
+139
-39
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+22
-12
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+11
-5
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+8
-0
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+10
-0
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+82
-16
python/paddle/fluid/tests/unittests/test_parallel_executor.py
...on/paddle/fluid/tests/unittests/test_parallel_executor.py
+6
-6
未找到文件。
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
fc6f0be2
...
...
@@ -155,13 +155,9 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif
}
void
ParallelExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
,
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
feed_tensors
)
{
void
ParallelExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
)
{
platform
::
RecordBlock
b
(
0
);
SplitTensorToPlaces
(
feed_tensors
);
// Create local scopes.
for
(
auto
&
scope
:
member_
->
local_scopes_
)
{
Scope
&
local_scope
=
scope
->
NewScope
();
...
...
@@ -195,14 +191,28 @@ void ParallelExecutor::Run(
auto
&
local_scope
=
*
scope
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
();
scope
->
DeleteScope
(
local_scope
);
local_scope
=
nullptr
;
}
}
void
ParallelExecutor
::
SplitTensorToPlaces
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
feed_tensors
)
{
for
(
auto
it
:
feed_tensors
)
{
auto
lod_tensors
=
it
.
second
.
SplitLoDTensor
(
member_
->
places_
);
void
ParallelExecutor
::
FeedTensorsIntoLocalScopes
(
const
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
LoDTensor
>>
&
tensors
)
{
PADDLE_ENFORCE_EQ
(
member_
->
local_scopes_
.
size
(),
tensors
.
size
());
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
auto
&
map
=
tensors
[
i
];
auto
*
scope
=
member_
->
local_scopes_
[
i
];
for
(
auto
&
pair
:
map
)
{
auto
*
trg
=
scope
->
Var
(
pair
.
first
)
->
GetMutable
<
LoDTensor
>
();
trg
->
ShareDataWith
(
pair
.
second
);
trg
->
set_lod
(
pair
.
second
.
lod
());
}
}
}
void
ParallelExecutor
::
FeedAndSplitTensorIntoLocalScopes
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
tensors
)
{
for
(
auto
pair
:
tensors
)
{
auto
lod_tensors
=
pair
.
second
.
SplitLoDTensor
(
member_
->
places_
);
PADDLE_ENFORCE_EQ
(
member_
->
places_
.
size
(),
lod_tensors
.
size
(),
"The number of samples of current batch is less than the count of "
...
...
@@ -211,7 +221,7 @@ void ParallelExecutor::SplitTensorToPlaces(
for
(
size_t
j
=
0
;
j
<
member_
->
places_
.
size
();
++
j
)
{
// TODO(panxy0718): Do I need to delete this var?
auto
t
=
member_
->
local_scopes_
[
j
]
->
Var
(
it
.
first
)
->
GetMutable
<
LoDTensor
>
();
member_
->
local_scopes_
[
j
]
->
Var
(
pair
.
first
)
->
GetMutable
<
LoDTensor
>
();
t
->
ShareDataWith
(
lod_tensors
[
j
]);
t
->
set_lod
(
lod_tensors
[
j
].
lod
());
}
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
fc6f0be2
...
...
@@ -44,16 +44,22 @@ class ParallelExecutor {
std
::
vector
<
Scope
*>&
GetLocalScopes
();
/**
* Feed tensors to local scopes. The size of tensors should be equal to the
* size of local scopes.
*/
void
FeedTensorsIntoLocalScopes
(
const
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
LoDTensor
>>&
tensors
);
void
FeedAndSplitTensorIntoLocalScopes
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>&
tensors
);
void
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
,
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>&
feed_tensors
);
const
std
::
string
&
fetched_var_name
);
void
BCastParamsToGPUs
(
const
std
::
unordered_set
<
std
::
string
>&
vars
)
const
;
private:
void
SplitTensorToPlaces
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>&
feed_tensors
);
ParallelExecutorPrivate
*
member_
;
};
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
fc6f0be2
...
...
@@ -505,11 +505,19 @@ All parameter, weight, gradient are variables in Paddle.
scope
,
local_scopes
,
allow_op_delay
);
})
.
def
(
"bcast_params"
,
&
ParallelExecutor
::
BCastParamsToGPUs
)
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
// one by one and mark them as reference.
.
def
(
"local_scopes"
,
[](
ParallelExecutor
&
self
)
->
std
::
vector
<
Scope
*>
*
{
return
&
self
.
GetLocalScopes
();
},
py
::
return_value_policy
::
reference
)
.
def
(
"feed_tensors_into_local_scopes"
,
&
ParallelExecutor
::
FeedTensorsIntoLocalScopes
)
.
def
(
"feed_and_split_tensor_into_local_scopes"
,
&
ParallelExecutor
::
FeedAndSplitTensorIntoLocalScopes
)
.
def
(
"run"
,
&
ParallelExecutor
::
Run
);
BindRecordIOWriter
(
&
m
);
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
fc6f0be2
...
...
@@ -190,6 +190,11 @@ void PyCUDATensorSetFromArray(
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
));
paddle
::
platform
::
GpuMemcpyAsync
(
dst
,
array
.
data
(),
sizeof
(
T
)
*
array
.
size
(),
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx
->
Wait
();
}
template
<
>
...
...
@@ -216,6 +221,11 @@ void PyCUDATensorSetFromArray(
paddle
::
platform
::
GpuMemcpyAsync
(
dst
,
array
.
data
(),
sizeof
(
uint16_t
)
*
array
.
size
(),
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx
->
Wait
();
}
template
<
typename
T
>
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
fc6f0be2
...
...
@@ -16,6 +16,7 @@ import core
import
multiprocessing
import
framework
import
executor
import
sys
__all__
=
[
'ParallelExecutor'
]
...
...
@@ -123,28 +124,93 @@ class ParallelExecutor(object):
allow_op_delay
)
self
.
scope
=
scope
def
run
(
self
,
fetch_list
,
feed
_dict
=
{}
):
def
run
(
self
,
fetch_list
,
feed
=
None
,
feed_dict
=
None
):
"""
:param fetch_list: A list of variable names that will be fetched.
:param feed_dict: A dict mapping for feed variable name to LoDTensor
or numpy array.
:return: fetched value list.
"""
if
not
isinstance
(
feed_dict
,
dict
):
raise
TypeError
(
"feed_dict should be a dict"
)
Run a parallel executor with fetch_list.
The feed parameter can be a dict or a list. If feed is a dict, the
feed data will be split into multiple devices. If feed is a list, we
assume the data has been splitted into multiple devices, the each
element in the list will be copied to each device directly.
For example, if the feed is a dict:
>>> exe = ParallelExecutor()
>>> # the image will be splitted into devices. If there is two devices
>>> # each device will process an image with shape (24, 1, 28, 28)
>>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
For example, if the feed is a list:
>>> exe = ParallelExecutor()
>>> # each device will process each element in the list.
>>> # the 1st device will process an image with shape (48, 1, 28, 28)
>>> # the 2nd device will process an image with shape (32, 1, 28, 28)
>>> #
>>> # you can use exe.device_count to get the device number.
>>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
>>> {"image": numpy.random.random(size=(32, 1, 28, 28))},
>>> ])
Args:
fetch_list(list): The fetched variable names
feed(list|dict|None): The feed variables. If the feed is a dict,
tensors in that dict will be splitted into each devices. If
the feed is a list, each element of the list will be copied
to each device.
feed_dict: Alias for feed parameter, for backward compatibility.
This parameter is deprecated.
feed_tensor_dict
=
{}
for
i
,
feed_name
in
enumerate
(
feed_dict
):
feed_tensor
=
feed_dict
[
feed_name
]
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
feed_tensor
=
core
.
LoDTensor
()
feed_tensor
.
set
(
feed_dict
[
feed_name
],
self
.
_act_places
[
0
])
feed_tensor_dict
[
feed_name
]
=
feed_tensor
Returns: fetched result list.
"""
if
feed
is
None
:
feed
=
feed_dict
print
>>
sys
.
stderr
,
"`feed_dict` is deprecated. Please use `feed=`"
if
isinstance
(
feed
,
dict
):
feed_tensor_dict
=
dict
()
for
feed_name
in
feed
:
feed_tensor
=
feed
[
feed_name
]
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
feed_tensor
=
core
.
LoDTensor
()
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor
.
set
(
feed
[
feed_name
],
core
.
CPUPlace
())
feed_tensor_dict
[
feed_name
]
=
feed_tensor
self
.
executor
.
feed_and_split_tensor_into_local_scopes
(
feed_tensor_dict
)
elif
isinstance
(
feed
,
list
)
or
isinstance
(
feed
,
tuple
):
if
len
(
feed
)
!=
len
(
self
.
_act_places
):
raise
ValueError
(
"Feed a list of tensor, the list should be the same size as places"
)
res
=
list
()
for
i
,
each
in
enumerate
(
feed
):
if
not
isinstance
(
each
,
dict
):
raise
TypeError
(
"Each element of feed list should be a dict"
)
res_dict
=
dict
()
for
feed_name
in
each
:
tensor
=
each
[
feed_name
]
if
not
isinstance
(
tensor
,
core
.
LoDTensor
):
tmp
=
core
.
LoDTensor
()
tmp
.
set
(
tensor
,
self
.
_act_places
[
i
])
tensor
=
tmp
res_dict
[
feed_name
]
=
tensor
res
.
append
(
res_dict
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
fetch_var_name
=
'@FETCHED_VAR_NAME@'
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
,
feed_tensor_dict
)
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
)
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
def
bcast_params
(
self
):
self
.
executor
.
bcast_params
(
set
(
self
.
persistable_vars
))
@
property
def
device_count
(
self
):
return
len
(
self
.
_act_places
)
python/paddle/fluid/tests/unittests/test_parallel_executor.py
浏览文件 @
fc6f0be2
...
...
@@ -203,12 +203,12 @@ class TestParallelExecutorBase(unittest.TestCase):
iter
=
10
,
batch_size
=
None
,
allow_op_delay
=
False
,
feed_dict
=
{}
):
feed_dict
=
None
):
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
.
random_seed
=
1
# Fix random seed
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
method
(
use_feed
=
len
(
feed_dict
)
>
0
)
loss
=
method
(
use_feed
=
feed_dict
is
not
None
)
adam
=
fluid
.
optimizer
.
Adam
()
adam
.
minimize
(
loss
)
if
memory_opt
:
...
...
@@ -222,13 +222,13 @@ class TestParallelExecutorBase(unittest.TestCase):
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
()
begin
=
time
.
time
()
first_loss
,
=
exe
.
run
([
loss
.
name
],
feed
_dict
=
feed_dict
)
first_loss
,
=
exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
first_loss
=
numpy
.
array
(
first_loss
)
for
i
in
xrange
(
iter
):
exe
.
run
([],
feed
_dict
=
feed_dict
)
exe
.
run
([],
feed
=
feed_dict
)
last_loss
,
=
exe
.
run
([
loss
.
name
],
feed
_dict
=
feed_dict
)
last_loss
,
=
exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
end
=
time
.
time
()
if
batch_size
is
not
None
:
...
...
@@ -649,5 +649,5 @@ class TestCRFModel(unittest.TestCase):
for
i
in
xrange
(
10
):
cur_batch
=
next
(
data
)
print
map
(
numpy
.
array
,
pe
.
run
(
feed
_dict
=
feeder
.
feed
(
cur_batch
),
pe
.
run
(
feed
=
feeder
.
feed
(
cur_batch
),
fetch_list
=
[
avg_cost
.
name
]))[
0
]
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录