Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
fc6f0be2
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
fc6f0be2
编写于
4月 17, 2018
作者:
Y
Yu Yang
提交者:
GitHub
4月 17, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #9942 from reyoung/feature/tuning_pe_trans
Feature/tuning pe trans
上级
b53f7e2c
72869543
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
139 addition
and
39 deletion
+139
-39
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+22
-12
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+11
-5
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+8
-0
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+10
-0
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+82
-16
python/paddle/fluid/tests/unittests/test_parallel_executor.py
...on/paddle/fluid/tests/unittests/test_parallel_executor.py
+6
-6
未找到文件。
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
fc6f0be2
...
@@ -155,13 +155,9 @@ void ParallelExecutor::BCastParamsToGPUs(
...
@@ -155,13 +155,9 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif
#endif
}
}
void
ParallelExecutor
::
Run
(
void
ParallelExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
)
{
const
std
::
string
&
fetched_var_name
,
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
feed_tensors
)
{
platform
::
RecordBlock
b
(
0
);
platform
::
RecordBlock
b
(
0
);
SplitTensorToPlaces
(
feed_tensors
);
// Create local scopes.
// Create local scopes.
for
(
auto
&
scope
:
member_
->
local_scopes_
)
{
for
(
auto
&
scope
:
member_
->
local_scopes_
)
{
Scope
&
local_scope
=
scope
->
NewScope
();
Scope
&
local_scope
=
scope
->
NewScope
();
...
@@ -195,14 +191,28 @@ void ParallelExecutor::Run(
...
@@ -195,14 +191,28 @@ void ParallelExecutor::Run(
auto
&
local_scope
=
auto
&
local_scope
=
*
scope
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
();
*
scope
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
();
scope
->
DeleteScope
(
local_scope
);
scope
->
DeleteScope
(
local_scope
);
local_scope
=
nullptr
;
}
}
}
}
void
ParallelExecutor
::
SplitTensorToPlaces
(
void
ParallelExecutor
::
FeedTensorsIntoLocalScopes
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
feed_tensors
)
{
const
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
LoDTensor
>>
&
tensors
)
{
for
(
auto
it
:
feed_tensors
)
{
PADDLE_ENFORCE_EQ
(
member_
->
local_scopes_
.
size
(),
tensors
.
size
());
auto
lod_tensors
=
it
.
second
.
SplitLoDTensor
(
member_
->
places_
);
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
auto
&
map
=
tensors
[
i
];
auto
*
scope
=
member_
->
local_scopes_
[
i
];
for
(
auto
&
pair
:
map
)
{
auto
*
trg
=
scope
->
Var
(
pair
.
first
)
->
GetMutable
<
LoDTensor
>
();
trg
->
ShareDataWith
(
pair
.
second
);
trg
->
set_lod
(
pair
.
second
.
lod
());
}
}
}
void
ParallelExecutor
::
FeedAndSplitTensorIntoLocalScopes
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
tensors
)
{
for
(
auto
pair
:
tensors
)
{
auto
lod_tensors
=
pair
.
second
.
SplitLoDTensor
(
member_
->
places_
);
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
member_
->
places_
.
size
(),
lod_tensors
.
size
(),
member_
->
places_
.
size
(),
lod_tensors
.
size
(),
"The number of samples of current batch is less than the count of "
"The number of samples of current batch is less than the count of "
...
@@ -211,7 +221,7 @@ void ParallelExecutor::SplitTensorToPlaces(
...
@@ -211,7 +221,7 @@ void ParallelExecutor::SplitTensorToPlaces(
for
(
size_t
j
=
0
;
j
<
member_
->
places_
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
member_
->
places_
.
size
();
++
j
)
{
// TODO(panxy0718): Do I need to delete this var?
// TODO(panxy0718): Do I need to delete this var?
auto
t
=
auto
t
=
member_
->
local_scopes_
[
j
]
->
Var
(
it
.
first
)
->
GetMutable
<
LoDTensor
>
();
member_
->
local_scopes_
[
j
]
->
Var
(
pair
.
first
)
->
GetMutable
<
LoDTensor
>
();
t
->
ShareDataWith
(
lod_tensors
[
j
]);
t
->
ShareDataWith
(
lod_tensors
[
j
]);
t
->
set_lod
(
lod_tensors
[
j
].
lod
());
t
->
set_lod
(
lod_tensors
[
j
].
lod
());
}
}
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
fc6f0be2
...
@@ -44,16 +44,22 @@ class ParallelExecutor {
...
@@ -44,16 +44,22 @@ class ParallelExecutor {
std
::
vector
<
Scope
*>&
GetLocalScopes
();
std
::
vector
<
Scope
*>&
GetLocalScopes
();
/**
* Feed tensors to local scopes. The size of tensors should be equal to the
* size of local scopes.
*/
void
FeedTensorsIntoLocalScopes
(
const
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
LoDTensor
>>&
tensors
);
void
FeedAndSplitTensorIntoLocalScopes
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>&
tensors
);
void
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
,
void
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
,
const
std
::
string
&
fetched_var_name
);
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>&
feed_tensors
);
void
BCastParamsToGPUs
(
const
std
::
unordered_set
<
std
::
string
>&
vars
)
const
;
void
BCastParamsToGPUs
(
const
std
::
unordered_set
<
std
::
string
>&
vars
)
const
;
private:
private:
void
SplitTensorToPlaces
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>&
feed_tensors
);
ParallelExecutorPrivate
*
member_
;
ParallelExecutorPrivate
*
member_
;
};
};
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
fc6f0be2
...
@@ -505,11 +505,19 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -505,11 +505,19 @@ All parameter, weight, gradient are variables in Paddle.
scope
,
local_scopes
,
allow_op_delay
);
scope
,
local_scopes
,
allow_op_delay
);
})
})
.
def
(
"bcast_params"
,
&
ParallelExecutor
::
BCastParamsToGPUs
)
.
def
(
"bcast_params"
,
&
ParallelExecutor
::
BCastParamsToGPUs
)
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
// one by one and mark them as reference.
.
def
(
"local_scopes"
,
.
def
(
"local_scopes"
,
[](
ParallelExecutor
&
self
)
->
std
::
vector
<
Scope
*>
*
{
[](
ParallelExecutor
&
self
)
->
std
::
vector
<
Scope
*>
*
{
return
&
self
.
GetLocalScopes
();
return
&
self
.
GetLocalScopes
();
},
},
py
::
return_value_policy
::
reference
)
py
::
return_value_policy
::
reference
)
.
def
(
"feed_tensors_into_local_scopes"
,
&
ParallelExecutor
::
FeedTensorsIntoLocalScopes
)
.
def
(
"feed_and_split_tensor_into_local_scopes"
,
&
ParallelExecutor
::
FeedAndSplitTensorIntoLocalScopes
)
.
def
(
"run"
,
&
ParallelExecutor
::
Run
);
.
def
(
"run"
,
&
ParallelExecutor
::
Run
);
BindRecordIOWriter
(
&
m
);
BindRecordIOWriter
(
&
m
);
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
fc6f0be2
...
@@ -190,6 +190,11 @@ void PyCUDATensorSetFromArray(
...
@@ -190,6 +190,11 @@ void PyCUDATensorSetFromArray(
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
));
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
));
paddle
::
platform
::
GpuMemcpyAsync
(
dst
,
array
.
data
(),
sizeof
(
T
)
*
array
.
size
(),
paddle
::
platform
::
GpuMemcpyAsync
(
dst
,
array
.
data
(),
sizeof
(
T
)
*
array
.
size
(),
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
());
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx
->
Wait
();
}
}
template
<
>
template
<
>
...
@@ -216,6 +221,11 @@ void PyCUDATensorSetFromArray(
...
@@ -216,6 +221,11 @@ void PyCUDATensorSetFromArray(
paddle
::
platform
::
GpuMemcpyAsync
(
dst
,
array
.
data
(),
paddle
::
platform
::
GpuMemcpyAsync
(
dst
,
array
.
data
(),
sizeof
(
uint16_t
)
*
array
.
size
(),
sizeof
(
uint16_t
)
*
array
.
size
(),
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
());
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx
->
Wait
();
}
}
template
<
typename
T
>
template
<
typename
T
>
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
fc6f0be2
...
@@ -16,6 +16,7 @@ import core
...
@@ -16,6 +16,7 @@ import core
import
multiprocessing
import
multiprocessing
import
framework
import
framework
import
executor
import
executor
import
sys
__all__
=
[
'ParallelExecutor'
]
__all__
=
[
'ParallelExecutor'
]
...
@@ -123,28 +124,93 @@ class ParallelExecutor(object):
...
@@ -123,28 +124,93 @@ class ParallelExecutor(object):
allow_op_delay
)
allow_op_delay
)
self
.
scope
=
scope
self
.
scope
=
scope
def
run
(
self
,
fetch_list
,
feed
_dict
=
{}
):
def
run
(
self
,
fetch_list
,
feed
=
None
,
feed_dict
=
None
):
"""
"""
:param fetch_list: A list of variable names that will be fetched.
Run a parallel executor with fetch_list.
:param feed_dict: A dict mapping for feed variable name to LoDTensor
or numpy array.
The feed parameter can be a dict or a list. If feed is a dict, the
:return: fetched value list.
feed data will be split into multiple devices. If feed is a list, we
"""
assume the data has been splitted into multiple devices, the each
if
not
isinstance
(
feed_dict
,
dict
):
element in the list will be copied to each device directly.
raise
TypeError
(
"feed_dict should be a dict"
)
For example, if the feed is a dict:
>>> exe = ParallelExecutor()
>>> # the image will be splitted into devices. If there is two devices
>>> # each device will process an image with shape (24, 1, 28, 28)
>>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
For example, if the feed is a list:
>>> exe = ParallelExecutor()
>>> # each device will process each element in the list.
>>> # the 1st device will process an image with shape (48, 1, 28, 28)
>>> # the 2nd device will process an image with shape (32, 1, 28, 28)
>>> #
>>> # you can use exe.device_count to get the device number.
>>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
>>> {"image": numpy.random.random(size=(32, 1, 28, 28))},
>>> ])
Args:
fetch_list(list): The fetched variable names
feed(list|dict|None): The feed variables. If the feed is a dict,
tensors in that dict will be splitted into each devices. If
the feed is a list, each element of the list will be copied
to each device.
feed_dict: Alias for feed parameter, for backward compatibility.
This parameter is deprecated.
feed_tensor_dict
=
{}
Returns: fetched result list.
for
i
,
feed_name
in
enumerate
(
feed_dict
):
feed_tensor
=
feed_dict
[
feed_name
]
"""
if
feed
is
None
:
feed
=
feed_dict
print
>>
sys
.
stderr
,
"`feed_dict` is deprecated. Please use `feed=`"
if
isinstance
(
feed
,
dict
):
feed_tensor_dict
=
dict
()
for
feed_name
in
feed
:
feed_tensor
=
feed
[
feed_name
]
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
feed_tensor
=
core
.
LoDTensor
()
feed_tensor
=
core
.
LoDTensor
()
feed_tensor
.
set
(
feed_dict
[
feed_name
],
self
.
_act_places
[
0
])
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor
.
set
(
feed
[
feed_name
],
core
.
CPUPlace
())
feed_tensor_dict
[
feed_name
]
=
feed_tensor
feed_tensor_dict
[
feed_name
]
=
feed_tensor
self
.
executor
.
feed_and_split_tensor_into_local_scopes
(
feed_tensor_dict
)
elif
isinstance
(
feed
,
list
)
or
isinstance
(
feed
,
tuple
):
if
len
(
feed
)
!=
len
(
self
.
_act_places
):
raise
ValueError
(
"Feed a list of tensor, the list should be the same size as places"
)
res
=
list
()
for
i
,
each
in
enumerate
(
feed
):
if
not
isinstance
(
each
,
dict
):
raise
TypeError
(
"Each element of feed list should be a dict"
)
res_dict
=
dict
()
for
feed_name
in
each
:
tensor
=
each
[
feed_name
]
if
not
isinstance
(
tensor
,
core
.
LoDTensor
):
tmp
=
core
.
LoDTensor
()
tmp
.
set
(
tensor
,
self
.
_act_places
[
i
])
tensor
=
tmp
res_dict
[
feed_name
]
=
tensor
res
.
append
(
res_dict
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
fetch_var_name
=
'@FETCHED_VAR_NAME@'
fetch_var_name
=
'@FETCHED_VAR_NAME@'
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
,
feed_tensor_dict
)
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
)
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
def
bcast_params
(
self
):
def
bcast_params
(
self
):
self
.
executor
.
bcast_params
(
set
(
self
.
persistable_vars
))
self
.
executor
.
bcast_params
(
set
(
self
.
persistable_vars
))
@
property
def
device_count
(
self
):
return
len
(
self
.
_act_places
)
python/paddle/fluid/tests/unittests/test_parallel_executor.py
浏览文件 @
fc6f0be2
...
@@ -203,12 +203,12 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -203,12 +203,12 @@ class TestParallelExecutorBase(unittest.TestCase):
iter
=
10
,
iter
=
10
,
batch_size
=
None
,
batch_size
=
None
,
allow_op_delay
=
False
,
allow_op_delay
=
False
,
feed_dict
=
{}
):
feed_dict
=
None
):
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
.
random_seed
=
1
# Fix random seed
startup
.
random_seed
=
1
# Fix random seed
with
fluid
.
program_guard
(
main
,
startup
):
with
fluid
.
program_guard
(
main
,
startup
):
loss
=
method
(
use_feed
=
len
(
feed_dict
)
>
0
)
loss
=
method
(
use_feed
=
feed_dict
is
not
None
)
adam
=
fluid
.
optimizer
.
Adam
()
adam
=
fluid
.
optimizer
.
Adam
()
adam
.
minimize
(
loss
)
adam
.
minimize
(
loss
)
if
memory_opt
:
if
memory_opt
:
...
@@ -222,13 +222,13 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -222,13 +222,13 @@ class TestParallelExecutorBase(unittest.TestCase):
if
batch_size
is
not
None
:
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
()
batch_size
*=
fluid
.
core
.
get_cuda_device_count
()
begin
=
time
.
time
()
begin
=
time
.
time
()
first_loss
,
=
exe
.
run
([
loss
.
name
],
feed
_dict
=
feed_dict
)
first_loss
,
=
exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
first_loss
=
numpy
.
array
(
first_loss
)
first_loss
=
numpy
.
array
(
first_loss
)
for
i
in
xrange
(
iter
):
for
i
in
xrange
(
iter
):
exe
.
run
([],
feed
_dict
=
feed_dict
)
exe
.
run
([],
feed
=
feed_dict
)
last_loss
,
=
exe
.
run
([
loss
.
name
],
feed
_dict
=
feed_dict
)
last_loss
,
=
exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
end
=
time
.
time
()
end
=
time
.
time
()
if
batch_size
is
not
None
:
if
batch_size
is
not
None
:
...
@@ -649,5 +649,5 @@ class TestCRFModel(unittest.TestCase):
...
@@ -649,5 +649,5 @@ class TestCRFModel(unittest.TestCase):
for
i
in
xrange
(
10
):
for
i
in
xrange
(
10
):
cur_batch
=
next
(
data
)
cur_batch
=
next
(
data
)
print
map
(
numpy
.
array
,
print
map
(
numpy
.
array
,
pe
.
run
(
feed
_dict
=
feeder
.
feed
(
cur_batch
),
pe
.
run
(
feed
=
feeder
.
feed
(
cur_batch
),
fetch_list
=
[
avg_cost
.
name
]))[
0
]
fetch_list
=
[
avg_cost
.
name
]))[
0
]
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录