Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
4bcc0b64
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
4bcc0b64
编写于
1月 10, 2018
作者:
Y
Yang Yang(Tony)
提交者:
GitHub
1月 10, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[WIP] feature/parallel_gpu (#7293)
feature/parallel_gpu
上级
df927768
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
89 addition
and
71 deletion
+89
-71
paddle/framework/lod_tensor.cc
paddle/framework/lod_tensor.cc
+29
-32
paddle/framework/tensor_util.h
paddle/framework/tensor_util.h
+18
-18
paddle/framework/var_desc.cc
paddle/framework/var_desc.cc
+1
-1
paddle/operators/get_places_op.cc
paddle/operators/get_places_op.cc
+2
-1
paddle/operators/parallel_do_op.cc
paddle/operators/parallel_do_op.cc
+38
-18
python/paddle/v2/fluid/tests/test_parallel_op.py
python/paddle/v2/fluid/tests/test_parallel_op.py
+1
-1
未找到文件。
paddle/framework/lod_tensor.cc
浏览文件 @
4bcc0b64
...
...
@@ -44,9 +44,19 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
LoDTensor
&
t
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
t
.
place
()));
PADDLE_ENFORCE
(
t
.
type
().
hash_code
()
==
typeid
(
float
).
hash_code
());
if
(
!
platform
::
is_cpu_place
(
t
.
place
()))
{
LoDTensor
tt
;
framework
::
Copy
(
t
,
platform
::
CPUPlace
(),
&
tt
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
t
.
place
());
dev_ctx
.
Wait
();
os
<<
tt
;
return
os
;
}
os
<<
"dim: "
<<
t
.
dims
()
<<
"
\n
"
;
os
<<
"lod: "
<<
t
.
lod
()
<<
"
\n
"
;
...
...
@@ -211,38 +221,23 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
DeserializeFromStream
(
is
,
static_cast
<
Tensor
*>
(
tensor
),
dev_ctx
);
}
// TODO(tonyyang-svail): make this function support LoD
std
::
vector
<
LoDTensor
>
LoDTensor
::
SplitLoDTensor
(
const
std
::
vector
<
platform
::
Place
>
places
)
const
{
check_memory_size
();
// PADDLE_ENFORCE(lod().empty() || (lod().size() == 1 && lod()[0].empty())
// , "Disable parallel lod for now");
PADDLE_ENFORCE
(
lod
().
empty
(),
"Disable parallel lod for now"
);
PADDLE_ENFORCE
(
dims
()[
0
]
%
places
.
size
()
==
0
,
"Batch size should be divided by places size"
);
std
::
vector
<
LoDTensor
>
lods
;
for
(
size_t
place_idx
=
0
;
place_idx
<
places
.
size
();
++
place_idx
)
{
size_t
begin
=
place_idx
*
dims
()[
0
]
/
places
.
size
();
size_t
end
=
(
place_idx
+
1
)
*
dims
()[
0
]
/
places
.
size
();
auto
src
=
Slice
(
static_cast
<
int
>
(
begin
),
static_cast
<
int
>
(
end
));
int
begin
=
place_idx
*
dims
()[
0
]
/
places
.
size
();
int
end
=
(
place_idx
+
1
)
*
dims
()[
0
]
/
places
.
size
();
LoDTensor
dst
;
dst
.
Resize
(
src
.
dims
());
auto
src
=
Slice
(
begin
,
end
);
auto
&
dst_place
=
places
[
place_idx
];
auto
dst_ptr
=
dst
.
mutable_data
(
dst_place
,
src
.
type
());
// TODO(tonyyang-svail):
// change the following to framework::Copy
auto
src_place
=
src
.
place
();
auto
src_ptr
=
src
.
data
<
void
>
();
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
}
else
{
PADDLE_THROW
(
"Not Implemented"
);
}
LoDTensor
dst
;
framework
::
Copy
(
src
,
dst_place
,
&
dst
);
lods
.
emplace_back
(
dst
);
}
...
...
@@ -250,28 +245,30 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
return
lods
;
}
// TODO(tonyyang-svail): make this function support LoD
void
LoDTensor
::
MergeLoDTensor
(
const
std
::
vector
<
const
LoDTensor
*>
&
lod_tensors
,
platform
::
Place
place
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
place
));
const
std
::
vector
<
const
LoDTensor
*>
&
lod_tensors
,
platform
::
Place
dst_place
)
{
PADDLE_ENFORCE
(
!
lod_tensors
.
empty
());
framework
::
DDim
new_dim
=
lod_tensors
[
0
]
->
dims
();
std
::
type_index
new_type
=
lod_tensors
[
0
]
->
type
();
auto
new_layout
=
lod_tensors
[
0
]
->
layout
();
for
(
auto
*
lod
:
lod_tensors
)
{
PADDLE_ENFORCE
(
new_dim
==
lod
->
dims
());
PADDLE_ENFORCE
(
new_type
==
lod
->
type
());
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
lod
->
place
()
));
PADDLE_ENFORCE
(
new_layout
==
lod
->
layout
(
));
}
new_dim
[
0
]
*=
lod_tensors
.
size
();
Resize
(
new_dim
);
set_layout
(
new_layout
);
auto
*
dst_ptr
=
reinterpret_cast
<
uint8_t
*>
(
mutable_data
(
place
,
new_type
));
mutable_data
(
dst_place
,
new_type
);
int
begin
=
0
;
for
(
auto
*
src
:
lod_tensors
)
{
auto
size
=
src
->
numel
()
*
SizeOfType
(
src
->
type
());
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src
->
place
()),
src
->
data
<
void
>
(),
size
);
dst_ptr
+=
size
;
int
end
=
begin
+
src
->
dims
()[
0
];
auto
dst
=
Slice
(
begin
,
end
);
framework
::
Copy
(
*
src
,
dst_place
,
&
dst
);
begin
=
end
;
}
}
...
...
paddle/framework/tensor_util.h
浏览文件 @
4bcc0b64
...
...
@@ -31,9 +31,10 @@ namespace framework {
*
* @note Copy supports CPU <-> GPU, GPU <-> GPU.
*/
inline
void
Copy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
dst
)
{
VLOG
(
3
)
<<
"Copy "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
...
...
@@ -88,26 +89,25 @@ inline void Copy(const Tensor& src, const platform::Place& dst_place,
}
/**
* @brief Copy supports CPU <-> CPU
* @brief Wrapper on
* Copy(const Tensor& src, const platform::Place& dst_place,
* const platform::DeviceContext& ctx, Tensor* dst);
*
* @param[in] src The external tensor.
* @param[in] dst_place The dst place.
*
* @note Copy supports CPU <-> GPU, GPU <-> GPU.
*/
inline
void
Copy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
Tensor
*
dst
)
{
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
dst
->
set_layout
(
src
.
layout
());
auto
src_place
=
src
.
place
();
auto
src_ptr
=
src
.
data
<
void
>
();
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
));
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
const
platform
::
DeviceContext
*
dev_ctx
;
if
(
platform
::
is_gpu_place
(
src
.
place
()))
{
dev_ctx
=
pool
.
Get
(
src
.
place
());
}
else
{
dev_ctx
=
pool
.
Get
(
dst_place
);
}
Copy
(
src
,
dst_place
,
*
dev_ctx
,
dst
);
}
/**
...
...
paddle/framework/var_desc.cc
浏览文件 @
4bcc0b64
...
...
@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
case
proto
::
VarDesc
::
LOD_TENSOR_ARRAY
:
return
desc_
.
tensor_array
().
tensor
();
default:
PADDLE_THROW
(
"The type of var
'"
,
this
->
Name
(),
"' is unsupported."
);
PADDLE_THROW
(
"The type of var
%s is unsupported."
,
this
->
Name
()
);
}
}
...
...
paddle/operators/get_places_op.cc
浏览文件 @
4bcc0b64
...
...
@@ -111,4 +111,5 @@ class GetPlacesInferShape : public framework::InferShapeBase {
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
get_places
,
ops
::
GetPlacesOp
,
ops
::
GetPlacesOpProtoMaker
,
ops
::
GetPlacesInferVarType
,
ops
::
GetPlacesInferShape
);
ops
::
GetPlacesInferVarType
,
ops
::
GetPlacesInferShape
,
paddle
::
framework
::
EmptyGradOpMaker
);
paddle/operators/parallel_do_op.cc
浏览文件 @
4bcc0b64
...
...
@@ -39,6 +39,7 @@ void SplitTensorAndMoveTensorToScopes(
const
std
::
vector
<
framework
::
Scope
*>
&
sub_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
std
::
string
>
&
names
)
{
PADDLE_ENFORCE_EQ
(
sub_scopes
.
size
(),
places
.
size
());
for
(
auto
&
argu
:
names
)
{
auto
*
var
=
scope
.
FindVar
(
argu
);
const
auto
&
tensor
=
var
->
Get
<
LoDTensor
>
();
...
...
@@ -54,6 +55,15 @@ void SplitTensorAndMoveTensorToScopes(
}
}
void
WaitOnPlaces
(
const
std
::
vector
<
platform
::
Place
>
places
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
for
(
auto
&
place
:
places
)
{
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
dev_ctx
.
Wait
();
}
}
class
ParallelDoOp
:
public
framework
::
OperatorBase
{
public:
ParallelDoOp
(
const
std
::
string
&
type
,
...
...
@@ -71,10 +81,7 @@ class ParallelDoOp : public framework::OperatorBase {
auto
*
block
=
Attr
<
framework
::
BlockDesc
*>
(
kParallelBlock
);
auto
*
program
=
block
->
Program
();
// TODO(tonyyang-svail): get places from input
std
::
vector
<
platform
::
Place
>
places
;
places
.
emplace_back
(
platform
::
CPUPlace
());
places
.
emplace_back
(
platform
::
CPUPlace
());
auto
&
places
=
scope
.
FindVar
(
Input
(
kPlaces
))
->
Get
<
platform
::
PlaceList
>
();
auto
&
sub_scopes
=
*
scope
.
FindVar
(
Output
(
kParallelScopes
))
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*>>
();
...
...
@@ -82,8 +89,22 @@ class ParallelDoOp : public framework::OperatorBase {
sub_scopes
.
push_back
(
&
scope
.
NewScope
());
}
// split input
SplitTensorAndMoveTensorToScopes
(
scope
,
sub_scopes
,
places
,
Inputs
(
kInputs
));
// copy parameter
for
(
auto
&
param
:
Inputs
(
kParameters
))
{
PADDLE_ENFORCE
(
scope
.
FindVar
(
param
)
->
IsType
<
LoDTensor
>
(),
"Only support parameter type as LoDTensor"
);
auto
&
src
=
scope
.
FindVar
(
param
)
->
Get
<
LoDTensor
>
();
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
auto
&
place
=
places
[
i
];
auto
*
sub_scope
=
sub_scopes
[
i
];
auto
*
dst
=
sub_scope
->
Var
(
param
)
->
GetMutable
<
LoDTensor
>
();
framework
::
Copy
(
src
,
place
,
dst
);
}
}
WaitOnPlaces
(
places
);
std
::
vector
<
std
::
future
<
void
>>
workers
;
workers
.
reserve
(
places
.
size
());
...
...
@@ -93,12 +114,6 @@ class ParallelDoOp : public framework::OperatorBase {
auto
&
place
=
places
[
place_idx
];
auto
*
cur_scope
=
sub_scopes
[
place_idx
];
// copy parameter
// some version of boost lacks != for boost::variant
if
(
!
(
dev_ctx
.
GetPlace
()
==
place
))
{
PADDLE_THROW
(
"Not Implemented"
);
}
workers
.
emplace_back
(
framework
::
Async
([
program
,
cur_scope
,
place
,
block
]
{
framework
::
Executor
executor
(
place
);
executor
.
Run
(
*
program
,
cur_scope
,
block
->
ID
(),
...
...
@@ -108,6 +123,7 @@ class ParallelDoOp : public framework::OperatorBase {
for
(
auto
&
worker
:
workers
)
{
worker
.
wait
();
}
WaitOnPlaces
(
places
);
// merge output
for
(
auto
&
o_name
:
Outputs
(
kOutputs
))
{
...
...
@@ -121,6 +137,7 @@ class ParallelDoOp : public framework::OperatorBase {
scope
.
FindVar
(
o_name
)
->
GetMutable
<
LoDTensor
>
();
lod_tensor_to_be_merged
->
MergeLoDTensor
(
lod_tensors
,
dev_ctx
.
GetPlace
());
}
WaitOnPlaces
(
places
);
}
};
...
...
@@ -161,15 +178,14 @@ class ParallelDoGradOp : public OperatorBase {
auto
&
sub_scopes
=
scope
.
FindVar
(
Input
(
kParallelScopes
))
->
Get
<
std
::
vector
<
framework
::
Scope
*>>
();
// TODO(tonyyang-svail): get places from input
std
::
vector
<
platform
::
Place
>
places
;
places
.
emplace_back
(
platform
::
CPUPlace
());
places
.
emplace_back
(
platform
::
CPUPlace
());
auto
&
places
=
scope
.
FindVar
(
Input
(
kPlaces
))
->
Get
<
platform
::
PlaceList
>
();
// feed output@grad
SplitTensorAndMoveTensorToScopes
(
scope
,
sub_scopes
,
places
,
Inputs
(
framework
::
GradVarName
(
kOutputs
)));
WaitOnPlaces
(
places
);
// for debugging
for
(
auto
&
s
:
Inputs
(
framework
::
GradVarName
(
kOutputs
)))
{
VLOG
(
3
)
<<
s
;
VLOG
(
3
)
<<
scope
.
FindVar
(
s
)
->
Get
<
LoDTensor
>
();
...
...
@@ -196,10 +212,11 @@ class ParallelDoGradOp : public OperatorBase {
for
(
auto
&
worker
:
workers
)
{
worker
.
wait
();
}
WaitOnPlaces
(
places
);
// merge grad
for
(
auto
&
s
:
Outputs
(
framework
::
GradVarName
(
kParameters
)))
{
VLOG
(
3
)
<<
s
;
VLOG
(
3
)
<<
"merge grad "
<<
s
;
auto
&
t
=
sub_scopes
[
0
]
->
FindVar
(
s
)
->
Get
<
LoDTensor
>
();
VLOG
(
3
)
<<
t
;
...
...
@@ -216,7 +233,8 @@ class ParallelDoGradOp : public OperatorBase {
auto
sum_op
=
framework
::
OpRegistry
::
CreateOp
(
"sum"
,
{{
"X"
,
{
s
,
s_buf
}}},
{{
"Out"
,
{
s
}}},
framework
::
AttributeMap
{});
sum_op
->
Run
(
*
sub_scopes
[
0
],
place
);
sum_op
->
Run
(
*
sub_scopes
[
0
],
places
[
0
]);
WaitOnPlaces
(
places
);
}
VLOG
(
3
)
<<
t
;
...
...
@@ -236,8 +254,10 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
for
(
auto
&
input_param
:
this
->
InputNames
())
{
VLOG
(
3
)
<<
input_param
;
grad
->
SetInput
(
input_param
,
this
->
Input
(
input_param
));
grad
->
SetOutput
(
framework
::
GradVarName
(
input_param
),
this
->
InputGrad
(
input_param
,
false
));
if
(
input_param
!=
kPlaces
)
{
grad
->
SetOutput
(
framework
::
GradVarName
(
input_param
),
this
->
InputGrad
(
input_param
,
false
));
}
}
for
(
auto
&
output_param
:
this
->
OutputNames
())
{
...
...
python/paddle/v2/fluid/tests/test_parallel_op.py
浏览文件 @
4bcc0b64
...
...
@@ -18,7 +18,7 @@ class ParallelOpTest(unittest.TestCase):
append_batch_size
=
False
,
stop_gradient
=
False
)
places
=
fluid
.
default_main_program
().
global_block
().
create_var
(
)
places
=
layers
.
get_places
(
device_count
=
4
)
pd
=
layers
.
ParallelDo
(
places
=
places
)
with
pd
.
do
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录