Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
64d7a302
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
64d7a302
编写于
3月 21, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Extract SSAGraph
上级
8dec4ad7
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
98 addition
and
93 deletion
+98
-93
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+98
-91
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+0
-2
未找到文件。
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
64d7a302
...
@@ -37,6 +37,86 @@ using details::ScaleLossGradOpHandle;
...
@@ -37,6 +37,86 @@ using details::ScaleLossGradOpHandle;
using
details
::
VarHandle
;
using
details
::
VarHandle
;
using
details
::
VarHandleBase
;
using
details
::
VarHandleBase
;
struct
SSAGraph
{
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
std
::
map
<
int
,
VarHandle
>>>
vars_
;
std
::
unordered_set
<
std
::
unique_ptr
<
VarHandleBase
>>
dep_vars_
;
std
::
vector
<
std
::
unique_ptr
<
OpHandleBase
>>
ops_
;
};
/**
* We only handle write after read(WAR), since it should not have a write
* after write in program. If there are write after write operators, we need
* prune them.
*
* https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
*/
static
void
PolishGraphToSupportDataHazards
(
SSAGraph
*
graph
)
{
for
(
auto
&
var_map
:
graph
->
vars_
)
{
for
(
auto
&
name_pair
:
var_map
)
{
if
(
name_pair
.
second
.
size
()
<=
1
)
{
return
;
}
auto
it_new
=
name_pair
.
second
.
rbegin
();
auto
it_old
=
name_pair
.
second
.
rbegin
();
++
it_old
;
for
(;
it_old
!=
name_pair
.
second
.
rend
();
it_new
=
it_old
,
++
it_old
)
{
auto
*
write_op
=
it_new
->
second
.
generated_op_
;
auto
&
read_ops
=
it_old
->
second
.
pending_ops_
;
auto
*
ex_write_op
=
it_old
->
second
.
generated_op_
;
if
(
ex_write_op
==
nullptr
)
{
// Nobody write this var.
continue
;
}
for
(
auto
*
read_op
:
read_ops
)
{
// Manually add a dependency var from read_op to write_op;
if
(
read_op
==
write_op
)
{
// Read Write is the same op.
continue
;
}
auto
*
dep_var
=
new
DummyVarHandle
();
read_op
->
AddOutput
(
dep_var
);
write_op
->
AddInput
(
dep_var
);
graph
->
dep_vars_
.
emplace
(
dep_var
);
}
}
}
}
}
static
VarHandle
*
CreateOrGetLatestVarHandle
(
SSAGraph
*
graph
,
const
std
::
string
&
each_var_name
,
const
platform
::
Place
&
place
,
size_t
place_offset
)
{
auto
&
var_holders
=
graph
->
vars_
[
place_offset
];
auto
&
var_holder
=
var_holders
[
each_var_name
];
VarHandle
*
var
=
nullptr
;
if
(
var_holder
.
empty
())
{
auto
&
init_var
=
var_holder
[
0
];
init_var
.
place_
=
place
;
init_var
.
name_
=
each_var_name
;
init_var
.
generated_op_
=
nullptr
;
init_var
.
version_
=
0
;
var
=
&
init_var
;
}
else
{
var
=
&
var_holder
.
rbegin
()
->
second
;
}
return
var
;
}
static
void
CreateOpOutput
(
SSAGraph
*
graph
,
OpHandleBase
*
op_handle
,
const
std
::
string
&
each_var_name
,
const
platform
::
Place
&
place
,
size_t
place_offset
)
{
auto
&
vars
=
graph
->
vars_
[
place_offset
][
each_var_name
];
size_t
version
=
vars
.
size
();
auto
&
var
=
vars
[
version
];
var
.
version_
=
version
;
var
.
name_
=
each_var_name
;
var
.
place_
=
place
;
op_handle
->
AddOutput
(
&
var
);
}
class
ParallelExecutorPrivate
{
class
ParallelExecutorPrivate
{
public:
public:
explicit
ParallelExecutorPrivate
(
size_t
num_threads
,
explicit
ParallelExecutorPrivate
(
size_t
num_threads
,
...
@@ -44,7 +124,7 @@ class ParallelExecutorPrivate {
...
@@ -44,7 +124,7 @@ class ParallelExecutorPrivate {
:
places_
(
places
),
:
places_
(
places
),
fetch_dev_ctxs_
(
places
),
fetch_dev_ctxs_
(
places
),
pool_
(
num_threads
<=
1
?
nullptr
:
new
ThreadPool
(
num_threads
))
{
pool_
(
num_threads
<=
1
?
nullptr
:
new
ThreadPool
(
num_threads
))
{
vars_
.
resize
(
places
.
size
());
graph_
.
vars_
.
resize
(
places
.
size
());
}
}
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
...
@@ -54,35 +134,13 @@ class ParallelExecutorPrivate {
...
@@ -54,35 +134,13 @@ class ParallelExecutorPrivate {
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
std
::
map
<
int
,
VarHandle
>>>
vars_
;
SSAGraph
graph_
;
std
::
unordered_set
<
std
::
unique_ptr
<
VarHandleBase
>>
dep_vars_
;
std
::
vector
<
std
::
unique_ptr
<
OpHandleBase
>>
ops_
;
// Use a simpler thread pool, might be faster.
// Use a simpler thread pool, might be faster.
std
::
unique_ptr
<
ThreadPool
>
pool_
;
std
::
unique_ptr
<
ThreadPool
>
pool_
;
std
::
unique_ptr
<
platform
::
EnforceNotMet
>
exception_
;
std
::
unique_ptr
<
platform
::
EnforceNotMet
>
exception_
;
VarHandle
*
GetVarHandle
(
const
std
::
string
&
each_var_name
,
const
platform
::
Place
&
place
,
size_t
place_offset
)
{
auto
&
var_holders
=
vars_
[
place_offset
];
auto
&
var_holder
=
var_holders
[
each_var_name
];
VarHandle
*
var
=
nullptr
;
if
(
var_holder
.
empty
())
{
auto
&
init_var
=
var_holder
[
0
];
init_var
.
place_
=
place
;
init_var
.
name_
=
each_var_name
;
init_var
.
generated_op_
=
nullptr
;
init_var
.
version_
=
0
;
var
=
&
init_var
;
}
else
{
var
=
&
var_holder
.
rbegin
()
->
second
;
}
return
var
;
}
void
RunOp
(
void
RunOp
(
bool
use_event
,
bool
use_event
,
std
::
unordered_map
<
VarHandleBase
*
,
std
::
atomic
<
bool
>>
&
pending_vars
,
std
::
unordered_map
<
VarHandleBase
*
,
std
::
atomic
<
bool
>>
&
pending_vars
,
...
@@ -113,17 +171,6 @@ class ParallelExecutorPrivate {
...
@@ -113,17 +171,6 @@ class ParallelExecutorPrivate {
op_run
();
op_run
();
}
}
}
}
void
GenerateVar
(
OpHandleBase
*
op_handle
,
const
std
::
string
&
each_var_name
,
const
platform
::
Place
&
place
,
size_t
place_offset
)
{
auto
&
vars
=
vars_
[
place_offset
][
each_var_name
];
size_t
version
=
vars
.
size
();
auto
&
var
=
vars
[
version
];
var
.
version_
=
version
;
var
.
name_
=
each_var_name
;
var
.
place_
=
place
;
op_handle
->
AddOutput
(
&
var
);
}
};
};
ParallelExecutor
::
ParallelExecutor
(
ParallelExecutor
::
ParallelExecutor
(
...
@@ -189,21 +236,22 @@ void ParallelExecutor::ConstructDependencyGraph(
...
@@ -189,21 +236,22 @@ void ParallelExecutor::ConstructDependencyGraph(
auto
&
p
=
member_
->
places_
[
i
];
auto
&
p
=
member_
->
places_
[
i
];
auto
*
s
=
member_
->
local_scopes_
[
i
];
auto
*
s
=
member_
->
local_scopes_
[
i
];
member_
->
ops_
.
emplace_back
(
new
ComputationOpHandle
(
*
op
,
s
,
p
));
member_
->
graph_
.
ops_
.
emplace_back
(
new
ComputationOpHandle
(
*
op
,
s
,
p
));
auto
*
op_handle
=
member_
->
ops_
.
back
().
get
();
auto
*
op_handle
=
member_
->
graph_
.
ops_
.
back
().
get
();
op_handle
->
dev_ctx_
[
p
]
=
const_cast
<
platform
::
DeviceContext
*>
(
op_handle
->
dev_ctx_
[
p
]
=
const_cast
<
platform
::
DeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
auto
var_names
=
op
->
InputArgumentNames
();
auto
var_names
=
op
->
InputArgumentNames
();
for
(
auto
&
each_var_name
:
var_names
)
{
for
(
auto
&
each_var_name
:
var_names
)
{
VarHandle
*
var
=
member_
->
GetVarHandle
(
each_var_name
,
p
,
i
);
VarHandle
*
var
=
CreateOrGetLatestVarHandle
(
&
member_
->
graph_
,
each_var_name
,
p
,
i
);
op_handle
->
AddInput
(
var
);
op_handle
->
AddInput
(
var
);
}
}
var_names
=
op
->
OutputArgumentNames
();
var_names
=
op
->
OutputArgumentNames
();
for
(
auto
&
each_var_name
:
var_names
)
{
for
(
auto
&
each_var_name
:
var_names
)
{
member_
->
GenerateVar
(
op_handle
,
each_var_name
,
p
,
i
);
CreateOpOutput
(
&
member_
->
graph_
,
op_handle
,
each_var_name
,
p
,
i
);
}
}
if
(
is_forwarding
)
{
if
(
is_forwarding
)
{
...
@@ -212,7 +260,7 @@ void ParallelExecutor::ConstructDependencyGraph(
...
@@ -212,7 +260,7 @@ void ParallelExecutor::ConstructDependencyGraph(
op_handle
=
op_handle
=
new
ScaleLossGradOpHandle
(
this
->
member_
->
local_scopes_
.
size
(),
s
,
new
ScaleLossGradOpHandle
(
this
->
member_
->
local_scopes_
.
size
(),
s
,
p
,
member_
->
nccl_ctxs_
->
DevCtx
(
p
));
p
,
member_
->
nccl_ctxs_
->
DevCtx
(
p
));
member_
->
ops_
.
emplace_back
(
op_handle
);
member_
->
graph_
.
ops_
.
emplace_back
(
op_handle
);
// FIXME: Currently ScaleLossGradOp only use device_count as scale
// FIXME: Currently ScaleLossGradOp only use device_count as scale
// factor. So it does not depend on any other operators.
// factor. So it does not depend on any other operators.
...
@@ -220,7 +268,8 @@ void ParallelExecutor::ConstructDependencyGraph(
...
@@ -220,7 +268,8 @@ void ParallelExecutor::ConstructDependencyGraph(
// loss->pending_ops_.emplace_back(op_handle);
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
// op_handle->inputs_.emplace_back(loss);
member_
->
GenerateVar
(
op_handle
,
loss_var_name
+
"@GRAD"
,
p
,
i
);
CreateOpOutput
(
&
member_
->
graph_
,
op_handle
,
loss_var_name
+
"@GRAD"
,
p
,
i
);
change_forward
=
true
;
change_forward
=
true
;
}
}
}
}
...
@@ -235,13 +284,13 @@ void ParallelExecutor::ConstructDependencyGraph(
...
@@ -235,13 +284,13 @@ void ParallelExecutor::ConstructDependencyGraph(
for
(
auto
&
og
:
var_names
)
{
for
(
auto
&
og
:
var_names
)
{
if
(
grads
.
count
(
og
)
!=
0
)
{
// is param grad
if
(
grads
.
count
(
og
)
!=
0
)
{
// is param grad
// Insert NCCL AllReduce Op
// Insert NCCL AllReduce Op
member_
->
ops_
.
emplace_back
(
new
NCCLAllReduceOpHandle
(
member_
->
graph_
.
ops_
.
emplace_back
(
new
NCCLAllReduceOpHandle
(
member_
->
local_scopes_
,
member_
->
places_
,
*
member_
->
nccl_ctxs_
));
member_
->
local_scopes_
,
member_
->
places_
,
*
member_
->
nccl_ctxs_
));
auto
*
op_handle
=
member_
->
ops_
.
back
().
get
();
auto
*
op_handle
=
member_
->
graph_
.
ops_
.
back
().
get
();
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
auto
&
p
=
member_
->
places_
[
i
];
auto
&
p
=
member_
->
places_
[
i
];
auto
&
vars
=
member_
->
vars_
[
i
][
og
];
auto
&
vars
=
member_
->
graph_
.
vars_
[
i
][
og
];
if
(
vars
.
empty
())
{
// This device has no data. continue.
if
(
vars
.
empty
())
{
// This device has no data. continue.
continue
;
continue
;
...
@@ -265,49 +314,7 @@ void ParallelExecutor::ConstructDependencyGraph(
...
@@ -265,49 +314,7 @@ void ParallelExecutor::ConstructDependencyGraph(
Dependency graph has been constructed. However, there are still data
Dependency graph has been constructed. However, there are still data
harzaeds need to be handled.
harzaeds need to be handled.
*/
*/
PolishGraphToSupportDataHazards
();
PolishGraphToSupportDataHazards
(
&
member_
->
graph_
);
}
/**
* We only handle write after read(WAR), since it should not have a write
* after write in program. If there are write after write operators, we need
* prune them.
*
* https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
*/
void
ParallelExecutor
::
PolishGraphToSupportDataHazards
()
const
{
for
(
auto
&
var_map
:
member_
->
vars_
)
{
for
(
auto
&
name_pair
:
var_map
)
{
if
(
name_pair
.
second
.
size
()
<=
1
)
{
return
;
}
auto
it_new
=
name_pair
.
second
.
rbegin
();
auto
it_old
=
name_pair
.
second
.
rbegin
();
++
it_old
;
for
(;
it_old
!=
name_pair
.
second
.
rend
();
it_new
=
it_old
,
++
it_old
)
{
auto
*
write_op
=
it_new
->
second
.
generated_op_
;
auto
&
read_ops
=
it_old
->
second
.
pending_ops_
;
auto
*
ex_write_op
=
it_old
->
second
.
generated_op_
;
if
(
ex_write_op
==
nullptr
)
{
// Nobody write this var.
continue
;
}
for
(
auto
*
read_op
:
read_ops
)
{
// Manually add a dependency var from read_op to write_op;
if
(
read_op
==
write_op
)
{
// Read Write is the same op.
continue
;
}
auto
*
dep_var
=
new
DummyVarHandle
();
read_op
->
AddOutput
(
dep_var
);
write_op
->
AddInput
(
dep_var
);
member_
->
dep_vars_
.
emplace
(
dep_var
);
}
}
}
}
}
}
void
ParallelExecutor
::
BCastParamsToGPUs
(
void
ParallelExecutor
::
BCastParamsToGPUs
(
...
@@ -365,7 +372,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
...
@@ -365,7 +372,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
pending_ops
;
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
pending_ops
;
std
::
vector
<
DummyVarHandle
>
dummy_vars
;
std
::
vector
<
DummyVarHandle
>
dummy_vars
;
for
(
auto
&
var_map
:
member_
->
vars_
)
{
for
(
auto
&
var_map
:
member_
->
graph_
.
vars_
)
{
for
(
auto
&
name_pair
:
var_map
)
{
for
(
auto
&
name_pair
:
var_map
)
{
for
(
auto
&
version_pair
:
name_pair
.
second
)
{
for
(
auto
&
version_pair
:
name_pair
.
second
)
{
pending_vars
[
&
version_pair
.
second
]
=
pending_vars
[
&
version_pair
.
second
]
=
...
@@ -374,13 +381,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
...
@@ -374,13 +381,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
}
}
}
}
for
(
auto
&
var
:
member_
->
dep_vars_
)
{
for
(
auto
&
var
:
member_
->
graph_
.
dep_vars_
)
{
pending_vars
[
var
.
get
()]
=
var
->
generated_op_
==
nullptr
;
pending_vars
[
var
.
get
()]
=
var
->
generated_op_
==
nullptr
;
}
}
std
::
vector
<
OpHandleBase
*>
to_run
;
std
::
vector
<
OpHandleBase
*>
to_run
;
for
(
auto
&
op
:
member_
->
ops_
)
{
for
(
auto
&
op
:
member_
->
graph_
.
ops_
)
{
if
(
op
->
inputs_
.
empty
())
{
// Special case, Op has no input.
if
(
op
->
inputs_
.
empty
())
{
// Special case, Op has no input.
to_run
.
emplace_back
(
op
.
get
());
to_run
.
emplace_back
(
op
.
get
());
}
else
{
}
else
{
...
@@ -391,7 +398,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
...
@@ -391,7 +398,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
VarHandleBase
*>>
fetched_vars
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
VarHandleBase
*>>
fetched_vars
;
for
(
auto
&
fetch_var_name
:
fetch_tensors
)
{
for
(
auto
&
fetch_var_name
:
fetch_tensors
)
{
for
(
auto
&
var_map
:
member_
->
vars_
)
{
for
(
auto
&
var_map
:
member_
->
graph_
.
vars_
)
{
auto
it
=
var_map
.
find
(
fetch_var_name
);
auto
it
=
var_map
.
find
(
fetch_var_name
);
if
(
it
!=
var_map
.
end
())
{
if
(
it
!=
var_map
.
end
())
{
fetched_vars
[
fetch_var_name
].
push_back
(
&
it
->
second
.
rbegin
()
->
second
);
fetched_vars
[
fetch_var_name
].
push_back
(
&
it
->
second
.
rbegin
()
->
second
);
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
64d7a302
...
@@ -52,8 +52,6 @@ class ParallelExecutor {
...
@@ -52,8 +52,6 @@ class ParallelExecutor {
const
std
::
string
&
loss_var_name
)
const
;
const
std
::
string
&
loss_var_name
)
const
;
void
BuildNCCLCommunicator
()
const
;
void
BuildNCCLCommunicator
()
const
;
void
PolishGraphToSupportDataHazards
()
const
;
};
};
}
// namespace framework
}
// namespace framework
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录