Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
c750be6d
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c750be6d
编写于
1月 25, 2019
作者:
Q
Qiao Longfei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add some log
上级
d243e555
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
20 addition
and
13 deletion
+20
-13
paddle/fluid/operators/distributed/rpc_server.cc
paddle/fluid/operators/distributed/rpc_server.cc
+13
-12
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+7
-1
未找到文件。
paddle/fluid/operators/distributed/rpc_server.cc
浏览文件 @
c750be6d
...
...
@@ -39,10 +39,11 @@ void RPCServer::SavePort() const {
port_file
.
open
(
file_path
);
port_file
<<
selected_port_
;
port_file
.
close
();
VLOG
(
4
)
<<
"selected port written to "
<<
file_path
;
VLOG
(
3
)
<<
"selected port written to "
<<
file_path
;
}
void
RPCServer
::
WaitBarrier
(
const
std
::
string
&
rpc_name
)
{
VLOG
(
3
)
<<
"WaitBarrier: "
<<
rpc_name
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
this
->
mutex_
);
barrier_cond_
.
wait
(
lock
,
[
this
,
&
rpc_name
]
{
return
((
barrier_counter_
[
rpc_name
]
==
client_num_
&&
client_num_
!=
0
)
||
...
...
@@ -54,7 +55,7 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
}
void
RPCServer
::
IncreaseBatchBarrier
(
const
std
::
string
rpc_name
)
{
VLOG
(
4
)
<<
"RPCServer begin IncreaseBatchBarrier "
<<
rpc_name
;
VLOG
(
3
)
<<
"RPCServer begin IncreaseBatchBarrier "
<<
rpc_name
;
int
b
=
0
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
b
=
++
barrier_counter_
[
rpc_name
];
...
...
@@ -71,7 +72,7 @@ void RPCServer::Complete() {
client_num_
--
;
need_reset_all_vars_
=
true
;
VLOG
(
4
)
<<
"decrease client_num to: "
<<
client_num_
;
VLOG
(
3
)
<<
"decrease client_num to: "
<<
client_num_
;
if
(
cur_cond_
.
load
()
==
rpc_cond_map_
[
kRequestGet
])
{
barrier_counter_
[
kRequestGet
]
--
;
}
...
...
@@ -105,7 +106,7 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
static
int
cond
=
-
1
;
rpc_cond_map_
[
rpc_name
]
=
++
cond
;
VLOG
(
4
)
<<
"RegisterRPC rpc_name:"
<<
rpc_name
<<
", handler:"
<<
handler
VLOG
(
3
)
<<
"RegisterRPC rpc_name:"
<<
rpc_name
<<
", handler:"
<<
handler
<<
", cond:"
<<
rpc_cond_map_
[
rpc_name
];
}
...
...
@@ -120,7 +121,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
}
void
RPCServer
::
WaitCond
(
const
std
::
string
&
rpc_name
)
{
VLOG
(
4
)
<<
"RPCServer WaitCond "
<<
rpc_name
;
VLOG
(
3
)
<<
"RPCServer WaitCond "
<<
rpc_name
;
int
cond
=
0
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
...
...
@@ -151,7 +152,7 @@ void RPCServer::RegisterVar(const std::string& var_name,
}
rpc_cond_
.
notify_all
();
VLOG
(
4
)
<<
"RegisterVar context:"
<<
h
.
String
();
VLOG
(
3
)
<<
"RegisterVar context:"
<<
h
.
String
();
}
void
RPCServer
::
IncreaseVarBarrier
(
const
std
::
string
&
var_name
)
{
...
...
@@ -167,11 +168,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
barrier_cond_
.
notify_all
();
}
VLOG
(
4
)
<<
"IncreaseVarBarrier context:"
<<
h
.
String
();
VLOG
(
3
)
<<
"IncreaseVarBarrier context:"
<<
h
.
String
();
}
void
RPCServer
::
WaitVarBarrier
(
const
std
::
string
&
var_name
)
{
VLOG
(
4
)
<<
"Wait
Barrier var_name:"
<<
var_name
;
VLOG
(
3
)
<<
"WaitVar
Barrier var_name:"
<<
var_name
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
barrier_cond_
.
wait
(
lock
,
[
&
]()
{
...
...
@@ -179,11 +180,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) {
exit_flag_
.
load
());
});
VLOG
(
4
)
<<
"Wait
Barrier context: "
<<
var_map_
[
var_name
].
String
();
VLOG
(
3
)
<<
"WaitVar
Barrier context: "
<<
var_map_
[
var_name
].
String
();
}
void
RPCServer
::
SetVarCond
(
const
std
::
string
&
var_name
)
{
VLOG
(
4
)
<<
"SetVarCond var_name:"
<<
var_name
;
VLOG
(
3
)
<<
"SetVarCond var_name:"
<<
var_name
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
var_map_
.
find
(
var_name
)
!=
var_map_
.
end
())
{
...
...
@@ -193,14 +194,14 @@ void RPCServer::SetVarCond(const std::string& var_name) {
}
void
RPCServer
::
WaitVarCond
(
const
std
::
string
&
var_name
)
{
VLOG
(
4
)
<<
"WaitVarCond var_name:"
<<
var_name
;
VLOG
(
3
)
<<
"WaitVarCond var_name:"
<<
var_name
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
rpc_cond_
.
wait
(
lock
,
[
=
]
{
return
(
var_map_
.
find
(
var_name
)
!=
var_map_
.
end
()
||
exit_flag_
.
load
());
});
VLOG
(
4
)
<<
"WaitVarCond var_name:"
<<
var_name
<<
" end"
;
VLOG
(
3
)
<<
"WaitVarCond var_name:"
<<
var_name
<<
" end"
;
}
MonomerHandle
RPCServer
::
GetMonomer
(
const
std
::
string
&
var_name
)
{
...
...
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
浏览文件 @
c750be6d
...
...
@@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop(
while
(
true
)
{
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
VLOG
(
3
)
<<
"wait all clients to send gradient"
;
rpc_service_
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"wait all clients to send send_barrier"
;
rpc_service_
->
WaitBarrier
(
distributed
::
kRequestSend
);
if
(
rpc_service_
->
IsExit
())
{
...
...
@@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop(
}
ParallelExecuteBlocks
(
parallel_blkids
,
executor
,
optimize_prepared
,
program
,
recv_scope
);
VLOG
(
2
)
<<
"run all blocks spent "
<<
GetTimestamp
()
-
ts
<<
"(ms)"
;
VLOG
(
3
)
<<
"run all blocks spent "
<<
GetTimestamp
()
-
ts
<<
"(ms)"
;
VLOG
(
3
)
<<
"ResetReceivedVars"
;
ResetReceivedVars
(
recv_scope
,
dev_ctx
,
rpc_service_
->
NeedResetAllVars
());
VLOG
(
3
)
<<
"wait all clients to get parameters back"
;
rpc_service_
->
SetCond
(
distributed
::
kRequestGet
);
VLOG
(
3
)
<<
"wait all clients to send fetch_barrier"
;
rpc_service_
->
WaitBarrier
(
distributed
::
kRequestGet
);
VLOG
(
3
)
<<
"ResetBarrierCounter"
;
rpc_service_
->
ResetBarrierCounter
();
}
// while(true)
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录