Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
4a91a145
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4a91a145
编写于
7月 12, 2018
作者:
Y
Yancey1989
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
enforce rpc client timeout
上级
486121d5
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
24 addition
and
14 deletion
+24
-14
paddle/fluid/operators/checkpoint_notify_op.cc
paddle/fluid/operators/checkpoint_notify_op.cc
+1
-1
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+11
-2
paddle/fluid/operators/distributed/grpc_client.h
paddle/fluid/operators/distributed/grpc_client.h
+3
-2
paddle/fluid/operators/distributed/rpc_client.h
paddle/fluid/operators/distributed/rpc_client.h
+1
-1
paddle/fluid/operators/fetch_barrier_op.cc
paddle/fluid/operators/fetch_barrier_op.cc
+2
-2
paddle/fluid/operators/prefetch_op.cc
paddle/fluid/operators/prefetch_op.cc
+1
-1
paddle/fluid/operators/recv_op.cc
paddle/fluid/operators/recv_op.cc
+1
-1
paddle/fluid/operators/send_barrier_op.cc
paddle/fluid/operators/send_barrier_op.cc
+2
-2
paddle/fluid/operators/send_op.cc
paddle/fluid/operators/send_op.cc
+1
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-1
未找到文件。
paddle/fluid/operators/checkpoint_notify_op.cc
浏览文件 @
4a91a145
...
...
@@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase {
VLOG
(
3
)
<<
"checkpoint notify sending lookup table: "
<<
lookup_table_name
<<
" and dir:"
<<
dir
<<
" to "
<<
epmap
[
i
];
}
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
};
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
4a91a145
...
...
@@ -281,9 +281,10 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
req_count_
++
;
}
void
GRPCClient
::
Wait
()
{
bool
GRPCClient
::
Wait
()
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
sync_mutex_
);
sync_cond_
.
wait
(
lk
,
[
this
]
{
return
req_count_
==
0
;
});
sync_cond_
.
wait
(
lk
,
[
this
]
{
return
(
req_count_
==
0
||
ok_
==
false
);
});
return
ok_
;
}
void
GRPCClient
::
Proceed
()
{
...
...
@@ -297,6 +298,14 @@ void GRPCClient::Proceed() {
if
(
c
->
status_
.
ok
())
{
VLOG
(
3
)
<<
c
->
var_h_
.
String
()
<<
" process"
;
c
->
Process
();
}
else
if
(
c
->
status_
.
error_code
()
==
grpc
::
StatusCode
::
DEADLINE_EXCEEDED
)
{
LOG
(
ERROR
)
<<
c
->
var_h_
.
String
()
<<
" meets grpc error:"
<<
c
->
status_
.
error_message
();
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
sync_mutex_
);
ok_
=
false
;
}
sync_cond_
.
notify_all
();
}
else
{
LOG
(
FATAL
)
<<
c
->
var_h_
.
String
()
<<
" meets grpc error:"
<<
c
->
status_
.
error_message
();
...
...
paddle/fluid/operators/distributed/grpc_client.h
浏览文件 @
4a91a145
...
...
@@ -188,7 +188,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
class
GRPCClient
:
public
RPCClient
{
public:
GRPCClient
()
{}
GRPCClient
()
:
ok_
(
true
)
{}
virtual
~
GRPCClient
();
bool
AsyncSendVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
...
...
@@ -221,7 +221,7 @@ class GRPCClient : public RPCClient {
void
AsyncSendEndPass
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
Wait
()
override
;
bool
Wait
()
override
;
void
SendBeginPass
()
override
;
...
...
@@ -247,6 +247,7 @@ class GRPCClient : public RPCClient {
std
::
mutex
sync_mutex_
;
std
::
condition_variable
sync_cond_
;
std
::
atomic
<
int64_t
>
req_count_
{
0
};
bool
ok_
;
// mutex for GetChannel thread safety
std
::
mutex
chan_mutex_
;
...
...
paddle/fluid/operators/distributed/rpc_client.h
浏览文件 @
4a91a145
...
...
@@ -72,7 +72,7 @@ class RPCClient {
virtual
void
SendBeginPass
()
=
0
;
virtual
void
SendEndPass
()
=
0
;
virtual
void
Wait
()
=
0
;
virtual
bool
Wait
()
=
0
;
template
<
typename
T
>
static
RPCClient
*
GetInstance
()
{
...
...
paddle/fluid/operators/fetch_barrier_op.cc
浏览文件 @
4a91a145
...
...
@@ -45,13 +45,13 @@ class FetchBarrierOp : public framework::OperatorBase {
distributed
::
RPCClient
*
rpc_client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
();
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
for
(
auto
&
ep
:
eps
)
{
VLOG
(
3
)
<<
"fetch barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendFetchBarrier
(
ep
);
}
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
};
...
...
paddle/fluid/operators/prefetch_op.cc
浏览文件 @
4a91a145
...
...
@@ -53,7 +53,7 @@ class PrefetchOp : public framework::OperatorBase {
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
}
}
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
};
...
...
paddle/fluid/operators/recv_op.cc
浏览文件 @
4a91a145
...
...
@@ -51,7 +51,7 @@ class RecvOp : public framework::OperatorBase {
rpc_client
->
AsyncGetVar
(
epmap
[
i
],
ctx
,
scope
,
outs
[
i
]);
}
if
(
sync_mode
)
{
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
}
};
...
...
paddle/fluid/operators/send_barrier_op.cc
浏览文件 @
4a91a145
...
...
@@ -50,13 +50,13 @@ class SendBarrierOp : public framework::OperatorBase {
VLOG
(
3
)
<<
"SendBarrierOp sync_mode:"
<<
sync_mode
;
// need to wait before sending send_barrier message
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
if
(
sync_mode
)
{
for
(
auto
&
ep
:
eps
)
{
VLOG
(
3
)
<<
"send barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendBatchBarrier
(
ep
);
}
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
}
};
...
...
paddle/fluid/operators/send_op.cc
浏览文件 @
4a91a145
...
...
@@ -59,7 +59,7 @@ class SendOp : public framework::OperatorBase {
}
}
if
(
sync_send
)
{
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
}
};
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
4a91a145
...
...
@@ -67,7 +67,7 @@ bool IsCompiledWithCUDA() {
}
bool
IsCompiledWithDIST
()
{
#ifdef PADDLE_WITH_DIST
#ifdef PADDLE_WITH_DIST
RIBUTE
return
true
;
#else
return
false
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录