Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
45363069
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
45363069
编写于
4月 01, 2018
作者:
Q
qiaolongfei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix prefetch hang problem, add some more logs
上级
fdecae5f
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
24 addition
and
21 deletion
+24
-21
paddle/fluid/operators/detail/grpc_client.cc
paddle/fluid/operators/detail/grpc_client.cc
+9
-7
paddle/fluid/operators/detail/grpc_server.cc
paddle/fluid/operators/detail/grpc_server.cc
+11
-2
paddle/fluid/operators/detail/grpc_service.h
paddle/fluid/operators/detail/grpc_service.h
+2
-2
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+2
-10
未找到文件。
paddle/fluid/operators/detail/grpc_client.cc
浏览文件 @
45363069
...
...
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "grpc_client.h"
#include <sys/time.h>
#include "paddle/fluid/operators/detail/grpc_client.h"
#include <limits>
#include "paddle/fluid/framework/threadpool.h"
namespace
paddle
{
...
...
@@ -52,7 +54,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
auto
call
=
s
->
stub_g_
.
PrepareUnaryCall
(
s
->
context_
.
get
(),
"/sendrecv.SendRecvService/SendVariable"
,
req
,
&
cq_
);
call
->
StartCall
();
call
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
(
void
*
)
s
);
call
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
static_cast
<
void
*>
(
s
)
);
});
req_count_
++
;
...
...
@@ -109,7 +111,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
auto
call
=
s
->
stub_g_
.
PrepareUnaryCall
(
s
->
context_
.
get
(),
"/sendrecv.SendRecvService/GetVariable"
,
buf
,
&
cq_
);
call
->
StartCall
();
call
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
(
void
*
)
s
);
call
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
static_cast
<
void
*>
(
s
)
);
});
req_count_
++
;
...
...
@@ -153,7 +155,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
s
->
context_
.
get
(),
"/sendrecv.SendRecvService/PrefetchVariable"
,
req
,
&
cq_
);
call
->
StartCall
();
call
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
(
void
*
)
s
);
call
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
static_cast
<
void
*>
(
s
)
);
});
req_count_
++
;
...
...
@@ -169,7 +171,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
BATCH_BARRIER_MESSAGE
);
auto
rpc
=
s
->
stub_
->
AsyncSendVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
(
void
*
)
s
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
static_cast
<
void
*>
(
s
)
);
req_count_
++
;
}
...
...
@@ -181,7 +183,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
FETCH_BARRIER_MESSAGE
);
auto
rpc
=
s
->
stub_
->
AsyncGetVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
(
void
*
)
s
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
static_cast
<
void
*>
(
s
)
);
req_count_
++
;
}
...
...
paddle/fluid/operators/detail/grpc_server.cc
浏览文件 @
45363069
...
...
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_server.h"
#include <paddle/fluid/operators/detail/send_recv.pb.h>
#include <limits>
#include <string>
using
::
grpc
::
ServerAsyncResponseWriter
;
...
...
@@ -224,6 +226,7 @@ void AsyncGRPCServer::ShutdownQueue() {
std
::
unique_lock
<
std
::
mutex
>
lock
(
cq_mutex_
);
cq_send_
->
Shutdown
();
cq_get_
->
Shutdown
();
cq_prefetch_
->
Shutdown
();
}
// This URL explains why shutdown is complicate:
...
...
@@ -236,6 +239,7 @@ void AsyncGRPCServer::ShutDown() {
void
AsyncGRPCServer
::
TryToRegisterNewSendOne
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
cq_mutex_
);
if
(
is_shut_down_
)
{
VLOG
(
3
)
<<
"shutdown, do not TryToRegisterNewSendOne"
;
return
;
}
RequestSend
*
send
=
new
RequestSend
(
&
service_
,
cq_send_
.
get
(),
scope_
,
...
...
@@ -246,6 +250,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
void
AsyncGRPCServer
::
TryToRegisterNewGetOne
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
cq_mutex_
);
if
(
is_shut_down_
)
{
VLOG
(
3
)
<<
"shutdown, do not TryToRegisterNewGetOne"
;
return
;
}
RequestGet
*
get
=
new
RequestGet
(
&
service_
,
cq_get_
.
get
(),
scope_
,
dev_ctx_
,
...
...
@@ -257,6 +262,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
VLOG
(
4
)
<<
"TryToRegisterNewPrefetchOne in"
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
cq_mutex_
);
if
(
is_shut_down_
)
{
VLOG
(
3
)
<<
"shutdown, do not TryToRegisterNewPrefetchOne"
;
return
;
}
RequestPrefetch
*
prefetch
=
...
...
@@ -274,18 +280,21 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
void
*
tag
=
NULL
;
bool
ok
=
false
;
while
(
true
)
{
VLOG
(
3
)
<<
"HandleRequest for "
<<
cq_name
<<
" while in"
;
if
(
!
cq
->
Next
(
&
tag
,
&
ok
))
{
LOG
(
INFO
)
<<
cq_name
<<
" CompletionQueue shutdown!"
;
break
;
}
VLOG
(
3
)
<<
"HandleRequest for "
<<
cq_name
<<
" while after Next"
;
PADDLE_ENFORCE
(
tag
);
// FIXME(typhoonzero): de-couple the barriers with recv_op
if
(
!
is_shut_down_
&&
cq_name
==
"cq_get"
)
WaitCond
(
1
);
if
(
!
is_shut_down_
&&
cq_name
==
"cq_send"
)
WaitCond
(
0
);
RequestBase
*
base
=
(
RequestBase
*
)
tag
;
RequestBase
*
base
=
reinterpret_cast
<
RequestBase
*>
(
tag
)
;
// reference:
// https://github.com/tensorflow/tensorflow/issues/5596
// https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
...
...
paddle/fluid/operators/detail/grpc_service.h
浏览文件 @
45363069
...
...
@@ -89,7 +89,7 @@ inline const char* GrpcMethodName(GrpcMethod id) {
case
GrpcMethod
::
kGetVariable
:
return
"/sendrecv.SendRecvService/GetVariable"
;
case
GrpcMethod
::
kPrefetchVariable
:
return
"/sendrecv.SendR
E
cvService/PrefetchVariable"
;
return
"/sendrecv.SendR
e
cvService/PrefetchVariable"
;
}
// Shouldn't be reached.
...
...
@@ -117,5 +117,5 @@ class GrpcService final {
};
}
// namespace detail
}
// namespace operator
}
// namespace operator
s
}
// namespace paddle
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
45363069
...
...
@@ -13,22 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <ostream>
#include <thread>
#include <unistd.h>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/proto_desc.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/grpc_server.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/simple_block_queue.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -177,7 +168,8 @@ class ListenAndServOp : public framework::OperatorBase {
}
ParallelExecuteBlocks
(
parallel_blkids
,
&
executor
,
program
,
&
recv_scope
);
VLOG
(
2
)
<<
"run all blocks spent (ms) "
<<
detail
::
GetTimestamp
()
-
ts
;
VLOG
(
3
)
<<
"run all blocks spent "
<<
detail
::
GetTimestamp
()
-
ts
<<
"(ms)"
;
// Reset the received sparse variables, the sum operator would not
// sum the input sparse variables which rows is empty at the next
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录