Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
7570d8e7
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7570d8e7
编写于
7月 18, 2018
作者:
Y
Yancey1989
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add rpc complete interface
上级
968922bd
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
62 addition
and
93 deletion
+62
-93
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+2
-8
paddle/fluid/framework/executor.h
paddle/fluid/framework/executor.h
+2
-6
paddle/fluid/operators/distributed/CMakeLists.txt
paddle/fluid/operators/distributed/CMakeLists.txt
+1
-1
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+5
-26
paddle/fluid/operators/distributed/grpc_client.h
paddle/fluid/operators/distributed/grpc_client.h
+3
-8
paddle/fluid/operators/distributed/request_handler.h
paddle/fluid/operators/distributed/request_handler.h
+0
-2
paddle/fluid/operators/distributed/request_handler_impl.cc
paddle/fluid/operators/distributed/request_handler_impl.cc
+4
-7
paddle/fluid/operators/distributed/rpc_client.h
paddle/fluid/operators/distributed/rpc_client.h
+5
-9
paddle/fluid/operators/distributed/rpc_server.cc
paddle/fluid/operators/distributed/rpc_server.cc
+10
-12
paddle/fluid/operators/distributed/rpc_server.h
paddle/fluid/operators/distributed/rpc_server.h
+2
-3
paddle/fluid/operators/distributed/rpc_server_test.cc
paddle/fluid/operators/distributed/rpc_server_test.cc
+25
-4
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-2
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+2
-5
未找到文件。
paddle/fluid/framework/executor.cc
浏览文件 @
7570d8e7
...
...
@@ -46,16 +46,10 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
Executor
::
Executor
(
const
platform
::
Place
&
place
)
:
place_
(
place
)
{}
#ifdef PADDLE_WITH_DISTRIBUTE
void
Executor
::
BeginPass
()
{
void
Executor
::
Complete
()
{
::
paddle
::
operators
::
distributed
::
RPCClient
::
GetInstance
<
::
paddle
::
operators
::
distributed
::
GRPCClient
>
()
->
SendBeginPass
();
}
void
Executor
::
EndPass
()
{
::
paddle
::
operators
::
distributed
::
RPCClient
::
GetInstance
<
::
paddle
::
operators
::
distributed
::
GRPCClient
>
()
->
SendEndPass
();
->
SendComplete
();
}
#endif
...
...
paddle/fluid/framework/executor.h
浏览文件 @
7570d8e7
...
...
@@ -46,14 +46,10 @@ class Executor {
#ifdef PADDLE_WITH_DISTRIBUTE
/*
* Sending signal to pserver to mark current
pass star
ted.
* Sending signal to pserver to mark current
trainer comple
ted.
*/
void
BeginPass
();
void
Complete
();
/*
* Sending signal to pserver to mark current pass finished.
*/
void
EndPass
();
#endif
/* @Brief
...
...
paddle/fluid/operators/distributed/CMakeLists.txt
浏览文件 @
7570d8e7
...
...
@@ -6,7 +6,7 @@ if(WITH_GRPC)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
cc_test
(
serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL
)
cc_test
(
g
rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
cc_test
(
rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
proto_desc lookup_table_op SERIAL
)
return
()
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
7570d8e7
...
...
@@ -35,18 +35,10 @@ void GRPCClient::InitEventLoop() {
client_thread_
.
reset
(
new
std
::
thread
(
std
::
bind
(
&
GRPCClient
::
Proceed
,
this
)));
}
void
GRPCClient
::
Send
BeginPass
()
{
void
GRPCClient
::
Send
Complete
()
{
for
(
auto
&
it
:
channels_
)
{
VLOG
(
3
)
<<
"send begin pass to: "
<<
it
.
first
;
this
->
AsyncSendBeginPass
(
it
.
first
);
}
this
->
Wait
();
}
void
GRPCClient
::
SendEndPass
()
{
for
(
auto
&
it
:
channels_
)
{
VLOG
(
3
)
<<
"send end pass to "
<<
it
.
first
;
this
->
AsyncSendEndPass
(
it
.
first
);
VLOG
(
3
)
<<
"send complete message to "
<<
it
.
first
;
this
->
AsyncSendComplete
(
it
.
first
);
}
this
->
Wait
();
}
...
...
@@ -238,32 +230,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
req_count_
++
;
}
void
GRPCClient
::
AsyncSend
BeginPass
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
void
GRPCClient
::
AsyncSend
Complete
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
const
auto
ch
=
GetChannel
(
ep
);
BatchBarrierProcessor
*
s
=
new
BatchBarrierProcessor
(
ch
);
s
->
Prepare
(
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
BEGIN_PASS
_MESSAGE
);
req
.
set_varname
(
COMPLETE
_MESSAGE
);
auto
rpc
=
s
->
stub_
->
AsyncSendVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
req_count_
++
;
}
void
GRPCClient
::
AsyncSendEndPass
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
const
auto
ch
=
GetChannel
(
ep
);
FetchBarrierProcessor
*
s
=
new
FetchBarrierProcessor
(
ch
);
s
->
Prepare
(
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
END_PASS_MESSAGE
);
auto
rpc
=
s
->
stub_
->
AsyncGetVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
req_count_
++
;
}
void
GRPCClient
::
AsyncCheckpointNotify
(
const
std
::
string
&
ep
,
const
std
::
string
&
dir
,
int64_t
time_out
)
{
...
...
paddle/fluid/operators/distributed/grpc_client.h
浏览文件 @
7570d8e7
...
...
@@ -215,17 +215,12 @@ class GRPCClient : public RPCClient {
void
AsyncCheckpointNotify
(
const
std
::
string
&
ep
,
const
std
::
string
&
dir
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
AsyncSendBeginPass
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
AsyncSendEndPass
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
AsyncSendComplete
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
bool
Wait
()
override
;
void
SendBeginPass
()
override
;
void
SendEndPass
()
override
;
void
SendComplete
()
override
;
protected:
void
InitImpl
()
override
;
...
...
paddle/fluid/operators/distributed/request_handler.h
浏览文件 @
7570d8e7
...
...
@@ -43,8 +43,6 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
#define COMPLETE_MESSAGE "COMPLETE@RECV"
#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
#define END_PASS_MESSAGE "END_PASS@RECV"
#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
...
...
paddle/fluid/operators/distributed/request_handler_impl.cc
浏览文件 @
7570d8e7
...
...
@@ -55,10 +55,9 @@ bool RequestSendHandler::Handle(const std::string& varname,
if
(
varname
==
BATCH_BARRIER_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv batch barrier message"
;
rpc_server_
->
IncreaseBatchBarrier
(
kRequestSend
);
}
else
if
(
varname
==
BEGIN_PASS_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv begin pass message"
;
rpc_server_
->
WaitCond
(
kRequestSend
);
rpc_server_
->
BeginPass
();
}
else
if
(
varname
==
COMPLETE_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv complete message"
;
rpc_server_
->
Complete
();
}
else
{
VLOG
(
3
)
<<
"sync: received var_name: "
<<
varname
;
rpc_server_
->
WaitCond
(
kRequestSend
);
...
...
@@ -95,14 +94,12 @@ bool RequestGetHandler::Handle(const std::string& varname,
if
(
varname
==
FETCH_BARRIER_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv fetch barrier message"
;
rpc_server_
->
IncreaseBatchBarrier
(
kRequestGet
);
}
else
if
(
varname
==
END_PASS_MESSAGE
)
{
rpc_server_
->
EndPass
();
}
else
{
rpc_server_
->
WaitCond
(
kRequestGet
);
*
outvar
=
scope_
->
FindVar
(
varname
);
}
}
else
{
if
(
varname
!=
FETCH_BARRIER_MESSAGE
&&
varname
!=
END_PASS
_MESSAGE
)
{
if
(
varname
!=
FETCH_BARRIER_MESSAGE
&&
varname
!=
COMPLETE
_MESSAGE
)
{
*
outvar
=
scope_
->
FindVar
(
varname
);
}
}
...
...
paddle/fluid/operators/distributed/rpc_client.h
浏览文件 @
7570d8e7
...
...
@@ -60,17 +60,13 @@ class RPCClient {
const
std
::
string
&
dir
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
=
0
;
virtual
void
AsyncSend
BeginPass
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
=
0
;
virtual
void
AsyncSend
Complete
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
=
0
;
virtual
void
AsyncSendEndPass
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
=
0
;
// BeginePass/EndPass tells all the pserver that start/end a pass, so that
// the pserver can increase/reduce it's barrier count, and continue to train
// Complete tells all the pserver instances that finishe the training,
// the pserver can reduce it's barrier count, and continue to train
// with other trainers.
virtual
void
SendBeginPass
()
=
0
;
virtual
void
SendEndPass
()
=
0
;
virtual
void
SendComplete
()
=
0
;
virtual
bool
Wait
()
=
0
;
...
...
paddle/fluid/operators/distributed/rpc_server.cc
浏览文件 @
7570d8e7
...
...
@@ -64,18 +64,7 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
}
}
void
RPCServer
::
BeginPass
()
{
VLOG
(
4
)
<<
"RPCServer begin increase pass barrier"
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
client_num_
++
;
VLOG
(
4
)
<<
"increase client_num to: "
<<
client_num_
;
}
barrier_cond_
.
notify_all
();
}
void
RPCServer
::
EndPass
()
{
VLOG
(
4
)
<<
"RPCServer begin increase pass barrier"
;
void
RPCServer
::
Complete
()
{
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
client_num_
--
;
...
...
@@ -83,10 +72,19 @@ void RPCServer::EndPass() {
if
(
cur_cond_
.
load
()
==
rpc_cond_map_
[
kRequestGet
])
{
barrier_counter_
[
kRequestGet
]
--
;
}
if
(
client_num_
==
0
)
{
exit_flag_
=
true
;
VLOG
(
1
)
<<
"No activate Trainer instance, PServer will exit..."
;
}
}
barrier_cond_
.
notify_all
();
}
int
RPCServer
::
GetClientNum
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
client_num_
;
}
void
RPCServer
::
ResetBarrierCounter
()
{
VLOG
(
3
)
<<
"RPCServer ResetBarrierCounter "
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
...
...
paddle/fluid/operators/distributed/rpc_server.h
浏览文件 @
7570d8e7
...
...
@@ -44,7 +44,7 @@ class RPCServer {
int
GetSelectedPort
()
const
{
return
selected_port_
;
}
int
GetClientNum
()
const
;
int
GetClientNum
();
void
SavePort
()
const
;
...
...
@@ -64,8 +64,7 @@ class RPCServer {
void
WaitCond
(
const
std
::
string
&
rpc_name
);
void
IncreaseBatchBarrier
(
const
std
::
string
rpc_name
);
void
BeginPass
();
void
EndPass
();
void
Complete
();
void
ResetBarrierCounter
();
...
...
paddle/fluid/operators/distributed/rpc_server_test.cc
浏览文件 @
7570d8e7
...
...
@@ -91,7 +91,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
}
}
void
StartServer
()
{
void
StartServer
(
const
std
::
string
&
rpc_name
)
{
framework
::
ProgramDesc
program
;
framework
::
Scope
scope
;
platform
::
CPUPlace
place
;
...
...
@@ -107,14 +107,14 @@ void StartServer() {
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>>
prefetch_var_name_to_prepared
;
prefetch_var_name_to_prepared
[
in_var_name
]
=
prepared
[
0
];
g_req_handler
->
SetProgram
(
&
program
);
g_req_handler
->
SetPrefetchPreparedCtx
(
&
prefetch_var_name_to_prepared
);
g_req_handler
->
SetDevCtx
(
&
ctx
);
g_req_handler
->
SetScope
(
&
scope
);
g_req_handler
->
SetExecutor
(
&
exe
);
g_rpc_service
->
RegisterRPC
(
distributed
::
kRequestPrefetch
,
g_req_handler
.
get
());
g_rpc_service
->
RegisterRPC
(
rpc_name
,
g_req_handler
.
get
());
g_req_handler
->
SetRPCServer
(
g_rpc_service
.
get
());
std
::
thread
server_thread
(
...
...
@@ -129,7 +129,7 @@ TEST(PREFETCH, CPU) {
distributed
::
RPCClient
*
client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
();
std
::
thread
server_thread
(
StartServer
);
std
::
thread
server_thread
(
StartServer
,
distributed
::
kRequestPrefetch
);
g_rpc_service
->
WaitServerReady
();
int
port
=
g_rpc_service
->
GetSelectedPort
();
...
...
@@ -162,3 +162,24 @@ TEST(PREFETCH, CPU) {
g_rpc_service
.
reset
(
nullptr
);
g_req_handler
.
reset
(
nullptr
);
}
TEST
(
COMPLETE
,
CPU
)
{
g_req_handler
.
reset
(
new
distributed
::
RequestSendHandler
(
true
));
g_rpc_service
.
reset
(
new
RPCSERVER_T
(
"127.0.0.1:0"
,
2
));
distributed
::
RPCClient
*
client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
();
PADDLE_ENFORCE
(
client
!=
nullptr
);
std
::
thread
server_thread
(
StartServer
,
distributed
::
kRequestSend
);
g_rpc_service
->
WaitServerReady
();
int
port
=
g_rpc_service
->
GetSelectedPort
();
std
::
string
ep
=
paddle
::
string
::
Sprintf
(
"127.0.0.1:%d"
,
port
);
client
->
AsyncSendComplete
(
ep
);
client
->
Wait
();
EXPECT_EQ
(
g_rpc_service
->
GetClientNum
(),
1
);
g_rpc_service
->
ShutDown
();
server_thread
.
join
();
g_rpc_service
.
reset
(
nullptr
);
g_req_handler
.
reset
(
nullptr
);
}
paddle/fluid/pybind/pybind.cc
浏览文件 @
7570d8e7
...
...
@@ -503,8 +503,7 @@ All parameter, weight, gradient are variables in Paddle.
py
::
class_
<
framework
::
Executor
>
(
m
,
"Executor"
)
.
def
(
py
::
init
<
const
platform
::
Place
&>
())
#ifdef PADDLE_WITH_DISTRIBUTE
.
def
(
"begin_pass"
,
&
Executor
::
BeginPass
)
.
def
(
"end_pass"
,
&
Executor
::
EndPass
)
.
def
(
"complete"
,
&
Executor
::
Complete
)
#endif
.
def
(
"run"
,
[](
Executor
&
self
,
const
ProgramDesc
&
prog
,
Scope
*
scope
,
int
block_id
,
bool
create_local_scope
,
bool
create_vars
)
{
...
...
python/paddle/fluid/executor.py
浏览文件 @
7570d8e7
...
...
@@ -348,11 +348,8 @@ class Executor(object):
]
return
outs
def
begin_pass
(
self
):
self
.
executor
.
begin_pass
()
def
end_pass
(
self
):
self
.
executor
.
end_pass
()
def
complete
(
self
):
self
.
executor
.
complete
()
def
run
(
self
,
program
=
None
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录