Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
7be79231
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7be79231
编写于
12月 18, 2017
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
wip multi-trainer
上级
1e549563
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
47 addition
and
29 deletion
+47
-29
paddle/operators/detail/send_impl.cc
paddle/operators/detail/send_impl.cc
+6
-0
paddle/operators/detail/send_recv_impl.h
paddle/operators/detail/send_recv_impl.h
+1
-0
paddle/operators/recv_op.cc
paddle/operators/recv_op.cc
+3
-2
paddle/operators/send_op.cc
paddle/operators/send_op.cc
+23
-19
python/paddle/v2/fluid/distribute_transpiler.py
python/paddle/v2/fluid/distribute_transpiler.py
+14
-8
未找到文件。
paddle/operators/detail/send_impl.cc
浏览文件 @
7be79231
...
...
@@ -66,6 +66,12 @@ bool RPCClient::GetVariable(const framework::Scope& scope,
return
true
;
}
void
RPCClient
::
Wait
()
{
ClientContext
context
;
VoidMessage
call_msg
,
ret_msg
;
stub_
->
Wait
(
&
context
,
call_msg
,
&
ret_msg
);
}
}
// namespace detail
}
// namespace operators
}
// namespace paddle
paddle/operators/detail/send_recv_impl.h
浏览文件 @
7be79231
...
...
@@ -81,6 +81,7 @@ class RPCClient {
bool
SendVariable
(
const
framework
::
Scope
&
scope
,
const
std
::
string
&
inname
);
bool
GetVariable
(
const
framework
::
Scope
&
scope
,
const
std
::
string
&
outname
);
void
Wait
();
private:
std
::
unique_ptr
<
SendRecvService
::
Stub
>
stub_
;
...
...
paddle/operators/recv_op.cc
浏览文件 @
7be79231
...
...
@@ -76,14 +76,14 @@ class RecvOp : public framework::OperatorBase {
const
platform
::
DeviceContext
&
dev_ctx
)
const
override
{
// FIXME(typhoonzero): no new scopes for every run.
framework
::
Scope
&
recv_scope
=
scope
.
NewScope
();
rpc_service_
.
SetScope
(
&
recv_scope
);
rpc_service_
->
SetScope
(
&
recv_scope
);
auto
param_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"ParamList"
);
auto
grad_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"GradList"
);
auto
trainer_count
=
Attr
<
int
>
(
"Trainers"
);
size_t
param_count
=
param_list
.
size
();
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
while
(
true
)
{
rpc_service_
.
Start
();
rpc_service_
->
Start
();
// Get from multiple trainers, we don't care about order in which
// the gradient arrives, just add suffix 0~n then average the gradient.
for
(
size_t
i
=
0
;
i
<
param_count
*
trainer_count
;
++
i
)
{
...
...
@@ -126,6 +126,7 @@ class RecvOp : public framework::OperatorBase {
}
catch
(
std
::
exception
&
e
)
{
LOG
(
ERROR
)
<<
"run sub program error "
<<
e
.
what
();
}
rpc_service_
->
Done
();
// for (size_t i = 0; i < param_count; ++i) {
// auto *out_var = recv_scope.FindVar(param_list[i]);
...
...
paddle/operators/send_op.cc
浏览文件 @
7be79231
...
...
@@ -34,34 +34,36 @@ class SendOp : public framework::OperatorBase {
const
framework
::
AttributeMap
&
attrs
)
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{
// init client when the operator is created at runtime.
if
(
!
client_
)
{
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint
"
);
client_
.
reset
(
new
detail
::
RPCClient
(
grpc
::
CreateChannel
(
endpoint
,
grpc
::
InsecureChannelCredentials
())));
// TODO(typhoonzero): how to call InitVariables
std
::
vector
<
std
::
string
>
endpoints
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"endpoints
"
);
for
(
auto
ep
:
endpoints
)
{
client_map_
[
ep
].
reset
(
new
detail
::
RPCClient
(
grpc
::
CreateChannel
(
ep
,
grpc
::
InsecureChannelCredentials
())));
}
}
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
override
{
auto
ins
=
Inputs
(
"X"
);
// TODO(typhoonzero): currently it's non-blocking,
//
should block until server responds
.
for
(
auto
in
:
ins
)
{
bool
ret
=
client_
->
SendVariable
(
scope
,
in
);
std
::
vector
<
std
::
string
>
epmap
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
);
//
TODO(typhoonzero): use async calls to send multiple variable asyncly
.
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
bool
ret
=
client_
map_
[
epmap
[
i
]]
->
SendVariable
(
scope
,
ins
[
i
]
);
if
(
!
ret
)
{
LOG
(
ERROR
)
<<
"send variable error
"
;
LOG
(
ERROR
)
<<
"send variable error
: "
<<
ins
[
i
]
;
}
}
for
(
auto
in
:
ins
)
{
bool
ret
=
client_
->
GetVariable
(
scope
);
client_map_
[
0
]
->
Wait
();
// TODO(typhoonzero): support async optimization
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
bool
ret
=
client_map_
[
epmap
[
i
]]
->
GetVariable
(
scope
,
ins
[
i
]);
if
(
!
ret
)
{
LOG
(
ERROR
)
<<
"GetVariable error
"
;
LOG
(
ERROR
)
<<
"GetVariable error
: "
<<
ins
[
i
]
;
}
}
}
protected:
std
::
shared_ptr
<
detail
::
RPCClient
>
client_
{
nullptr
};
mutable
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
detail
::
RPCClient
>>
client_map_
;
};
class
SendOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
...
@@ -74,11 +76,13 @@ Recv operator
This operator will recv tensor from send_op
)DOC"
);
AddAttr
<
std
::
string
>
(
"endpoint"
,
"(string, default 127.0.0.1:6164)"
"IP address to listen on."
)
.
SetDefault
(
"127.0.0.1:6164"
)
.
AddCustomChecker
([](
const
std
::
string
&
ip
)
{
return
!
ip
.
empty
();
});
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"endpoints"
,
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to."
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
,
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input "
"variables for mapping"
);
}
};
...
...
python/paddle/v2/fluid/distribute_transpiler.py
浏览文件 @
7be79231
...
...
@@ -145,14 +145,20 @@ class DistributeTranspiler:
pserver_endpoints
=
kwargs
[
"pservers"
].
split
(
","
)
self
.
param_grad_map
=
split_method
(
params_and_grads
,
pserver_endpoints
)
for
ep
in
pserver_endpoints
:
# FIXME(typhoonzero): send to different servers can run in parrallel.
send_op
=
program
.
global_block
().
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
self
.
param_grad_map
[
ep
][
"grads"
]
},
# inputs is a list of tensors to be send
outputs
=
{},
attrs
=
{
"endpoint"
:
ep
})
send_op_ordered_inputs
=
[]
epmap
=
[]
for
ep
,
v
in
self
.
param_grad_map
.
iteritems
():
send_op_ordered_inputs
.
extend
(
v
[
"grads"
])
for
i
in
v
:
epmap
.
append
(
ep
)
send_op
=
program
.
global_block
().
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
send_op_ordered_inputs
},
# inputs is a list of tensors to be send
outputs
=
{},
attrs
=
{
"endpoints"
:
pserver_endpoints
,
"epmap"
:
epmap
})
def
_create_var_for_trainers
(
self
,
block
,
var
,
trainers
):
var_list
=
[]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录