Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
9573d610
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9573d610
编写于
3月 04, 2019
作者:
Q
Qiao Longfei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
use rpc common in parameter send and recv
上级
3691a46f
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
44 addition
and
37 deletion
+44
-37
paddle/fluid/operators/distributed/parameter_recv.cc
paddle/fluid/operators/distributed/parameter_recv.cc
+8
-9
paddle/fluid/operators/distributed/parameter_recv.h
paddle/fluid/operators/distributed/parameter_recv.h
+2
-3
paddle/fluid/operators/distributed/parameter_send.cc
paddle/fluid/operators/distributed/parameter_send.cc
+14
-16
paddle/fluid/operators/distributed/parameter_send.h
paddle/fluid/operators/distributed/parameter_send.h
+2
-4
paddle/fluid/operators/distributed/rpc_common.h
paddle/fluid/operators/distributed/rpc_common.h
+7
-0
paddle/fluid/operators/distributed_ops/recv_op.cc
paddle/fluid/operators/distributed_ops/recv_op.cc
+5
-2
paddle/fluid/operators/distributed_ops/send_op.cc
paddle/fluid/operators/distributed_ops/send_op.cc
+6
-3
未找到文件。
paddle/fluid/operators/distributed/parameter_recv.cc
浏览文件 @
9573d610
...
...
@@ -39,9 +39,7 @@ using SelectedRows = framework::SelectedRows;
using
DDim
=
framework
::
DDim
;
template
<
typename
T
>
void
ParameterRecv
<
T
>::
operator
()(
const
std
::
string
&
var_name
,
const
std
::
vector
<
std
::
string
>
&
recv_varnames
,
const
std
::
vector
<
std
::
string
>
&
epmap
,
void
ParameterRecv
<
T
>::
operator
()(
const
RpcContext
&
rpc_ctx
,
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Scope
&
scope
)
{
framework
::
Scope
*
local_scope
=
scope
.
NewTmpScope
();
...
...
@@ -53,21 +51,22 @@ void ParameterRecv<T>::operator()(const std::string &var_name,
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
ctx
.
Attr
<
int
>
(
"trainer_id"
));
auto
*
recv_var
=
scope
.
FindVar
(
var_name
);
auto
*
recv_var
=
scope
.
FindVar
(
rpc_ctx
.
var_name
);
std
::
vector
<
framework
::
Tensor
*>
recved_tensors
;
// recv all vars to local scope
if
(
recv_var
->
IsType
<
framework
::
LoDTensor
>
())
{
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
r
ecv_var
names
.
size
();
i
++
)
{
auto
&
recv_var_name
=
r
ecv_var
names
[
i
];
for
(
size_t
i
=
0
;
i
<
r
pc_ctx
.
splited_var_
names
.
size
();
i
++
)
{
auto
&
recv_var_name
=
r
pc_ctx
.
splited_var_
names
[
i
];
framework
::
Tensor
*
t
=
local_scope
->
Var
(
recv_var_name
)
->
GetMutable
<
framework
::
LoDTensor
>
();
recved_tensors
.
push_back
(
t
);
VLOG
(
3
)
<<
"recv "
<<
recv_var_name
<<
" from "
<<
epmap
[
i
];
rets
.
push_back
(
rpc_client
->
AsyncGetVar
(
epmap
[
i
],
cpu_ctx
,
*
local_scope
,
recv_var_name
,
recv_var_name
));
VLOG
(
3
)
<<
"recv "
<<
recv_var_name
<<
" from "
<<
rpc_ctx
.
epmap
[
i
];
rets
.
push_back
(
rpc_client
->
AsyncGetVar
(
rpc_ctx
.
epmap
[
i
],
cpu_ctx
,
*
local_scope
,
recv_var_name
,
recv_var_name
));
}
for
(
size_t
i
=
0
;
i
<
rets
.
size
();
i
++
)
{
PADDLE_ENFORCE
(
rets
[
i
]
->
Wait
(),
"internal error in RPCClient"
);
...
...
paddle/fluid/operators/distributed/parameter_recv.h
浏览文件 @
9573d610
...
...
@@ -18,6 +18,7 @@
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/distributed/rpc_common.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -25,9 +26,7 @@ namespace distributed {
template
<
typename
T
>
struct
ParameterRecv
{
void
operator
()(
const
std
::
string
&
var_name
,
const
std
::
vector
<
std
::
string
>
&
recv_varnames
,
const
std
::
vector
<
std
::
string
>
&
epmap
,
void
operator
()(
const
RpcContext
&
rpc_ctx
,
const
framework
::
ExecutionContext
&
context
,
const
framework
::
Scope
&
scope
);
};
...
...
paddle/fluid/operators/distributed/parameter_send.cc
浏览文件 @
9573d610
...
...
@@ -38,10 +38,7 @@ using SelectedRows = framework::SelectedRows;
using
DDim
=
framework
::
DDim
;
template
<
typename
T
>
void
ParameterSend
<
T
>::
operator
()(
const
std
::
string
&
var_name
,
const
std
::
vector
<
std
::
string
>
&
send_varnames
,
const
std
::
vector
<
std
::
string
>
&
epmap
,
const
std
::
vector
<
int64_t
>
&
height_sections
,
void
ParameterSend
<
T
>::
operator
()(
const
RpcContext
&
rpc_ctx
,
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Scope
&
scope
,
bool
sync
)
{
framework
::
Scope
*
local_scope
=
scope
.
NewTmpScope
();
...
...
@@ -53,8 +50,8 @@ void ParameterSend<T>::operator()(const std::string &var_name,
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
ctx
.
Attr
<
int
>
(
"trainer_id"
));
auto
*
send_var
=
scope
.
FindVar
(
var_name
);
size_t
out_num
=
send_var
names
.
size
();
auto
*
send_var
=
scope
.
FindVar
(
rpc_ctx
.
var_name
);
size_t
out_num
=
rpc_ctx
.
splited_var_
names
.
size
();
if
(
send_var
->
IsType
<
framework
::
LoDTensor
>
())
{
if
(
out_num
>
1
)
{
auto
&
send_tensor
=
send_var
->
Get
<
framework
::
LoDTensor
>
();
...
...
@@ -63,19 +60,19 @@ void ParameterSend<T>::operator()(const std::string &var_name,
outs_dims
.
reserve
(
out_num
);
// infer output shape
PADDLE_ENFORCE_EQ
(
height_sections
.
size
(),
out_num
,
PADDLE_ENFORCE_EQ
(
rpc_ctx
.
height_sections
.
size
(),
out_num
,
"tensor split sections size"
"should be equal to output size."
);
for
(
size_t
i
=
0
;
i
<
out_num
;
++
i
)
{
auto
dim
=
send_tensor_dims
;
dim
[
0
]
=
height_sections
[
i
];
dim
[
0
]
=
rpc_ctx
.
height_sections
[
i
];
outs_dims
.
push_back
(
dim
);
}
// create output var in local scope
size_t
row_offset
=
0
;
for
(
auto
i
=
0
;
i
<
out_num
;
++
i
)
{
framework
::
Tensor
*
out
=
local_scope
->
Var
(
send_var
names
[
i
])
framework
::
Tensor
*
out
=
local_scope
->
Var
(
rpc_ctx
.
splited_var_
names
[
i
])
->
GetMutable
<
framework
::
LoDTensor
>
();
*
out
=
send_tensor
.
Slice
(
row_offset
,
row_offset
+
outs_dims
[
i
][
0
]);
row_offset
+=
outs_dims
[
i
][
0
];
...
...
@@ -83,7 +80,7 @@ void ParameterSend<T>::operator()(const std::string &var_name,
}
}
else
if
(
send_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
&
send_slr
=
send_var
->
Get
<
framework
::
SelectedRows
>
();
auto
abs_sections
=
ToAbsoluteSection
(
height_sections
);
auto
abs_sections
=
ToAbsoluteSection
(
rpc_ctx
.
height_sections
);
auto
send_rows
=
send_slr
.
rows
();
std
::
vector
<
std
::
vector
<
int
>>
outs_rows_idx
;
...
...
@@ -97,7 +94,7 @@ void ParameterSend<T>::operator()(const std::string &var_name,
// create output var in local scope
std
::
vector
<
framework
::
SelectedRows
*>
outs
;
for
(
auto
&
name
:
send_var
names
)
{
for
(
auto
&
name
:
rpc_ctx
.
splited_var_
names
)
{
auto
*
out
=
local_scope
->
Var
(
name
)
->
GetMutable
<
framework
::
SelectedRows
>
();
outs
.
push_back
(
out
);
}
...
...
@@ -112,7 +109,7 @@ void ParameterSend<T>::operator()(const std::string &var_name,
for
(
size_t
i
=
0
;
i
<
outs_rows_idx
.
size
();
++
i
)
{
auto
rows_idx
=
outs_rows_idx
[
i
];
outs
[
i
]
->
set_height
(
height_sections
[
i
]);
outs
[
i
]
->
set_height
(
rpc_ctx
.
height_sections
[
i
]);
auto
dims
=
send_slr
.
GetCompleteDims
();
dims
[
0
]
=
rows_idx
.
size
();
outs
[
i
]
->
mutable_value
()
->
mutable_data
<
T
>
(
dims
,
send_slr
.
place
());
...
...
@@ -149,15 +146,16 @@ void ParameterSend<T>::operator()(const std::string &var_name,
}
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
send_var
names
.
size
();
i
++
)
{
auto
&
send_var_name
=
send_var
names
[
i
];
auto
&
endpoint
=
epmap
[
i
];
for
(
size_t
i
=
0
;
i
<
rpc_ctx
.
splited_var_
names
.
size
();
i
++
)
{
auto
&
send_var_name
=
rpc_ctx
.
splited_var_
names
[
i
];
auto
&
endpoint
=
rpc_ctx
.
epmap
[
i
];
if
(
NeedSend
(
*
local_scope
,
send_var_name
))
{
VLOG
(
3
)
<<
"sending "
<<
send_var_name
<<
" to "
<<
endpoint
;
rets
.
push_back
(
rpc_client
->
AsyncSendVar
(
endpoint
,
cpu_ctx
,
*
local_scope
,
send_var_name
));
}
else
{
VLOG
(
3
)
<<
"don't send non-initialized variable: "
<<
send_varnames
[
i
];
VLOG
(
3
)
<<
"don't send non-initialized variable: "
<<
rpc_ctx
.
splited_var_names
[
i
];
}
}
...
...
paddle/fluid/operators/distributed/parameter_send.h
浏览文件 @
9573d610
...
...
@@ -18,6 +18,7 @@
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/distributed/rpc_common.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -25,10 +26,7 @@ namespace distributed {
template
<
typename
T
>
struct
ParameterSend
{
void
operator
()(
const
std
::
string
&
var_name
,
const
std
::
vector
<
std
::
string
>
&
send_varnames
,
const
std
::
vector
<
std
::
string
>
&
epmap
,
const
std
::
vector
<
int64_t
>
&
height_sections
,
void
operator
()(
const
RpcContext
&
rpc_ctx
,
const
framework
::
ExecutionContext
&
context
,
const
framework
::
Scope
&
scope
,
bool
sync
);
};
...
...
paddle/fluid/operators/distributed/rpc_common.h
浏览文件 @
9573d610
...
...
@@ -22,6 +22,13 @@ namespace operators {
namespace
distributed
{
struct
RpcContext
{
RpcContext
(
const
std
::
string
&
name
,
const
std
::
vector
<
std
::
string
>&
names
,
const
std
::
vector
<
std
::
string
>&
emap
,
const
std
::
vector
<
int64_t
>&
sections
)
:
var_name
(
name
),
splited_var_names
(
names
),
epmap
(
emap
),
height_sections
(
sections
)
{}
std
::
string
var_name
;
std
::
vector
<
std
::
string
>
splited_var_names
;
std
::
vector
<
std
::
string
>
epmap
;
...
...
paddle/fluid/operators/distributed_ops/recv_op.cc
浏览文件 @
9573d610
...
...
@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/parameter_recv.h"
#include "paddle/fluid/operators/distributed/rpc_common.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
...
...
@@ -57,9 +58,11 @@ class RecvOp : public framework::OperatorBase {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
pool
.
Get
(
place
);
auto
exe_ctx
=
framework
::
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
,
nullptr
);
auto
exe_ctx
=
framework
::
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
,
nullptr
);
auto
recv_functor
=
distributed
::
ParameterRecv
<
float
>
();
recv_functor
(
outs
[
0
],
recv_varnames
,
epmap
,
exe_ctx
,
scope
);
auto
rpc_ctx
=
distributed
::
RpcContext
(
outs
[
0
],
recv_varnames
,
epmap
,
{});
recv_functor
(
rpc_ctx
,
exe_ctx
,
scope
);
}
else
{
if
(
with_barrier
)
{
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
...
...
paddle/fluid/operators/distributed_ops/send_op.cc
浏览文件 @
9573d610
...
...
@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/parameter_send.h"
#include "paddle/fluid/operators/distributed/rpc_common.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -50,10 +51,12 @@ class SendOp : public framework::OperatorBase {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
pool
.
Get
(
place
);
auto
exe_ctx
=
framework
::
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
,
nullptr
);
auto
exe_ctx
=
framework
::
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
,
nullptr
);
auto
send_functor
=
distributed
::
ParameterSend
<
float
>
();
send_functor
(
ins
[
0
],
send_varnames
,
epmap
,
height_sections
,
exe_ctx
,
scope
,
static_cast
<
bool
>
(
sync_send
));
auto
rpc_ctx
=
distributed
::
RpcContext
(
ins
[
0
],
send_varnames
,
epmap
,
height_sections
);
send_functor
(
rpc_ctx
,
exe_ctx
,
scope
,
static_cast
<
bool
>
(
sync_send
));
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录