Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
d21ab2e2
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d21ab2e2
编写于
3月 29, 2018
作者:
武
武毅
提交者:
GitHub
3月 29, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #9448 from typhoonzero/fix_dist_slr_height
fix dist train selected rows height missing
上级
24100e1f
96192a85
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
45 addition
and
17 deletion
+45
-17
paddle/fluid/operators/detail/grpc_client.cc
paddle/fluid/operators/detail/grpc_client.cc
+0
-1
paddle/fluid/operators/detail/send_recv.proto
paddle/fluid/operators/detail/send_recv.proto
+4
-4
paddle/fluid/operators/detail/sendrecvop_utils.cc
paddle/fluid/operators/detail/sendrecvop_utils.cc
+2
-1
paddle/fluid/operators/detail/sendrecvop_utils.h
paddle/fluid/operators/detail/sendrecvop_utils.h
+7
-0
paddle/fluid/operators/detail/test_serde.cc
paddle/fluid/operators/detail/test_serde.cc
+13
-8
paddle/fluid/operators/detail/variable_response.cc
paddle/fluid/operators/detail/variable_response.cc
+15
-1
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+2
-0
paddle/fluid/operators/send_op.cc
paddle/fluid/operators/send_op.cc
+2
-2
未找到文件。
paddle/fluid/operators/detail/grpc_client.cc
浏览文件 @
d21ab2e2
...
@@ -204,7 +204,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
...
@@ -204,7 +204,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
}
}
grpc
::
ChannelArguments
args
;
grpc
::
ChannelArguments
args
;
args
.
SetInt
(
"grpc.testing.fixed_reconnect_backoff_ms"
,
5000
);
args
.
SetCompressionAlgorithm
(
GRPC_COMPRESS_NONE
);
args
.
SetCompressionAlgorithm
(
GRPC_COMPRESS_NONE
);
args
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
...
...
paddle/fluid/operators/detail/send_recv.proto
浏览文件 @
d21ab2e2
...
@@ -59,12 +59,12 @@ message VariableMessage {
...
@@ -59,12 +59,12 @@ message VariableMessage {
// lod details:
// lod details:
int64
lod_level
=
5
;
int64
lod_level
=
5
;
repeated
LodData
lod
=
6
;
repeated
LodData
lod
=
6
;
// selected_rows height, aka. original dim0
int64
slr_height
=
7
;
// tensor data
// tensor data
bytes
serialized
=
7
;
bytes
serialized
=
8
;
// selected_rows data
// selected_rows data
bytes
rows
=
8
;
bytes
rows
=
9
;
}
}
message
VoidMessage
{}
message
VoidMessage
{}
message
TestMessage
{
int64
test_1
=
1
;
}
paddle/fluid/operators/detail/sendrecvop_utils.cc
浏览文件 @
d21ab2e2
...
@@ -108,6 +108,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
...
@@ -108,6 +108,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e
.
WriteUint64
(
VarMsg
::
kDimsFieldNumber
,
dim
);
e
.
WriteUint64
(
VarMsg
::
kDimsFieldNumber
,
dim
);
}
}
e
.
WriteUint64
(
VarMsg
::
kLodLevelFieldNumber
,
0
);
e
.
WriteUint64
(
VarMsg
::
kLodLevelFieldNumber
,
0
);
e
.
WriteUint64
(
VarMsg
::
kSlrHeightFieldNumber
,
slr
->
height
());
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
tensor
=
slr
->
mutable_value
();
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
...
@@ -154,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
...
@@ -154,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
ProtoEncodeHelper
e2
((
char
*
)
buf
,
128
);
ProtoEncodeHelper
e2
((
char
*
)
buf
,
128
);
// NOTE: rows is of type int64_t
// NOTE: rows is of type int64_t
size_t
rows_memory_size
=
size_t
rows_memory_size
=
slr
->
rows
().
capacity
()
*
framework
::
SizeOfType
(
typeid
(
int64_t
));
slr
->
rows
().
size
()
*
framework
::
SizeOfType
(
typeid
(
int64_t
));
e2
.
WriteVarlengthBeginning
(
VarMsg
::
kRowsFieldNumber
,
rows_memory_size
);
e2
.
WriteVarlengthBeginning
(
VarMsg
::
kRowsFieldNumber
,
rows_memory_size
);
slices
[
2
]
=
::
grpc
::
Slice
(
e2
.
size
());
slices
[
2
]
=
::
grpc
::
Slice
(
e2
.
size
());
memcpy
(
const_cast
<
uint8_t
*>
(
slices
[
2
].
begin
()),
e2
.
data
(),
e2
.
size
());
memcpy
(
const_cast
<
uint8_t
*>
(
slices
[
2
].
begin
()),
e2
.
data
(),
e2
.
size
());
...
...
paddle/fluid/operators/detail/sendrecvop_utils.h
浏览文件 @
d21ab2e2
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <sys/time.h>
#include <iostream>
#include <iostream>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -35,6 +36,12 @@ namespace detail {
...
@@ -35,6 +36,12 @@ namespace detail {
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
static
int64_t
GetTimestamp
()
{
struct
timeval
tp
;
gettimeofday
(
&
tp
,
NULL
);
return
tp
.
tv_sec
*
1000
+
tp
.
tv_usec
/
1000
;
}
typedef
void
(
*
DestroyCallback
)(
void
*
);
typedef
void
(
*
DestroyCallback
)(
void
*
);
void
SerializeToByteBuffer
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
void
SerializeToByteBuffer
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
...
...
paddle/fluid/operators/detail/test_serde.cc
浏览文件 @
d21ab2e2
...
@@ -40,14 +40,14 @@ void RunSerdeTestSelectedRows(platform::Place place) {
...
@@ -40,14 +40,14 @@ void RunSerdeTestSelectedRows(platform::Place place) {
// serialize var to ByteBuffer
// serialize var to ByteBuffer
framework
::
Variable
var
;
framework
::
Variable
var
;
auto
*
slr
=
var
.
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
slr
=
var
.
GetMutable
<
framework
::
SelectedRows
>
();
slr
->
set_height
(
1000
);
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
rows
=
slr
->
mutable_rows
();
auto
*
rows
=
slr
->
mutable_rows
();
tensor
->
Resize
(
framework
::
make_ddim
({
2
,
10
}));
tensor
->
Resize
(
framework
::
make_ddim
({
564
,
128
}));
tensor
->
mutable_data
<
float
>
(
place
);
tensor
->
mutable_data
<
float
>
(
place
);
int
tensor_numel
=
2
*
10
;
int
tensor_numel
=
564
*
128
;
math
::
set_constant
(
ctx
,
tensor
,
32.7
);
math
::
set_constant
(
ctx
,
tensor
,
32.7
);
rows
->
push_back
(
3
);
for
(
int
i
=
0
;
i
<
564
;
++
i
)
rows
->
push_back
(
i
);
rows
->
push_back
(
10
);
::
grpc
::
ByteBuffer
msg
;
::
grpc
::
ByteBuffer
msg
;
operators
::
detail
::
SerializeToByteBuffer
(
"myvar"
,
&
var
,
ctx
,
&
msg
);
operators
::
detail
::
SerializeToByteBuffer
(
"myvar"
,
&
var
,
ctx
,
&
msg
);
...
@@ -64,6 +64,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
...
@@ -64,6 +64,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
sendrecv
::
VariableMessage
varmsg
;
sendrecv
::
VariableMessage
varmsg
;
EXPECT_TRUE
(
varmsg
.
ParseFromString
(
tmp
));
EXPECT_TRUE
(
varmsg
.
ParseFromString
(
tmp
));
// deserialize bytebuffer
EXPECT_EQ
(
varmsg
.
varname
(),
"myvar"
);
EXPECT_EQ
(
varmsg
.
varname
(),
"myvar"
);
EXPECT_EQ
(
varmsg
.
type
(),
1
);
EXPECT_EQ
(
varmsg
.
type
(),
1
);
...
@@ -74,8 +75,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
...
@@ -74,8 +75,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
EXPECT_FLOAT_EQ
(
tensor_data
[
i
],
32.7
);
EXPECT_FLOAT_EQ
(
tensor_data
[
i
],
32.7
);
}
}
EXPECT_EQ
(
rows_data
[
0
],
3
);
for
(
int
i
=
0
;
i
<
564
;
++
i
)
{
EXPECT_EQ
(
rows_data
[
1
],
10
);
EXPECT_EQ
(
rows_data
[
i
],
i
);
}
// deserialize zero-copy
// deserialize zero-copy
// framework::Variable var2;
// framework::Variable var2;
// operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
// operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
...
@@ -104,8 +107,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
...
@@ -104,8 +107,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
EXPECT_FLOAT_EQ
(
tensor_data2
[
i
],
32.7
);
EXPECT_FLOAT_EQ
(
tensor_data2
[
i
],
32.7
);
}
}
EXPECT_EQ
(
rows_data2
[
0
],
3
);
for
(
int
i
=
0
;
i
<
rows2
->
size
();
++
i
)
{
EXPECT_EQ
(
rows_data2
[
1
],
10
);
EXPECT_EQ
(
rows_data2
[
i
],
i
);
}
EXPECT_EQ
(
slr2
->
height
(),
1000
);
}
}
void
RunTestLodTensor
(
platform
::
Place
place
,
int
from_type
=
0
)
{
void
RunTestLodTensor
(
platform
::
Place
place
,
int
from_type
=
0
)
{
...
...
paddle/fluid/operators/detail/variable_response.cc
浏览文件 @
d21ab2e2
...
@@ -147,8 +147,13 @@ bool VariableResponse::CopySelectRowsTensorData(
...
@@ -147,8 +147,13 @@ bool VariableResponse::CopySelectRowsTensorData(
const
platform
::
DeviceContext
&
ctx
,
framework
::
DDim
&
dims
,
int
length
)
{
const
platform
::
DeviceContext
&
ctx
,
framework
::
DDim
&
dims
,
int
length
)
{
auto
var
=
scope_
->
FindVar
(
meta_
.
varname
());
auto
var
=
scope_
->
FindVar
(
meta_
.
varname
());
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
slr
->
set_height
(
meta_
.
slr_height
());
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
tensor
=
slr
->
mutable_value
();
tensor
->
Resize
(
dims
);
tensor
->
Resize
(
dims
);
PADDLE_ENFORCE_EQ
(
tensor
->
numel
(),
length
/
framework
::
SizeOfType
(
paddle
::
operators
::
detail
::
ToTypeIndex
(
meta_
.
data_type
())));
void
*
tensor_data
=
tensor
->
mutable_data
(
void
*
tensor_data
=
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
ctx
.
GetPlace
(),
paddle
::
operators
::
detail
::
ToTypeIndex
(
meta_
.
data_type
()));
paddle
::
operators
::
detail
::
ToTypeIndex
(
meta_
.
data_type
()));
...
@@ -165,7 +170,8 @@ bool VariableResponse::CopySelectRowsData(
...
@@ -165,7 +170,8 @@ bool VariableResponse::CopySelectRowsData(
const
platform
::
DeviceContext
&
ctx
,
int
length
)
{
const
platform
::
DeviceContext
&
ctx
,
int
length
)
{
auto
var
=
scope_
->
FindVar
(
meta_
.
varname
());
auto
var
=
scope_
->
FindVar
(
meta_
.
varname
());
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
slr
->
mutable_rows
()
->
resize
(
length
/
8
);
// int64
slr
->
mutable_rows
()
->
resize
(
length
/
framework
::
SizeOfType
(
typeid
(
int64_t
)));
// int64
int64_t
*
rows_data
=
slr
->
mutable_rows
()
->
data
();
int64_t
*
rows_data
=
slr
->
mutable_rows
()
->
data
();
// copy rows CPU data, GPU data will be copied lazily.
// copy rows CPU data, GPU data will be copied lazily.
...
@@ -348,6 +354,14 @@ int VariableResponse::Parse(Source* source) {
...
@@ -348,6 +354,14 @@ int VariableResponse::Parse(Source* source) {
}
}
break
;
break
;
}
}
case
sendrecv
::
VariableMessage
::
kSlrHeightFieldNumber
:
{
uint64_t
v
=
0
;
if
((
wt
!=
WIRETYPE_VARINT
)
||
!
input
.
ReadVarint64
(
&
v
))
{
return
tag
;
}
meta_
.
set_slr_height
(
static_cast
<
int64_t
>
(
v
));
break
;
}
case
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
:
{
case
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
:
{
PADDLE_ENFORCE
((
meta_
.
type
()
==
sendrecv
::
SELECTED_ROWS
||
PADDLE_ENFORCE
((
meta_
.
type
()
==
sendrecv
::
SELECTED_ROWS
||
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
&&
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
&&
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
d21ab2e2
...
@@ -141,6 +141,7 @@ class ListenAndServOp : public framework::OperatorBase {
...
@@ -141,6 +141,7 @@ class ListenAndServOp : public framework::OperatorBase {
// and this will still work.
// and this will still work.
std
::
vector
<
std
::
future
<
void
>>
fs
;
std
::
vector
<
std
::
future
<
void
>>
fs
;
double
ts
=
detail
::
GetTimestamp
();
// block0 contains only listen_and_serv op, start run from block1.
// block0 contains only listen_and_serv op, start run from block1.
for
(
int
blkid
=
1
;
blkid
<
num_blocks
-
1
;
++
blkid
)
{
for
(
int
blkid
=
1
;
blkid
<
num_blocks
-
1
;
++
blkid
)
{
fs
.
push_back
(
fs
.
push_back
(
...
@@ -162,6 +163,7 @@ class ListenAndServOp : public framework::OperatorBase {
...
@@ -162,6 +163,7 @@ class ListenAndServOp : public framework::OperatorBase {
LOG
(
ERROR
)
<<
"run sub program error "
<<
e
.
what
();
LOG
(
ERROR
)
<<
"run sub program error "
<<
e
.
what
();
}
}
}
}
VLOG
(
2
)
<<
"run all blocks spent (ms) "
<<
detail
::
GetTimestamp
()
-
ts
;
// Reset the received sparse variables, the sum operator would not
// Reset the received sparse variables, the sum operator would not
// sum the input sparse variables which rows is empty at the next
// sum the input sparse variables which rows is empty at the next
...
...
paddle/fluid/operators/send_op.cc
浏览文件 @
d21ab2e2
...
@@ -72,7 +72,7 @@ class SendOp : public framework::OperatorBase {
...
@@ -72,7 +72,7 @@ class SendOp : public framework::OperatorBase {
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
i
++
)
{
if
(
NeedSend
(
scope
,
ins
[
i
]))
{
if
(
NeedSend
(
scope
,
ins
[
i
]))
{
VLOG
(
2
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
];
VLOG
(
3
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
];
rpc_client
->
AsyncSendVariable
(
epmap
[
i
],
ctx
,
scope
,
ins
[
i
]);
rpc_client
->
AsyncSendVariable
(
epmap
[
i
],
ctx
,
scope
,
ins
[
i
]);
}
else
{
}
else
{
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
...
@@ -81,7 +81,7 @@ class SendOp : public framework::OperatorBase {
...
@@ -81,7 +81,7 @@ class SendOp : public framework::OperatorBase {
PADDLE_ENFORCE
(
rpc_client
->
Wait
());
PADDLE_ENFORCE
(
rpc_client
->
Wait
());
for
(
auto
&
ep
:
endpoints
)
{
for
(
auto
&
ep
:
endpoints
)
{
VLOG
(
2
)
<<
"batch barrier, ep: "
<<
ep
;
VLOG
(
3
)
<<
"batch barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendBatchBarrier
(
ep
);
rpc_client
->
AsyncSendBatchBarrier
(
ep
);
}
}
PADDLE_ENFORCE
(
rpc_client
->
Wait
());
PADDLE_ENFORCE
(
rpc_client
->
Wait
());
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录