Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
d21ab2e2
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d21ab2e2
编写于
3月 29, 2018
作者:
武
武毅
提交者:
GitHub
3月 29, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #9448 from typhoonzero/fix_dist_slr_height
fix dist train selected rows height missing
上级
24100e1f
96192a85
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
45 addition
and
17 deletion
+45
-17
paddle/fluid/operators/detail/grpc_client.cc
paddle/fluid/operators/detail/grpc_client.cc
+0
-1
paddle/fluid/operators/detail/send_recv.proto
paddle/fluid/operators/detail/send_recv.proto
+4
-4
paddle/fluid/operators/detail/sendrecvop_utils.cc
paddle/fluid/operators/detail/sendrecvop_utils.cc
+2
-1
paddle/fluid/operators/detail/sendrecvop_utils.h
paddle/fluid/operators/detail/sendrecvop_utils.h
+7
-0
paddle/fluid/operators/detail/test_serde.cc
paddle/fluid/operators/detail/test_serde.cc
+13
-8
paddle/fluid/operators/detail/variable_response.cc
paddle/fluid/operators/detail/variable_response.cc
+15
-1
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+2
-0
paddle/fluid/operators/send_op.cc
paddle/fluid/operators/send_op.cc
+2
-2
未找到文件。
paddle/fluid/operators/detail/grpc_client.cc
浏览文件 @
d21ab2e2
...
@@ -204,7 +204,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
...
@@ -204,7 +204,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
}
}
grpc
::
ChannelArguments
args
;
grpc
::
ChannelArguments
args
;
args
.
SetInt
(
"grpc.testing.fixed_reconnect_backoff_ms"
,
5000
);
args
.
SetCompressionAlgorithm
(
GRPC_COMPRESS_NONE
);
args
.
SetCompressionAlgorithm
(
GRPC_COMPRESS_NONE
);
args
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
...
...
paddle/fluid/operators/detail/send_recv.proto
浏览文件 @
d21ab2e2
...
@@ -59,12 +59,12 @@ message VariableMessage {
...
@@ -59,12 +59,12 @@ message VariableMessage {
// lod details:
// lod details:
int64
lod_level
=
5
;
int64
lod_level
=
5
;
repeated
LodData
lod
=
6
;
repeated
LodData
lod
=
6
;
// selected_rows height, aka. original dim0
int64
slr_height
=
7
;
// tensor data
// tensor data
bytes
serialized
=
7
;
bytes
serialized
=
8
;
// selected_rows data
// selected_rows data
bytes
rows
=
8
;
bytes
rows
=
9
;
}
}
message
VoidMessage
{}
message
VoidMessage
{}
message
TestMessage
{
int64
test_1
=
1
;
}
paddle/fluid/operators/detail/sendrecvop_utils.cc
浏览文件 @
d21ab2e2
...
@@ -108,6 +108,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
...
@@ -108,6 +108,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e
.
WriteUint64
(
VarMsg
::
kDimsFieldNumber
,
dim
);
e
.
WriteUint64
(
VarMsg
::
kDimsFieldNumber
,
dim
);
}
}
e
.
WriteUint64
(
VarMsg
::
kLodLevelFieldNumber
,
0
);
e
.
WriteUint64
(
VarMsg
::
kLodLevelFieldNumber
,
0
);
e
.
WriteUint64
(
VarMsg
::
kSlrHeightFieldNumber
,
slr
->
height
());
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
tensor
=
slr
->
mutable_value
();
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
...
@@ -154,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
...
@@ -154,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
ProtoEncodeHelper
e2
((
char
*
)
buf
,
128
);
ProtoEncodeHelper
e2
((
char
*
)
buf
,
128
);
// NOTE: rows is of type int64_t
// NOTE: rows is of type int64_t
size_t
rows_memory_size
=
size_t
rows_memory_size
=
slr
->
rows
().
capacity
()
*
framework
::
SizeOfType
(
typeid
(
int64_t
));
slr
->
rows
().
size
()
*
framework
::
SizeOfType
(
typeid
(
int64_t
));
e2
.
WriteVarlengthBeginning
(
VarMsg
::
kRowsFieldNumber
,
rows_memory_size
);
e2
.
WriteVarlengthBeginning
(
VarMsg
::
kRowsFieldNumber
,
rows_memory_size
);
slices
[
2
]
=
::
grpc
::
Slice
(
e2
.
size
());
slices
[
2
]
=
::
grpc
::
Slice
(
e2
.
size
());
memcpy
(
const_cast
<
uint8_t
*>
(
slices
[
2
].
begin
()),
e2
.
data
(),
e2
.
size
());
memcpy
(
const_cast
<
uint8_t
*>
(
slices
[
2
].
begin
()),
e2
.
data
(),
e2
.
size
());
...
...
paddle/fluid/operators/detail/sendrecvop_utils.h
浏览文件 @
d21ab2e2
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <sys/time.h>
#include <iostream>
#include <iostream>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -35,6 +36,12 @@ namespace detail {
...
@@ -35,6 +36,12 @@ namespace detail {
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
static
int64_t
GetTimestamp
()
{
struct
timeval
tp
;
gettimeofday
(
&
tp
,
NULL
);
return
tp
.
tv_sec
*
1000
+
tp
.
tv_usec
/
1000
;
}
typedef
void
(
*
DestroyCallback
)(
void
*
);
typedef
void
(
*
DestroyCallback
)(
void
*
);
void
SerializeToByteBuffer
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
void
SerializeToByteBuffer
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
...
...
paddle/fluid/operators/detail/test_serde.cc
浏览文件 @
d21ab2e2
...
@@ -40,14 +40,14 @@ void RunSerdeTestSelectedRows(platform::Place place) {
...
@@ -40,14 +40,14 @@ void RunSerdeTestSelectedRows(platform::Place place) {
// serialize var to ByteBuffer
// serialize var to ByteBuffer
framework
::
Variable
var
;
framework
::
Variable
var
;
auto
*
slr
=
var
.
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
slr
=
var
.
GetMutable
<
framework
::
SelectedRows
>
();
slr
->
set_height
(
1000
);
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
rows
=
slr
->
mutable_rows
();
auto
*
rows
=
slr
->
mutable_rows
();
tensor
->
Resize
(
framework
::
make_ddim
({
2
,
10
}));
tensor
->
Resize
(
framework
::
make_ddim
({
564
,
128
}));
tensor
->
mutable_data
<
float
>
(
place
);
tensor
->
mutable_data
<
float
>
(
place
);
int
tensor_numel
=
2
*
10
;
int
tensor_numel
=
564
*
128
;
math
::
set_constant
(
ctx
,
tensor
,
32.7
);
math
::
set_constant
(
ctx
,
tensor
,
32.7
);
rows
->
push_back
(
3
);
for
(
int
i
=
0
;
i
<
564
;
++
i
)
rows
->
push_back
(
i
);
rows
->
push_back
(
10
);
::
grpc
::
ByteBuffer
msg
;
::
grpc
::
ByteBuffer
msg
;
operators
::
detail
::
SerializeToByteBuffer
(
"myvar"
,
&
var
,
ctx
,
&
msg
);
operators
::
detail
::
SerializeToByteBuffer
(
"myvar"
,
&
var
,
ctx
,
&
msg
);
...
@@ -64,6 +64,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
...
@@ -64,6 +64,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
sendrecv
::
VariableMessage
varmsg
;
sendrecv
::
VariableMessage
varmsg
;
EXPECT_TRUE
(
varmsg
.
ParseFromString
(
tmp
));
EXPECT_TRUE
(
varmsg
.
ParseFromString
(
tmp
));
// deserialize bytebuffer
EXPECT_EQ
(
varmsg
.
varname
(),
"myvar"
);
EXPECT_EQ
(
varmsg
.
varname
(),
"myvar"
);
EXPECT_EQ
(
varmsg
.
type
(),
1
);
EXPECT_EQ
(
varmsg
.
type
(),
1
);
...
@@ -74,8 +75,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
...
@@ -74,8 +75,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
EXPECT_FLOAT_EQ
(
tensor_data
[
i
],
32.7
);
EXPECT_FLOAT_EQ
(
tensor_data
[
i
],
32.7
);
}
}
EXPECT_EQ
(
rows_data
[
0
],
3
);
for
(
int
i
=
0
;
i
<
564
;
++
i
)
{
EXPECT_EQ
(
rows_data
[
1
],
10
);
EXPECT_EQ
(
rows_data
[
i
],
i
);
}
// deserialize zero-copy
// deserialize zero-copy
// framework::Variable var2;
// framework::Variable var2;
// operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
// operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
...
@@ -104,8 +107,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
...
@@ -104,8 +107,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
EXPECT_FLOAT_EQ
(
tensor_data2
[
i
],
32.7
);
EXPECT_FLOAT_EQ
(
tensor_data2
[
i
],
32.7
);
}
}
EXPECT_EQ
(
rows_data2
[
0
],
3
);
for
(
int
i
=
0
;
i
<
rows2
->
size
();
++
i
)
{
EXPECT_EQ
(
rows_data2
[
1
],
10
);
EXPECT_EQ
(
rows_data2
[
i
],
i
);
}
EXPECT_EQ
(
slr2
->
height
(),
1000
);
}
}
void
RunTestLodTensor
(
platform
::
Place
place
,
int
from_type
=
0
)
{
void
RunTestLodTensor
(
platform
::
Place
place
,
int
from_type
=
0
)
{
...
...
paddle/fluid/operators/detail/variable_response.cc
浏览文件 @
d21ab2e2
...
@@ -147,8 +147,13 @@ bool VariableResponse::CopySelectRowsTensorData(
...
@@ -147,8 +147,13 @@ bool VariableResponse::CopySelectRowsTensorData(
const
platform
::
DeviceContext
&
ctx
,
framework
::
DDim
&
dims
,
int
length
)
{
const
platform
::
DeviceContext
&
ctx
,
framework
::
DDim
&
dims
,
int
length
)
{
auto
var
=
scope_
->
FindVar
(
meta_
.
varname
());
auto
var
=
scope_
->
FindVar
(
meta_
.
varname
());
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
slr
->
set_height
(
meta_
.
slr_height
());
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
tensor
=
slr
->
mutable_value
();
tensor
->
Resize
(
dims
);
tensor
->
Resize
(
dims
);
PADDLE_ENFORCE_EQ
(
tensor
->
numel
(),
length
/
framework
::
SizeOfType
(
paddle
::
operators
::
detail
::
ToTypeIndex
(
meta_
.
data_type
())));
void
*
tensor_data
=
tensor
->
mutable_data
(
void
*
tensor_data
=
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
ctx
.
GetPlace
(),
paddle
::
operators
::
detail
::
ToTypeIndex
(
meta_
.
data_type
()));
paddle
::
operators
::
detail
::
ToTypeIndex
(
meta_
.
data_type
()));
...
@@ -165,7 +170,8 @@ bool VariableResponse::CopySelectRowsData(
...
@@ -165,7 +170,8 @@ bool VariableResponse::CopySelectRowsData(
const
platform
::
DeviceContext
&
ctx
,
int
length
)
{
const
platform
::
DeviceContext
&
ctx
,
int
length
)
{
auto
var
=
scope_
->
FindVar
(
meta_
.
varname
());
auto
var
=
scope_
->
FindVar
(
meta_
.
varname
());
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
slr
->
mutable_rows
()
->
resize
(
length
/
8
);
// int64
slr
->
mutable_rows
()
->
resize
(
length
/
framework
::
SizeOfType
(
typeid
(
int64_t
)));
// int64
int64_t
*
rows_data
=
slr
->
mutable_rows
()
->
data
();
int64_t
*
rows_data
=
slr
->
mutable_rows
()
->
data
();
// copy rows CPU data, GPU data will be copied lazily.
// copy rows CPU data, GPU data will be copied lazily.
...
@@ -348,6 +354,14 @@ int VariableResponse::Parse(Source* source) {
...
@@ -348,6 +354,14 @@ int VariableResponse::Parse(Source* source) {
}
}
break
;
break
;
}
}
case
sendrecv
::
VariableMessage
::
kSlrHeightFieldNumber
:
{
uint64_t
v
=
0
;
if
((
wt
!=
WIRETYPE_VARINT
)
||
!
input
.
ReadVarint64
(
&
v
))
{
return
tag
;
}
meta_
.
set_slr_height
(
static_cast
<
int64_t
>
(
v
));
break
;
}
case
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
:
{
case
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
:
{
PADDLE_ENFORCE
((
meta_
.
type
()
==
sendrecv
::
SELECTED_ROWS
||
PADDLE_ENFORCE
((
meta_
.
type
()
==
sendrecv
::
SELECTED_ROWS
||
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
&&
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
&&
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
d21ab2e2
...
@@ -141,6 +141,7 @@ class ListenAndServOp : public framework::OperatorBase {
...
@@ -141,6 +141,7 @@ class ListenAndServOp : public framework::OperatorBase {
// and this will still work.
// and this will still work.
std
::
vector
<
std
::
future
<
void
>>
fs
;
std
::
vector
<
std
::
future
<
void
>>
fs
;
double
ts
=
detail
::
GetTimestamp
();
// block0 contains only listen_and_serv op, start run from block1.
// block0 contains only listen_and_serv op, start run from block1.
for
(
int
blkid
=
1
;
blkid
<
num_blocks
-
1
;
++
blkid
)
{
for
(
int
blkid
=
1
;
blkid
<
num_blocks
-
1
;
++
blkid
)
{
fs
.
push_back
(
fs
.
push_back
(
...
@@ -162,6 +163,7 @@ class ListenAndServOp : public framework::OperatorBase {
...
@@ -162,6 +163,7 @@ class ListenAndServOp : public framework::OperatorBase {
LOG
(
ERROR
)
<<
"run sub program error "
<<
e
.
what
();
LOG
(
ERROR
)
<<
"run sub program error "
<<
e
.
what
();
}
}
}
}
VLOG
(
2
)
<<
"run all blocks spent (ms) "
<<
detail
::
GetTimestamp
()
-
ts
;
// Reset the received sparse variables, the sum operator would not
// Reset the received sparse variables, the sum operator would not
// sum the input sparse variables which rows is empty at the next
// sum the input sparse variables which rows is empty at the next
...
...
paddle/fluid/operators/send_op.cc
浏览文件 @
d21ab2e2
...
@@ -72,7 +72,7 @@ class SendOp : public framework::OperatorBase {
...
@@ -72,7 +72,7 @@ class SendOp : public framework::OperatorBase {
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
i
++
)
{
if
(
NeedSend
(
scope
,
ins
[
i
]))
{
if
(
NeedSend
(
scope
,
ins
[
i
]))
{
VLOG
(
2
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
];
VLOG
(
3
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
];
rpc_client
->
AsyncSendVariable
(
epmap
[
i
],
ctx
,
scope
,
ins
[
i
]);
rpc_client
->
AsyncSendVariable
(
epmap
[
i
],
ctx
,
scope
,
ins
[
i
]);
}
else
{
}
else
{
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
...
@@ -81,7 +81,7 @@ class SendOp : public framework::OperatorBase {
...
@@ -81,7 +81,7 @@ class SendOp : public framework::OperatorBase {
PADDLE_ENFORCE
(
rpc_client
->
Wait
());
PADDLE_ENFORCE
(
rpc_client
->
Wait
());
for
(
auto
&
ep
:
endpoints
)
{
for
(
auto
&
ep
:
endpoints
)
{
VLOG
(
2
)
<<
"batch barrier, ep: "
<<
ep
;
VLOG
(
3
)
<<
"batch barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendBatchBarrier
(
ep
);
rpc_client
->
AsyncSendBatchBarrier
(
ep
);
}
}
PADDLE_ENFORCE
(
rpc_client
->
Wait
());
PADDLE_ENFORCE
(
rpc_client
->
Wait
());
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录