Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
45af8c1e
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
45af8c1e
编写于
3月 09, 2018
作者:
武
武毅
提交者:
gongweibao
3月 09, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Performance/zero copy variable seriralization (#8839)
上级
12fc76e1
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
786 addition
and
4 deletion
+786
-4
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+0
-1
paddle/fluid/operators/detail/CMakeLists.txt
paddle/fluid/operators/detail/CMakeLists.txt
+4
-1
paddle/fluid/operators/detail/bytebuffer_stream.cc
paddle/fluid/operators/detail/bytebuffer_stream.cc
+88
-0
paddle/fluid/operators/detail/bytebuffer_stream.h
paddle/fluid/operators/detail/bytebuffer_stream.h
+51
-0
paddle/fluid/operators/detail/proto_encoder_helper.h
paddle/fluid/operators/detail/proto_encoder_helper.h
+147
-0
paddle/fluid/operators/detail/send_recv.proto
paddle/fluid/operators/detail/send_recv.proto
+25
-1
paddle/fluid/operators/detail/sendrecvop_utils.cc
paddle/fluid/operators/detail/sendrecvop_utils.cc
+242
-1
paddle/fluid/operators/detail/sendrecvop_utils.h
paddle/fluid/operators/detail/sendrecvop_utils.h
+34
-0
paddle/fluid/operators/detail/test_serde.cc
paddle/fluid/operators/detail/test_serde.cc
+195
-0
未找到文件。
paddle/fluid/framework/tensor_util.cc
浏览文件 @
45af8c1e
...
...
@@ -187,7 +187,6 @@ bool TensorContainsInf(const framework::Tensor& tensor) {
void
TensorToStream
(
std
::
ostream
&
os
,
const
Tensor
&
tensor
,
const
platform
::
DeviceContext
&
dev_ctx
)
{
// TODO(typhoonzero): serialize to ostream
{
// the 1st field, uint32_t version
constexpr
uint32_t
version
=
0
;
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
version
),
sizeof
(
version
));
...
...
paddle/fluid/operators/detail/CMakeLists.txt
浏览文件 @
45af8c1e
if
(
WITH_DISTRIBUTE
)
grpc_library
(
sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows
)
grpc_library
(
sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
test_serde.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
cc_test
(
serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc
)
endif
()
paddle/fluid/operators/detail/bytebuffer_stream.cc
0 → 100644
浏览文件 @
45af8c1e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#include "bytebuffer_stream.h"
namespace
paddle
{
namespace
operators
{
namespace
detail
{
GrpcByteBufferSource
::
GrpcByteBufferSource
()
{}
bool
GrpcByteBufferSource
::
Init
(
const
grpc
::
ByteBuffer
&
src
)
{
cur_
=
-
1
;
left_
=
0
;
ptr_
=
nullptr
;
byte_count_
=
0
;
bool
ok
=
src
.
Dump
(
&
slices_
).
ok
();
if
(
!
ok
)
{
slices_
.
clear
();
}
return
ok
;
}
bool
GrpcByteBufferSource
::
Next
(
const
void
**
data
,
int
*
size
)
{
// Use loop instead of if in case buffer contained empty slices.
while
(
left_
==
0
)
{
// Advance to next slice.
cur_
++
;
if
(
cur_
>=
slices_
.
size
())
{
return
false
;
}
const
::
grpc
::
Slice
&
s
=
slices_
[
cur_
];
left_
=
s
.
size
();
ptr_
=
reinterpret_cast
<
const
char
*>
(
s
.
begin
());
}
*
data
=
ptr_
;
*
size
=
left_
;
byte_count_
+=
left_
;
ptr_
+=
left_
;
left_
=
0
;
return
true
;
}
void
GrpcByteBufferSource
::
BackUp
(
int
count
)
{
ptr_
-=
count
;
left_
+=
count
;
byte_count_
-=
count
;
}
bool
GrpcByteBufferSource
::
Skip
(
int
count
)
{
const
void
*
data
;
int
size
;
while
(
Next
(
&
data
,
&
size
))
{
if
(
size
>=
count
)
{
BackUp
(
size
-
count
);
return
true
;
}
// size < count;
count
-=
size
;
}
// error or we have too large count;
return
false
;
}
google
::
protobuf
::
int64
GrpcByteBufferSource
::
ByteCount
()
const
{
return
byte_count_
;
}
}
// namespace detail
}
// namespace operators
}
// namespace paddle
\ No newline at end of file
paddle/fluid/operators/detail/bytebuffer_stream.h
0 → 100644
浏览文件 @
45af8c1e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#pragma once
#include <grpc++/grpc++.h>
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
namespace
paddle
{
namespace
operators
{
namespace
detail
{
// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
class
GrpcByteBufferSource
:
public
::
google
::
protobuf
::
io
::
ZeroCopyInputStream
{
public:
GrpcByteBufferSource
();
bool
Init
(
const
::
grpc
::
ByteBuffer
&
src
);
// Can be called multiple times.
bool
Next
(
const
void
**
data
,
int
*
size
)
override
;
void
BackUp
(
int
count
)
override
;
bool
Skip
(
int
count
)
override
;
::
google
::
protobuf
::
int64
ByteCount
()
const
override
;
private:
std
::
vector
<::
grpc
::
Slice
>
slices_
;
size_t
cur_
;
// Current slice index.
int
left_
;
// Number of bytes in slices_[cur_] left to yield.
const
char
*
ptr_
;
// Address of next byte in slices_[cur_] to yield.
::
google
::
protobuf
::
int64
byte_count_
;
};
}
// namespace detail
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/detail/proto_encoder_helper.h
0 → 100644
浏览文件 @
45af8c1e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#pragma once
#include <grpc++/grpc++.h>
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
namespace
detail
{
char
*
EncodeVarint32
(
char
*
dst
,
uint32_t
v
)
{
// Operate on characters as unsigneds
unsigned
char
*
ptr
=
reinterpret_cast
<
unsigned
char
*>
(
dst
);
static
const
int
B
=
128
;
if
(
v
<
(
1
<<
7
))
{
*
(
ptr
++
)
=
v
;
}
else
if
(
v
<
(
1
<<
14
))
{
*
(
ptr
++
)
=
v
|
B
;
*
(
ptr
++
)
=
v
>>
7
;
}
else
if
(
v
<
(
1
<<
21
))
{
*
(
ptr
++
)
=
v
|
B
;
*
(
ptr
++
)
=
(
v
>>
7
)
|
B
;
*
(
ptr
++
)
=
v
>>
14
;
}
else
if
(
v
<
(
1
<<
28
))
{
*
(
ptr
++
)
=
v
|
B
;
*
(
ptr
++
)
=
(
v
>>
7
)
|
B
;
*
(
ptr
++
)
=
(
v
>>
14
)
|
B
;
*
(
ptr
++
)
=
v
>>
21
;
}
else
{
*
(
ptr
++
)
=
v
|
B
;
*
(
ptr
++
)
=
(
v
>>
7
)
|
B
;
*
(
ptr
++
)
=
(
v
>>
14
)
|
B
;
*
(
ptr
++
)
=
(
v
>>
21
)
|
B
;
*
(
ptr
++
)
=
v
>>
28
;
}
return
reinterpret_cast
<
char
*>
(
ptr
);
}
char
*
EncodeVarint64
(
char
*
dst
,
uint64_t
v
)
{
static
const
int
B
=
128
;
unsigned
char
*
ptr
=
reinterpret_cast
<
unsigned
char
*>
(
dst
);
while
(
v
>=
B
)
{
*
(
ptr
++
)
=
(
v
&
(
B
-
1
))
|
B
;
v
>>=
7
;
}
*
(
ptr
++
)
=
static_cast
<
unsigned
char
>
(
v
);
return
reinterpret_cast
<
char
*>
(
ptr
);
}
int
VarintLength
(
uint64_t
v
)
{
int
len
=
1
;
while
(
v
>=
128
)
{
v
>>=
7
;
len
++
;
}
return
len
;
}
class
ProtoEncodeHelper
{
public:
ProtoEncodeHelper
(
char
*
buf
,
int
max_size
)
:
base_
(
buf
),
p_
(
buf
),
limit_
(
base_
+
max_size
)
{}
~
ProtoEncodeHelper
()
{
// Make sure callers didn't do operations that went over max_size promised
PADDLE_ENFORCE_LE
(
p_
,
limit_
);
}
const
char
*
data
()
const
{
return
base_
;
}
size_t
size
()
const
{
return
p_
-
base_
;
}
void
WriteUint64
(
int
tag
,
uint64_t
v
)
{
Encode32
(
combine
(
tag
,
WIRETYPE_VARINT
));
Encode64
(
v
);
}
void
WriteBool
(
int
tag
,
bool
v
)
{
Encode32
(
combine
(
tag
,
WIRETYPE_VARINT
));
EncodeBool
(
v
);
}
void
WriteString
(
int
tag
,
const
std
::
string
&
v
)
{
Encode32
(
combine
(
tag
,
WIRETYPE_LENGTH_DELIMITED
));
Encode32
(
v
.
size
());
EncodeBytes
(
v
.
data
(),
v
.
size
());
}
void
WriteVarlengthBeginning
(
int
tag
,
uint32_t
len
)
{
Encode32
(
combine
(
tag
,
WIRETYPE_LENGTH_DELIMITED
));
Encode32
(
len
);
}
void
WriteRawBytes
(
const
std
::
string
&
v
)
{
EncodeBytes
(
v
.
data
(),
v
.
size
());
}
private:
// Note: this module's behavior must match the protocol buffer wire encoding
// format.
enum
{
WIRETYPE_VARINT
=
0
,
WIRETYPE_LENGTH_DELIMITED
=
2
,
};
static
uint32_t
combine
(
uint32_t
tag
,
uint32_t
type
)
{
return
((
tag
<<
3
)
|
type
);
}
inline
void
Encode32
(
uint32_t
v
)
{
if
(
v
<
128
)
{
// Fast path for single-byte values. Many of the calls will use a
// constant value for v, so the comparison will get optimized away
// when Encode32 is inlined into the caller.
*
p_
=
v
;
p_
++
;
}
else
{
p_
=
EncodeVarint32
(
p_
,
v
);
}
}
void
Encode64
(
uint64_t
v
)
{
p_
=
EncodeVarint64
(
p_
,
v
);
}
void
EncodeBool
(
bool
v
)
{
*
p_
=
(
v
?
1
:
0
);
// Equal to varint32 encoding of 0 or 1
p_
++
;
}
void
EncodeBytes
(
const
char
*
bytes
,
int
N
)
{
memcpy
(
p_
,
bytes
,
N
);
p_
+=
N
;
}
char
*
base_
;
char
*
p_
;
char
*
limit_
;
// Just for CHECKs
};
}
// detail
}
// operators
}
// paddle
paddle/fluid/operators/detail/send_recv.proto
浏览文件 @
45af8c1e
...
...
@@ -33,10 +33,34 @@ enum VarType {
}
message
VariableMessage
{
enum
Type
{
// Pod Types
BOOL
=
0
;
INT16
=
1
;
INT32
=
2
;
INT64
=
3
;
FP16
=
4
;
FP32
=
5
;
FP64
=
6
;
}
message
LodData
{
repeated
int64
lod_data
=
1
;
}
string
varname
=
1
;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType
VarType
type
=
2
;
bytes
serialized
=
3
;
// bool persistable is not needed for sending.
// tensor info:
Type
data_type
=
3
;
repeated
int64
dims
=
4
;
// lod details:
int64
lod_level
=
5
;
repeated
LodData
lod
=
6
;
// tensor data
bytes
serialized
=
7
;
// selected_rows data
bytes
rows
=
8
;
}
message
VoidMessage
{}
paddle/fluid/operators/detail/sendrecvop_utils.cc
浏览文件 @
45af8c1e
...
...
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -63,6 +68,242 @@ void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
}
}
void
SerializeToByteBuffer
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
::
grpc
::
ByteBuffer
*
msg
)
{
using
VarMsg
=
sendrecv
::
VariableMessage
;
sendrecv
::
VariableMessage
request
;
std
::
string
header
;
request
.
AppendToString
(
&
header
);
// When using GPU, need to free the copied CPU buffer
// when the ByteBuffer destroies
// TODO(typhoonzero): add unref here, if we have dependent
// parallelism execution, need to know when to free the tensor.
DestroyCallback
destroy_callback
=
[](
void
*
backing
)
{};
void
*
buf
=
malloc
(
1024
);
void
*
payload
;
size_t
payload_size
;
ProtoEncodeHelper
e
((
char
*
)
buf
,
1024
);
e
.
WriteString
(
VarMsg
::
kVarnameFieldNumber
,
name
);
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
e
.
WriteUint64
(
VarMsg
::
kTypeFieldNumber
,
0
);
}
else
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
e
.
WriteUint64
(
VarMsg
::
kTypeFieldNumber
,
1
);
}
switch
(
framework
::
ToVarType
(
var
->
Type
()))
{
case
framework
::
proto
::
VarType_Type_LOD_TENSOR
:
{
auto
tensor
=
var
->
Get
<
framework
::
LoDTensor
>
();
e
.
WriteUint64
(
VarMsg
::
kDataTypeFieldNumber
,
framework
::
ToDataType
(
tensor
.
type
()));
for
(
auto
&
dim
:
framework
::
vectorize
(
tensor
.
dims
()))
{
e
.
WriteUint64
(
VarMsg
::
kDimsFieldNumber
,
dim
);
}
auto
lod
=
tensor
.
lod
();
// std::vector<Vector<size_t>>
if
(
lod
.
size
()
>
0
)
{
e
.
WriteUint64
(
VarMsg
::
kLodLevelFieldNumber
,
lod
.
size
());
for
(
auto
&
each
:
lod
)
{
e
.
WriteVarlengthBeginning
(
VarMsg
::
kLodFieldNumber
,
2
+
// tag + varintlength of submessage
1
+
// kLodDataFieldNumber
each
.
size
());
// auto copied from GPU
for
(
auto
&
d
:
each
)
{
e
.
WriteUint64
(
VarMsg
::
LodData
::
kLodDataFieldNumber
,
d
);
}
}
}
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
tensor
.
place
()));
platform
::
CPUPlace
cpu
;
auto
&
gpu_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
);
auto
copy_size
=
tensor
.
memory_size
();
payload
=
memory
::
Alloc
(
cpu
,
copy_size
);
memory
::
Copy
(
cpu
,
payload
,
boost
::
get
<
platform
::
CUDAPlace
>
(
tensor
.
place
()),
reinterpret_cast
<
const
void
*>
(
tensor
.
data
<
void
>
()),
copy_size
,
gpu_dev_ctx
.
stream
());
destroy_callback
=
[](
void
*
backing
)
{
std
::
cout
<<
"destroy payload"
<<
std
::
endl
;
platform
::
CPUPlace
cpu
;
memory
::
Free
(
cpu
,
backing
);
};
#endif
}
else
{
payload
=
tensor
.
data
<
void
>
();
}
payload_size
=
tensor
.
memory_size
();
std
::
string
tmp
(
reinterpret_cast
<
char
*>
(
payload
),
payload_size
);
for
(
int
i
=
0
;
i
<
tmp
.
size
();
++
i
)
{
printf
(
"%02X "
,
tmp
.
data
()[
i
]);
}
printf
(
"
\n
"
);
e
.
WriteVarlengthBeginning
(
VarMsg
::
kSerializedFieldNumber
,
payload_size
);
}
break
;
case
framework
::
proto
::
VarType_Type_SELECTED_ROWS
:
{
// TODO(typhoonzero): selectedrows implement should not use unique_ptr
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
e
.
WriteUint64
(
VarMsg
::
kDataTypeFieldNumber
,
framework
::
ToDataType
(
slr
->
value
().
type
()));
for
(
auto
&
dim
:
framework
::
vectorize
(
slr
->
value
().
dims
()))
{
e
.
WriteUint64
(
VarMsg
::
kDimsFieldNumber
,
dim
);
}
e
.
WriteUint64
(
VarMsg
::
kLodLevelFieldNumber
,
0
);
auto
*
tensor
=
slr
->
mutable_value
();
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
platform
::
CPUPlace
cpu
;
auto
&
gpu_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
);
auto
copy_size
=
tensor
->
memory_size
();
payload
=
memory
::
Alloc
(
cpu
,
copy_size
);
memory
::
Copy
(
cpu
,
payload
,
boost
::
get
<
platform
::
CUDAPlace
>
(
tensor
->
place
()),
reinterpret_cast
<
const
void
*>
(
tensor
->
data
<
void
>
()),
copy_size
,
gpu_dev_ctx
.
stream
());
ctx
.
Wait
();
float
*
ttt
=
reinterpret_cast
<
float
*>
(
payload
);
for
(
int
i
=
0
;
i
<
copy_size
/
4
;
i
++
)
{
std
::
cout
<<
"copied to cpu: "
<<
ttt
[
i
]
<<
std
::
endl
;
}
destroy_callback
=
[](
void
*
backing
)
{
std
::
cout
<<
"destroy..."
<<
std
::
endl
;
// platform::CPUPlace cpu;
// memory::Free(cpu, backing);
};
#endif
}
else
{
payload
=
slr
->
mutable_value
()
->
data
<
void
>
();
}
payload_size
=
tensor
->
memory_size
();
e
.
WriteVarlengthBeginning
(
VarMsg
::
kSerializedFieldNumber
,
payload_size
);
}
break
;
default:
PADDLE_THROW
(
"Serialize does not support type: %s"
,
typeid
(
var
->
Type
()).
name
());
break
;
}
// steal reference of tensor data
::
grpc
::
Slice
slices
[
4
];
// metadata, tensor, rows meta, rows
int
num_slices
=
2
;
// only SelectedRows have rows buffer
slices
[
0
]
=
::
grpc
::
Slice
(
e
.
size
());
memcpy
(
const_cast
<
uint8_t
*>
(
slices
[
0
].
begin
()),
e
.
data
(),
e
.
size
());
slices
[
1
]
=
::
grpc
::
Slice
(
grpc_slice_new_with_user_data
(
payload
,
payload_size
,
destroy_callback
,
static_cast
<
char
*>
(
payload
)),
::
grpc
::
Slice
::
STEAL_REF
);
if
(
framework
::
ToVarType
(
var
->
Type
())
==
framework
::
proto
::
VarType_Type_SELECTED_ROWS
)
{
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
ProtoEncodeHelper
e2
((
char
*
)
buf
,
128
);
// NOTE: rows is of type int64_t
size_t
rows_memory_size
=
slr
->
rows
().
capacity
()
*
framework
::
SizeOfType
(
typeid
(
int64_t
));
e2
.
WriteVarlengthBeginning
(
VarMsg
::
kRowsFieldNumber
,
rows_memory_size
);
slices
[
2
]
=
::
grpc
::
Slice
(
e2
.
size
());
memcpy
(
const_cast
<
uint8_t
*>
(
slices
[
2
].
begin
()),
e2
.
data
(),
e2
.
size
());
slices
[
3
]
=
::
grpc
::
Slice
(
grpc_slice_new_with_user_data
(
const_cast
<
void
*>
(
reinterpret_cast
<
const
void
*>
(
slr
->
rows
().
data
())),
rows_memory_size
,
[](
void
*
backing
)
{
// TODO(typhoonzero): add unref here, same as above.
},
const_cast
<
char
*>
(
reinterpret_cast
<
const
char
*>
(
slr
->
rows
().
data
()))),
::
grpc
::
Slice
::
STEAL_REF
);
num_slices
=
4
;
}
::
grpc
::
ByteBuffer
tmp
(
&
slices
[
0
],
num_slices
);
msg
->
Swap
(
&
tmp
);
}
void
DeserializeFromByteBuffer
(
const
::
grpc
::
ByteBuffer
&
msg
,
const
platform
::
DeviceContext
&
ctx
,
framework
::
Variable
*
var
)
{
sendrecv
::
VariableMessage
meta
;
GrpcByteBufferSource
source
;
source
.
Init
(
msg
);
::
google
::
protobuf
::
io
::
CodedInputStream
input
(
&
source
);
// do zerocopy parsing
PADDLE_ENFORCE
(
meta
.
ParseFromCodedStream
(
&
input
));
PADDLE_ENFORCE
(
input
.
ConsumedEntireMessage
());
// dims is needed by both tensor and selectedrows
std
::
vector
<
int
>
vecdims
;
for
(
auto
&
d
:
meta
.
dims
())
{
vecdims
.
push_back
(
d
);
}
framework
::
DDim
dims
=
framework
::
make_ddim
(
vecdims
);
if
(
meta
.
type
()
==
sendrecv
::
LOD_TENSOR
)
{
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
dims
);
void
*
tensor_data
=
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
paddle
::
operators
::
detail
::
ToTypeIndex
(
meta
.
data_type
()));
framework
::
LoD
lod
;
for
(
int
i
=
0
;
i
<
meta
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
meta
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
meta
.
lod
(
i
).
lod_data
(
j
));
}
lod
.
push_back
(
v
);
}
tensor
->
set_lod
(
lod
);
// How to avoid copying and use the message buffer directly?
// Maybe need to find a way to release all memory except tensor content.
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
platform
::
CPUPlace
cpu
;
auto
&
gpu_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
);
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
tensor
->
place
()),
tensor_data
,
cpu
,
reinterpret_cast
<
const
void
*>
(
meta
.
serialized
().
data
()),
meta
.
serialized
().
size
(),
gpu_dev_ctx
.
stream
());
#endif
}
else
{
memcpy
(
tensor_data
,
reinterpret_cast
<
const
void
*>
(
meta
.
serialized
().
data
()),
meta
.
serialized
().
size
());
}
}
else
if
(
meta
.
type
()
==
sendrecv
::
SELECTED_ROWS
)
{
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
tensor
=
slr
->
mutable_value
();
int64_t
*
rows_data
=
slr
->
mutable_rows
()
->
data
();
tensor
->
Resize
(
dims
);
void
*
tensor_data
=
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
paddle
::
operators
::
detail
::
ToTypeIndex
(
meta
.
data_type
()));
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
platform
::
CPUPlace
cpu
;
auto
&
gpu_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
);
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
tensor
->
place
()),
tensor_data
,
cpu
,
reinterpret_cast
<
const
void
*>
(
meta
.
serialized
().
data
()),
meta
.
serialized
().
size
(),
gpu_dev_ctx
.
stream
());
#endif
}
else
{
memcpy
(
tensor_data
,
reinterpret_cast
<
const
void
*>
(
meta
.
serialized
().
data
()),
meta
.
serialized
().
size
());
}
// copy rows CPU data, GPU data will be copied lazly
memcpy
(
rows_data
,
reinterpret_cast
<
const
void
*>
(
meta
.
rows
().
data
()),
meta
.
rows
().
size
());
}
}
}
// namespace detail
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
\ No newline at end of file
paddle/fluid/operators/detail/sendrecvop_utils.h
浏览文件 @
45af8c1e
...
...
@@ -33,6 +33,14 @@ namespace detail {
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
typedef
void
(
*
DestroyCallback
)(
void
*
);
inline
int64_t
GetTimestamp
()
{
return
std
::
chrono
::
duration_cast
<
std
::
chrono
::
milliseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
();
}
void
SerializeToMessage
(
const
std
::
string
&
name
,
const
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
sendrecv
::
VariableMessage
*
msg
);
...
...
@@ -40,6 +48,32 @@ void SerializeToMessage(const std::string& name, const framework::Variable* var,
void
DeserializeFromMessage
(
const
sendrecv
::
VariableMessage
&
msg
,
const
platform
::
DeviceContext
&
ctx
,
framework
::
Variable
*
var
);
void
SerializeToByteBuffer
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
::
grpc
::
ByteBuffer
*
msg
);
void
DeserializeFromByteBuffer
(
const
::
grpc
::
ByteBuffer
&
msg
,
const
platform
::
DeviceContext
&
ctx
,
framework
::
Variable
*
var
);
inline
std
::
type_index
ToTypeIndex
(
sendrecv
::
VariableMessage
::
Type
type
)
{
switch
(
type
)
{
case
sendrecv
::
VariableMessage
::
FP32
:
return
typeid
(
float
);
// NOLINT
case
sendrecv
::
VariableMessage
::
FP64
:
return
typeid
(
double
);
// NOLINT
case
sendrecv
::
VariableMessage
::
INT32
:
return
typeid
(
int
);
// NOLINT
case
sendrecv
::
VariableMessage
::
INT64
:
return
typeid
(
int64_t
);
// NOLINT
case
sendrecv
::
VariableMessage
::
BOOL
:
return
typeid
(
bool
);
// NOLINT
default:
PADDLE_THROW
(
"Not support type %d"
,
type
);
}
}
}
// namespace detail
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/detail/test_serde.cc
0 → 100644
浏览文件 @
45af8c1e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
namespace
framework
=
paddle
::
framework
;
namespace
platform
=
paddle
::
platform
;
namespace
operators
=
paddle
::
operators
;
namespace
math
=
paddle
::
operators
::
math
;
namespace
memory
=
paddle
::
memory
;
void
RunSerdeTestTensor
(
platform
::
Place
place
)
{
// serialize var to ByteBuffer
framework
::
Variable
var
;
auto
*
tensor
=
var
.
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
framework
::
make_ddim
({
4
,
8
,
4
,
2
}));
framework
::
LoD
lod
;
lod
.
push_back
(
framework
::
Vector
<
size_t
>
({
1
,
3
,
8
}));
tensor
->
set_lod
(
lod
);
int
tensor_numel
=
4
*
8
*
4
*
2
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
ctx
=
*
pool
.
Get
(
place
);
float
*
orig_tensor_data
=
tensor
->
mutable_data
<
float
>
(
place
);
math
::
set_constant
(
ctx
,
tensor
,
31.9
);
::
grpc
::
ByteBuffer
msg
;
operators
::
detail
::
SerializeToByteBuffer
(
"myvar"
,
&
var
,
ctx
,
&
msg
);
EXPECT_GT
(
msg
.
Length
(),
0
);
// deserialize
std
::
vector
<::
grpc
::
Slice
>
slices
;
(
void
)
msg
.
Dump
(
&
slices
);
std
::
string
tmp
;
for
(
const
auto
&
s
:
slices
)
{
tmp
.
append
(
reinterpret_cast
<
const
char
*>
(
s
.
begin
()),
s
.
size
());
}
sendrecv
::
VariableMessage
varmsg
;
EXPECT_TRUE
(
varmsg
.
ParseFromString
(
tmp
));
EXPECT_EQ
(
varmsg
.
varname
(),
"myvar"
);
EXPECT_EQ
(
varmsg
.
type
(),
0
);
EXPECT_EQ
(
varmsg
.
dims
()[
0
],
4
);
EXPECT_EQ
(
varmsg
.
dims
()[
1
],
8
);
EXPECT_EQ
(
varmsg
.
dims
()[
2
],
4
);
EXPECT_EQ
(
varmsg
.
dims
()[
3
],
2
);
EXPECT_EQ
(
varmsg
.
lod_level
(),
1
);
EXPECT_EQ
(
varmsg
.
lod
(
0
).
lod_data
(
0
),
1
);
EXPECT_EQ
(
varmsg
.
lod
(
0
).
lod_data
(
1
),
3
);
EXPECT_EQ
(
varmsg
.
lod
(
0
).
lod_data
(
2
),
8
);
const
float
*
tensor_data
=
reinterpret_cast
<
const
float
*>
(
varmsg
.
serialized
().
data
());
for
(
int
i
=
0
;
i
<
varmsg
.
serialized
().
size
();
++
i
)
{
printf
(
"%02X "
,
varmsg
.
serialized
().
data
()[
i
]);
}
printf
(
"
\n
"
);
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
std
::
cout
<<
"#####tensor data: "
<<
tensor_data
[
i
]
<<
std
::
endl
;
EXPECT_EQ
(
tensor_data
[
i
],
orig_tensor_data
[
i
]);
std
::
cout
<<
"test end 1 "
<<
std
::
endl
;
}
std
::
cout
<<
"tensor data end "
<<
std
::
endl
;
// deserialize zero-copy
framework
::
Variable
var2
;
operators
::
detail
::
DeserializeFromByteBuffer
(
msg
,
ctx
,
&
var2
);
auto
tensor2
=
var2
.
Get
<
framework
::
LoDTensor
>
();
float
*
tensor_data2
=
nullptr
;
framework
::
Tensor
tmp_tensor
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
platform
::
CPUPlace
cpu
;
framework
::
TensorCopy
(
tensor2
,
cpu
,
&
tmp_tensor
);
tensor_data2
=
tmp_tensor
.
data
<
float
>
();
}
else
{
tensor_data2
=
const_cast
<
float
*>
(
tensor2
.
data
<
float
>
());
}
EXPECT_EQ
(
varmsg
.
lod_level
(),
1
);
EXPECT_EQ
(
varmsg
.
lod
(
0
).
lod_data
(
0
),
1
);
EXPECT_EQ
(
varmsg
.
lod
(
0
).
lod_data
(
1
),
3
);
EXPECT_EQ
(
varmsg
.
lod
(
0
).
lod_data
(
2
),
8
);
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
EXPECT_EQ
(
tensor_data2
[
i
],
orig_tensor_data
[
i
]);
}
void
RunSerdeTestSelectedRows
(
platform
::
Place
place
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
ctx
=
*
pool
.
Get
(
place
);
// serialize var to ByteBuffer
framework
::
Variable
var
;
auto
*
slr
=
var
.
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
rows
=
slr
->
mutable_rows
();
tensor
->
Resize
(
framework
::
make_ddim
({
2
,
10
}));
int
tensor_numel
=
2
*
10
;
float
*
orig_tensor_data
=
tensor
->
mutable_data
<
float
>
(
place
);
math
::
set_constant
(
ctx
,
tensor
,
32.7
);
rows
->
push_back
(
3
);
rows
->
push_back
(
10
);
::
grpc
::
ByteBuffer
msg
;
operators
::
detail
::
SerializeToByteBuffer
(
"myvar"
,
&
var
,
ctx
,
&
msg
);
EXPECT_GT
(
msg
.
Length
(),
0
);
// deserialize
std
::
vector
<::
grpc
::
Slice
>
slices
;
(
void
)
msg
.
Dump
(
&
slices
);
std
::
string
tmp
;
for
(
const
auto
&
s
:
slices
)
{
tmp
.
append
(
reinterpret_cast
<
const
char
*>
(
s
.
begin
()),
s
.
size
());
}
sendrecv
::
VariableMessage
varmsg
;
EXPECT_TRUE
(
varmsg
.
ParseFromString
(
tmp
));
EXPECT_EQ
(
varmsg
.
varname
(),
"myvar"
);
EXPECT_EQ
(
varmsg
.
type
(),
1
);
const
float
*
tensor_data
=
reinterpret_cast
<
const
float
*>
(
varmsg
.
serialized
().
data
());
const
int64_t
*
rows_data
=
reinterpret_cast
<
const
int64_t
*>
(
varmsg
.
rows
().
data
());
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
EXPECT_EQ
(
tensor_data
[
i
],
orig_tensor_data
[
i
]);
}
EXPECT_EQ
(
rows_data
[
0
],
3
);
EXPECT_EQ
(
rows_data
[
1
],
10
);
// deserialize zero-copy
framework
::
Variable
var2
;
operators
::
detail
::
DeserializeFromByteBuffer
(
msg
,
ctx
,
&
var2
);
auto
*
slr2
=
var2
.
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
tensor2
=
slr2
->
mutable_value
();
auto
*
rows2
=
slr2
->
mutable_rows
();
float
*
tensor_data2
=
nullptr
;
framework
::
Tensor
tmp_tensor
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
platform
::
CPUPlace
cpu
;
framework
::
TensorCopy
(
*
tensor2
,
cpu
,
&
tmp_tensor
);
tensor_data2
=
tmp_tensor
.
data
<
float
>
();
}
else
{
tensor_data2
=
const_cast
<
float
*>
(
tensor2
->
data
<
float
>
());
}
const
int64_t
*
rows_data2
=
rows2
->
data
();
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
EXPECT_EQ
(
tensor_data2
[
i
],
orig_tensor_data
[
i
]);
}
EXPECT_EQ
(
rows_data2
[
0
],
3
);
EXPECT_EQ
(
rows_data2
[
1
],
10
);
}
// TEST(SelectedRows, CPU) {
// platform::CPUPlace place;
// RunSerdeTestSelectedRows(place);
// }
// TEST(SelectedRows, GPU) {
// platform::CUDAPlace place;
// RunSerdeTestSelectedRows(place);
// }
TEST
(
Tensor
,
CPU
)
{
platform
::
CPUPlace
place
;
RunSerdeTestTensor
(
place
);
}
TEST
(
Tensor
,
GPU
)
{
platform
::
CUDAPlace
place
;
RunSerdeTestTensor
(
place
);
}
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录