Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
8310ce60
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8310ce60
编写于
10月 25, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix cluster memory
test=develop
上级
71c846ef
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
62 addition
and
34 deletion
+62
-34
.gitignore
.gitignore
+1
-0
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+1
-0
paddle/fluid/operators/distributed/grpc_serde.cc
paddle/fluid/operators/distributed/grpc_serde.cc
+9
-12
paddle/fluid/operators/distributed/sendrecvop_utils.cc
paddle/fluid/operators/distributed/sendrecvop_utils.cc
+21
-10
paddle/fluid/operators/distributed/sendrecvop_utils.h
paddle/fluid/operators/distributed/sendrecvop_utils.h
+23
-6
paddle/fluid/operators/distributed/variable_response.cc
paddle/fluid/operators/distributed/variable_response.cc
+4
-4
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+3
-2
未找到文件。
.gitignore
浏览文件 @
8310ce60
...
...
@@ -29,3 +29,4 @@ third_party/
build_*
# clion workspace.
cmake-build-*
paddle/fluid/operators/distributed/send_recv.proto
paddle/fluid/framework/tensor.h
浏览文件 @
8310ce60
...
...
@@ -156,6 +156,7 @@ class Tensor {
void
clear
()
{
holder_
=
nullptr
;
}
const
std
::
shared_ptr
<
memory
::
Allocation
>&
Holder
()
const
{
return
holder_
;
}
size_t
offset
()
const
{
return
offset_
;
}
private:
/*! holds the memory block if allocated. */
...
...
paddle/fluid/operators/distributed/grpc_serde.cc
浏览文件 @
8310ce60
...
...
@@ -34,8 +34,7 @@ namespace distributed {
static
void
SerializeDestroyCallback
(
void
*
payload
)
{
if
(
payload
!=
nullptr
)
{
auto
*
shared_payload
=
reinterpret_cast
<
std
::
shared_ptr
<
memory
::
Allocation
>*>
(
payload
);
auto
*
shared_payload
=
reinterpret_cast
<
TensorPayload
*>
(
payload
);
delete
shared_payload
;
}
}
...
...
@@ -46,7 +45,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const
std
::
string
&
out_name
)
{
platform
::
RecordRPCEvent
record_event
(
"serial"
,
&
ctx
);
VarMsg
request
;
std
::
shared_ptr
<
memory
::
Allocation
>
*
payload
=
nullptr
;
TensorPayload
*
payload
=
nullptr
;
request
.
set_varname
(
name
);
// Note: normally the profiler is enabled in 1 trainer, hence only
...
...
@@ -65,12 +64,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
}
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
request
.
set_type
(
::
sendrecv
::
LOD_TENSOR
);
payload
=
new
std
::
shared_ptr
<
memory
::
Allocation
>
(
GetTensorPayload
(
var
,
ctx
,
&
request
));
payload
=
new
TensorPayload
(
GetTensorPayload
(
var
,
ctx
,
&
request
));
}
else
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
request
.
set_type
(
::
sendrecv
::
SELECTED_ROWS
);
payload
=
new
std
::
shared_ptr
<
memory
::
Allocation
>
(
GetSelectedRowsPayload
(
var
,
ctx
,
&
request
));
payload
=
new
TensorPayload
(
GetSelectedRowsPayload
(
var
,
ctx
,
&
request
));
#ifdef PADDLE_WITH_CUDA
}
else
if
(
var
->
IsType
<
ncclUniqueId
>
())
{
request
.
set_type
(
::
sendrecv
::
NCCL_ID
);
...
...
@@ -106,16 +103,16 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
PADDLE_ENFORCE_NOT_NULL
(
payload
);
e
.
WriteVarlengthBeginning
(
VarMsg
::
kSerializedFieldNumber
,
payload
->
get
()
->
size
());
payload
->
memory_
size
());
// steal reference of tensor data
::
grpc
::
Slice
slices
[
4
];
// metadata, tensor, rows meta, rows
int
num_slices
=
2
;
// only SelectedRows have rows buffer
slices
[
0
]
=
::
grpc
::
Slice
(
e
.
size
());
memcpy
(
const_cast
<
uint8_t
*>
(
slices
[
0
].
begin
()),
e
.
data
(),
e
.
size
());
slices
[
1
]
=
::
grpc
::
Slice
(
grpc_slice_new_with_user_data
(
payload
->
get
()
->
ptr
(),
payload
->
get
()
->
size
(),
SerializeDestroyCallback
,
payload
),
::
grpc
::
Slice
::
STEAL_REF
);
slices
[
1
]
=
::
grpc
::
Slice
(
grpc_slice_new_with_user_data
(
payload
->
ptr
(),
payload
->
memory_
size
(),
SerializeDestroyCallback
,
payload
),
::
grpc
::
Slice
::
STEAL_REF
);
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
...
...
paddle/fluid/operators/distributed/sendrecvop_utils.cc
浏览文件 @
8310ce60
...
...
@@ -28,7 +28,7 @@ namespace distributed {
using
VarMsg
=
sendrecv
::
VariableMessage
;
static
std
::
shared_ptr
<
memory
::
Allocation
>
GetCommunicationAllocationFromTensor
(
static
TensorPayload
GetCommunicationAllocationFromTensor
(
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
tensor
)
{
if
(
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
...
...
@@ -45,17 +45,17 @@ static std::shared_ptr<memory::Allocation> GetCommunicationAllocationFromTensor(
tensor
.
data
<
void
>
(),
copy_size
,
gpu_dev_ctx
.
stream
());
ctx
.
Wait
();
return
result
;
return
TensorPayload
(
result
)
;
#else
return
nullptr
;
// THIS SHOULD NOT HAPPENED.
PADDLE_THROW
(
"This situation should not be happened"
);
#endif
}
else
{
return
tensor
.
Holder
(
);
return
TensorPayload
(
tensor
);
}
}
std
::
shared_ptr
<
memory
::
Allocation
>
GetTensorPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
)
{
TensorPayload
GetTensorPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
)
{
auto
tensor
=
var
->
Get
<
framework
::
LoDTensor
>
();
// FIXME(wuyi): data types in send_recv.proto is copied from
// framework.proto
...
...
@@ -77,9 +77,9 @@ std::shared_ptr<memory::Allocation> GetTensorPayload(
return
GetCommunicationAllocationFromTensor
(
ctx
,
tensor
);
}
std
::
shared_ptr
<
memory
::
Allocation
>
GetSelectedRowsPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
)
{
TensorPayload
GetSelectedRowsPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
)
{
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
request
->
set_data_type
(
static_cast
<
VarMsg
::
Type
>
(
framework
::
ToDataType
(
slr
->
value
().
type
())));
...
...
@@ -94,6 +94,17 @@ std::shared_ptr<memory::Allocation> GetSelectedRowsPayload(
return
GetCommunicationAllocationFromTensor
(
ctx
,
*
tensor
);
}
TensorPayload
::
TensorPayload
(
std
::
shared_ptr
<
memory
::
Allocation
>
allocation
)
:
allocation_
(
allocation
),
offset_
(
0
),
memory_size_
(
allocation
->
size
())
{}
TensorPayload
::
TensorPayload
(
const
framework
::
Tensor
&
tensor
)
:
allocation_
(
tensor
.
Holder
()),
offset_
(
tensor
.
offset
()),
memory_size_
(
tensor
.
numel
()
*
framework
::
SizeOfType
(
tensor
.
type
()))
{}
void
*
TensorPayload
::
ptr
()
const
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
allocation_
->
ptr
())
+
offset_
);
}
size_t
TensorPayload
::
memory_size
()
const
{
return
memory_size_
;
}
}
// namespace distributed
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/distributed/sendrecvop_utils.h
浏览文件 @
8310ce60
...
...
@@ -33,13 +33,30 @@ namespace distributed {
using
VarMsg
=
sendrecv
::
VariableMessage
;
std
::
shared_ptr
<
memory
::
Allocation
>
GetTensorPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
);
class
TensorPayload
final
{
public:
explicit
TensorPayload
(
const
framework
::
Tensor
&
tensor
);
explicit
TensorPayload
(
std
::
shared_ptr
<
memory
::
Allocation
>
allocation
);
std
::
shared_ptr
<
memory
::
Allocation
>
GetSelectedRowsPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
);
TensorPayload
(
const
TensorPayload
&
o
)
=
default
;
TensorPayload
&
operator
=
(
const
TensorPayload
&
o
)
=
default
;
void
*
ptr
()
const
;
size_t
memory_size
()
const
;
private:
std
::
shared_ptr
<
memory
::
Allocation
>
allocation_
;
size_t
offset_
;
size_t
memory_size_
;
};
TensorPayload
GetTensorPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
);
TensorPayload
GetSelectedRowsPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
);
inline
std
::
type_index
ToTypeIndex
(
sendrecv
::
VariableMessage
::
Type
type
)
{
switch
(
type
)
{
...
...
paddle/fluid/operators/distributed/variable_response.cc
浏览文件 @
8310ce60
...
...
@@ -112,11 +112,11 @@ bool VariableResponse::CopyLodTensorData(
void
*
tensor_data
=
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
ToTypeIndex
(
meta_
.
data_type
()));
if
(
!
ReadRaw
(
input
,
ctx
,
tensor
->
place
(),
tensor_data
,
length
))
{
return
false
;
}
return
true
;
VLOG
(
6
)
<<
"Tensor.memory_size = "
<<
tensor
->
memory_size
()
<<
", Buffer Size = "
<<
length
;
PADDLE_ENFORCE_EQ
(
tensor
->
memory_size
(),
length
);
return
ReadRaw
(
input
,
ctx
,
tensor
->
place
(),
tensor_data
,
length
);
}
inline
framework
::
DDim
GetDims
(
...
...
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
浏览文件 @
8310ce60
...
...
@@ -42,11 +42,12 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self
.
_sync_mode
=
False
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
#FIXME(typhoonzero): fix async tests later
def
notest_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
'IS_SELF_CONTAINED_LR'
:
'1'
'IS_SELF_CONTAINED_LR'
:
'1'
,
}
self
.
check_with_place
(
"dist_simnet_bow.py"
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录