Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
8310ce60
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8310ce60
编写于
10月 25, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix cluster memory
test=develop
上级
71c846ef
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
62 addition
and
34 deletion
+62
-34
.gitignore
.gitignore
+1
-0
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+1
-0
paddle/fluid/operators/distributed/grpc_serde.cc
paddle/fluid/operators/distributed/grpc_serde.cc
+9
-12
paddle/fluid/operators/distributed/sendrecvop_utils.cc
paddle/fluid/operators/distributed/sendrecvop_utils.cc
+21
-10
paddle/fluid/operators/distributed/sendrecvop_utils.h
paddle/fluid/operators/distributed/sendrecvop_utils.h
+23
-6
paddle/fluid/operators/distributed/variable_response.cc
paddle/fluid/operators/distributed/variable_response.cc
+4
-4
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+3
-2
未找到文件。
.gitignore
浏览文件 @
8310ce60
...
...
@@ -29,3 +29,4 @@ third_party/
build_*
# clion workspace.
cmake-build-*
paddle/fluid/operators/distributed/send_recv.proto
paddle/fluid/framework/tensor.h
浏览文件 @
8310ce60
...
...
@@ -156,6 +156,7 @@ class Tensor {
void
clear
()
{
holder_
=
nullptr
;
}
const
std
::
shared_ptr
<
memory
::
Allocation
>&
Holder
()
const
{
return
holder_
;
}
size_t
offset
()
const
{
return
offset_
;
}
private:
/*! holds the memory block if allocated. */
...
...
paddle/fluid/operators/distributed/grpc_serde.cc
浏览文件 @
8310ce60
...
...
@@ -34,8 +34,7 @@ namespace distributed {
static
void
SerializeDestroyCallback
(
void
*
payload
)
{
if
(
payload
!=
nullptr
)
{
auto
*
shared_payload
=
reinterpret_cast
<
std
::
shared_ptr
<
memory
::
Allocation
>*>
(
payload
);
auto
*
shared_payload
=
reinterpret_cast
<
TensorPayload
*>
(
payload
);
delete
shared_payload
;
}
}
...
...
@@ -46,7 +45,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const
std
::
string
&
out_name
)
{
platform
::
RecordRPCEvent
record_event
(
"serial"
,
&
ctx
);
VarMsg
request
;
std
::
shared_ptr
<
memory
::
Allocation
>
*
payload
=
nullptr
;
TensorPayload
*
payload
=
nullptr
;
request
.
set_varname
(
name
);
// Note: normally the profiler is enabled in 1 trainer, hence only
...
...
@@ -65,12 +64,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
}
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
request
.
set_type
(
::
sendrecv
::
LOD_TENSOR
);
payload
=
new
std
::
shared_ptr
<
memory
::
Allocation
>
(
GetTensorPayload
(
var
,
ctx
,
&
request
));
payload
=
new
TensorPayload
(
GetTensorPayload
(
var
,
ctx
,
&
request
));
}
else
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
request
.
set_type
(
::
sendrecv
::
SELECTED_ROWS
);
payload
=
new
std
::
shared_ptr
<
memory
::
Allocation
>
(
GetSelectedRowsPayload
(
var
,
ctx
,
&
request
));
payload
=
new
TensorPayload
(
GetSelectedRowsPayload
(
var
,
ctx
,
&
request
));
#ifdef PADDLE_WITH_CUDA
}
else
if
(
var
->
IsType
<
ncclUniqueId
>
())
{
request
.
set_type
(
::
sendrecv
::
NCCL_ID
);
...
...
@@ -106,14 +103,14 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
PADDLE_ENFORCE_NOT_NULL
(
payload
);
e
.
WriteVarlengthBeginning
(
VarMsg
::
kSerializedFieldNumber
,
payload
->
get
()
->
size
());
payload
->
memory_
size
());
// steal reference of tensor data
::
grpc
::
Slice
slices
[
4
];
// metadata, tensor, rows meta, rows
int
num_slices
=
2
;
// only SelectedRows have rows buffer
slices
[
0
]
=
::
grpc
::
Slice
(
e
.
size
());
memcpy
(
const_cast
<
uint8_t
*>
(
slices
[
0
].
begin
()),
e
.
data
(),
e
.
size
());
slices
[
1
]
=
::
grpc
::
Slice
(
grpc_slice_new_with_user_data
(
payload
->
get
()
->
ptr
(),
payload
->
get
()
->
size
(),
slices
[
1
]
=
::
grpc
::
Slice
(
grpc_slice_new_with_user_data
(
payload
->
ptr
(),
payload
->
memory_
size
(),
SerializeDestroyCallback
,
payload
),
::
grpc
::
Slice
::
STEAL_REF
);
...
...
paddle/fluid/operators/distributed/sendrecvop_utils.cc
浏览文件 @
8310ce60
...
...
@@ -28,7 +28,7 @@ namespace distributed {
using
VarMsg
=
sendrecv
::
VariableMessage
;
static
std
::
shared_ptr
<
memory
::
Allocation
>
GetCommunicationAllocationFromTensor
(
static
TensorPayload
GetCommunicationAllocationFromTensor
(
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
tensor
)
{
if
(
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
...
...
@@ -45,16 +45,16 @@ static std::shared_ptr<memory::Allocation> GetCommunicationAllocationFromTensor(
tensor
.
data
<
void
>
(),
copy_size
,
gpu_dev_ctx
.
stream
());
ctx
.
Wait
();
return
result
;
return
TensorPayload
(
result
)
;
#else
return
nullptr
;
// THIS SHOULD NOT HAPPENED.
PADDLE_THROW
(
"This situation should not be happened"
);
#endif
}
else
{
return
tensor
.
Holder
(
);
return
TensorPayload
(
tensor
);
}
}
std
::
shared_ptr
<
memory
::
Allocation
>
GetTensorPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
TensorPayload
GetTensorPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
)
{
auto
tensor
=
var
->
Get
<
framework
::
LoDTensor
>
();
// FIXME(wuyi): data types in send_recv.proto is copied from
...
...
@@ -77,8 +77,8 @@ std::shared_ptr<memory::Allocation> GetTensorPayload(
return
GetCommunicationAllocationFromTensor
(
ctx
,
tensor
);
}
std
::
shared_ptr
<
memory
::
Allocation
>
GetSelectedRowsPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
TensorPayload
GetSelectedRowsPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
)
{
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
request
->
set_data_type
(
...
...
@@ -94,6 +94,17 @@ std::shared_ptr<memory::Allocation> GetSelectedRowsPayload(
return
GetCommunicationAllocationFromTensor
(
ctx
,
*
tensor
);
}
TensorPayload
::
TensorPayload
(
std
::
shared_ptr
<
memory
::
Allocation
>
allocation
)
:
allocation_
(
allocation
),
offset_
(
0
),
memory_size_
(
allocation
->
size
())
{}
TensorPayload
::
TensorPayload
(
const
framework
::
Tensor
&
tensor
)
:
allocation_
(
tensor
.
Holder
()),
offset_
(
tensor
.
offset
()),
memory_size_
(
tensor
.
numel
()
*
framework
::
SizeOfType
(
tensor
.
type
()))
{}
void
*
TensorPayload
::
ptr
()
const
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
allocation_
->
ptr
())
+
offset_
);
}
size_t
TensorPayload
::
memory_size
()
const
{
return
memory_size_
;
}
}
// namespace distributed
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/distributed/sendrecvop_utils.h
浏览文件 @
8310ce60
...
...
@@ -33,12 +33,29 @@ namespace distributed {
using
VarMsg
=
sendrecv
::
VariableMessage
;
std
::
shared_ptr
<
memory
::
Allocation
>
GetTensorPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
class
TensorPayload
final
{
public:
explicit
TensorPayload
(
const
framework
::
Tensor
&
tensor
);
explicit
TensorPayload
(
std
::
shared_ptr
<
memory
::
Allocation
>
allocation
);
TensorPayload
(
const
TensorPayload
&
o
)
=
default
;
TensorPayload
&
operator
=
(
const
TensorPayload
&
o
)
=
default
;
void
*
ptr
()
const
;
size_t
memory_size
()
const
;
private:
std
::
shared_ptr
<
memory
::
Allocation
>
allocation_
;
size_t
offset_
;
size_t
memory_size_
;
};
TensorPayload
GetTensorPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
);
std
::
shared_ptr
<
memory
::
Allocation
>
GetSelectedRowsPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
TensorPayload
GetSelectedRowsPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
);
inline
std
::
type_index
ToTypeIndex
(
sendrecv
::
VariableMessage
::
Type
type
)
{
...
...
paddle/fluid/operators/distributed/variable_response.cc
浏览文件 @
8310ce60
...
...
@@ -112,11 +112,11 @@ bool VariableResponse::CopyLodTensorData(
void
*
tensor_data
=
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
ToTypeIndex
(
meta_
.
data_type
()));
if
(
!
ReadRaw
(
input
,
ctx
,
tensor
->
place
(),
tensor_data
,
length
))
{
return
false
;
}
return
true
;
VLOG
(
6
)
<<
"Tensor.memory_size = "
<<
tensor
->
memory_size
()
<<
", Buffer Size = "
<<
length
;
PADDLE_ENFORCE_EQ
(
tensor
->
memory_size
(),
length
);
return
ReadRaw
(
input
,
ctx
,
tensor
->
place
(),
tensor_data
,
length
);
}
inline
framework
::
DDim
GetDims
(
...
...
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
浏览文件 @
8310ce60
...
...
@@ -42,11 +42,12 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self
.
_sync_mode
=
False
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
#FIXME(typhoonzero): fix async tests later
def
notest_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
'IS_SELF_CONTAINED_LR'
:
'1'
'IS_SELF_CONTAINED_LR'
:
'1'
,
}
self
.
check_with_place
(
"dist_simnet_bow.py"
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录