Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
2a9c993e
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2a9c993e
编写于
1月 20, 2022
作者:
Y
yaoxuefeng
提交者:
GitHub
1月 20, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
mod communicator (#39064)
上级
6b0c57cf
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
225 addition
and
19 deletion
+225
-19
paddle/fluid/distributed/service/communicator.cc
paddle/fluid/distributed/service/communicator.cc
+177
-0
paddle/fluid/distributed/service/communicator.h
paddle/fluid/distributed/service/communicator.h
+13
-0
paddle/fluid/operators/pscore/distributed_lookup_table_op.h
paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+12
-9
paddle/fluid/operators/pscore/distributed_push_sparse_op.h
paddle/fluid/operators/pscore/distributed_push_sparse_op.h
+23
-10
未找到文件。
paddle/fluid/distributed/service/communicator.cc
浏览文件 @
2a9c993e
...
...
@@ -30,6 +30,8 @@ namespace distributed {
using
framework
::
LoDTensor
;
using
framework
::
SelectedRows
;
const
uint32_t
MAX_FEASIGN_NUM
=
1024
*
100
*
100
;
inline
double
GetCurrentUS
()
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
...
...
@@ -576,6 +578,181 @@ void AsyncCommunicator::MainThread() {
VLOG
(
1
)
<<
"communicator stopped, send thread exit"
;
}
void
AsyncCommunicator
::
PullSparseToTensorSync
(
const
uint64_t
table_id
,
int
fea_dim
,
uint64_t
padding_id
,
platform
::
Place
place
,
bool
is_training
,
std
::
vector
<
const
LoDTensor
*>
*
inputs
,
std
::
vector
<
LoDTensor
*>
*
outputs
)
{
std
::
vector
<
uint64_t
>
fea_keys
;
std
::
vector
<
float
*>
pull_result_ptr
;
fea_keys
.
reserve
(
MAX_FEASIGN_NUM
/
100
);
pull_result_ptr
.
reserve
(
MAX_FEASIGN_NUM
/
100
);
std
::
vector
<
float
>
init_value
(
fea_dim
,
0
);
framework
::
LoDTensor
*
output
=
nullptr
;
float
*
output_data
=
nullptr
;
size_t
output_index
=
-
1
;
size_t
output_len
=
0
;
for
(
size_t
index
=
0
;
index
<
inputs
->
size
();
++
index
)
{
const
framework
::
LoDTensor
*
tensor
=
inputs
->
at
(
index
);
const
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
size_t
len
=
tensor
->
numel
();
for
(
size_t
i
=
0
;
i
<
len
;
++
i
,
output_len
+=
fea_dim
)
{
if
(
!
output
||
output_len
==
size_t
(
output
->
numel
()))
{
++
output_index
;
CHECK
(
output_index
<
outputs
->
size
());
// NOLINT
output
=
outputs
->
at
(
output_index
);
output
->
set_lod
(
tensor
->
lod
());
output_data
=
output
->
mutable_data
<
float
>
(
place
);
output_len
=
0
;
CHECK
(
output
->
numel
()
%
fea_dim
==
0
);
// NOLINT
CHECK
(
output_data
!=
nullptr
);
// NOLINT
}
uint64_t
real_id
=
static_cast
<
uint64_t
>
(
ids
[
i
]);
if
(
real_id
==
padding_id
)
{
memcpy
(
output_data
+
output_len
,
init_value
.
data
(),
sizeof
(
float
)
*
fea_dim
);
continue
;
}
fea_keys
.
push_back
(
real_id
);
pull_result_ptr
.
push_back
(
output_data
+
output_len
);
}
}
auto
status
=
_worker_ptr
->
pull_sparse
(
pull_result_ptr
.
data
(),
table_id
,
fea_keys
.
data
(),
fea_keys
.
size
(),
is_training
);
status
.
wait
();
auto
ret
=
status
.
get
();
if
(
ret
!=
0
)
{
LOG
(
ERROR
)
<<
"fleet pull sparse failed, status["
<<
ret
<<
"]"
;
sleep
(
sleep_seconds_before_fail_exit_
);
}
}
void
AsyncCommunicator
::
PushSparseFromTensorAsync
(
const
uint64_t
table_id
,
int
fea_dim
,
uint64_t
padding_id
,
platform
::
Place
place
,
std
::
vector
<
const
framework
::
LoDTensor
*>
*
inputs
,
const
framework
::
LoDTensor
*
shows
,
const
framework
::
LoDTensor
*
clks
,
std
::
vector
<
framework
::
LoDTensor
*>
*
outputs
)
{
int
batch_size
=
-
1
;
bool
batch_size_consist
=
true
;
for
(
auto
*
input
:
*
inputs
)
{
int
cur_batch_size
=
input
->
lod
().
size
()
?
input
->
lod
()[
0
].
size
()
-
1
:
input
->
dims
()[
0
];
if
(
batch_size
==
-
1
)
{
batch_size
=
cur_batch_size
;
}
else
{
// CHECK(batch_size == cur_batch_size); // NOLINT
batch_size_consist
=
false
;
break
;
}
}
CHECK
(
batch_size
>
0
);
// NOLINT
int
show_size
=
shows
->
lod
().
size
()
?
shows
->
lod
()[
0
].
size
()
-
1
:
shows
->
dims
()[
0
];
CHECK
(
show_size
==
batch_size
||
show_size
==
1
);
int
clk_size
=
clks
->
lod
().
size
()
?
clks
->
lod
()[
0
].
size
()
-
1
:
clks
->
dims
()[
0
];
CHECK
(
clk_size
==
batch_size
||
clk_size
==
1
);
CHECK
(
outputs
->
size
()
==
inputs
->
size
());
std
::
vector
<
uint64_t
>
push_keys
;
push_keys
.
reserve
(
MAX_FEASIGN_NUM
/
100
);
std
::
vector
<
std
::
vector
<
float
>>
push_values
;
push_values
.
reserve
(
MAX_FEASIGN_NUM
/
100
);
size_t
output_len
=
0
;
size_t
input_idx
=
0
;
VLOG
(
2
)
<<
"fleet.cc::emb_dim: "
<<
fea_dim
;
// TODO(zhaocaibei123): check type of show/clk is int? float? uint64?
// const long int* show_tensor = shows->data<int64_t>();
// const long int* clk_tensor = clks->data<int64_t>();
const
int64_t
*
show_tensor
=
shows
->
data
<
int64_t
>
();
const
int64_t
*
clk_tensor
=
clks
->
data
<
int64_t
>
();
for
(
size_t
index
=
0
;
index
<
inputs
->
size
();
++
index
)
{
framework
::
LoDTensor
*
g_tensor
=
outputs
->
at
(
index
);
float
*
g
=
g_tensor
->
data
<
float
>
();
// no cvm
if
(
batch_size_consist
)
{
// TODO(zhaocaibei123): add config
// scale_sparse_gradient_with_batch_size_
Eigen
::
Map
<
Eigen
::
Matrix
<
float
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
Eigen
::
RowMajor
>>
g_mat
(
g
,
g_tensor
->
numel
()
/
fea_dim
,
fea_dim
);
g_mat
.
rightCols
(
fea_dim
)
*=
batch_size
;
}
const
framework
::
LoDTensor
*
tensor
=
inputs
->
at
(
index
);
const
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
size_t
len
=
tensor
->
numel
();
output_len
=
0
;
if
(
tensor
->
lod
().
size
()
>
0
)
{
for
(
size_t
i
=
0
;
i
<
tensor
->
lod
()[
0
].
size
()
-
1
;
++
i
)
{
for
(
int
j
=
tensor
->
lod
()[
0
][
i
];
j
<
tensor
->
lod
()[
0
][
i
+
1
];
++
j
,
output_len
+=
fea_dim
)
{
uint64_t
real_id
=
static_cast
<
uint64_t
>
(
ids
[
j
]);
if
(
real_id
==
padding_id
)
{
continue
;
}
push_keys
.
emplace_back
(
real_id
);
push_values
.
emplace_back
(
fea_dim
+
3
);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
push_values
.
back
()[
0
]
=
2
;
// TODO(zhaocaibei123): slot
push_values
.
back
()[
1
]
=
(
i
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
i
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
float
*
data
=
push_values
.
back
().
data
()
+
3
;
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
++
input_idx
;
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
len
;
++
i
,
output_len
+=
fea_dim
)
{
uint64_t
real_id
=
static_cast
<
uint64_t
>
(
ids
[
i
]);
if
(
real_id
==
padding_id
)
{
continue
;
}
push_keys
.
emplace_back
(
real_id
);
push_values
.
emplace_back
(
fea_dim
+
3
);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
push_values
.
back
()[
0
]
=
2
;
// TODO(zhaocaibei123): slot
push_values
.
back
()[
1
]
=
(
i
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
i
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
float
*
data
=
push_values
.
back
().
data
()
+
3
;
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
++
input_idx
;
}
}
CHECK
(
output_len
==
g_tensor
->
numel
());
}
std
::
vector
<
float
*>
push_g_vec
(
input_idx
,
nullptr
);
for
(
auto
i
=
0u
;
i
<
push_keys
.
size
();
++
i
)
{
push_g_vec
[
i
]
=
push_values
.
at
(
i
).
data
();
}
PADDLE_ENFORCE_EQ
(
this
->
Check
(
table_id
),
true
,
platform
::
errors
::
InvalidArgument
(
"can not find table: %s, please check your config"
,
table_id
));
auto
status
=
_worker_ptr
->
push_sparse
(
table_id
,
push_keys
.
data
(),
(
const
float
**
)
push_g_vec
.
data
(),
push_keys
.
size
());
}
void
HalfAsyncCommunicator
::
MainThread
()
{
VLOG
(
3
)
<<
"HalfAsyncCommunicator MainThread start and wait"
;
...
...
paddle/fluid/distributed/service/communicator.h
浏览文件 @
2a9c993e
...
...
@@ -453,6 +453,18 @@ class AsyncCommunicator : public Communicator {
void
PushDensePostProcessing
();
void
PullSparseToTensorSync
(
const
uint64_t
table_id
,
int
fea_dim
,
uint64_t
padding_id
,
platform
::
Place
place
,
bool
is_training
,
std
::
vector
<
const
framework
::
LoDTensor
*>
*
inputs
,
// NOLINT
std
::
vector
<
framework
::
LoDTensor
*>
*
outputs
);
// NOLINT
void
PushSparseFromTensorAsync
(
const
uint64_t
table_id
,
int
fea_dim
,
uint64_t
padding_id
,
platform
::
Place
place
,
std
::
vector
<
const
framework
::
LoDTensor
*>
*
inputs
,
const
framework
::
LoDTensor
*
shows
,
const
framework
::
LoDTensor
*
clicks
,
std
::
vector
<
framework
::
LoDTensor
*>
*
outputs
);
protected:
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
BlockingQueue
<
std
::
shared_ptr
<
Variable
>>>>
...
...
@@ -467,6 +479,7 @@ class AsyncCommunicator : public Communicator {
bool
need_global_step_
=
false
;
bool
independent_recv_
=
true
;
int
parallel_task_nums_
=
0
;
int32_t
sleep_seconds_before_fail_exit_
;
std
::
unique_ptr
<
std
::
thread
>
main_thread_
{
nullptr
};
std
::
unique_ptr
<
std
::
thread
>
recv_thread_
{
nullptr
};
...
...
paddle/fluid/operators/pscore/distributed_lookup_table_op.h
浏览文件 @
2a9c993e
...
...
@@ -14,6 +14,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/distributed/fleet.h"
#include "paddle/fluid/distributed/service/communicator.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
...
...
@@ -51,13 +52,15 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
auto
inputs
=
context
.
MultiInput
<
framework
::
LoDTensor
>
(
"Ids"
);
auto
outputs
=
context
.
MultiOutput
<
framework
::
LoDTensor
>
(
"Outputs"
);
auto
fleet
=
distributed
::
FleetWrapper
::
GetInstance
();
// auto fleet = distributed::FleetWrapper::GetInstance();
auto
*
communicator
=
(
distributed
::
AsyncCommunicator
*
)
distributed
::
Communicator
::
GetInstance
();
if
(
platform
::
is_cpu_place
(
context
.
GetPlace
()))
{
fleet
->
PullSparseToTensorSync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
)
,
context
.
GetPlace
(),
!
is_test
,
&
inputs
,
&
outputs
);
communicator
->
PullSparseToTensorSync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
),
context
.
GetPlace
(),
!
is_test
,
&
inputs
,
&
outputs
);
}
else
{
auto
inputs_variable
=
context
.
MultiInputVar
(
"Ids"
);
auto
outputs_variable
=
context
.
MultiOutputVar
(
"Outputs"
);
...
...
@@ -93,10 +96,10 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
}
// use fleet->PullSparse
fleet
->
PullSparseToTensorSync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
)
,
cpu_place
,
!
is_test
,
&
tmp_input_vec
,
&
tmp_output_vec
);
communicator
->
PullSparseToTensorSync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
),
cpu_place
,
!
is_test
,
&
tmp_input_vec
,
&
tmp_output_vec
);
// cp temp to origin
for
(
size_t
idx
=
0
;
idx
<
output_var_size
;
++
idx
)
{
...
...
paddle/fluid/operators/pscore/distributed_push_sparse_op.h
浏览文件 @
2a9c993e
...
...
@@ -14,6 +14,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/distributed/fleet.h"
#include "paddle/fluid/distributed/service/communicator.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
...
...
@@ -32,20 +33,21 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> {
auto
table_id
=
context
.
Attr
<
int
>
(
"table_id"
);
auto
emb_dim
=
context
.
Attr
<
int
>
(
"size"
);
VLOG
(
1
)
<<
"push_sparse.h::emb_dim: "
<<
emb_dim
;
bool
is_test
=
context
.
Attr
<
bool
>
(
"is_test"
);
auto
inputs
=
context
.
MultiInput
<
framework
::
LoDTensor
>
(
"Ids"
);
auto
shows
=
context
.
Input
<
framework
::
LoDTensor
>
(
"Shows"
);
auto
clks
=
context
.
Input
<
framework
::
LoDTensor
>
(
"Clicks"
);
auto
outputs
=
context
.
MultiOutput
<
framework
::
LoDTensor
>
(
"Outputs"
);
auto
fleet
=
distributed
::
FleetWrapper
::
GetInstance
();
// auto fleet = distributed::FleetWrapper::GetInstance();
auto
*
communicator
=
(
distributed
::
AsyncCommunicator
*
)
distributed
::
Communicator
::
GetInstance
();
if
(
platform
::
is_cpu_place
(
context
.
GetPlace
()))
{
fleet
->
PushSparseFromTensorAsync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
)
,
context
.
GetPlace
(),
&
inputs
,
shows
,
clk
s
,
&
outputs
);
communicator
->
PushSparseFromTensorAsync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
),
context
.
GetPlace
(),
&
input
s
,
shows
,
clks
,
&
outputs
);
}
else
{
auto
inputs_variable
=
context
.
MultiInputVar
(
"Ids"
);
auto
outputs_variable
=
context
.
MultiOutputVar
(
"Outputs"
);
...
...
@@ -71,6 +73,17 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> {
tmp_input_vec
.
push_back
(
tmp_input_tensor
);
}
framework
::
Variable
*
tmp_shows_var
=
tmp_scope
->
Var
(
"Shows"
);
framework
::
LoDTensor
*
tmp_shows_tensor
=
tmp_shows_var
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
Variable
*
tmp_clicks_var
=
tmp_scope
->
Var
(
"Clicks"
);
framework
::
LoDTensor
*
tmp_clicks_tensor
=
tmp_clicks_var
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
TensorCopy
(
*
shows
,
cpu_place
,
context
.
device_context
(),
tmp_shows_tensor
);
framework
::
TensorCopy
(
*
clks
,
cpu_place
,
context
.
device_context
(),
tmp_clicks_tensor
);
// create temp output
for
(
size_t
idx
=
0
;
idx
<
output_var_size
;
++
idx
)
{
framework
::
Variable
*
tmp_output_var
=
tmp_scope
->
Var
(
outputs_name
[
idx
]);
...
...
@@ -81,10 +94,10 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> {
}
// use fleet->PullSparse
fleet
->
PullSparseToTensorSync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
)
,
cpu_place
,
!
is_test
,
&
tmp_input_vec
,
&
tmp_output_vec
);
communicator
->
PushSparseFromTensorAsync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
),
context
.
GetPlace
()
,
&
tmp_input_vec
,
tmp_shows_tensor
,
tmp_clicks_tensor
,
&
tmp_output_vec
);
// cp temp to origin
for
(
size_t
idx
=
0
;
idx
<
output_var_size
;
++
idx
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录