Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
b8d106e1
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b8d106e1
编写于
7月 20, 2022
作者:
D
danleifeng
提交者:
GitHub
7月 20, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
【GPUPS】Adam accessor (#43919)
* add adam/sharedadam optimzier for gpups;edit optimizer struct;test=develop
上级
1882ffd5
变更
36
隐藏空白更改
内联
并排
Showing
36 changed file
with
2714 addition
and
1282 deletion
+2714
-1282
paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
+13
-15
paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
+17
-2
paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
+83
-1
paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
+23
-0
paddle/fluid/distributed/ps/table/table.cc
paddle/fluid/distributed/ps/table/table.cc
+1
-0
paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
+1
-0
paddle/fluid/distributed/ps/wrapper/fleet.cc
paddle/fluid/distributed/ps/wrapper/fleet.cc
+26
-19
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+3
-3
paddle/fluid/framework/fleet/CMakeLists.txt
paddle/fluid/framework/fleet/CMakeLists.txt
+11
-4
paddle/fluid/framework/fleet/heter_context.h
paddle/fluid/framework/fleet/heter_context.h
+0
-2
paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+4
-4
paddle/fluid/framework/fleet/heter_ps/feature_value.cu
paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+192
-0
paddle/fluid/framework/fleet/heter_ps/feature_value.h
paddle/fluid/framework/fleet/heter_ps/feature_value.h
+705
-0
paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+4
-2
paddle/fluid/framework/fleet/heter_ps/hashtable.h
paddle/fluid/framework/fleet/heter_ps/hashtable.h
+8
-4
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+74
-88
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+18
-9
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+248
-185
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+101
-72
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+23
-29
paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
+26
-17
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+102
-43
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+14
-11
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+10
-10
paddle/fluid/framework/fleet/heter_ps/mem_pool.h
paddle/fluid/framework/fleet/heter_ps/mem_pool.h
+0
-14
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+399
-73
paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
+26
-2
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+99
-211
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+27
-299
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+154
-57
paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
+75
-104
python/paddle/distributed/fleet/base/distributed_strategy.py
python/paddle/distributed/fleet/base/distributed_strategy.py
+15
-0
python/paddle/distributed/ps/the_one_ps.py
python/paddle/distributed/ps/the_one_ps.py
+1
-1
python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
+201
-0
python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
.../fluid/tests/unittests/test_fleet_distributed_strategy.py
+8
-0
tools/parallel_UT_rule.py
tools/parallel_UT_rule.py
+2
-1
未找到文件。
paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
浏览文件 @
b8d106e1
...
...
@@ -31,6 +31,7 @@ int CtrDymfAccessor::Initialize() {
_embedx_sgd_rule
=
CREATE_PSCORE_CLASS
(
SparseValueSGDRule
,
name
);
_embedx_sgd_rule
->
LoadConfig
(
_config
.
embedx_sgd_param
(),
_config
.
embedx_dim
());
common_feature_value
.
optimizer_name
=
name
;
common_feature_value
.
embed_sgd_dim
=
_embed_sgd_rule
->
Dim
();
common_feature_value
.
embedx_dim
=
_config
.
embedx_dim
();
...
...
@@ -42,7 +43,10 @@ int CtrDymfAccessor::Initialize() {
if
(
_config
.
ctr_accessor_param
().
show_scale
())
{
_show_scale
=
true
;
}
VLOG
(
0
)
<<
" INTO CtrDymfAccessor::Initialize()"
;
VLOG
(
0
)
<<
" INTO CtrDymfAccessor::Initialize(); embed_sgd_dim:"
<<
common_feature_value
.
embed_sgd_dim
<<
" embedx_dim:"
<<
common_feature_value
.
embedx_dim
<<
" embedx_sgd_dim:"
<<
common_feature_value
.
embedx_sgd_dim
;
InitAccessorInfo
();
return
0
;
}
...
...
@@ -53,9 +57,9 @@ void CtrDymfAccessor::InitAccessorInfo() {
auto
embedx_dim
=
_config
.
embedx_dim
();
VLOG
(
0
)
<<
"InitAccessorInfo embedx_dim:"
<<
embedx_dim
;
_accessor_info
.
select_dim
=
3
+
embedx_dim
;
_accessor_info
.
select_dim
=
4
+
embedx_dim
;
_accessor_info
.
select_size
=
_accessor_info
.
select_dim
*
sizeof
(
float
);
_accessor_info
.
update_dim
=
4
+
embedx_dim
;
_accessor_info
.
update_dim
=
5
+
embedx_dim
;
_accessor_info
.
update_size
=
_accessor_info
.
update_dim
*
sizeof
(
float
);
_accessor_info
.
mf_size
=
(
embedx_dim
+
common_feature_value
.
embedx_sgd_dim
)
*
sizeof
(
float
);
...
...
@@ -179,8 +183,10 @@ int32_t CtrDymfAccessor::Create(float** values, size_t num) {
value
[
common_feature_value
.
ClickIndex
()]
=
0
;
value
[
common_feature_value
.
SlotIndex
()]
=
-
1
;
value
[
common_feature_value
.
MfDimIndex
()]
=
-
1
;
_embed_sgd_rule
->
InitValue
(
value
+
common_feature_value
.
EmbedWIndex
(),
value
+
common_feature_value
.
EmbedG2SumIndex
());
_embed_sgd_rule
->
InitValue
(
value
+
common_feature_value
.
EmbedWIndex
(),
value
+
common_feature_value
.
EmbedG2SumIndex
(),
false
);
// adam embed init not zero, adagrad embed init zero
_embedx_sgd_rule
->
InitValue
(
value
+
common_feature_value
.
EmbedxWIndex
(),
value
+
common_feature_value
.
EmbedxG2SumIndex
(),
false
);
...
...
@@ -293,22 +299,14 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) {
i
++
)
{
os
<<
" "
<<
v
[
i
];
}
// os << " " << common_feature_value.Slot(const_cast<float*>(v)) << " "
// << common_feature_value.MfDim(const_cast<float*>(v));
auto
show
=
common_feature_value
.
Show
(
const_cast
<
float
*>
(
v
));
auto
click
=
common_feature_value
.
Click
(
const_cast
<
float
*>
(
v
));
auto
score
=
ShowClickScore
(
show
,
click
);
auto
mf_dim
=
int
(
common_feature_value
.
MfDim
(
const_cast
<
float
*>
(
v
)));
if
(
score
>=
_config
.
embedx_threshold
()
&&
param
>
common_feature_value
.
EmbedxG2SumIndex
())
{
// VLOG(1) << "common_feature_value.EmbedxG2SumIndex():"
// << common_feature_value.EmbedxG2SumIndex();
// VLOG(1) << "common_feature_value.EmbedxWIndex():"
// << common_feature_value.EmbedxWIndex();
// VLOG(1) << "common_feature_value.MfDim():"
// << common_feature_value.MfDim(const_cast<float*>(v));
for
(
auto
i
=
common_feature_value
.
EmbedxG2SumIndex
();
i
<
common_feature_value
.
EmbedxWIndex
()
+
common_feature_value
.
MfDim
(
const_cast
<
float
*>
(
v
));
i
<
common_feature_value
.
Dim
(
mf_dim
);
++
i
)
{
os
<<
" "
<<
v
[
i
];
}
...
...
paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
浏览文件 @
b8d106e1
...
...
@@ -54,10 +54,24 @@ class CtrDymfAccessor : public ValueAccessor {
int
ClickIndex
()
{
return
ShowIndex
()
+
1
;
}
int
EmbedWIndex
()
{
return
ClickIndex
()
+
1
;
}
int
EmbedG2SumIndex
()
{
return
EmbedWIndex
()
+
1
;
}
int
SlotIndex
()
{
return
EmbedG2SumIndex
()
+
1
;
}
int
SlotIndex
()
{
return
EmbedG2SumIndex
()
+
embed_sgd_dim
;
}
int
MfDimIndex
()
{
return
SlotIndex
()
+
1
;
}
int
EmbedxG2SumIndex
()
{
return
MfDimIndex
()
+
1
;
}
int
EmbedxWIndex
()
{
return
EmbedxG2SumIndex
()
+
1
;
}
int
EmbedxWIndex
()
{
return
EmbedxG2SumIndex
()
+
embedx_sgd_dim
;
}
// 根据mf_dim计算的总长度
int
Dim
(
int
&
mf_dim
)
{
int
tmp_embedx_sgd_dim
=
1
;
if
(
optimizer_name
==
"SparseAdamSGDRule"
)
{
// adam
tmp_embedx_sgd_dim
=
mf_dim
*
2
+
2
;
}
else
if
(
optimizer_name
==
"SparseSharedAdamSGDRule"
)
{
// shared_adam
tmp_embedx_sgd_dim
=
4
;
}
return
7
+
embed_sgd_dim
+
tmp_embedx_sgd_dim
+
mf_dim
;
}
// 根据mf_dim计算的总byte数
int
Size
(
int
&
mf_dim
)
{
return
(
Dim
(
mf_dim
))
*
sizeof
(
float
);
}
float
&
UnseenDays
(
float
*
val
)
{
return
val
[
UnseenDaysIndex
()];
}
float
&
DeltaScore
(
float
*
val
)
{
return
val
[
DeltaScoreIndex
()];
}
...
...
@@ -73,6 +87,7 @@ class CtrDymfAccessor : public ValueAccessor {
int
embed_sgd_dim
;
int
embedx_dim
;
int
embedx_sgd_dim
;
std
::
string
optimizer_name
;
};
struct
CtrDymfPushValue
{
...
...
paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
浏览文件 @
b8d106e1
...
...
@@ -213,7 +213,6 @@ void SparseAdamSGDRule::UpdateValueWork(float* w,
float
beta1_pow_
=
*
beta1_pow
;
float
beta2_pow_
=
*
beta2_pow
;
// lr not change in one update
lr
*=
sqrt
(
1
-
beta2_pow_
)
/
(
1
-
beta1_pow_
);
for
(
size_t
i
=
0
;
i
<
_embedding_dim
;
i
++
)
{
// Calculation
...
...
@@ -252,5 +251,88 @@ void SparseAdamSGDRule::InitValueWork(float* value,
*
(
sgd
+
Beta1PowIndex
())
=
_beta1_decay_rate
;
*
(
sgd
+
Beta2PowIndex
())
=
_beta2_decay_rate
;
}
void
SparseSharedAdamSGDRule
::
LoadConfig
(
const
SparseCommonSGDRuleParameter
&
param
,
size_t
emb_dim
)
{
_embedding_dim
=
emb_dim
;
auto
adam_param
=
param
.
adam
();
learning_rate_
=
adam_param
.
learning_rate
();
_initial_range
=
adam_param
.
initial_range
();
_beta1_decay_rate
=
adam_param
.
beta1_decay_rate
();
_beta2_decay_rate
=
adam_param
.
beta2_decay_rate
();
_ada_epsilon
=
adam_param
.
ada_epsilon
();
if
(
adam_param
.
weight_bounds_size
()
==
0
)
{
_min_bound
=
-
std
::
numeric_limits
<
float
>::
max
();
_max_bound
=
std
::
numeric_limits
<
float
>::
max
();
}
else
{
CHECK
(
adam_param
.
weight_bounds_size
()
>=
2
)
<<
"invalid repeated size for weight_bounds:"
<<
adam_param
.
weight_bounds_size
();
_min_bound
=
adam_param
.
weight_bounds
(
0
);
_max_bound
=
adam_param
.
weight_bounds
(
1
);
}
}
void
SparseSharedAdamSGDRule
::
UpdateValueWork
(
float
*
w
,
float
*
sgd
,
const
float
*
grad
,
float
scale
)
{
float
*
gsum
=
sgd
+
GSumIndex
();
float
*
g2sum
=
sgd
+
G2SumIndex
();
float
*
beta1_pow
=
sgd
+
Beta1PowIndex
();
float
*
beta2_pow
=
sgd
+
Beta2PowIndex
();
const
float
*
g
=
grad
;
float
lr
=
learning_rate_
;
float
beta1_pow_
=
*
beta1_pow
;
float
beta2_pow_
=
*
beta2_pow
;
float
gsum_
=
*
gsum
;
float
g2sum_
=
*
g2sum
;
lr
*=
sqrt
(
1
-
beta2_pow_
)
/
(
1
-
beta1_pow_
);
double
sum_gsum
=
0.0
;
double
sum_g2sum
=
0.0
;
for
(
int
i
=
0
;
i
<
_embedding_dim
;
i
++
)
{
// Calculation
double
new_gsum
=
_beta1_decay_rate
*
gsum_
+
(
1
-
_beta1_decay_rate
)
*
g
[
i
];
double
new_g2sum
=
_beta2_decay_rate
*
g2sum_
+
(
1
-
_beta2_decay_rate
)
*
g
[
i
]
*
g
[
i
];
w
[
i
]
=
w
[
i
]
-
lr
*
(
new_gsum
/
(
sqrt
(
new_g2sum
)
+
_ada_epsilon
));
BoundValue
(
w
[
i
]);
sum_gsum
+=
new_gsum
;
sum_g2sum
+=
new_g2sum
;
}
// update beta_pow_decay
(
*
gsum
)
=
sum_gsum
/
_embedding_dim
;
(
*
g2sum
)
=
sum_g2sum
/
_embedding_dim
;
(
*
beta1_pow
)
*=
_beta1_decay_rate
;
(
*
beta2_pow
)
*=
_beta2_decay_rate
;
}
void
SparseSharedAdamSGDRule
::
InitValueWork
(
float
*
value
,
float
*
sgd
,
bool
zero_init
)
{
for
(
int
i
=
0
;
i
<
_embedding_dim
;
++
i
)
{
if
(
zero_init
)
{
value
[
i
]
=
0.0
;
BoundValue
(
value
[
i
]);
}
else
{
value
[
i
]
=
(
local_uniform_real_distribution
<
double
>
()(
local_random_engine
())
*
2
-
1
)
*
_initial_range
;
BoundValue
(
value
[
i
]);
}
}
// init rule gsum and g2sum
for
(
int
i
=
GSumIndex
();
i
<
Beta1PowIndex
();
i
++
)
{
sgd
[
i
]
=
0.0
;
}
// init beta1_pow and beta2_pow
*
(
sgd
+
Beta1PowIndex
())
=
_beta1_decay_rate
;
*
(
sgd
+
Beta2PowIndex
())
=
_beta2_decay_rate
;
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
浏览文件 @
b8d106e1
...
...
@@ -144,5 +144,28 @@ class SparseAdamSGDRule : public SparseValueSGDRule {
float
_beta2_decay_rate
;
float
_ada_epsilon
;
};
class
SparseSharedAdamSGDRule
:
public
SparseValueSGDRule
{
public:
virtual
void
LoadConfig
(
const
SparseCommonSGDRuleParameter
&
param
,
size_t
emb_dim
);
virtual
void
UpdateValueWork
(
float
*
w
,
float
*
sgd
,
const
float
*
push_value
,
float
scale
);
virtual
void
InitValueWork
(
float
*
value
,
float
*
sgd
,
bool
zero_init
);
virtual
size_t
Dim
()
{
return
4
;
}
size_t
GSumIndex
()
{
return
0
;
}
size_t
G2SumIndex
()
{
return
GSumIndex
()
+
1
;
}
size_t
Beta1PowIndex
()
{
return
G2SumIndex
()
+
1
;
}
size_t
Beta2PowIndex
()
{
return
Beta1PowIndex
()
+
1
;
}
protected:
float
learning_rate_
;
float
_beta1_decay_rate
;
float
_beta2_decay_rate
;
float
_ada_epsilon
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/ps/table/table.cc
浏览文件 @
b8d106e1
...
...
@@ -49,6 +49,7 @@ REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
REGISTER_PSCORE_CLASS
(
SparseValueSGDRule
,
SparseAdamSGDRule
);
REGISTER_PSCORE_CLASS
(
SparseValueSGDRule
,
SparseNaiveSGDRule
);
REGISTER_PSCORE_CLASS
(
SparseValueSGDRule
,
SparseAdaGradSGDRule
);
REGISTER_PSCORE_CLASS
(
SparseValueSGDRule
,
SparseSharedAdamSGDRule
);
int32_t
TableManager
::
Initialize
()
{
static
bool
initialized
=
false
;
...
...
paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
浏览文件 @
b8d106e1
...
...
@@ -13,6 +13,7 @@ cc_library(
op_registry
fs
shell
ps_gpu_wrapper
${
RPC_DEPS
}
)
target_link_libraries
(
fleet z
)
paddle/fluid/distributed/ps/wrapper/fleet.cc
浏览文件 @
b8d106e1
...
...
@@ -18,6 +18,10 @@ limitations under the License. */
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/table/table.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#endif
namespace
paddle
{
namespace
distributed
{
...
...
@@ -129,6 +133,13 @@ void FleetWrapper::InitWorker(const std::string& dist_desc,
worker_ptr_
=
std
::
shared_ptr
<
paddle
::
distributed
::
PSClient
>
(
paddle
::
distributed
::
PSClientFactory
::
Create
(
ps_param
));
worker_ptr_
->
Configure
(
ps_param
,
dense_pull_regions
,
ps_env_
,
index
);
#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE
VLOG
(
3
)
<<
"FleetWrapper::InitWorker InitializeGPUServer"
;
auto
*
accessor
=
worker_ptr_
->
GetTableAccessor
(
0
);
auto
ps_gpu_wrapper
=
paddle
::
framework
::
PSGPUWrapper
::
GetInstance
();
ps_gpu_wrapper
->
InitializeGPUServer
(
ps_param
);
ps_gpu_wrapper
->
SetTableAccessor
(
accessor
);
#endif
}
}
else
{
VLOG
(
3
)
<<
"Client can be initialized only once"
;
...
...
@@ -525,11 +536,11 @@ void FleetWrapper::PushSparseFromTensorAsync(
int
batch_size
=
-
1
;
bool
batch_size_consist
=
true
;
for
(
auto
*
input
:
*
inputs
)
{
in
t
cur_batch_size
=
size_
t
cur_batch_size
=
input
->
lod
().
size
()
?
input
->
lod
()[
0
].
size
()
-
1
:
input
->
dims
()[
0
];
if
(
batch_size
==
-
1
)
{
batch_size
=
cur_batch_size
;
}
else
if
(
batch_size
!=
cur_batch_size
)
{
batch_size
=
int
(
cur_batch_size
)
;
}
else
if
(
batch_size
!=
int
(
cur_batch_size
)
)
{
// CHECK(batch_size == cur_batch_size); // NOLINT
batch_size_consist
=
false
;
break
;
...
...
@@ -537,12 +548,12 @@ void FleetWrapper::PushSparseFromTensorAsync(
}
CHECK
(
batch_size
>
0
);
// NOLINT
in
t
show_size
=
size_
t
show_size
=
shows
->
lod
().
size
()
?
shows
->
lod
()[
0
].
size
()
-
1
:
shows
->
dims
()[
0
];
CHECK
(
show_size
==
batch_size
||
show_size
==
1
);
in
t
clk_size
=
CHECK
(
show_size
==
size_t
(
batch_size
)
||
show_size
==
1
);
size_
t
clk_size
=
clks
->
lod
().
size
()
?
clks
->
lod
()[
0
].
size
()
-
1
:
clks
->
dims
()[
0
];
CHECK
(
clk_size
==
batch_size
||
clk_size
==
1
);
CHECK
(
clk_size
==
size_t
(
batch_size
)
||
clk_size
==
1
);
CHECK
(
outputs
->
size
()
==
inputs
->
size
());
std
::
vector
<
uint64_t
>
push_keys
;
...
...
@@ -601,12 +612,10 @@ void FleetWrapper::PushSparseFromTensorAsync(
// in
// ctr_accessor.h
push_values
.
back
()[
0
]
=
2
;
// TODO(zhaocaibei123): slot
push_values
.
back
()[
1
]
=
(
static_cast
<
int
>
(
i
)
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
static_cast
<
int
>
(
i
)
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
push_values
.
back
()[
1
]
=
(
i
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
i
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
float
*
data
=
push_values
.
back
().
data
()
+
3
;
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
}
...
...
@@ -630,12 +639,10 @@ void FleetWrapper::PushSparseFromTensorAsync(
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
push_values
.
back
()[
0
]
=
2
;
// TODO(zhaocaibei123): slot
push_values
.
back
()[
1
]
=
(
static_cast
<
int
>
(
i
)
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
static_cast
<
int
>
(
i
)
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
push_values
.
back
()[
1
]
=
(
i
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
i
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
float
*
data
=
push_values
.
back
().
data
()
+
3
;
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
}
...
...
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
b8d106e1
...
...
@@ -197,14 +197,14 @@ message TableParameter {
message
TableAccessorParameter
{
optional
string
accessor_class
=
1
;
optional
SGDParameter
embed_sgd_param
=
2
;
optional
SGDParameter
embedx_sgd_param
=
3
;
optional
uint32
fea_dim
=
4
[
default
=
11
];
// field size of one value
optional
uint32
embedx_dim
=
5
[
default
=
8
];
// embedx feature size
optional
uint32
embedx_threshold
=
6
[
default
=
10
];
// embedx feature create threshold
optional
CtrAccessorParameter
ctr_accessor_param
=
7
;
repeated
TableAccessorSaveParameter
table_accessor_save_param
=
8
;
optional
SGDParameter
embed_sgd_param
=
10
;
optional
SGDParameter
embedx_sgd_param
=
11
;
}
message
SGDParameter
{
...
...
@@ -228,7 +228,7 @@ message
repeated
float
weight_bounds
=
4
;
}
message
SparseAdamSGDParameter
{
// SparseAdamSGDRule
message
SparseAdamSGDParameter
{
// SparseAdamSGDRule
| SparseSharedAdamSGDRule
optional
double
learning_rate
=
1
[
default
=
0.001
];
optional
double
initial_range
=
2
[
default
=
0.0001
];
optional
double
beta1_decay_rate
=
3
[
default
=
0.9
];
...
...
paddle/fluid/framework/fleet/CMakeLists.txt
浏览文件 @
b8d106e1
...
...
@@ -25,10 +25,17 @@ endif()
if
(
WITH_HETERPS
)
if
(
WITH_NCCL AND WITH_GPU
)
nv_library
(
ps_gpu_wrapper
SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps gloo_wrapper
${
BRPC_DEPS
}
)
if
(
WITH_PSCORE
)
nv_library
(
ps_gpu_wrapper
SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps gloo_wrapper ps_framework_proto
${
BRPC_DEPS
}
)
else
()
nv_library
(
ps_gpu_wrapper
SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps gloo_wrapper
${
BRPC_DEPS
}
)
endif
()
add_subdirectory
(
heter_ps
)
elseif
(
WITH_XPU_KP
)
xpu_library
(
...
...
paddle/fluid/framework/fleet/heter_context.h
浏览文件 @
b8d106e1
...
...
@@ -81,7 +81,6 @@ class HeterContext {
std
::
vector
<
std
::
vector
<
FeatureValue
>>
device_values_
;
std
::
vector
<
std
::
vector
<
FeatureKey
>>
device_keys_
;
std
::
vector
<
std
::
vector
<
std
::
vector
<
FeatureKey
>>>
device_dim_keys_
;
std
::
vector
<
std
::
vector
<
std
::
vector
<
FeatureValue
>>>
device_dim_values_
;
std
::
vector
<
std
::
mutex
*>
mutex_
;
std
::
vector
<
std
::
vector
<
std
::
mutex
*>>
dim_mutex_
;
int
multi_mf_dim_
=
0
;
...
...
@@ -114,7 +113,6 @@ class HeterContext {
value_dim_ptr_
[
i
].
resize
(
dim_num
);
}
device_values_
.
resize
(
device_num
);
device_dim_values_
.
resize
(
device_num
);
device_keys_
.
resize
(
device_num
);
device_dim_keys_
.
resize
(
device_num
);
...
...
paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
浏览文件 @
b8d106e1
...
...
@@ -9,16 +9,16 @@ if(WITH_GPU)
endif
()
nv_library
(
heter_comm_kernel
SRCS heter_comm_kernel.cu feature_value.h
SRCS heter_comm_kernel.cu feature_value.h
feature_value.cu
DEPS
${
HETERPS_DEPS
}
)
nv_library
(
hashtable_kernel
SRCS hashtable_kernel.cu feature_value.h
SRCS hashtable_kernel.cu feature_value.h
feature_value.cu
DEPS
${
HETERPS_DEPS
}
)
nv_library
(
heter_comm
SRCS heter_comm.h feature_value.h
heter_resource.cc heter_resource.h
mem_pool.h
SRCS heter_comm.h feature_value.h
feature_value.cu heter_resource.cc
heter_resource.h
mem_pool.h
DEPS
${
HETERPS_DEPS
}
heter_comm_kernel hashtable_kernel
)
nv_test
(
test_heter_comm
...
...
paddle/fluid/framework/fleet/heter_ps/feature_value.cu
0 → 100644
浏览文件 @
b8d106e1
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
namespace
paddle
{
namespace
framework
{
template
<
typename
FVAccessor
>
__global__
void
PullCopy
(
float
**
dest
,
const
float
*
src
,
const
int64_t
*
len
,
int
slot_num
,
int
total_len
,
uint64_t
**
keys
,
uint64_t
max_val_size
,
int
*
gpu_dim
,
FVAccessor
feature_value_accessor
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
x
-
1
]
:
0
);
float
*
feature_value_ptr
=
(
float
*
)((
char
*
)
src
+
uint64_t
(
i
)
*
uint64_t
(
max_val_size
));
int
mf_dim
=
gpu_dim
[
x
]
-
3
;
feature_value_accessor
.
Select
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
),
feature_value_ptr
,
keys
[
x
]
+
y
,
mf_dim
);
}
}
template
<
typename
FVAccessor
>
__global__
void
PushCopyWithPool
(
float
*
dest
,
float
**
src
,
int64_t
*
len
,
int
slot_num
,
uint64_t
total_len
,
int
bs
,
int
*
slot_vector
,
int
*
mf_dim_vector
,
size_t
grad_value_size
,
FVAccessor
feature_value_accessor
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
low
-
1
]
:
0
);
float
*
cur
=
(
float
*
)((
char
*
)
dest
+
i
*
grad_value_size
);
cur
[
feature_value_accessor
.
common_push_value
.
SlotIndex
()]
=
(
float
)
slot_vector
[
x
];
int
mf_dim
=
mf_dim_vector
[
x
];
cur
[
feature_value_accessor
.
common_push_value
.
MfDimIndex
()]
=
mf_dim
;
cur
[
feature_value_accessor
.
common_push_value
.
ShowIndex
()]
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
));
cur
[
feature_value_accessor
.
common_push_value
.
ClickIndex
()]
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
);
cur
[
feature_value_accessor
.
common_push_value
.
EmbedGIndex
()]
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
*
-
1.
*
bs
;
for
(
int
j
=
0
;
j
<
mf_dim
;
j
++
)
{
cur
[
feature_value_accessor
.
common_push_value
.
EmbedxGIndex
()
+
j
]
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
*
-
1.
*
bs
;
}
}
}
template
<
typename
GPUAccessor
>
void
AccessorWrapper
<
GPUAccessor
>::
CopyForPullImpl
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
float
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
,
int
feature_value_size
)
{
auto
stream
=
dynamic_cast
<
paddle
::
platform
::
CUDADeviceContext
*>
(
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
buf_value
=
memory
::
Alloc
(
place
,
values
.
size
()
*
sizeof
(
float
*
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_value
->
ptr
());
cudaMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
PullCopy
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
gpu_values
,
total_values_gpu
,
gpu_len
,
slot_num
,
total_length
,
gpu_keys
,
feature_value_size
,
gpu_dim
,
gpu_accessor_
);
cudaStreamSynchronize
(
stream
);
}
template
<
typename
GPUAccessor
>
void
AccessorWrapper
<
GPUAccessor
>::
CopyForPushImpl
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
float
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
,
std
::
vector
<
int
>&
slot_vector
,
std
::
vector
<
int
>&
slot_mf_dim_vector
)
{
auto
stream
=
dynamic_cast
<
paddle
::
platform
::
CUDADeviceContext
*>
(
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
slot_lengths_lod
=
slot_lengths
;
for
(
int
i
=
1
;
i
<
slot_lengths_lod
.
size
();
i
++
)
{
slot_lengths_lod
[
i
]
+=
slot_lengths_lod
[
i
-
1
];
}
auto
buf_grad_value
=
memory
::
Alloc
(
place
,
grad_values
.
size
()
*
sizeof
(
float
*
));
auto
buf_length
=
memory
::
Alloc
(
place
,
slot_lengths
.
size
()
*
sizeof
(
int64_t
));
auto
buf_slot_vector
=
memory
::
Alloc
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
auto
buf_mf_dim_vector
=
memory
::
Alloc
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_grad_value
->
ptr
());
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
int
*
d_slot_vector
=
reinterpret_cast
<
int
*>
(
buf_slot_vector
->
ptr
());
int
*
d_mf_dim_vector
=
reinterpret_cast
<
int
*>
(
buf_mf_dim_vector
->
ptr
());
cudaMemcpy
(
gpu_values
,
grad_values
.
data
(),
grad_values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_slot_vector
,
slot_vector
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_mf_dim_vector
,
slot_mf_dim_vector
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
PushCopyWithPool
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
total_grad_values_gpu
,
gpu_values
,
gpu_len
,
slot_lengths
.
size
(),
total_length
,
batch_size
,
d_slot_vector
,
d_mf_dim_vector
,
grad_value_size
,
gpu_accessor_
);
cudaStreamSynchronize
(
stream
);
}
#ifdef PADDLE_WITH_PSCORE
template
class
AccessorWrapper
<
CommonFeatureValueAccessor
>;
#endif
}
// namespace framework
}
// namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/feature_value.h
浏览文件 @
b8d106e1
...
...
@@ -17,12 +17,547 @@ limitations under the License. */
#ifdef PADDLE_WITH_HETERPS
#include <iostream>
#include <sstream>
#include <unordered_map>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_PSCORE
#include "paddle/fluid/distributed/ps/table/accessor.h"
#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
#endif
namespace
paddle
{
namespace
framework
{
#define MF_DIM 8
typedef
uint64_t
FeatureKey
;
#define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
class
FeatureValueAccessor
{
public:
__host__
__device__
FeatureValueAccessor
()
{}
__host__
__device__
~
FeatureValueAccessor
()
{}
__host__
__device__
virtual
int
Configure
(
std
::
unordered_map
<
std
::
string
,
float
>
config
)
{
_config
=
config
;
Initialize
();
return
0
;
}
__host__
__device__
virtual
int
Initialize
()
=
0
;
protected:
std
::
unordered_map
<
std
::
string
,
float
>
_config
;
};
// adagrad: embed_sgd_dim=1, embedx_sgd_dim=1,embedx_dim=n
// adam std: embed_sgd_dim=4, embedx_sgd_dim=n*2+2,embedx_dim=n
// adam shared: embed_sgd_dim=4, embedx_sgd_dim=4,embedx_dim=n
class
CommonFeatureValueAccessor
:
public
FeatureValueAccessor
{
public:
struct
CommonFeatureValue
{
/*
uint64_t cpu_ptr;
float delta_score;
float show;
float click;
float embed_w;
std::vector<float> embed_g2sum;
float slot;
float mf_dim
float mf_size
std::vector<float> embedx_g2sum;
std::vector<float> embedx_w;
*/
__host__
__device__
int
Dim
()
{
return
9
+
embed_sgd_dim
+
embedx_sgd_dim
+
embedx_dim
;
}
// has cpu_ptr(2)
__host__
__device__
int
DimSize
(
size_t
dim
,
int
embedx_dim
)
{
return
sizeof
(
float
);
}
__host__
__device__
size_t
Size
()
{
return
TYPEALIGN
(
8
,
Dim
()
*
sizeof
(
float
));
}
// cpu_ptr:uint64=2float
__host__
__device__
int
EmbedDim
()
{
return
embed_sgd_dim
;
}
__host__
__device__
int
EmbedXDim
()
{
return
embedx_sgd_dim
;
}
__host__
__device__
int
EmbedWDim
()
{
return
embedx_dim
;
}
__host__
__device__
int
CpuPtrIndex
()
{
return
0
;
}
// cpuprt uint64
__host__
__device__
int
DeltaScoreIndex
()
{
return
CpuPtrIndex
()
+
2
;
}
__host__
__device__
int
ShowIndex
()
{
return
DeltaScoreIndex
()
+
1
;
}
__host__
__device__
int
ClickIndex
()
{
return
ShowIndex
()
+
1
;
}
__host__
__device__
int
EmbedWIndex
()
{
return
ClickIndex
()
+
1
;
}
__host__
__device__
int
EmbedG2SumIndex
()
{
return
EmbedWIndex
()
+
1
;
}
__host__
__device__
int
SlotIndex
()
{
return
EmbedG2SumIndex
()
+
embed_sgd_dim
;
}
__host__
__device__
int
MfDimIndex
()
{
return
SlotIndex
()
+
1
;
}
__host__
__device__
int
MfSizeIndex
()
{
return
MfDimIndex
()
+
1
;
}
// actual mf size (ex. 0)
__host__
__device__
int
EmbedxG2SumIndex
()
{
return
MfSizeIndex
()
+
1
;
}
__host__
__device__
int
EmbedxWIndex
()
{
return
EmbedxG2SumIndex
()
+
embedx_sgd_dim
;
}
// 根据mf_dim计算的总长度
__host__
__device__
int
Dim
(
int
&
mf_dim
)
{
int
tmp_embedx_sgd_dim
=
1
;
if
(
optimizer_type_
==
3
)
{
// adam
tmp_embedx_sgd_dim
=
mf_dim
*
2
+
2
;
}
else
if
(
optimizer_type_
==
4
)
{
// shared_adam
tmp_embedx_sgd_dim
=
4
;
}
return
9
+
embed_sgd_dim
+
tmp_embedx_sgd_dim
+
mf_dim
;
}
// 根据mf_dim 计算的总byte数
__host__
__device__
size_t
Size
(
int
&
mf_dim
)
{
return
TYPEALIGN
(
8
,
Dim
(
mf_dim
)
*
sizeof
(
float
));
// cpu_ptr:2float
}
// 根据mf_dim 计算的 mf_size byte数
__host__
__device__
size_t
MFSize
(
int
&
mf_dim
)
{
int
tmp_embedx_sgd_dim
=
1
;
if
(
optimizer_type_
==
3
)
{
// adam
tmp_embedx_sgd_dim
=
mf_dim
*
2
+
2
;
}
else
if
(
optimizer_type_
==
4
)
{
// shared_adam
tmp_embedx_sgd_dim
=
4
;
}
return
(
tmp_embedx_sgd_dim
+
mf_dim
)
*
sizeof
(
float
);
}
__host__
__device__
int
EmbedxG2SumOffsetIndex
()
{
return
0
;
}
__host__
__device__
int
EmbedxWOffsetIndex
(
float
*
val
)
{
// has mf
int
tmp_embedx_sgd_dim
=
1
;
if
(
int
(
MfSize
(
val
))
>
0
)
{
if
(
optimizer_type_
==
3
)
{
// adam
tmp_embedx_sgd_dim
=
int
(
MfDim
(
val
))
*
2
+
2
;
}
else
if
(
optimizer_type_
==
4
)
{
// shared_adam
tmp_embedx_sgd_dim
=
4
;
}
return
EmbedxG2SumIndex
()
+
tmp_embedx_sgd_dim
;
}
else
{
// no mf
return
0
;
}
}
__host__
__device__
uint64_t
CpuPtr
(
float
*
val
)
{
return
*
(
reinterpret_cast
<
uint64_t
*>
(
val
));
}
__host__
__device__
float
&
DeltaScore
(
float
*
val
)
{
return
val
[
DeltaScoreIndex
()];
}
__host__
__device__
float
&
Show
(
float
*
val
)
{
return
val
[
ShowIndex
()];
}
__host__
__device__
float
&
Click
(
float
*
val
)
{
return
val
[
ClickIndex
()];
}
__host__
__device__
float
&
Slot
(
float
*
val
)
{
return
val
[
SlotIndex
()];
}
__host__
__device__
float
&
MfDim
(
float
*
val
)
{
return
val
[
MfDimIndex
()];
}
__host__
__device__
float
&
MfSize
(
float
*
val
)
{
return
val
[
MfSizeIndex
()];
}
__host__
__device__
float
&
EmbedW
(
float
*
val
)
{
return
val
[
EmbedWIndex
()];
}
__host__
__device__
float
&
EmbedG2Sum
(
float
*
val
)
{
return
val
[
EmbedG2SumIndex
()];
}
__host__
__device__
float
&
EmbedxG2Sum
(
float
*
val
)
{
return
val
[
EmbedxG2SumIndex
()];
}
__host__
__device__
float
&
EmbedxW
(
float
*
val
)
{
return
val
[
EmbedxWIndex
()];
}
int
embed_sgd_dim
;
int
embedx_dim
;
int
embedx_sgd_dim
;
int
optimizer_type_
;
};
struct
CommonPushValue
{
/*
float slot;
float show;
float click;
float mf_dim;
float embed_g;
std::vector<float> embedx_g;
*/
__host__
__device__
int
Dim
(
int
embedx_dim
)
{
return
5
+
embedx_dim
;
}
__host__
__device__
int
DimSize
(
int
dim
,
int
embedx_dim
)
{
return
sizeof
(
float
);
}
__host__
__device__
int
Size
(
int
embedx_dim
)
{
return
TYPEALIGN
(
8
,
Dim
(
embedx_dim
)
*
sizeof
(
float
));
}
__host__
__device__
int
SlotIndex
()
{
return
0
;
}
__host__
__device__
int
ShowIndex
()
{
return
CommonPushValue
::
SlotIndex
()
+
1
;
}
__host__
__device__
int
ClickIndex
()
{
return
CommonPushValue
::
ShowIndex
()
+
1
;
}
__host__
__device__
int
MfDimIndex
()
{
return
CommonPushValue
::
ClickIndex
()
+
1
;
}
__host__
__device__
int
EmbedGIndex
()
{
return
CommonPushValue
::
MfDimIndex
()
+
1
;
}
__host__
__device__
int
EmbedxGIndex
()
{
return
CommonPushValue
::
EmbedGIndex
()
+
1
;
}
__host__
__device__
float
&
Slot
(
float
*
val
)
{
return
val
[
CommonPushValue
::
SlotIndex
()];
}
__host__
__device__
float
&
Show
(
float
*
val
)
{
return
val
[
CommonPushValue
::
ShowIndex
()];
}
__host__
__device__
float
&
Click
(
float
*
val
)
{
return
val
[
CommonPushValue
::
ClickIndex
()];
}
__host__
__device__
float
&
MfDim
(
float
*
val
)
{
return
val
[
CommonPushValue
::
MfDimIndex
()];
}
__host__
__device__
float
&
EmbedG
(
float
*
val
)
{
return
val
[
CommonPushValue
::
EmbedGIndex
()];
}
__host__
__device__
float
*
EmbedxG
(
float
*
val
)
{
return
val
+
CommonPushValue
::
EmbedxGIndex
();
}
};
struct
CommonPullValue
{
/*
float show;
float click;
float embed_w;
std::vector<float> embedx_w;
*/
__host__
__device__
static
int
Dim
(
int
embedx_dim
)
{
return
3
+
embedx_dim
;
}
__host__
__device__
int
DimSize
(
size_t
dim
)
{
return
sizeof
(
float
);
}
__host__
__device__
int
Size
(
int
embedx_dim
)
{
return
TYPEALIGN
(
8
,
Dim
(
embedx_dim
)
*
sizeof
(
float
));
}
__host__
__device__
int
ShowIndex
()
{
return
0
;
}
__host__
__device__
int
ClickIndex
()
{
return
1
;
}
__host__
__device__
int
EmbedWIndex
()
{
return
2
;
}
__host__
__device__
int
EmbedxWIndex
()
{
return
3
;
}
__host__
__device__
float
&
Show
(
float
*
val
)
{
return
val
[
CommonPullValue
::
ShowIndex
()];
}
__host__
__device__
float
&
Click
(
float
*
val
)
{
return
val
[
CommonPullValue
::
ClickIndex
()];
}
__host__
__device__
float
&
EmbedW
(
float
*
val
)
{
return
val
[
CommonPullValue
::
EmbedWIndex
()];
}
__host__
__device__
float
*
EmbedxW
(
float
*
val
)
{
return
val
+
CommonPullValue
::
EmbedxWIndex
();
}
};
__host__
__device__
CommonFeatureValueAccessor
()
{}
__host__
__device__
~
CommonFeatureValueAccessor
()
{}
__host__
__device__
virtual
int
Initialize
()
{
int
optimizer_type
=
(
_config
.
find
(
"optimizer_type"
)
==
_config
.
end
())
?
1
:
int
(
_config
[
"optimizer_type"
]);
int
sparse_embedx_dim
=
(
_config
.
find
(
"embedx_dim"
)
==
_config
.
end
())
?
8
:
int
(
_config
[
"embedx_dim"
]);
if
(
optimizer_type
==
3
)
{
// adam
common_feature_value
.
embed_sgd_dim
=
4
;
common_feature_value
.
embedx_sgd_dim
=
sparse_embedx_dim
*
2
+
2
;
}
else
if
(
optimizer_type
==
4
)
{
// shared_adam
common_feature_value
.
embed_sgd_dim
=
4
;
common_feature_value
.
embedx_sgd_dim
=
4
;
}
else
{
common_feature_value
.
embed_sgd_dim
=
1
;
common_feature_value
.
embedx_sgd_dim
=
1
;
}
common_feature_value
.
optimizer_type_
=
optimizer_type
;
common_feature_value
.
embedx_dim
=
sparse_embedx_dim
;
return
0
;
}
// // build阶段从cpu_val赋值给gpu_val
__host__
void
BuildFill
(
float
*
gpu_val
,
void
*
cpu
,
paddle
::
distributed
::
ValueAccessor
*
cpu_table_accessor
,
int
mf_dim
)
{
#ifdef PADDLE_WITH_PSCORE
paddle
::
distributed
::
CtrDymfAccessor
*
cpu_accessor
=
dynamic_cast
<
paddle
::
distributed
::
CtrDymfAccessor
*>
(
cpu_table_accessor
);
paddle
::
distributed
::
FixedFeatureValue
*
cpu_ptr
=
(
paddle
::
distributed
::
FixedFeatureValue
*
)(
cpu
);
float
*
cpu_val
=
cpu_ptr
->
data
();
size_t
cpu_dim
=
cpu_ptr
->
size
();
gpu_val
[
common_feature_value
.
DeltaScoreIndex
()]
=
cpu_val
[
cpu_accessor
->
common_feature_value
.
DeltaScoreIndex
()];
gpu_val
[
common_feature_value
.
ShowIndex
()]
=
cpu_val
[
cpu_accessor
->
common_feature_value
.
ShowIndex
()];
gpu_val
[
common_feature_value
.
ClickIndex
()]
=
cpu_val
[
cpu_accessor
->
common_feature_value
.
ClickIndex
()];
gpu_val
[
common_feature_value
.
SlotIndex
()]
=
cpu_val
[
cpu_accessor
->
common_feature_value
.
SlotIndex
()];
gpu_val
[
common_feature_value
.
EmbedWIndex
()]
=
cpu_val
[
cpu_accessor
->
common_feature_value
.
EmbedWIndex
()];
for
(
int
i
=
0
;
i
<
common_feature_value
.
EmbedDim
();
i
++
)
{
gpu_val
[
common_feature_value
.
EmbedG2SumIndex
()
+
i
]
=
cpu_val
[
cpu_accessor
->
common_feature_value
.
EmbedG2SumIndex
()
+
i
];
}
*
(
reinterpret_cast
<
uint64_t
*>
(
gpu_val
+
common_feature_value
.
CpuPtrIndex
()))
=
(
uint64_t
)(
cpu
);
cpu_val
[
cpu_accessor
->
common_feature_value
.
MfDimIndex
()]
=
float
(
mf_dim
);
gpu_val
[
common_feature_value
.
MfDimIndex
()]
=
mf_dim
;
if
(
cpu_dim
>
cpu_accessor
->
GetAccessorInfo
().
dim
-
cpu_accessor
->
GetAccessorInfo
().
mf_size
/
sizeof
(
float
))
{
gpu_val
[
common_feature_value
.
MfSizeIndex
()]
=
common_feature_value
.
MFSize
(
mf_dim
)
/
sizeof
(
float
);
for
(
int
x
=
0
;
x
<
int
(
common_feature_value
.
MFSize
(
mf_dim
)
/
sizeof
(
float
));
x
++
)
{
gpu_val
[
common_feature_value
.
EmbedxG2SumIndex
()
+
x
]
=
cpu_val
[
cpu_accessor
->
common_feature_value
.
EmbedxG2SumIndex
()
+
x
];
}
}
else
{
gpu_val
[
common_feature_value
.
MfSizeIndex
()]
=
0
;
for
(
int
x
=
common_feature_value
.
EmbedxG2SumIndex
();
x
<
int
(
common_feature_value
.
Size
(
mf_dim
)
/
sizeof
(
float
));
x
++
)
{
gpu_val
[
x
]
=
0
;
}
}
#endif
}
// dump_to_cpu阶段从gpu_val赋值给cpu_val
__host__
void
DumpFill
(
float
*
gpu_val
,
paddle
::
distributed
::
ValueAccessor
*
cpu_table_accessor
,
int
mf_dim
)
{
#ifdef PADDLE_WITH_PSCORE
paddle
::
distributed
::
CtrDymfAccessor
*
cpu_accessor
=
dynamic_cast
<
paddle
::
distributed
::
CtrDymfAccessor
*>
(
cpu_table_accessor
);
auto
*
downpour_value
=
(
paddle
::
distributed
::
FixedFeatureValue
*
)(
*
(
reinterpret_cast
<
uint64_t
*>
(
gpu_val
+
common_feature_value
.
CpuPtrIndex
())));
size_t
downpour_value_size
=
downpour_value
->
size
();
if
(
gpu_val
[
common_feature_value
.
MfSizeIndex
()]
>
0
&&
downpour_value_size
==
(
cpu_accessor
->
GetAccessorInfo
().
dim
-
int
(
cpu_accessor
->
GetAccessorInfo
().
mf_size
/
sizeof
(
float
))))
{
// cpu_accessor
downpour_value
->
resize
(
cpu_accessor
->
common_feature_value
.
Dim
(
mf_dim
));
}
float
*
cpu_val
=
downpour_value
->
data
();
cpu_val
[
cpu_accessor
->
common_feature_value
.
DeltaScoreIndex
()]
=
gpu_val
[
common_feature_value
.
DeltaScoreIndex
()];
cpu_val
[
cpu_accessor
->
common_feature_value
.
ShowIndex
()]
=
gpu_val
[
common_feature_value
.
ShowIndex
()];
cpu_val
[
cpu_accessor
->
common_feature_value
.
ClickIndex
()]
=
gpu_val
[
common_feature_value
.
ClickIndex
()];
cpu_val
[
cpu_accessor
->
common_feature_value
.
EmbedWIndex
()]
=
gpu_val
[
common_feature_value
.
EmbedWIndex
()];
cpu_val
[
cpu_accessor
->
common_feature_value
.
SlotIndex
()]
=
gpu_val
[
common_feature_value
.
SlotIndex
()];
for
(
int
i
=
0
;
i
<
common_feature_value
.
EmbedDim
();
i
++
)
{
cpu_val
[
cpu_accessor
->
common_feature_value
.
EmbedG2SumIndex
()
+
i
]
=
gpu_val
[
common_feature_value
.
EmbedG2SumIndex
()
+
i
];
}
if
(
gpu_val
[
common_feature_value
.
MfSizeIndex
()]
>
0
)
{
for
(
int
x
=
0
;
x
<
int
(
common_feature_value
.
MFSize
(
mf_dim
)
/
sizeof
(
float
));
x
++
)
{
cpu_val
[
cpu_accessor
->
common_feature_value
.
EmbedxG2SumIndex
()
+
x
]
=
gpu_val
[
common_feature_value
.
EmbedxG2SumIndex
()
+
x
];
}
}
#endif
}
// dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel
// 中从src_val赋值给dest_val
__host__
__device__
void
FeatureValueFill
(
float
*
dest_val
,
float
*
src_val
,
int
mf_dim
)
{
*
(
reinterpret_cast
<
uint64_t
*>
(
dest_val
+
common_feature_value
.
CpuPtrIndex
()))
=
*
(
reinterpret_cast
<
uint64_t
*>
(
src_val
+
common_feature_value
.
CpuPtrIndex
()));
dest_val
[
common_feature_value
.
DeltaScoreIndex
()]
=
src_val
[
common_feature_value
.
DeltaScoreIndex
()];
dest_val
[
common_feature_value
.
ShowIndex
()]
=
src_val
[
common_feature_value
.
ShowIndex
()];
dest_val
[
common_feature_value
.
ClickIndex
()]
=
src_val
[
common_feature_value
.
ClickIndex
()];
dest_val
[
common_feature_value
.
EmbedWIndex
()]
=
src_val
[
common_feature_value
.
EmbedWIndex
()];
for
(
int
i
=
0
;
i
<
common_feature_value
.
EmbedDim
();
i
++
)
{
dest_val
[
common_feature_value
.
EmbedG2SumIndex
()
+
i
]
=
src_val
[
common_feature_value
.
EmbedG2SumIndex
()
+
i
];
}
dest_val
[
common_feature_value
.
SlotIndex
()]
=
src_val
[
common_feature_value
.
SlotIndex
()];
dest_val
[
common_feature_value
.
MfDimIndex
()]
=
mf_dim
;
dest_val
[
common_feature_value
.
MfSizeIndex
()]
=
src_val
[
common_feature_value
.
MfSizeIndex
()];
for
(
int
x
=
common_feature_value
.
EmbedxG2SumIndex
();
x
<
int
(
common_feature_value
.
Size
(
mf_dim
)
/
sizeof
(
float
));
x
++
)
{
dest_val
[
x
]
=
src_val
[
x
];
}
}
// dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel
// 中从src_val赋值给dest_val
__host__
__device__
void
PushValueFill
(
float
*
dest_val
,
const
float
*
src_val
)
{
dest_val
[
common_push_value
.
SlotIndex
()]
=
src_val
[
common_push_value
.
SlotIndex
()];
dest_val
[
common_push_value
.
ShowIndex
()]
=
src_val
[
common_push_value
.
ShowIndex
()];
dest_val
[
common_push_value
.
ClickIndex
()]
=
src_val
[
common_push_value
.
ClickIndex
()];
dest_val
[
common_push_value
.
MfDimIndex
()]
=
src_val
[
common_push_value
.
MfDimIndex
()];
dest_val
[
common_push_value
.
EmbedGIndex
()]
=
src_val
[
common_push_value
.
EmbedGIndex
()];
for
(
int
x
=
0
;
x
<
int
(
src_val
[
common_push_value
.
MfDimIndex
()]);
x
++
)
{
dest_val
[
common_push_value
.
EmbedxGIndex
()
+
x
]
=
src_val
[
common_push_value
.
EmbedxGIndex
()
+
x
];
}
}
// update_basic 阶段 gpukernel 中从src_val赋值给dest_val
__host__
__device__
void
PushValueFillBasic
(
float
*
dest_val
,
const
float
*
src_val
)
{
dest_val
[
common_push_value
.
SlotIndex
()]
=
src_val
[
common_push_value
.
SlotIndex
()];
dest_val
[
common_push_value
.
ShowIndex
()]
=
src_val
[
common_push_value
.
ShowIndex
()];
dest_val
[
common_push_value
.
ClickIndex
()]
=
src_val
[
common_push_value
.
ClickIndex
()];
dest_val
[
common_push_value
.
MfDimIndex
()]
=
src_val
[
common_push_value
.
MfDimIndex
()];
dest_val
[
common_push_value
.
EmbedGIndex
()]
=
src_val
[
common_push_value
.
EmbedGIndex
()];
}
// merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val
__host__
__device__
void
MergePushValue
(
float
*
dest_val
,
const
float
*
src_val
)
{
dest_val
[
common_push_value
.
ShowIndex
()]
+=
src_val
[
common_push_value
.
ShowIndex
()];
dest_val
[
common_push_value
.
ClickIndex
()]
+=
src_val
[
common_push_value
.
ClickIndex
()];
dest_val
[
common_push_value
.
EmbedGIndex
()]
+=
src_val
[
common_push_value
.
EmbedGIndex
()];
for
(
int
j
=
0
;
j
<
int
(
dest_val
[
common_push_value
.
MfDimIndex
()]);
j
++
)
{
dest_val
[
common_push_value
.
EmbedxGIndex
()
+
j
]
+=
src_val
[
common_push_value
.
EmbedxGIndex
()
+
j
];
}
}
// merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val
__host__
__device__
void
MergePushValueBasic
(
float
*
dest_val
,
const
float
*
src_val
)
{
dest_val
[
common_push_value
.
ShowIndex
()]
+=
src_val
[
common_push_value
.
ShowIndex
()];
dest_val
[
common_push_value
.
ClickIndex
()]
+=
src_val
[
common_push_value
.
ClickIndex
()];
dest_val
[
common_push_value
.
EmbedGIndex
()]
+=
src_val
[
common_push_value
.
EmbedGIndex
()];
}
// PullCopy 阶段 gpukernel 中 FeatureValue回填到PullValue
__host__
__device__
void
Select
(
float
*
dest_val
,
float
*
src_val
,
uint64_t
*
key
,
int
mf_dim
)
{
if
(
*
key
==
0
)
{
*
(
dest_val
+
common_pull_value
.
ShowIndex
())
=
0
;
*
(
dest_val
+
common_pull_value
.
ClickIndex
())
=
0
;
*
(
dest_val
+
common_pull_value
.
EmbedWIndex
())
=
0
;
}
else
{
*
(
dest_val
+
common_pull_value
.
ShowIndex
())
=
src_val
[
common_feature_value
.
ShowIndex
()];
*
(
dest_val
+
common_pull_value
.
ClickIndex
())
=
src_val
[
common_feature_value
.
ClickIndex
()];
*
(
dest_val
+
common_pull_value
.
EmbedWIndex
())
=
src_val
[
common_feature_value
.
EmbedWIndex
()];
}
if
(
src_val
[
common_feature_value
.
MfSizeIndex
()]
==
0
||
*
key
==
0
)
{
for
(
int
j
=
0
;
j
<
mf_dim
;
j
++
)
{
*
(
dest_val
+
common_pull_value
.
EmbedxWIndex
()
+
j
)
=
0
;
}
}
else
{
for
(
int
j
=
0
;
j
<
mf_dim
;
j
++
)
{
*
(
dest_val
+
common_pull_value
.
EmbedxWIndex
()
+
j
)
=
src_val
[
common_feature_value
.
EmbedxWOffsetIndex
(
src_val
)
+
j
];
}
}
}
__host__
__device__
std
::
string
ParseToString
(
const
float
*
v
,
int
param_size
)
{
/*
uint64_t cpu_ptr; // 2float
float delta_score;
float show;
float click;
float embed_w;
std::vector<float> embed_g2sum;
float slot;
float mf_dim
float mf_size
std::vector<float> embedx_g2sum;
std::vector<float> embedx_w;
*/
std
::
stringstream
os
;
os
<<
"cpuptr: "
<<
common_feature_value
.
CpuPtr
(
const_cast
<
float
*>
(
v
))
<<
" delta_score: "
<<
v
[
2
]
<<
" show: "
<<
v
[
3
]
<<
" click: "
<<
v
[
4
]
<<
" embed_w:"
<<
v
[
5
]
<<
" embed_g2sum:"
;
for
(
int
i
=
common_feature_value
.
EmbedG2SumIndex
();
i
<
common_feature_value
.
SlotIndex
();
i
++
)
{
os
<<
" "
<<
v
[
i
];
}
int
mf_dim
=
int
(
common_feature_value
.
MfDim
(
const_cast
<
float
*>
(
v
)));
os
<<
" slot: "
<<
common_feature_value
.
Slot
(
const_cast
<
float
*>
(
v
))
<<
" mf_dim: "
<<
mf_dim
<<
" mf_size: "
<<
common_feature_value
.
MfSize
(
const_cast
<
float
*>
(
v
))
<<
" mf: "
;
if
(
param_size
>
common_feature_value
.
EmbedxG2SumIndex
())
{
for
(
auto
i
=
common_feature_value
.
EmbedxG2SumIndex
();
i
<
common_feature_value
.
Dim
(
mf_dim
);
++
i
)
{
os
<<
" "
<<
v
[
i
];
}
}
return
os
.
str
();
}
public:
CommonFeatureValue
common_feature_value
;
CommonPushValue
common_push_value
;
CommonPullValue
common_pull_value
;
};
struct
FeatureValue
{
float
delta_score
;
...
...
@@ -95,6 +630,176 @@ struct FeaturePushValue {
}
};
class
VirtualAccessor
{
public:
virtual
int
Configure
(
std
::
unordered_map
<
std
::
string
,
float
>
config
)
=
0
;
virtual
size_t
GetFeatureValueSize
(
int
&
mf_dim
)
=
0
;
virtual
size_t
GetPushValueSize
(
int
&
mf_dim
)
=
0
;
virtual
void
BuildFill
(
void
*
gpu_val
,
void
*
cpu_val
,
paddle
::
distributed
::
ValueAccessor
*
cpu_table_accessor
,
int
mf_dim
)
=
0
;
virtual
void
DumpFill
(
float
*
gpu_val
,
paddle
::
distributed
::
ValueAccessor
*
cpu_table_accessor
,
int
mf_dim
)
=
0
;
virtual
void
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
float
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
,
int
feature_value_size
)
=
0
;
virtual
void
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
float
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
,
std
::
vector
<
int
>&
slot_vector
,
std
::
vector
<
int
>&
slot_mf_dim_vector
)
=
0
;
virtual
std
::
string
ParseToString
(
const
float
*
v
,
int
param_size
)
=
0
;
};
template
<
typename
GPUAccessor
>
class
AccessorWrapper
:
public
VirtualAccessor
{
public:
explicit
AccessorWrapper
()
{}
virtual
~
AccessorWrapper
()
{}
AccessorWrapper
(
const
AccessorWrapper
&
)
=
delete
;
AccessorWrapper
&
operator
=
(
const
AccessorWrapper
&
)
=
delete
;
virtual
int
Configure
(
std
::
unordered_map
<
std
::
string
,
float
>
config
)
{
return
gpu_accessor_
.
Configure
(
config
);
}
virtual
size_t
GetFeatureValueSize
(
int
&
mf_dim
)
{
return
gpu_accessor_
.
common_feature_value
.
Size
(
mf_dim
);
}
virtual
size_t
GetPushValueSize
(
int
&
mf_dim
)
{
return
gpu_accessor_
.
common_push_value
.
Size
(
mf_dim
);
}
virtual
void
BuildFill
(
void
*
gpu_val
,
void
*
cpu_val
,
paddle
::
distributed
::
ValueAccessor
*
cpu_table_accessor
,
int
mf_dim
)
{
gpu_accessor_
.
BuildFill
(
(
float
*
)(
gpu_val
),
cpu_val
,
cpu_table_accessor
,
mf_dim
);
}
virtual
void
DumpFill
(
float
*
gpu_val
,
paddle
::
distributed
::
ValueAccessor
*
cpu_table_accessor
,
int
mf_dim
)
{
gpu_accessor_
.
DumpFill
(
gpu_val
,
cpu_table_accessor
,
mf_dim
);
}
virtual
void
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
float
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
,
int
feature_value_size
)
{
CopyForPullImpl
(
place
,
gpu_keys
,
values
,
total_values_gpu
,
gpu_len
,
slot_num
,
hidden_size
,
total_length
,
gpu_dim
,
feature_value_size
);
}
virtual
void
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
float
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
,
std
::
vector
<
int
>&
slot_vector
,
std
::
vector
<
int
>&
slot_mf_dim_vector
)
{
CopyForPushImpl
(
place
,
grad_values
,
total_grad_values_gpu
,
slot_lengths
,
total_length
,
batch_size
,
grad_value_size
,
slot_vector
,
slot_mf_dim_vector
);
}
void
CopyForPullImpl
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
float
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
,
int
feature_value_size
);
void
CopyForPushImpl
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
float
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
,
std
::
vector
<
int
>&
slot_vector
,
std
::
vector
<
int
>&
slot_mf_dim_vector
);
virtual
std
::
string
ParseToString
(
const
float
*
v
,
int
param_size
)
{
return
gpu_accessor_
.
ParseToString
(
v
,
param_size
);
}
GPUAccessor
gpu_accessor_
;
};
class
GlobalAccessorTransfor
{
public:
static
GlobalAccessorTransfor
&
GetInstance
()
{
static
GlobalAccessorTransfor
ins
;
return
ins
;
}
void
Init
(
std
::
string
accessor_type
)
{
if
(
accessor_wrapper_ptr_
!=
nullptr
)
{
return
;
}
if
(
accessor_type
==
"CtrDymfAccessor"
)
{
accessor_wrapper_ptr_
=
new
AccessorWrapper
<
CommonFeatureValueAccessor
>
();
}
else
{
VLOG
(
0
)
<<
"GlobalAccessorTransfor Init not support accessor_type:"
<<
accessor_type
;
accessor_wrapper_ptr_
=
new
AccessorWrapper
<
CommonFeatureValueAccessor
>
();
}
}
VirtualAccessor
*
GetAccessorWrapper
()
{
return
accessor_wrapper_ptr_
;
}
private:
VirtualAccessor
*
accessor_wrapper_ptr_
=
nullptr
;
};
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
浏览文件 @
b8d106e1
...
...
@@ -25,10 +25,12 @@
#ifdef PADDLE_WITH_HETERPS
namespace
paddle
{
namespace
framework
{
class
GpuPsGraphTable
:
public
HeterComm
<
uint64_t
,
int64_t
,
int
>
{
class
GpuPsGraphTable
:
public
HeterComm
<
uint64_t
,
int64_t
,
int
,
CommonFeatureValueAccessor
>
{
public:
GpuPsGraphTable
(
std
::
shared_ptr
<
HeterPsResource
>
resource
,
int
topo_aware
)
:
HeterComm
<
uint64_t
,
int64_t
,
int
>
(
1
,
resource
)
{
:
HeterComm
<
uint64_t
,
int64_t
,
int
,
CommonFeatureValueAccessor
>
(
1
,
resource
)
{
load_factor_
=
0.25
;
rw_lock
.
reset
(
new
pthread_rwlock_t
());
gpu_num
=
resource_
->
total_device
();
...
...
paddle/fluid/framework/fleet/heter_ps/hashtable.h
浏览文件 @
b8d106e1
...
...
@@ -137,8 +137,12 @@ class HashTable {
size_t
len
,
StreamType
stream
);
template
<
typename
StreamType
>
void
get
(
const
KeyType
*
d_keys
,
char
*
d_vals
,
size_t
len
,
StreamType
stream
);
template
<
typename
StreamType
,
typename
FVAccessor
>
void
get
(
const
KeyType
*
d_keys
,
char
*
d_vals
,
size_t
len
,
StreamType
stream
,
FVAccessor
&
fv_accessor
);
void
show
();
...
...
@@ -150,9 +154,9 @@ class HashTable {
#if defined(PADDLE_WITH_CUDA)
template
<
typename
GradType
,
typename
Sgd
,
typename
StreamType
>
template
<
typename
Sgd
,
typename
StreamType
>
void
update
(
const
KeyType
*
d_keys
,
const
GradType
*
d_grads
,
const
float
*
d_grads
,
size_t
len
,
Sgd
sgd
,
StreamType
stream
);
...
...
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
浏览文件 @
b8d106e1
...
...
@@ -83,36 +83,25 @@ __global__ void search_kernel(Table* table,
}
}
template
<
typename
Table
>
template
<
typename
Table
,
typename
FVAccessor
>
__global__
void
dy_mf_search_kernel
(
Table
*
table
,
const
typename
Table
::
key_type
*
const
keys
,
char
*
vals
,
size_t
len
,
size_t
pull_feature_value_size
)
{
size_t
pull_feature_value_size
,
FVAccessor
feature_value_accessor
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
auto
it
=
table
->
find
(
keys
[
i
]);
if
(
it
!=
table
->
end
())
{
uint64_t
offset
=
i
*
pull_feature_value_size
;
FeatureValue
*
cur
=
(
FeatureValue
*
)(
vals
+
offset
);
FeatureValue
&
input
=
*
(
FeatureValue
*
)(
it
->
second
);
cur
->
slot
=
input
.
slot
;
cur
->
show
=
input
.
show
;
cur
->
clk
=
input
.
clk
;
cur
->
mf_dim
=
input
.
mf_dim
;
cur
->
lr
=
input
.
lr
;
cur
->
mf_size
=
input
.
mf_size
;
cur
->
cpu_ptr
=
input
.
cpu_ptr
;
cur
->
delta_score
=
input
.
delta_score
;
cur
->
lr_g2sum
=
input
.
lr_g2sum
;
for
(
int
j
=
0
;
j
<
cur
->
mf_dim
+
1
;
++
j
)
{
cur
->
mf
[
j
]
=
input
.
mf
[
j
];
}
}
else
{
if
(
keys
[
i
]
!=
0
)
{
printf
(
"warning::pull miss key: %llu"
,
keys
[
i
]);
}
float
*
cur
=
(
float
*
)(
vals
+
offset
);
float
*
input
=
it
->
second
;
int
mf_dim
=
int
(
input
[
feature_value_accessor
.
common_feature_value
.
MfDimIndex
()]);
feature_value_accessor
.
FeatureValueFill
(
cur
,
input
,
mf_dim
);
}
}
}
...
...
@@ -145,8 +134,8 @@ __global__ void dy_mf_update_kernel(Table* table,
if
(
i
<
len
)
{
auto
it
=
table
->
find
(
keys
[
i
]);
if
(
it
!=
table
->
end
())
{
FeaturePushValue
*
cur
=
(
FeaturePushValue
*
)(
grads
+
i
*
grad_value_size
);
sgd
.
dy_mf_update_value
(
optimizer_config
,
(
it
.
getter
())
->
second
,
*
cur
);
float
*
cur
=
(
float
*
)(
grads
+
i
*
grad_value_size
);
sgd
.
dy_mf_update_value
(
optimizer_config
,
(
it
.
getter
())
->
second
,
cur
);
}
else
{
if
(
keys
[
i
]
!=
0
)
{
printf
(
"warning::push miss key: %llu"
,
keys
[
i
]);
...
...
@@ -212,17 +201,18 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
}
template
<
typename
KeyType
,
typename
ValType
>
template
<
typename
StreamType
>
template
<
typename
StreamType
,
typename
FVAccessor
>
void
HashTable
<
KeyType
,
ValType
>::
get
(
const
KeyType
*
d_keys
,
char
*
d_vals
,
size_t
len
,
StreamType
stream
)
{
StreamType
stream
,
FVAccessor
&
fv_accessor
)
{
if
(
len
==
0
)
{
return
;
}
const
int
grid_size
=
(
len
-
1
)
/
BLOCK_SIZE_
+
1
;
dy_mf_search_kernel
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
container_
,
d_keys
,
d_vals
,
len
,
pull_feature_value_size_
);
container_
,
d_keys
,
d_vals
,
len
,
pull_feature_value_size_
,
fv_accessor
);
}
template
<
typename
KeyType
,
typename
ValType
>
...
...
@@ -298,27 +288,6 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
cpu_val
[
x
+
7
]
=
gpu_val
.
mf
[
x
];
}
}
#endif
#ifdef PADDLE_WITH_PSCORE
auto
*
downpour_value
=
(
paddle
::
distributed
::
FixedFeatureValue
*
)(
gpu_val
.
cpu_ptr
);
int
downpour_value_size
=
downpour_value
->
size
();
if
(
gpu_val
.
mf_size
>
0
&&
downpour_value_size
==
7
)
{
downpour_value
->
resize
(
gpu_val
.
mf_size
+
downpour_value_size
);
}
float
*
cpu_val
=
downpour_value
->
data
();
// cpu_val[0] = 0;
cpu_val
[
2
]
=
gpu_val
.
delta_score
;
cpu_val
[
3
]
=
gpu_val
.
show
;
cpu_val
[
4
]
=
gpu_val
.
clk
;
cpu_val
[
5
]
=
gpu_val
.
lr
;
cpu_val
[
6
]
=
gpu_val
.
lr_g2sum
;
cpu_val
[
0
]
=
gpu_val
.
slot
;
if
(
gpu_val
.
mf_size
>
0
)
{
for
(
int
x
=
0
;
x
<
gpu_val
.
mf_size
;
x
++
)
{
cpu_val
[
x
+
7
]
=
gpu_val
.
mf
[
x
];
}
}
#endif
}
};
...
...
@@ -336,9 +305,9 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
}
template
<
typename
KeyType
,
typename
ValType
>
template
<
typename
GradType
,
typename
Sgd
,
typename
StreamType
>
template
<
typename
Sgd
,
typename
StreamType
>
void
HashTable
<
KeyType
,
ValType
>::
update
(
const
KeyType
*
d_keys
,
const
GradType
*
d_grads
,
const
float
*
d_grads
,
size_t
len
,
Sgd
sgd
,
StreamType
stream
)
{
...
...
@@ -371,8 +340,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
push_grad_value_size_
);
}
template
class
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
>;
template
class
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>;
template
class
HashTable
<
unsigned
long
,
float
>;
template
class
HashTable
<
unsigned
long
,
float
*
>;
template
class
HashTable
<
long
,
int
>;
template
class
HashTable
<
unsigned
long
,
int
>;
template
class
HashTable
<
unsigned
long
,
unsigned
long
>;
...
...
@@ -382,15 +351,19 @@ template class HashTable<long, long>;
template
class
HashTable
<
long
,
unsigned
long
>;
template
class
HashTable
<
long
,
unsigned
int
>;
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
>
::
get
<
c
udaStream_t
>
(
c
onst
unsigned
long
*
d_keys
,
paddle
::
framework
::
FeatureValue
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
float
>
::
get
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
float
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>
::
get
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
char
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
HashTable
<
unsigned
long
,
float
*
>
::
get
<
cudaStream_t
,
CommonFeatureValueAccessor
>
(
const
unsigned
long
*
d_keys
,
char
*
d_vals
,
size_t
len
,
cudaStream_t
stream
,
CommonFeatureValueAccessor
&
fv_accessor
);
template
void
HashTable
<
long
,
int
>
::
get
<
cudaStream_t
>
(
const
long
*
d_keys
,
int
*
d_vals
,
...
...
@@ -399,6 +372,12 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
template
void
HashTable
<
unsigned
long
,
int
>
::
get
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
int
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
unsigned
long
>
::
get
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
unsigned
long
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
unsigned
long
>
::
get
<
cudaStream_t
>
(
const
long
*
d_keys
,
unsigned
long
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
long
>
::
get
<
cudaStream_t
>
(
const
long
*
d_keys
,
...
...
@@ -414,19 +393,19 @@ template void HashTable<unsigned long, long>::get<cudaStream_t>(
// const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
// stream);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
>
::
insert
<
c
udaStream_t
>
(
c
onst
unsigned
long
*
d_keys
,
const
paddle
::
framework
::
FeatureValue
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
float
>
::
insert
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
const
float
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>
::
insert
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
feature_value_size
,
size_t
start_index
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
float
*
>
::
insert
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
feature_value_size
,
size_t
start_index
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
int
>
::
insert
<
cudaStream_t
>
(
const
long
*
d_keys
,
const
int
*
d_vals
,
...
...
@@ -460,30 +439,37 @@ template void HashTable<unsigned long, long>::insert<cudaStream_t>(
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
>
::
dump_to_cpu
<
cudaStream_t
>
(
int
devid
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
unsigned
long
>
::
insert
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
const
unsigned
long
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
>
::
update
<
paddle
::
framework
::
FeaturePushValue
,
Optimizer
<
paddle
::
framework
::
FeatureValue
,
paddle
::
framework
::
FeaturePushValue
>
,
template
void
HashTable
<
unsigned
long
,
float
*
>
::
dump_to_cpu
<
cudaStream_t
>
(
int
devid
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
float
*
>
::
update
<
SparseAdagradOptimizer
,
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
const
char
*
d_grads
,
size_t
len
,
SparseAdagradOptimizer
sgd
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
float
*
>
::
update
<
SparseAdamOptimizer
,
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
const
char
*
d_grads
,
size_t
len
,
SparseAdamOptimizer
sgd
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
float
*
>
::
update
<
SparseAdamSharedOptimizer
,
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
const
paddle
::
framework
::
FeaturePushValue
*
d_grads
,
const
char
*
d_grads
,
size_t
len
,
Optimizer
<
paddle
::
framework
::
FeatureValue
,
paddle
::
framework
::
FeaturePushValue
>
sgd
,
SparseAdamSharedOptimizer
sgd
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>
::
update
<
Optimizer
<
paddle
::
framework
::
FeatureValue
,
paddle
::
framework
::
FeaturePushValue
>
,
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
const
char
*
d_grads
,
size_t
len
,
Optimizer
<
paddle
::
framework
::
FeatureValue
,
paddle
::
framework
::
FeaturePushValue
>
sgd
,
cudaStream_t
stream
);
// template void HashTable<unsigned long,
// paddle::framework::FeatureValue>::update<
// Optimizer<paddle::framework::FeatureValue,
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
浏览文件 @
b8d106e1
...
...
@@ -46,7 +46,10 @@ namespace framework {
#define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
class
HeterComm
{
public:
HeterComm
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
);
...
...
@@ -65,12 +68,9 @@ class HeterComm {
GradType
*
d_grads
,
size_t
len
,
int
&
uniq_len
);
// NOLINT
void
dynamic_merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
int
&
uniq_len
);
void
pull_sparse
(
int
num
,
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
);
void
dynamic_merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
float
*
d_grads
,
size_t
len
,
int
&
uniq_len
);
void
pull_sparse
(
int
num
,
KeyType
*
d_keys
,
float
*
d_vals
,
size_t
len
);
void
build_ps
(
int
num
,
KeyType
*
h_keys
,
ValType
*
h_vals
,
...
...
@@ -92,7 +92,7 @@ class HeterComm {
template
<
typename
Sgd
>
void
push_sparse
(
int
num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
float
*
d_grads
,
size_t
len
,
Sgd
&
sgd
);
// NOLINT
#elif defined(PADDLE_WITH_XPU_KP)
...
...
@@ -149,6 +149,13 @@ class HeterComm {
multi_mf_dim_
=
multi_mf_dim
;
max_mf_dim_
=
max_mf_dim
;
}
void
set_accessor
(
FVAccessor
&
accessor
)
{
feature_value_accessor_
=
accessor
;
// for (auto& ptr_table: ptr_tables_) {
// ptr_table->set_accessor(feature_value_accessor_);
// }
}
#endif
bool
need_transfer
(
int
send_id
,
int
receive_id
)
{
...
...
@@ -282,9 +289,11 @@ class HeterComm {
char
*
src_val
,
size_t
val_size
);
FVAccessor
feature_value_accessor_
;
protected:
using
Table
=
HashTable
<
KeyType
,
ValType
>
;
using
PtrTable
=
HashTable
<
KeyType
,
ValType
*>
;
using
PtrTable
=
HashTable
<
KeyType
,
float
*>
;
std
::
vector
<
Table
*>
tables_
;
std
::
vector
<
PtrTable
*>
ptr_tables_
;
std
::
shared_ptr
<
HeterPsResource
>
resource_
;
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
浏览文件 @
b8d106e1
...
...
@@ -24,8 +24,12 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
HeterComm
<
KeyType
,
ValType
,
GradType
>::
HeterComm
(
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
HeterComm
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
)
{
VLOG
(
1
)
<<
"Construct new HeterComm"
;
resource_
=
resource
;
...
...
@@ -42,10 +46,14 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
tables_
.
push_back
(
table
);
}
else
{
max_mf_dim_
=
resource_
->
max_mf_dim
();
size_t
val_type_size
=
TYPEALIGN
(
8
,
sizeof
(
FeatureValue
)
+
sizeof
(
float
)
*
(
max_mf_dim_
+
1
));
size_t
grad_type_size
=
TYPEALIGN
(
8
,
sizeof
(
FeaturePushValue
)
+
(
max_mf_dim_
*
sizeof
(
float
)));
auto
accessor_wrapper_ptr
=
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
();
size_t
val_type_size
=
accessor_wrapper_ptr
->
GetFeatureValueSize
(
max_mf_dim_
);
size_t
grad_type_size
=
accessor_wrapper_ptr
->
GetPushValueSize
(
max_mf_dim_
);
VLOG
(
0
)
<<
" HeterComm init, max feature_value_size:"
<<
val_type_size
<<
", feature_value_push_size:"
<<
grad_type_size
;
auto
ptr_table
=
new
PtrTable
(
capacity
/
load_factor_
);
ptr_table
->
set_feature_value_size
(
val_type_size
,
grad_type_size
);
ptr_tables_
.
push_back
(
ptr_table
);
...
...
@@ -58,8 +66,11 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
init_path
();
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
init_path
()
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
init_path
()
{
int
total_device
=
resource_
->
total_device
();
path_
.
resize
(
total_device
);
if
(
!
topo_aware_
)
{
...
...
@@ -111,14 +122,18 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
template
<
typename
DstPlace
,
typename
SrcPlace
,
typename
StreamType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
memory_copy
(
DstPlace
dst_place
,
void
*
dst
,
SrcPlace
src_place
,
const
void
*
src
,
size_t
count
,
StreamType
stream
)
{
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
memory_copy
(
DstPlace
dst_place
,
void
*
dst
,
SrcPlace
src_place
,
const
void
*
src
,
size_t
count
,
StreamType
stream
)
{
#if defined(PADDLE_WITH_CUDA)
cudaMemcpyAsync
(
dst
,
src
,
count
,
cudaMemcpyDefault
,
stream
);
if
(
stream
==
0
)
{
...
...
@@ -129,11 +144,12 @@ void HeterComm<KeyType, ValType, GradType>::memory_copy(DstPlace dst_place,
#endif
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
create_storage
(
int
start_index
,
int
end_index
,
int
keylen
,
int
vallen
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
create_storage
(
int
start_index
,
int
end_index
,
int
keylen
,
int
vallen
)
{
#if defined(PADDLE_WITH_CUDA)
auto
&
allocator
=
allocators_
[
start_index
];
auto
&
nodes
=
path_
[
start_index
][
end_index
].
nodes_
;
...
...
@@ -167,9 +183,12 @@ void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
#endif
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
destroy_storage
(
int
start_index
,
int
end_index
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
destroy_storage
(
int
start_index
,
int
end_index
)
{
#if defined(PADDLE_WITH_CUDA)
auto
&
allocator
=
allocators_
[
start_index
];
auto
&
nodes
=
path_
[
start_index
][
end_index
].
nodes_
;
...
...
@@ -184,13 +203,17 @@ void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
#endif
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
walk_to_dest
(
int
start_index
,
int
num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
GradType
*
src_val
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
walk_to_dest
(
int
start_index
,
int
num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
GradType
*
src_val
)
{
int
need_copy_val
=
0
;
if
(
src_val
)
{
need_copy_val
=
1
;
...
...
@@ -267,14 +290,18 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
walk_to_dest
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
char
*
src_val
,
size_t
val_size
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
walk_to_dest
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
char
*
src_val
,
size_t
val_size
)
{
int
need_copy_val
=
0
;
if
(
src_val
)
{
need_copy_val
=
1
;
...
...
@@ -327,13 +354,17 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
walk_to_src
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
char
*
src_val
,
size_t
val_size
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
walk_to_src
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
char
*
src_val
,
size_t
val_size
)
{
std
::
queue
<
CopyTask
>
que
;
for
(
int
i
=
0
;
i
<
gpu_num
;
i
++
)
{
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
...
...
@@ -383,8 +414,11 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_src(int start_index,
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
HeterComm
<
KeyType
,
ValType
,
GradType
>::~
HeterComm
()
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::~
HeterComm
()
{
if
(
!
multi_mf_dim_
)
{
for
(
auto
&
table
:
tables_
)
{
delete
table
;
...
...
@@ -402,15 +436,22 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() {
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
show_one_table
(
int
gpu_num
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
show_one_table
(
int
gpu_num
)
{
if
(
!
multi_mf_dim_
)
{
tables_
[
gpu_num
]
->
show
();
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
>::
log2i
(
int
x
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
log2i
(
int
x
)
{
unsigned
res
=
0
;
while
(
x
>>=
1
)
{
++
res
;
...
...
@@ -418,13 +459,20 @@ int HeterComm<KeyType, ValType, GradType>::log2i(int x) {
return
res
;
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
>::
get_index_by_devid
(
int
devid
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
get_index_by_devid
(
int
devid
)
{
return
resource_
->
get_index_by_devid
(
devid
);
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
set_sparse_sgd
(
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
set_sparse_sgd
(
const
OptimizerConfig
&
optimizer_config
)
{
for
(
int
i
=
0
;
i
<
resource_
->
total_device
();
++
i
)
{
AnyDeviceGuard
guard
(
resource_
->
dev_id
(
i
));
...
...
@@ -436,8 +484,11 @@ void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
set_embedx_sgd
(
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
set_embedx_sgd
(
const
OptimizerConfig
&
optimizer_config
)
{
for
(
int
i
=
0
;
i
<
resource_
->
total_device
();
++
i
)
{
AnyDeviceGuard
guard
(
resource_
->
dev_id
(
i
));
...
...
@@ -449,13 +500,17 @@ void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
build_ps
(
int
dev_num
,
KeyType
*
h_keys
,
ValType
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
build_ps
(
int
dev_num
,
KeyType
*
h_keys
,
ValType
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
{
if
(
len
<=
0
)
{
return
;
}
...
...
@@ -518,14 +573,18 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int dev_num,
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
build_ps
(
int
num
,
KeyType
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
build_ps
(
int
num
,
KeyType
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
{
if
(
len
<=
0
)
{
return
;
}
...
...
@@ -580,8 +639,11 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num,
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
merge_grad
(
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
merge_grad
(
int
dev_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
...
...
@@ -654,13 +716,12 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(
sync_stream
(
stream
);
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
dynamic_merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
int
&
uniq_len
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
dynamic_merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
float
*
d_grads
,
size_t
len
,
int
&
uniq_len
)
{
int
dev_id
=
resource_
->
dev_id
(
gpu_num
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
...
...
@@ -668,16 +729,15 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
size_t
temp_storage_bytes
;
// VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_;
size_t
grad_value_size
=
TYPEALIGN
(
8
,
sizeof
(
FeaturePushValue
)
+
(
max_mf_dim_
*
sizeof
(
float
))
);
auto
accessor_wrapper_ptr
=
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
();
size_t
grad_value_size
=
accessor_wrapper_ptr
->
GetPushValueSize
(
max_mf_dim_
);
auto
d_merge_keys
=
memory
::
Alloc
(
place
,
len
*
sizeof
(
KeyType
));
KeyType
*
d_merge_keys_ptr
=
reinterpret_cast
<
KeyType
*>
(
d_merge_keys
->
ptr
());
auto
d_merge_grads
=
memory
::
Alloc
(
place
,
len
*
grad_value_size
);
GradType
*
d_merge_grads_ptr
=
reinterpret_cast
<
GradType
*>
(
d_merge_grads
->
ptr
());
float
*
d_merge_grads_ptr
=
reinterpret_cast
<
float
*>
(
d_merge_grads
->
ptr
());
auto
d_fea_num_info
=
memory
::
Alloc
(
place
,
sizeof
(
uint32_t
)
*
(
len
*
3
+
1
));
uint32_t
*
d_fea_num_info_ptr
=
...
...
@@ -772,7 +832,8 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
uniq_len
,
grad_value_size
,
merger_
,
stream
);
stream
,
feature_value_accessor_
);
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpyAsync
(
d_grads
,
d_merge_grads_ptr
,
...
...
@@ -782,8 +843,11 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamSynchronize
(
stream
));
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
split_input_to_shard
(
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
split_input_to_shard
(
KeyType
*
d_keys
,
int
*
d_idx_ptr
,
size_t
len
,
...
...
@@ -843,11 +907,12 @@ void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
sync_stream
(
stream
);
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
pull_sparse
(
int
num
,
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
pull_sparse
(
int
num
,
KeyType
*
d_keys
,
float
*
d_vals
,
size_t
len
)
{
if
(
len
==
0
)
{
return
;
}
...
...
@@ -893,12 +958,15 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
auto
d_idx
=
memory
::
Alloc
(
place
,
len
*
sizeof
(
int
));
int
*
d_idx_ptr
=
reinterpret_cast
<
int
*>
(
d_idx
->
ptr
());
size_t
val_type_size
=
TYPEALIGN
(
8
,
sizeof
(
FeatureValue
)
+
sizeof
(
float
)
*
(
max_mf_dim_
+
1
));
auto
accessor_wrapper_ptr
=
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
();
size_t
val_type_size
=
accessor_wrapper_ptr
->
GetFeatureValueSize
(
max_mf_dim_
);
VLOG
(
3
)
<<
"pull_sparse len:"
<<
len
<<
" val_type_size: "
<<
val_type_size
;
auto
d_shard_keys
=
memory
::
Alloc
(
place
,
len
*
sizeof
(
KeyType
));
KeyType
*
d_shard_keys_ptr
=
reinterpret_cast
<
KeyType
*>
(
d_shard_keys
->
ptr
());
auto
d_shard_vals
=
memory
::
Alloc
(
place
,
len
*
val_type_size
);
ValType
*
d_shard_vals_ptr
=
reinterpret_cast
<
ValType
*>
(
d_shard_vals
->
ptr
());
float
*
d_shard_vals_ptr
=
reinterpret_cast
<
float
*>
(
d_shard_vals
->
ptr
());
split_input_to_shard
(
d_keys
,
d_idx_ptr
,
len
,
d_left_ptr
,
d_right_ptr
,
num
);
...
...
@@ -944,7 +1012,8 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
ptr_tables_
[
i
]
->
get
(
reinterpret_cast
<
KeyType
*>
(
node
.
key_storage
),
node
.
val_storage
,
h_right
[
i
]
-
h_left
[
i
]
+
1
,
resource_
->
remote_stream
(
i
,
num
));
resource_
->
remote_stream
(
i
,
num
),
feature_value_accessor_
);
}
for
(
int
i
=
0
;
i
<
total_device
;
++
i
)
{
...
...
@@ -964,10 +1033,16 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
auto
&
node
=
path_
[
num
][
i
].
nodes_
.
front
();
sync_stream
(
node
.
out_stream
);
}
heter_comm_kernel_
->
dy_mf_fill_dvals
(
d_shard_vals_ptr
,
d_vals
,
d_idx_ptr
,
len
,
val_type_size
,
stream
);
heter_comm_kernel_
->
dy_mf_fill_dvals
(
d_shard_vals_ptr
,
d_vals
,
d_idx_ptr
,
len
,
val_type_size
,
stream
,
feature_value_accessor_
);
sync_stream
(
stream
);
for
(
int
i
=
0
;
i
<
total_device
;
++
i
)
{
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
continue
;
...
...
@@ -977,13 +1052,17 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
}
#if defined(PADDLE_WITH_CUDA)
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
template
<
typename
Sgd
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
push_sparse
(
int
dev_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
Sgd
&
sgd
)
{
// NOLINT
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
push_sparse
(
int
dev_num
,
KeyType
*
d_keys
,
float
*
d_grads
,
size_t
len
,
Sgd
&
sgd
)
{
// NOLINT
if
(
len
==
0
)
{
return
;
}
...
...
@@ -991,8 +1070,9 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
int
total_device
=
resource_
->
total_device
();
int
dev_id
=
resource_
->
dev_id
(
dev_num
);
size_t
grad_value_size
=
TYPEALIGN
(
8
,
sizeof
(
FeaturePushValue
)
+
(
max_mf_dim_
*
sizeof
(
float
)));
auto
accessor_wrapper_ptr
=
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
();
size_t
grad_value_size
=
accessor_wrapper_ptr
->
GetPushValueSize
(
max_mf_dim_
);
DevPlace
place
=
DevPlace
(
dev_id
);
AnyDeviceGuard
guard
(
dev_id
);
auto
stream
=
resource_
->
local_stream
(
dev_num
,
0
);
...
...
@@ -1037,8 +1117,7 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
KeyType
*
d_shard_keys_ptr
=
reinterpret_cast
<
KeyType
*>
(
d_shard_keys
->
ptr
());
auto
d_shard_grads
=
memory
::
Alloc
(
place
,
len
*
grad_value_size
);
GradType
*
d_shard_grads_ptr
=
reinterpret_cast
<
GradType
*>
(
d_shard_grads
->
ptr
());
float
*
d_shard_grads_ptr
=
reinterpret_cast
<
float
*>
(
d_shard_grads
->
ptr
());
int
uniq_len
=
len
;
dynamic_merge_grad
(
dev_num
,
d_keys
,
d_grads
,
len
,
uniq_len
);
...
...
@@ -1048,24 +1127,15 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
split_input_to_shard
(
d_keys
,
d_idx_ptr
,
uniq_len
,
d_left_ptr
,
d_right_ptr
,
dev_num
);
if
(
!
multi_mf_dim_
)
{
heter_comm_kernel_
->
fill_shard_grads
(
d_shard_keys_ptr
,
d_keys
,
d_shard_grads_ptr
,
d_grads
,
d_idx_ptr
,
uniq_len
,
stream
);
}
else
{
heter_comm_kernel_
->
dy_mf_fill_shard_grads
(
d_shard_keys_ptr
,
d_keys
,
d_shard_grads_ptr
,
d_grads
,
d_idx_ptr
,
uniq_len
,
grad_value_size
,
stream
);
}
heter_comm_kernel_
->
dy_mf_fill_shard_grads
(
d_shard_keys_ptr
,
d_keys
,
d_shard_grads_ptr
,
d_grads
,
d_idx_ptr
,
uniq_len
,
grad_value_size
,
stream
,
feature_value_accessor_
);
sync_stream
(
stream
);
...
...
@@ -1089,33 +1159,17 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
continue
;
}
if
(
!
multi_mf_dim_
)
{
create_storage
(
dev_num
,
i
,
shard_len
*
sizeof
(
KeyType
),
shard_len
*
sizeof
(
GradType
));
}
else
{
create_storage
(
dev_num
,
i
,
shard_len
*
sizeof
(
KeyType
),
shard_len
*
grad_value_size
);
}
create_storage
(
dev_num
,
i
,
shard_len
*
sizeof
(
KeyType
),
shard_len
*
grad_value_size
);
}
if
(
!
multi_mf_dim_
)
{
walk_to_dest
(
dev_num
,
total_device
,
h_left
,
h_right
,
d_shard_keys_ptr
,
d_shard_grads_ptr
);
}
else
{
walk_to_dest
(
dev_num
,
total_device
,
h_left
,
h_right
,
d_shard_keys_ptr
,
reinterpret_cast
<
char
*>
(
d_shard_grads_ptr
),
grad_value_size
);
}
walk_to_dest
(
dev_num
,
total_device
,
h_left
,
h_right
,
d_shard_keys_ptr
,
reinterpret_cast
<
char
*>
(
d_shard_grads_ptr
),
grad_value_size
);
for
(
int
i
=
0
;
i
<
total_device
;
++
i
)
{
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
...
...
@@ -1125,21 +1179,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
sync_stream
(
node
.
in_stream
);
AnyDeviceGuard
guard
(
resource_
->
dev_id
(
i
));
if
(
!
multi_mf_dim_
)
{
tables_
[
i
]
->
rwlock_
->
WRLock
();
tables_
[
i
]
->
update
(
reinterpret_cast
<
KeyType
*>
(
node
.
key_storage
),
reinterpret_cast
<
GradType
*>
(
node
.
val_storage
),
h_right
[
i
]
-
h_left
[
i
]
+
1
,
sgd
,
resource_
->
remote_stream
(
i
,
dev_num
));
}
else
{
ptr_tables_
[
i
]
->
rwlock_
->
WRLock
();
ptr_tables_
[
i
]
->
update
(
reinterpret_cast
<
KeyType
*>
(
node
.
key_storage
),
node
.
val_storage
,
h_right
[
i
]
-
h_left
[
i
]
+
1
,
sgd
,
resource_
->
remote_stream
(
i
,
dev_num
));
}
ptr_tables_
[
i
]
->
rwlock_
->
WRLock
();
ptr_tables_
[
i
]
->
update
(
reinterpret_cast
<
KeyType
*>
(
node
.
key_storage
),
node
.
val_storage
,
h_right
[
i
]
-
h_left
[
i
]
+
1
,
sgd
,
resource_
->
remote_stream
(
i
,
dev_num
));
}
for
(
int
i
=
0
;
i
<
total_device
;
++
i
)
{
...
...
@@ -1162,11 +1207,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
}
#elif defined(PADDLE_WITH_XPU_KP)
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
push_sparse
(
int
dev_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
)
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
push_sparse
(
int
dev_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
)
{
if
(
len
==
0
)
{
return
;
}
...
...
@@ -1302,9 +1348,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
#endif
#if defined(PADDLE_WITH_CUDA)
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
template
<
typename
Sgd
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
update_one_table
(
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
update_one_table
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
...
...
@@ -1323,9 +1372,12 @@ void HeterComm<KeyType, ValType, GradType>::update_one_table(
cudaStreamSynchronize
(
resource_
->
remote_stream
(
gpu_num
,
gpu_num
));
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
template
<
typename
Sgd
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
push_sparse_multi_node
(
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
push_sparse_multi_node
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
...
...
@@ -1352,8 +1404,11 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node(
sgd
);
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
>::
gather_one_node_grad
(
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
gather_one_node_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
int
len
)
{
int
total_gpu
=
resource_
->
total_device
();
int
dev_id
=
resource_
->
dev_id
(
gpu_num
);
...
...
@@ -1454,8 +1509,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
return
ret
;
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
>::
gather_multi_node_grad
(
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
gather_multi_node_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
int
len
)
{
int
dev_id
=
resource_
->
dev_id
(
gpu_num
);
auto
&
storage
=
storage_
[
gpu_num
];
...
...
@@ -1525,8 +1583,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
}
#endif
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
end_pass
()
{
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
,
typename
FVAccessor
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
,
FVAccessor
>::
end_pass
()
{
int
total_device
=
resource_
->
total_device
();
std
::
vector
<
std
::
thread
>
threads
;
...
...
@@ -1547,8 +1608,10 @@ void HeterComm<KeyType, ValType, GradType>::end_pass() {
}
}
// template <typename KeyType, typename ValType, typename GradType>
// void HeterComm<KeyType, ValType, GradType>::dump_to_cpu(int index) {
// template <typename KeyType, typename ValType, typename GradType, typename
// FVAccessor>
// void HeterComm<KeyType, ValType, GradType, FVAccessor>::dump_to_cpu(int
// index) {
// auto stream = resource_->local_stream(index, 0);
// int dev_id = resource_->dev_id(index);
// platform::CUDADeviceGuard guard(dev_id);
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
浏览文件 @
b8d106e1
...
...
@@ -128,22 +128,28 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals,
}
}
template
<
typename
KeyType
,
typename
GradType
,
typename
T
>
__global__
void
dy_mf_fill_shard_grads_kernel
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
T
*
idx
,
size_t
len
,
size_t
grad_value_size
)
{
template
<
typename
KeyType
,
typename
T
,
typename
FVAccessor
>
__global__
void
dy_mf_fill_shard_grads_kernel
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
float
*
d_shard_grads
,
float
*
d_grads
,
T
*
idx
,
size_t
len
,
size_t
grad_value_size
,
FVAccessor
feature_value_accessor
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
d_shard_keys
[
i
]
=
d_keys
[
idx
[
i
]];
*
(
GradType
*
)((
char
*
)
d_shard_grads
+
i
*
grad_value_size
)
=
*
(
GradType
*
)((
char
*
)
d_grads
+
uint64_t
(
idx
[
i
])
*
grad_value_size
);
float
*
cur
=
(
float
*
)((
char
*
)
d_shard_grads
+
i
*
grad_value_size
);
float
*
shard_val
=
(
float
*
)((
char
*
)
d_grads
+
uint64_t
(
idx
[
i
])
*
grad_value_size
);
feature_value_accessor
.
PushValueFill
(
cur
,
shard_val
);
}
}
template
<
typename
FVAccessor
>
__global__
void
merge_gradients_kernel
(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
...
...
@@ -151,36 +157,40 @@ __global__ void merge_gradients_kernel(const uint32_t* offset,
char
*
output
,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
)
{
DynamicGradMerger
&
merger
,
FVAccessor
&
feature_value_accessor
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
n
)
{
uint32_t
start
=
offset
[
i
];
uint32_t
num
=
fea_num
[
i
];
int
ori_index
=
index
[
start
];
FeaturePushValue
&
out
=
*
(
FeaturePushValue
*
)(
output
+
i
*
grad_value_size
);
FeaturePushValue
&
in
=
*
(
FeaturePushValue
*
)(
input
+
size_t
(
ori_index
)
*
grad_value_size
);
merger_
.
update_one
(
out
,
in
);
float
*
out
=
(
float
*
)(
output
+
i
*
grad_value_size
);
float
*
in
=
(
float
*
)(
input
+
size_t
(
ori_index
)
*
grad_value_size
);
merger
.
update_one
(
out
,
in
,
feature_value_accessor
);
for
(
int
j
=
1
;
j
<
num
;
++
j
)
{
ori_index
=
index
[
start
+
j
];
FeaturePushValue
&
rhs
=
*
(
FeaturePushValue
*
)(
input
+
size_t
(
ori_index
)
*
grad_value_size
);
merger_
.
merge_one
(
out
,
rhs
);
in
=
(
float
*
)(
input
+
size_t
(
ori_index
)
*
grad_value_size
);
merger
.
merge_one
(
out
,
in
,
feature_value_accessor
);
}
}
}
template
<
typename
ValType
,
typename
T
>
__global__
void
dy_mf_fill_dvals_kernel
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
template
<
typename
T
,
typename
FVAccessor
>
__global__
void
dy_mf_fill_dvals_kernel
(
float
*
d_shard_vals
,
float
*
d_vals
,
T
*
idx
,
size_t
len
,
size_t
val_size
)
{
size_t
val_size
,
FVAccessor
feature_value_accessor
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
uint64_t
new_offset
=
uint64_t
(
idx
[
i
])
*
val_size
;
*
(
ValType
*
)((
char
*
)
d_vals
+
new_offset
)
=
*
(
ValType
*
)((
char
*
)
d_shard_vals
+
i
*
val_size
);
float
*
cur
=
(
float
*
)((
char
*
)
d_vals
+
new_offset
);
float
*
shard_val
=
(
float
*
)((
char
*
)
d_shard_vals
+
uint64_t
(
i
)
*
val_size
);
int
mf_dim
=
int
(
shard_val
[
feature_value_accessor
.
common_feature_value
.
MfDimIndex
()]);
feature_value_accessor
.
FeatureValueFill
(
cur
,
shard_val
,
mf_dim
);
}
}
...
...
@@ -312,15 +322,20 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
debug_synchronous
));
}
template
<
typename
KeyType
,
typename
GradType
,
typename
T
,
typename
StreamType
>
void
HeterCommKernel
::
dy_mf_fill_shard_grads
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
T
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
StreamType
&
stream
)
{
template
<
typename
KeyType
,
typename
T
,
typename
StreamType
,
typename
FVAccessor
>
void
HeterCommKernel
::
dy_mf_fill_shard_grads
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
float
*
d_shard_grads
,
float
*
d_grads
,
T
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
StreamType
&
stream
,
FVAccessor
&
feature_value_accessor
)
{
int
grid_size
=
(
len
-
1
)
/
block_size_
+
1
;
size_t
c_len
=
(
size_t
)
len
;
dy_mf_fill_shard_grads_kernel
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
...
...
@@ -330,10 +345,11 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys,
d_grads
,
idx
,
c_len
,
grad_value_size
);
grad_value_size
,
feature_value_accessor
);
}
template
<
typename
StreamType
>
template
<
typename
StreamType
,
typename
FVAccessor
>
void
HeterCommKernel
::
merge_gradient
(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
...
...
@@ -342,23 +358,33 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
,
const
StreamType
&
stream
)
{
const
StreamType
&
stream
,
FVAccessor
&
feature_value_accessor
)
{
int
grid_size
=
(
n
-
1
)
/
block_size_
+
1
;
merge_gradients_kernel
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
offset
,
fea_num
,
index
,
input
,
output
,
n
,
grad_value_size
,
merger_
);
offset
,
fea_num
,
index
,
input
,
output
,
n
,
grad_value_size
,
merger_
,
feature_value_accessor
);
}
template
<
typename
ValType
,
typename
T
,
typename
StreamType
>
void
HeterCommKernel
::
dy_mf_fill_dvals
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
template
<
typename
T
,
typename
StreamType
,
typename
FVAccessor
>
void
HeterCommKernel
::
dy_mf_fill_dvals
(
float
*
d_shard_vals
,
float
*
d_vals
,
T
*
idx
,
long
long
len
,
size_t
val_size
,
const
StreamType
&
stream
)
{
const
StreamType
&
stream
,
FVAccessor
&
feature_value_accessor
)
{
int
grid_size
=
(
len
-
1
)
/
block_size_
+
1
;
size_t
c_len
=
(
size_t
)
len
;
dy_mf_fill_dvals_kernel
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_vals
,
d_vals
,
idx
,
c_len
,
val_size
);
d_shard_vals
,
d_vals
,
idx
,
c_len
,
val_size
,
feature_value_accessor
);
}
template
void
HeterCommKernel
::
fill_idx
<
int
,
cudaStream_t
>(
...
...
@@ -402,17 +428,15 @@ template void HeterCommKernel::fill_shard_key<unsigned long, int, cudaStream_t>(
long
long
len
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
fill_shard_grads
<
unsigned
long
,
paddle
::
framework
::
FeaturePushValue
,
int
,
cudaStream_t
>(
unsigned
long
*
d_shard_keys
,
unsigned
long
*
d_keys
,
paddle
::
framework
::
FeaturePushValue
*
d_shard_grads
,
paddle
::
framework
::
FeaturePushValue
*
d_grads
,
int
*
idx
,
long
long
len
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
fill_shard_grads
<
unsigned
long
,
float
,
int
,
cudaStream_t
>(
unsigned
long
*
d_shard_keys
,
unsigned
long
*
d_keys
,
float
*
d_shard_grads
,
float
*
d_grads
,
int
*
idx
,
long
long
len
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
fill_dvals
<
paddle
::
framework
::
FeatureValue
,
int
,
cudaStream_t
>(
...
...
@@ -467,20 +491,23 @@ template void HeterCommKernel::reduce_by_key<
cudaStream_t
stream
,
bool
debug_synchronous
);
template
void
HeterCommKernel
::
dy_mf_fill_shard_grads
<
unsigned
long
,
paddle
::
framework
::
FeaturePushValue
,
int
,
cudaStream_t
>(
unsigned
long
*
d_shard_keys
,
unsigned
long
*
d_keys
,
paddle
::
framework
::
FeaturePushValue
*
d_shard_grads
,
paddle
::
framework
::
FeaturePushValue
*
d_grads
,
int
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
merge_gradient
<
cudaStream_t
>(
template
void
HeterCommKernel
::
dy_mf_fill_shard_grads
<
unsigned
long
,
int
,
cudaStream_t
,
CommonFeatureValueAccessor
>(
unsigned
long
*
d_shard_keys
,
unsigned
long
*
d_keys
,
float
*
d_shard_grads
,
float
*
d_grads
,
int
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
cudaStream_t
&
stream
,
CommonFeatureValueAccessor
&
feature_value_accessor
);
template
void
HeterCommKernel
::
merge_gradient
<
cudaStream_t
,
CommonFeatureValueAccessor
>(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
...
...
@@ -489,16 +516,18 @@ template void HeterCommKernel::merge_gradient<cudaStream_t>(
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
,
const
cudaStream_t
&
stream
);
const
cudaStream_t
&
stream
,
CommonFeatureValueAccessor
&
feature_value_accessor
);
template
void
HeterCommKernel
::
dy_mf_fill_dvals
<
paddle
::
framework
::
FeatureValue
,
int
,
cudaStream_t
>(
paddle
::
framework
::
FeatureValue
*
d_shard_vals
,
paddle
::
framework
::
FeatureValue
*
d_vals
,
dy_mf_fill_dvals
<
int
,
cudaStream_t
,
CommonFeatureValueAccessor
>(
float
*
d_shard_vals
,
float
*
d_vals
,
int
*
idx
,
long
long
len
,
size_t
val_size
,
const
cudaStream_t
&
stream
);
const
cudaStream_t
&
stream
,
CommonFeatureValueAccessor
&
feature_value_accessor
);
#endif
}
// namespace framework
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
浏览文件 @
b8d106e1
...
...
@@ -41,25 +41,16 @@ struct DynamicGradMerger {
return
out
;
}
template
<
typename
T
>
__device__
__forceinline__
void
update_one
(
T
&
output
,
const
T
&
input
)
{
output
.
slot
=
input
.
slot
;
output
.
show
=
input
.
show
;
output
.
clk
=
input
.
clk
;
output
.
mf_dim
=
input
.
mf_dim
;
output
.
lr_g
=
input
.
lr_g
;
for
(
int
i
=
0
;
i
<
output
.
mf_dim
;
++
i
)
{
output
.
mf_g
[
i
]
=
input
.
mf_g
[
i
];
}
template
<
typename
FVAccessor
>
__device__
__forceinline__
void
update_one
(
float
*
output
,
const
float
*
input
,
FVAccessor
&
feature_value_accessor
)
{
feature_value_accessor
.
PushValueFill
(
output
,
input
);
}
template
<
typename
T
>
__device__
__forceinline__
void
merge_one
(
T
&
output
,
const
T
&
input
)
{
output
.
show
+=
input
.
show
;
output
.
clk
+=
input
.
clk
;
output
.
lr_g
+=
input
.
lr_g
;
for
(
int
i
=
0
;
i
<
input
.
mf_dim
;
++
i
)
{
output
.
mf_g
[
i
]
+=
input
.
mf_g
[
i
];
}
template
<
typename
FVAccessor
>
__device__
__forceinline__
void
merge_one
(
float
*
output
,
const
float
*
input
,
FVAccessor
&
feature_value_accessor
)
{
feature_value_accessor
.
MergePushValue
(
output
,
input
);
}
};
...
...
@@ -146,19 +137,20 @@ class HeterCommKernel {
bool
debug_synchronous
=
false
);
template
<
typename
KeyType
,
typename
GradType
,
typename
T
,
typename
StreamType
>
typename
StreamType
,
typename
FVAccessor
>
void
dy_mf_fill_shard_grads
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
float
*
d_shard_grads
,
float
*
d_grads
,
T
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
StreamType
&
stream
);
const
StreamType
&
stream
,
FVAccessor
&
feature_value_accessor
);
template
<
typename
StreamType
>
template
<
typename
StreamType
,
typename
FVAccessor
>
void
merge_gradient
(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
...
...
@@ -167,15 +159,17 @@ class HeterCommKernel {
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
,
const
StreamType
&
stream
);
const
StreamType
&
stream
,
FVAccessor
&
feature_value_accessor
);
template
<
typename
ValType
,
typename
T
,
typename
StreamType
>
void
dy_mf_fill_dvals
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
template
<
typename
T
,
typename
StreamType
,
typename
FVAccessor
>
void
dy_mf_fill_dvals
(
float
*
d_shard_vals
,
float
*
d_vals
,
T
*
idx
,
long
long
len
,
size_t
val_size
,
const
StreamType
&
stream
);
const
StreamType
&
stream
,
FVAccessor
&
feature_value_accessor
);
private:
int
block_size_
{
256
};
...
...
paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
浏览文件 @
b8d106e1
...
...
@@ -22,34 +22,43 @@ namespace paddle {
namespace
framework
{
HeterPsBase
*
HeterPsBase
::
get_instance
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
)
{
return
new
HeterPs
(
capacity
,
resource
);
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
,
std
::
unordered_map
<
std
::
string
,
float
>
fleet_config
,
std
::
string
accessor_type
,
int
optimizer_type
)
{
if
(
accessor_type
==
"CtrDymfAccessor"
&&
(
optimizer_type
==
1
||
optimizer_type
==
3
||
optimizer_type
==
4
))
{
return
new
HeterPs
<
CommonFeatureValueAccessor
>
(
capacity
,
resource
,
accessor_type
,
fleet_config
,
optimizer_type
);
}
else
{
VLOG
(
0
)
<<
" HeterPsBase get_instance Warning: now only support "
"CtrDymfAccessor, but get "
<<
accessor_type_
;
return
new
HeterPs
<
CommonFeatureValueAccessor
>
(
capacity
,
resource
,
accessor_type
,
fleet_config
,
optimizer_type
);
}
}
HeterPs
::
HeterPs
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
)
{
comm_
=
std
::
make_shared
<
HeterComm
<
FeatureKey
,
FeatureValue
,
FeaturePushValue
>>
(
capacity
,
resource
);
HeterPs
::
HeterPs
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
,
std
::
unordered_map
<
std
::
string
,
float
>
fleet_config
,
std
::
string
accessor_type
,
int
optimizer_type
)
{
comm_
=
std
::
make_shared
<
HeterComm
<
FeatureKey
,
float
*
,
float
*
,
FVAccessor
>>
(
capacity
,
resource
);
optimizer_type_
=
optimizer_type
;
}
HeterPs
::~
HeterPs
()
{}
void
HeterPs
::
pull_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeatureValue
*
d_vals
,
float
*
d_vals
,
size_t
len
)
{
comm_
->
pull_sparse
(
num
,
d_keys
,
d_vals
,
len
);
}
void
HeterPs
::
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
{
comm_
->
build_ps
(
num
,
h_keys
,
h_vals
,
len
,
chunk_size
,
stream_num
);
}
int
HeterPs
::
get_index_by_devid
(
int
devid
)
{
return
comm_
->
get_index_by_devid
(
devid
);
}
...
...
@@ -68,7 +77,7 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
void
HeterPs
::
push_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeaturePushValue
*
d_grads
,
float
*
d_grads
,
size_t
len
)
{
comm_
->
push_sparse
(
num
,
d_keys
,
d_grads
,
len
);
// comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
...
...
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
浏览文件 @
b8d106e1
...
...
@@ -22,80 +22,139 @@ namespace paddle {
namespace
framework
{
HeterPsBase
*
HeterPsBase
::
get_instance
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
)
{
return
new
HeterPs
(
capacity
,
resource
);
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
,
std
::
unordered_map
<
std
::
string
,
float
>
fleet_config
,
std
::
string
accessor_type
,
int
optimizer_type
)
{
if
(
accessor_type
==
"CtrDymfAccessor"
&&
(
optimizer_type
==
1
||
optimizer_type
==
3
||
optimizer_type
==
4
))
{
return
new
HeterPs
<
CommonFeatureValueAccessor
>
(
capacity
,
resource
,
fleet_config
,
accessor_type
,
optimizer_type
);
}
else
{
VLOG
(
0
)
<<
" HeterPsBase get_instance Warning: now only support "
"CtrDymfAccessor, but get "
<<
accessor_type
;
return
new
HeterPs
<
CommonFeatureValueAccessor
>
(
capacity
,
resource
,
fleet_config
,
accessor_type
,
optimizer_type
);
}
}
HeterPs
::
HeterPs
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
)
{
comm_
=
std
::
make_shared
<
HeterComm
<
FeatureKey
,
FeatureValue
,
FeaturePushValue
>>
(
capacity
,
resource
);
opt_
=
Optimizer
<
FeatureValue
,
FeaturePushValue
>
();
template
<
typename
FVAccessor
>
HeterPs
<
FVAccessor
>::
HeterPs
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
,
std
::
unordered_map
<
std
::
string
,
float
>
fleet_config
,
std
::
string
accessor_type
,
int
optimizer_type
)
{
comm_
=
std
::
make_shared
<
HeterComm
<
FeatureKey
,
float
*
,
float
*
,
FVAccessor
>>
(
capacity
,
resource
);
feature_value_accessor_
.
Configure
(
fleet_config
);
set_accessor
(
feature_value_accessor_
);
accessor_type_
=
accessor_type
;
optimizer_type_
=
optimizer_type
;
}
HeterPs
::~
HeterPs
()
{}
template
<
typename
FVAccessor
>
HeterPs
<
FVAccessor
>::~
HeterPs
()
{}
void
HeterPs
::
pull_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeatureValue
*
d_vals
,
size_t
len
)
{
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
pull_sparse
(
int
num
,
FeatureKey
*
d_keys
,
float
*
d_vals
,
size_t
len
)
{
comm_
->
pull_sparse
(
num
,
d_keys
,
d_vals
,
len
);
}
void
HeterPs
::
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
{
comm_
->
build_ps
(
num
,
h_keys
,
h_vals
,
len
,
chunk_size
,
stream_num
);
}
void
HeterPs
::
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
{
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
{
comm_
->
build_ps
(
num
,
h_keys
,
pool
,
len
,
feature_value_size
,
chunk_size
,
stream_num
);
}
int
HeterPs
::
get_index_by_devid
(
int
devid
)
{
template
<
typename
FVAccessor
>
int
HeterPs
<
FVAccessor
>::
get_index_by_devid
(
int
devid
)
{
return
comm_
->
get_index_by_devid
(
devid
);
}
void
HeterPs
::
set_sparse_sgd
(
const
OptimizerConfig
&
optimizer_config
)
{
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
set_sparse_sgd
(
const
OptimizerConfig
&
optimizer_config
)
{
comm_
->
set_sparse_sgd
(
optimizer_config
);
}
void
HeterPs
::
set_embedx_sgd
(
const
OptimizerConfig
&
optimizer_config
)
{
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
set_embedx_sgd
(
const
OptimizerConfig
&
optimizer_config
)
{
comm_
->
set_embedx_sgd
(
optimizer_config
);
}
void
HeterPs
::
end_pass
()
{
comm_
->
end_pass
();
}
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
end_pass
()
{
comm_
->
end_pass
();
}
void
HeterPs
::
show_one_table
(
int
gpu_num
)
{
comm_
->
show_one_table
(
gpu_num
);
}
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
show_one_table
(
int
gpu_num
)
{
comm_
->
show_one_table
(
gpu_num
);
}
void
HeterPs
::
push_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeaturePushValue
*
d_grads
,
size_t
len
)
{
comm_
->
push_sparse
(
num
,
d_keys
,
d_grads
,
len
,
opt_
);
// comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
push_sparse
(
int
num
,
FeatureKey
*
d_keys
,
float
*
d_grads
,
size_t
len
)
{
if
(
accessor_type_
==
"CtrDymfAccessor"
)
{
if
(
optimizer_type_
==
3
)
{
// adam
auto
optimizer
=
SparseAdamOptimizer
(
feature_value_accessor_
);
VLOG
(
5
)
<<
"INTO push_sparse SparseAdamOptimizer, EmbedDim():"
<<
optimizer
.
EmbedDim
();
comm_
->
push_sparse
(
num
,
d_keys
,
d_grads
,
len
,
optimizer
);
}
else
if
(
optimizer_type_
==
4
)
{
// shared_adam
auto
optimizer
=
SparseAdamSharedOptimizer
(
feature_value_accessor_
);
VLOG
(
5
)
<<
"INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():"
<<
optimizer
.
EmbedDim
();
comm_
->
push_sparse
(
num
,
d_keys
,
d_grads
,
len
,
optimizer
);
}
else
if
(
optimizer_type_
==
1
)
{
// adagrad {
auto
optimizer
=
SparseAdagradOptimizer
(
feature_value_accessor_
);
VLOG
(
5
)
<<
"INTO push_sparse SparseAdagradOptimizer, EmbedDim():"
<<
optimizer
.
EmbedDim
();
comm_
->
push_sparse
(
num
,
d_keys
,
d_grads
,
len
,
optimizer
);
}
else
{
VLOG
(
0
)
<<
" push sparse Error: CtrDymfAccessor only support adagrad(1),"
"adam(3) or shared_adam(4), bug get optimizer type:"
<<
optimizer_type_
;
}
}
else
{
VLOG
(
0
)
<<
" push sparse Error: now only support CtrDymfAccessor, but get "
<<
accessor_type_
;
}
}
void
HeterPs
::
set_nccl_comm_and_size
(
const
std
::
vector
<
ncclComm_t
>&
inner_comms
,
const
std
::
vector
<
ncclComm_t
>&
inter_comms
,
int
comm_size
)
{
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
set_nccl_comm_and_size
(
const
std
::
vector
<
ncclComm_t
>&
inner_comms
,
const
std
::
vector
<
ncclComm_t
>&
inter_comms
,
int
comm_size
)
{
comm_
->
set_nccl_comm_and_size
(
inner_comms
,
inter_comms
,
comm_size
);
}
void
HeterPs
::
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
{
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
{
comm_
->
set_multi_mf_dim
(
multi_mf_dim
,
max_mf_dim
);
}
template
<
typename
FVAccessor
>
void
HeterPs
<
FVAccessor
>::
set_accessor
(
FVAccessor
&
accessor
)
{
comm_
->
set_accessor
(
accessor
);
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
浏览文件 @
b8d106e1
...
...
@@ -26,24 +26,23 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
template
<
typename
FVAccessor
>
class
HeterPs
:
public
HeterPsBase
{
public:
HeterPs
()
{}
HeterPs
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
);
HeterPs
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
,
std
::
unordered_map
<
std
::
string
,
float
>
fleet_config
,
std
::
string
accessor_type
,
int
optimizer_type
);
virtual
~
HeterPs
();
HeterPs
(
const
HeterPs
&
)
=
delete
;
HeterPs
&
operator
=
(
const
HeterPs
&
)
=
delete
;
void
pull_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeatureValue
*
d_vals
,
float
*
d_vals
,
size_t
len
)
override
;
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
override
;
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
...
...
@@ -56,6 +55,8 @@ class HeterPs : public HeterPsBase {
const
std
::
vector
<
ncclComm_t
>&
inter_comms
,
int
comm_size
)
override
;
void
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
override
;
void
set_accessor
(
FVAccessor
&
accessor
);
#endif
void
set_sparse_sgd
(
const
OptimizerConfig
&
optimizer_config
)
override
;
...
...
@@ -66,13 +67,15 @@ class HeterPs : public HeterPsBase {
void
show_one_table
(
int
gpu_num
)
override
;
void
push_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeaturePushValue
*
d_grads
,
float
*
d_grads
,
size_t
len
)
override
;
private:
std
::
shared_ptr
<
HeterComm
<
FeatureKey
,
FeatureValue
,
FeaturePushValue
>>
comm_
;
std
::
shared_ptr
<
HeterComm
<
FeatureKey
,
float
*
,
float
*
,
FVAccessor
>>
comm_
;
#if defined(PADDLE_WITH_CUDA)
Optimizer
<
FeatureValue
,
FeaturePushValue
>
opt_
;
FVAccessor
feature_value_accessor_
;
std
::
string
accessor_type_
;
int
optimizer_type_
;
#endif
};
...
...
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
浏览文件 @
b8d106e1
...
...
@@ -34,14 +34,8 @@ class HeterPsBase {
virtual
void
pull_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeatureValue
*
d_vals
,
float
*
d_vals
,
size_t
len
)
=
0
;
virtual
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
=
0
;
virtual
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
...
...
@@ -56,19 +50,25 @@ class HeterPsBase {
const
std
::
vector
<
ncclComm_t
>&
inter_comms
,
int
comm_size
)
=
0
;
virtual
void
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
=
0
;
#endif
virtual
void
end_pass
()
=
0
;
virtual
void
show_one_table
(
int
gpu_num
)
=
0
;
virtual
void
push_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeaturePushValue
*
d_grads
,
float
*
d_grads
,
size_t
len
)
=
0
;
virtual
void
set_sparse_sgd
(
const
OptimizerConfig
&
optimizer_config
)
=
0
;
virtual
void
set_embedx_sgd
(
const
OptimizerConfig
&
optimizer_config
)
=
0
;
static
HeterPsBase
*
get_instance
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
);
static
HeterPsBase
*
get_instance
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
,
// CommonFeatureValueAccessor feature_value_accessor,
std
::
unordered_map
<
std
::
string
,
float
>
fleet_config
,
std
::
string
accessor_type
,
int
optimizer_type
);
};
}
// end namespace framework
...
...
paddle/fluid/framework/fleet/heter_ps/mem_pool.h
浏览文件 @
b8d106e1
...
...
@@ -82,20 +82,6 @@ class HBMMemoryPool : public managed {
cudaMemset
(
mem_
,
0
,
block_size_
*
capacity
);
}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
HBMMemoryPool
&
p
)
{
for
(
size_t
k
=
0
;
k
<
5
;
k
++
)
{
auto
x
=
(
FeatureValue
*
)(
p
.
mem
()
+
k
*
p
.
capacity
());
out
<<
"show: "
<<
x
->
show
<<
" clk: "
<<
x
->
clk
<<
" slot: "
<<
x
->
slot
<<
" lr: "
<<
x
->
lr
<<
" mf_dim: "
<<
x
->
mf_size
<<
" mf_size: "
<<
x
->
mf_size
<<
" mf:"
;
for
(
int
i
=
0
;
i
<
x
->
mf_size
+
1
;
++
i
)
{
out
<<
" "
<<
x
->
mf
[
i
];
}
out
<<
"
\n
"
;
}
return
out
;
}
char
*
mem
()
{
return
mem_
;
}
size_t
capacity
()
{
return
capacity_
;
}
...
...
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
浏览文件 @
b8d106e1
...
...
@@ -27,134 +27,460 @@ namespace paddle {
namespace
framework
{
#if defined(PADDLE_WITH_CUDA)
template
<
typename
ValType
,
typename
GradType
>
class
Optimizer
{
public:
Optimizer
()
{}
__host__
Optimizer
(
CommonFeatureValueAccessor
feature_value_accessor
)
{
feature_value_accessor_
=
feature_value_accessor
;
}
__host__
~
Optimizer
()
{}
__device__
void
update_value
(
const
OptimizerConfig
&
optimizer_config
,
float
&
val
,
// NOLINT
const
float
&
grad
)
{
printf
(
"Warning: update_value will not used. Please use dy_mf_update_value
\n
"
);
}
__device__
void
dy_mf_update_value
(
const
OptimizerConfig
&
optimizer_config
,
float
*
ptr
,
const
float
*
grad
)
{}
~
Optimizer
()
{}
CommonFeatureValueAccessor
feature_value_accessor_
;
void
initialize
()
{}
size_t
_embedding_dim
;
size_t
_lr_embedding_dim
;
};
class
SparseAdagradOptimizer
:
public
Optimizer
{
public:
__host__
SparseAdagradOptimizer
(
CommonFeatureValueAccessor
feature_value_accessor
)
:
Optimizer
(
feature_value_accessor
)
{
_lr_embedding_dim
=
1
;
_embedding_dim
=
feature_value_accessor_
.
common_feature_value
.
EmbedWDim
();
}
__device__
void
update_value_work
(
const
OptimizerConfig
&
optimizer_config
,
int
n
,
float
*
w
,
float
*
sgd
,
// NOLINT
const
float
*
g
,
float
scale
)
{
float
&
g2sum
=
sgd
[
G2SumIndex
()];
double
add_g2sum
=
0
;
double
ratio
=
optimizer_config
.
mf_learning_rate
*
sqrt
(
optimizer_config
.
mf_initial_g2sum
/
(
optimizer_config
.
mf_initial_g2sum
+
g2sum
));
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
double
scaled_grad
=
g
[
i
]
/
scale
;
w
[
i
]
+=
scaled_grad
*
ratio
;
if
(
w
[
i
]
<
optimizer_config
.
mf_min_bound
)
w
[
i
]
=
optimizer_config
.
mf_min_bound
;
if
(
w
[
i
]
>
optimizer_config
.
mf_max_bound
)
w
[
i
]
=
optimizer_config
.
mf_max_bound
;
add_g2sum
+=
scaled_grad
*
scaled_grad
;
}
g2sum
+=
add_g2sum
/
n
;
}
__device__
void
update_value
(
const
OptimizerConfig
&
optimizer_config
,
float
&
val
,
// NOLINT
const
float
&
grad
)
{
printf
(
"Warning: update_value will not used. Please use dy_mf_update_value
\n
"
);
}
__device__
void
dy_mf_update_value
(
const
OptimizerConfig
&
optimizer_config
,
float
*
ptr
,
const
float
*
grad
)
{
float
g_show
=
grad
[
feature_value_accessor_
.
common_push_value
.
ShowIndex
()];
float
g_click
=
grad
[
feature_value_accessor_
.
common_push_value
.
ClickIndex
()];
ptr
[
feature_value_accessor_
.
common_feature_value
.
SlotIndex
()]
=
grad
[
feature_value_accessor_
.
common_push_value
.
SlotIndex
()];
ptr
[
feature_value_accessor_
.
common_feature_value
.
ShowIndex
()]
+=
g_show
;
ptr
[
feature_value_accessor_
.
common_feature_value
.
ClickIndex
()]
+=
g_click
;
ptr
[
feature_value_accessor_
.
common_feature_value
.
DeltaScoreIndex
()]
+=
optimizer_config
.
nonclk_coeff
*
(
g_show
-
g_click
)
+
optimizer_config
.
clk_coeff
*
g_click
;
update_value_work
(
optimizer_config
,
1
,
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedWIndex
(),
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedG2SumIndex
(),
grad
+
feature_value_accessor_
.
common_push_value
.
EmbedGIndex
(),
g_show
);
int
mf_dim
=
int
(
ptr
[
feature_value_accessor_
.
common_feature_value
.
MfDimIndex
()]);
if
(
ptr
[
feature_value_accessor_
.
common_feature_value
.
MfSizeIndex
()]
==
0
)
{
if
(
optimizer_config
.
mf_create_thresholds
<=
optimizer_config
.
nonclk_coeff
*
(
ptr
[
feature_value_accessor_
.
common_feature_value
.
ShowIndex
()]
-
ptr
[
feature_value_accessor_
.
common_feature_value
.
ClickIndex
()])
+
optimizer_config
.
clk_coeff
*
ptr
[
feature_value_accessor_
.
common_feature_value
.
ClickIndex
()])
{
ptr
[
feature_value_accessor_
.
common_feature_value
.
MfSizeIndex
()]
=
feature_value_accessor_
.
common_feature_value
.
MFSize
(
mf_dim
)
/
sizeof
(
float
);
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
curandState
state
;
curand_init
(
clock64
(),
tid_x
,
0
,
&
state
);
for
(
int
i
=
0
;
i
<
mf_dim
;
++
i
)
{
ptr
[
feature_value_accessor_
.
common_feature_value
.
EmbedxWIndex
()
+
i
]
=
(
curand_uniform
(
&
state
))
*
optimizer_config
.
mf_initial_range
;
}
}
}
else
{
update_value_work
(
optimizer_config
,
mf_dim
,
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedxWIndex
(),
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedxG2SumIndex
(),
grad
+
feature_value_accessor_
.
common_push_value
.
EmbedxGIndex
(),
g_show
);
}
}
__host__
__device__
size_t
Dim
()
{
return
EmbedDim
()
+
EmbedxDim
();
}
__host__
__device__
size_t
EmbedDim
()
{
return
_lr_embedding_dim
;
}
__host__
__device__
size_t
EmbedxDim
()
{
return
_embedding_dim
;
}
__host__
__device__
size_t
G2SumIndex
()
{
return
0
;
}
__host__
__device__
size_t
EmbedxG2SumIndex
()
{
return
0
;
}
};
class
SparseAdamOptimizer
:
public
Optimizer
{
public:
__host__
SparseAdamOptimizer
(
CommonFeatureValueAccessor
feature_value_accessor
)
:
Optimizer
(
feature_value_accessor
)
{
_lr_embedding_dim
=
1
;
_embedding_dim
=
feature_value_accessor_
.
common_feature_value
.
EmbedWDim
();
}
__device__
void
update_lr
(
const
OptimizerConfig
&
optimizer_config
,
float
&
w
,
// NOLINT
float
&
g2sum
,
float
g
,
// NOLINT
int
n
,
float
*
w
,
float
*
sgd
,
const
float
*
g
,
float
scale
)
{
double
add_g2sum
=
0
;
double
ratio
=
optimizer_config
.
learning_rate
*
sqrt
(
optimizer_config
.
initial_g2sum
/
(
optimizer_config
.
initial_g2sum
+
g2sum
));
double
scaled_grad
=
g
/
scale
;
float
*
moment1
=
sgd
+
GSumIndex
();
float
*
moment2
=
sgd
+
G2SumIndex
();
float
*
beta1_pow
=
sgd
+
Beta1PowIndex
();
float
*
beta2_pow
=
sgd
+
Beta2PowIndex
();
w
+=
scaled_grad
*
ratio
;
float
beta1_pow_
=
*
beta1_pow
;
float
beta2_pow_
=
*
beta2_pow
;
if
(
w
<
optimizer_config
.
min_bound
)
w
=
optimizer_config
.
min_bound
;
if
(
w
>
optimizer_config
.
max_bound
)
w
=
optimizer_config
.
max_bound
;
float
epsilon
=
1e-08
;
double
ratio
=
optimizer_config
.
learning_rate
*
sqrt
(
1.0
-
beta2_pow_
)
/
(
1.0
-
beta1_pow_
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
double
scaled_grad
=
g
[
i
]
/
scale
;
add_g2sum
+=
scaled_grad
*
scaled_grad
;
double
new_moment1
=
optimizer_config
.
beta1_decay_rate
*
moment1
[
i
]
+
(
1.0
-
optimizer_config
.
beta1_decay_rate
)
*
scaled_grad
;
double
new_moment2
=
optimizer_config
.
beta2_decay_rate
*
moment2
[
i
]
+
(
1.0
-
optimizer_config
.
beta2_decay_rate
)
*
scaled_grad
*
scaled_grad
;
w
[
i
]
+=
ratio
*
(
new_moment1
/
(
sqrt
(
new_moment2
)
+
epsilon
));
if
(
w
[
i
]
<
optimizer_config
.
mf_min_bound
)
w
[
i
]
=
optimizer_config
.
mf_min_bound
;
if
(
w
[
i
]
>
optimizer_config
.
mf_max_bound
)
w
[
i
]
=
optimizer_config
.
mf_max_bound
;
g2sum
+=
add_g2sum
;
moment1
[
i
]
=
new_moment1
;
moment2
[
i
]
=
new_moment2
;
}
(
*
beta1_pow
)
*=
optimizer_config
.
beta1_decay_rate
;
(
*
beta2_pow
)
*=
optimizer_config
.
beta2_decay_rate
;
}
__device__
void
update_mf
(
const
OptimizerConfig
&
optimizer_config
,
int
n
,
float
*
w
,
float
&
g2sum
,
// NOLINT
float
*
sgd
,
const
float
*
g
,
float
scale
)
{
double
add_g2sum
=
0
;
double
ratio
=
optimizer_config
.
mf_learning_rate
*
sqrt
(
optimizer_config
.
mf_initial_g2sum
/
(
optimizer_config
.
mf_initial_g2sum
+
g2sum
));
float
*
moment1
=
sgd
+
EmbedxGSumIndex
();
float
*
moment2
=
sgd
+
EmbedxG2SumIndex
();
float
*
beta1_pow
=
sgd
+
EmbedxBeta1PowIndex
();
float
*
beta2_pow
=
sgd
+
EmbedxBeta2PowIndex
();
float
beta1_pow_
=
*
beta1_pow
;
float
beta2_pow_
=
*
beta2_pow
;
float
epsilon
=
1e-08
;
double
ratio
=
optimizer_config
.
learning_rate
*
sqrt
(
1.0
-
beta2_pow_
)
/
(
1.0
-
beta1_pow_
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
double
scaled_grad
=
g
[
i
]
/
scale
;
w
[
i
]
+=
scaled_grad
*
ratio
;
double
new_moment1
=
optimizer_config
.
beta1_decay_rate
*
moment1
[
i
]
+
(
1.0
-
optimizer_config
.
beta1_decay_rate
)
*
scaled_grad
;
double
new_moment2
=
optimizer_config
.
beta2_decay_rate
*
moment2
[
i
]
+
(
1.0
-
optimizer_config
.
beta2_decay_rate
)
*
scaled_grad
*
scaled_grad
;
w
[
i
]
+=
ratio
*
(
new_moment1
/
(
sqrt
(
new_moment2
)
+
epsilon
));
if
(
w
[
i
]
<
optimizer_config
.
mf_min_bound
)
w
[
i
]
=
optimizer_config
.
mf_min_bound
;
if
(
w
[
i
]
>
optimizer_config
.
mf_max_bound
)
w
[
i
]
=
optimizer_config
.
mf_max_bound
;
add_g2sum
+=
scaled_grad
*
scaled_grad
;
}
g2sum
+=
add_g2sum
/
n
;
moment1
[
i
]
=
new_moment1
;
moment2
[
i
]
=
new_moment2
;
}
(
*
beta1_pow
)
*=
optimizer_config
.
beta1_decay_rate
;
(
*
beta2_pow
)
*=
optimizer_config
.
beta2_decay_rate
;
}
__device__
void
update_value
(
const
OptimizerConfig
&
optimizer_config
,
ValType
&
val
,
// NOLINT
const
GradType
&
grad
)
{
val
.
slot
=
grad
.
slot
;
val
.
show
+=
grad
.
show
;
val
.
clk
+=
grad
.
clk
;
val
.
delta_score
+=
optimizer_config
.
nonclk_coeff
*
(
grad
.
show
-
grad
.
clk
)
+
optimizer_config
.
clk_coeff
*
grad
.
clk
;
float
&
val
,
// NOLINT
const
float
&
grad
)
{
printf
(
"Warning: update_value will not used. Please use dy_mf_update_value
\n
"
);
}
__device__
void
dy_mf_update_value
(
const
OptimizerConfig
&
optimizer_config
,
float
*
ptr
,
const
float
*
grad
)
{
float
g_show
=
grad
[
feature_value_accessor_
.
common_push_value
.
ShowIndex
()];
float
g_click
=
grad
[
feature_value_accessor_
.
common_push_value
.
ClickIndex
()];
update_lr
(
optimizer_config
,
val
.
lr
,
val
.
lr_g2sum
,
grad
.
lr_g
,
grad
.
show
);
ptr
[
feature_value_accessor_
.
common_feature_value
.
SlotIndex
()]
=
grad
[
feature_value_accessor_
.
common_push_value
.
SlotIndex
()];
ptr
[
feature_value_accessor_
.
common_feature_value
.
ShowIndex
()]
+=
g_show
;
ptr
[
feature_value_accessor_
.
common_feature_value
.
ClickIndex
()]
+=
g_click
;
ptr
[
feature_value_accessor_
.
common_feature_value
.
DeltaScoreIndex
()]
+=
optimizer_config
.
nonclk_coeff
*
(
g_show
-
g_click
)
+
optimizer_config
.
clk_coeff
*
g_click
;
if
(
val
.
mf_size
==
0
)
{
update_lr
(
optimizer_config
,
1
,
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedWIndex
(),
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedG2SumIndex
(),
grad
+
feature_value_accessor_
.
common_push_value
.
EmbedGIndex
(),
g_show
);
int
mf_dim
=
int
(
ptr
[
feature_value_accessor_
.
common_feature_value
.
MfDimIndex
()]);
if
(
ptr
[
feature_value_accessor_
.
common_feature_value
.
MfSizeIndex
()]
==
0
)
{
if
(
optimizer_config
.
mf_create_thresholds
<=
optimizer_config
.
nonclk_coeff
*
(
val
.
show
-
val
.
clk
)
+
optimizer_config
.
clk_coeff
*
val
.
clk
)
{
val
.
mf_size
=
MF_DIM
+
1
;
val
.
mf
[
0
]
=
0
;
optimizer_config
.
nonclk_coeff
*
(
ptr
[
feature_value_accessor_
.
common_feature_value
.
ShowIndex
()]
-
ptr
[
feature_value_accessor_
.
common_feature_value
.
ClickIndex
()])
+
optimizer_config
.
clk_coeff
*
ptr
[
feature_value_accessor_
.
common_feature_value
.
ClickIndex
()])
{
ptr
[
feature_value_accessor_
.
common_feature_value
.
MfSizeIndex
()]
=
feature_value_accessor_
.
common_feature_value
.
MFSize
(
mf_dim
)
/
sizeof
(
float
);
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
curandState
state
;
curand_init
(
clock64
(),
tid_x
,
0
,
&
state
);
for
(
int
i
=
0
;
i
<
MF_DIM
;
++
i
)
{
val
.
mf
[
i
+
1
]
=
for
(
int
i
=
0
;
i
<
mf_dim
;
++
i
)
{
ptr
[
feature_value_accessor_
.
common_feature_value
.
EmbedxWIndex
()
+
i
]
=
(
curand_uniform
(
&
state
))
*
optimizer_config
.
mf_initial_range
;
}
ptr
[
feature_value_accessor_
.
common_feature_value
.
EmbedxG2SumIndex
()
+
EmbedxBeta1PowIndex
()]
=
optimizer_config
.
beta1_decay_rate
;
ptr
[
feature_value_accessor_
.
common_feature_value
.
EmbedxG2SumIndex
()
+
EmbedxBeta2PowIndex
()]
=
optimizer_config
.
beta2_decay_rate
;
}
}
else
{
update_mf
(
optimizer_config
,
MF_DIM
,
&
val
.
mf
[
1
],
val
.
mf
[
0
],
grad
.
mf_g
,
grad
.
show
);
update_mf
(
optimizer_config
,
mf_dim
,
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedxWIndex
(),
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedxG2SumIndex
(),
grad
+
feature_value_accessor_
.
common_push_value
.
EmbedxGIndex
(),
g_show
);
}
// printf("EmbedxGIndex: %f, mf_gsum: %f, ",
// feature_value_accessor_.common_push_value.EmbedxGIndex(),
// ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex()]);
}
__host__
__device__
size_t
Dim
()
{
return
EmbedDim
()
+
EmbedxDim
();
}
__host__
__device__
size_t
EmbedDim
()
{
return
_lr_embedding_dim
*
2
+
2
;
}
__host__
__device__
size_t
EmbedxDim
()
{
return
_embedding_dim
*
2
+
2
;
}
__host__
__device__
size_t
GSumIndex
()
{
return
0
;
}
__host__
__device__
size_t
G2SumIndex
()
{
return
GSumIndex
()
+
_lr_embedding_dim
;
}
__host__
__device__
size_t
Beta1PowIndex
()
{
return
G2SumIndex
()
+
_lr_embedding_dim
;
}
__host__
__device__
size_t
Beta2PowIndex
()
{
return
Beta1PowIndex
()
+
1
;
}
__host__
__device__
size_t
EmbedxGSumIndex
()
{
return
0
;
}
__host__
__device__
size_t
EmbedxG2SumIndex
()
{
return
EmbedxGSumIndex
()
+
_embedding_dim
;
}
__host__
__device__
size_t
EmbedxBeta1PowIndex
()
{
return
EmbedxG2SumIndex
()
+
_embedding_dim
;
}
__host__
__device__
size_t
EmbedxBeta2PowIndex
()
{
return
EmbedxBeta1PowIndex
()
+
1
;
}
};
class
SparseAdamSharedOptimizer
:
public
Optimizer
{
public:
__host__
SparseAdamSharedOptimizer
(
CommonFeatureValueAccessor
feature_value_accessor
)
:
Optimizer
(
feature_value_accessor
)
{
_lr_embedding_dim
=
1
;
_embedding_dim
=
feature_value_accessor_
.
common_feature_value
.
EmbedWDim
();
}
__device__
void
update_value_work
(
const
OptimizerConfig
&
optimizer_config
,
int
n
,
float
*
w
,
float
*
sgd
,
const
float
*
g
,
float
scale
)
{
float
*
moment1
=
sgd
+
GSumIndex
();
float
*
moment2
=
sgd
+
G2SumIndex
();
float
*
beta1_pow
=
sgd
+
Beta1PowIndex
();
float
*
beta2_pow
=
sgd
+
Beta2PowIndex
();
float
beta1_pow_
=
*
beta1_pow
;
float
beta2_pow_
=
*
beta2_pow
;
float
moment1_
=
*
moment1
;
float
moment2_
=
*
moment2
;
float
epsilon
=
1e-08
;
double
ratio
=
optimizer_config
.
learning_rate
*
sqrt
(
1.0
-
beta2_pow_
)
/
(
1.0
-
beta1_pow_
);
double
sum_mom1
=
0.0
;
double
sum_mom2
=
0.0
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
double
scaled_grad
=
g
[
i
]
/
scale
;
double
new_moment1
=
optimizer_config
.
beta1_decay_rate
*
moment1_
+
(
1.0
-
optimizer_config
.
beta1_decay_rate
)
*
scaled_grad
;
double
new_moment2
=
optimizer_config
.
beta2_decay_rate
*
moment2_
+
(
1.0
-
optimizer_config
.
beta2_decay_rate
)
*
scaled_grad
*
scaled_grad
;
w
[
i
]
+=
ratio
*
(
new_moment1
/
(
sqrt
(
new_moment2
)
+
epsilon
));
if
(
w
[
i
]
<
optimizer_config
.
mf_min_bound
)
w
[
i
]
=
optimizer_config
.
mf_min_bound
;
if
(
w
[
i
]
>
optimizer_config
.
mf_max_bound
)
w
[
i
]
=
optimizer_config
.
mf_max_bound
;
sum_mom1
+=
new_moment1
;
sum_mom2
+=
new_moment2
;
}
(
*
moment1
)
=
sum_mom1
/
n
;
(
*
moment2
)
=
sum_mom2
/
n
;
(
*
beta1_pow
)
*=
optimizer_config
.
beta1_decay_rate
;
(
*
beta2_pow
)
*=
optimizer_config
.
beta2_decay_rate
;
}
__device__
void
update_value
(
const
OptimizerConfig
&
optimizer_config
,
float
&
val
,
// NOLINT
const
float
&
grad
)
{
printf
(
"Warning: update_value will not used. Please use dy_mf_update_value
\n
"
);
}
__device__
void
dy_mf_update_value
(
const
OptimizerConfig
&
optimizer_config
,
ValType
*
ptr
,
const
GradType
&
grad
)
{
ptr
->
slot
=
grad
.
slot
;
ptr
->
show
+=
grad
.
show
;
ptr
->
clk
+=
grad
.
clk
;
ptr
->
delta_score
+=
optimizer_config
.
nonclk_coeff
*
(
grad
.
show
-
grad
.
clk
)
+
optimizer_config
.
clk_coeff
*
grad
.
clk
;
update_lr
(
optimizer_config
,
ptr
->
lr
,
ptr
->
lr_g2sum
,
grad
.
lr_g
,
grad
.
show
);
// use MF_DIM temporarily
// ptr->mf_dim = grad.mf_dim;
if
(
ptr
->
mf_size
==
0
)
{
float
*
ptr
,
const
float
*
grad
)
{
float
g_show
=
grad
[
feature_value_accessor_
.
common_push_value
.
ShowIndex
()];
float
g_click
=
grad
[
feature_value_accessor_
.
common_push_value
.
ClickIndex
()];
ptr
[
feature_value_accessor_
.
common_feature_value
.
SlotIndex
()]
=
grad
[
feature_value_accessor_
.
common_push_value
.
SlotIndex
()];
ptr
[
feature_value_accessor_
.
common_feature_value
.
ShowIndex
()]
+=
g_show
;
ptr
[
feature_value_accessor_
.
common_feature_value
.
ClickIndex
()]
+=
g_click
;
ptr
[
feature_value_accessor_
.
common_feature_value
.
DeltaScoreIndex
()]
+=
optimizer_config
.
nonclk_coeff
*
(
g_show
-
g_click
)
+
optimizer_config
.
clk_coeff
*
g_click
;
update_value_work
(
optimizer_config
,
1
,
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedWIndex
(),
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedG2SumIndex
(),
grad
+
feature_value_accessor_
.
common_push_value
.
EmbedGIndex
(),
g_show
);
int
mf_dim
=
int
(
ptr
[
feature_value_accessor_
.
common_feature_value
.
MfDimIndex
()]);
if
(
ptr
[
feature_value_accessor_
.
common_feature_value
.
MfSizeIndex
()]
==
0
)
{
if
(
optimizer_config
.
mf_create_thresholds
<=
optimizer_config
.
nonclk_coeff
*
(
ptr
->
show
-
ptr
->
clk
)
+
optimizer_config
.
clk_coeff
*
ptr
->
clk
)
{
ptr
->
mf_size
=
ptr
->
mf_dim
+
1
;
optimizer_config
.
nonclk_coeff
*
(
ptr
[
feature_value_accessor_
.
common_feature_value
.
ShowIndex
()]
-
ptr
[
feature_value_accessor_
.
common_feature_value
.
ClickIndex
()])
+
optimizer_config
.
clk_coeff
*
ptr
[
feature_value_accessor_
.
common_feature_value
.
ClickIndex
()])
{
ptr
[
feature_value_accessor_
.
common_feature_value
.
MfSizeIndex
()]
=
feature_value_accessor_
.
common_feature_value
.
MFSize
(
mf_dim
)
/
sizeof
(
float
);
// ptr->mf_size = MF_DIM + 1;
ptr
->
mf
[
0
]
=
0
;
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
curandState
state
;
curand_init
(
clock64
(),
tid_x
,
0
,
&
state
);
for
(
int
i
=
0
;
i
<
ptr
->
mf_dim
;
++
i
)
{
ptr
->
mf
[
i
+
1
]
=
for
(
int
i
=
0
;
i
<
mf_dim
;
++
i
)
{
ptr
[
feature_value_accessor_
.
common_feature_value
.
EmbedxWIndex
()
+
i
]
=
(
curand_uniform
(
&
state
))
*
optimizer_config
.
mf_initial_range
;
}
ptr
[
feature_value_accessor_
.
common_feature_value
.
EmbedxG2SumIndex
()
+
EmbedxBeta1PowIndex
()]
=
optimizer_config
.
beta1_decay_rate
;
ptr
[
feature_value_accessor_
.
common_feature_value
.
EmbedxG2SumIndex
()
+
EmbedxBeta2PowIndex
()]
=
optimizer_config
.
beta2_decay_rate
;
}
}
else
{
update_mf
(
optimizer_config
,
ptr
->
mf_dim
,
&
(
ptr
->
mf
[
1
]),
ptr
->
mf
[
0
],
grad
.
mf_g
,
grad
.
show
);
// for local test
update_value_work
(
optimizer_config
,
mf_dim
,
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedxWIndex
(),
ptr
+
feature_value_accessor_
.
common_feature_value
.
EmbedxG2SumIndex
(),
grad
+
feature_value_accessor_
.
common_push_value
.
EmbedxGIndex
(),
g_show
);
}
}
__host__
__device__
size_t
Dim
()
{
return
EmbedDim
()
+
EmbedxDim
();
}
__host__
__device__
size_t
EmbedDim
()
{
return
4
;
}
__host__
__device__
size_t
EmbedxDim
()
{
return
4
;
}
__host__
__device__
size_t
GSumIndex
()
{
return
0
;
}
__host__
__device__
size_t
G2SumIndex
()
{
return
GSumIndex
()
+
1
;
}
__host__
__device__
size_t
Beta1PowIndex
()
{
return
G2SumIndex
()
+
1
;
}
__host__
__device__
size_t
Beta2PowIndex
()
{
return
Beta1PowIndex
()
+
1
;
}
__host__
__device__
size_t
EmbedxGSumIndex
()
{
return
0
;
}
__host__
__device__
size_t
EmbedxG2SumIndex
()
{
return
EmbedxGSumIndex
()
+
1
;
}
__host__
__device__
size_t
EmbedxBeta1PowIndex
()
{
return
EmbedxG2SumIndex
()
+
1
;
}
__host__
__device__
size_t
EmbedxBeta2PowIndex
()
{
return
EmbedxBeta1PowIndex
()
+
1
;
}
};
#endif
...
...
paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
浏览文件 @
b8d106e1
...
...
@@ -27,13 +27,19 @@ class OptimizerConfig {
float
learning_rate
=
0.05
;
float
initial_g2sum
=
3.0
;
float
initial_range
=
0
;
float
beta1_decay_rate
=
0.9
;
// adam
float
beta2_decay_rate
=
0.999
;
// adam
float
ada_epsilon
=
1e-8
;
float
mf_create_thresholds
=
10
;
float
mf_learning_rate
=
0.05
;
float
mf_initial_g2sum
=
3.0
;
float
mf_initial_range
=
1e-4
;
float
mf_beta1_decay_rate
=
0.9
;
// adam
float
mf_beta2_decay_rate
=
0.999
;
// adam
float
mf_min_bound
=
-
10
;
float
mf_max_bound
=
10
;
float
mf_ada_epsilon
=
1e-8
;
void
set_sparse_sgd
(
float
nonclk_coeff
,
float
clk_coeff
,
...
...
@@ -41,7 +47,10 @@ class OptimizerConfig {
float
max_bound
,
float
learning_rate
,
float
initial_g2sum
,
float
initial_range
)
{
float
initial_range
,
float
beta1_decay_rate
,
float
beta2_decay_rate
,
float
ada_epsilon
)
{
this
->
nonclk_coeff
=
nonclk_coeff
;
this
->
clk_coeff
=
clk_coeff
;
this
->
min_bound
=
min_bound
;
...
...
@@ -49,6 +58,9 @@ class OptimizerConfig {
this
->
learning_rate
=
learning_rate
;
this
->
initial_g2sum
=
initial_g2sum
;
this
->
initial_range
=
initial_range
;
this
->
beta1_decay_rate
=
beta1_decay_rate
;
this
->
beta2_decay_rate
=
beta2_decay_rate
;
this
->
ada_epsilon
=
ada_epsilon
;
}
void
set_sparse_sgd
(
const
OptimizerConfig
&
optimizer_config
)
{
...
...
@@ -59,6 +71,9 @@ class OptimizerConfig {
this
->
learning_rate
=
optimizer_config
.
learning_rate
;
this
->
initial_g2sum
=
optimizer_config
.
initial_g2sum
;
this
->
initial_range
=
optimizer_config
.
initial_range
;
this
->
beta1_decay_rate
=
optimizer_config
.
beta1_decay_rate
;
this
->
beta2_decay_rate
=
optimizer_config
.
beta2_decay_rate
;
this
->
ada_epsilon
=
optimizer_config
.
ada_epsilon
;
}
void
set_embedx_sgd
(
float
mf_create_thresholds
,
...
...
@@ -66,13 +81,19 @@ class OptimizerConfig {
float
mf_initial_g2sum
,
float
mf_initial_range
,
float
mf_min_bound
,
float
mf_max_bound
)
{
float
mf_max_bound
,
float
mf_beta1_decay_rate
,
float
mf_beta2_decay_rate
,
float
mf_ada_epsilon
)
{
this
->
mf_create_thresholds
=
mf_create_thresholds
;
this
->
mf_learning_rate
=
mf_learning_rate
;
this
->
mf_initial_g2sum
=
mf_initial_g2sum
;
this
->
mf_initial_range
=
mf_initial_range
;
this
->
mf_min_bound
=
mf_min_bound
;
this
->
mf_max_bound
=
mf_max_bound
;
this
->
mf_beta1_decay_rate
=
mf_beta1_decay_rate
;
this
->
mf_beta2_decay_rate
=
mf_beta2_decay_rate
;
this
->
mf_ada_epsilon
=
mf_ada_epsilon
;
}
void
set_embedx_sgd
(
const
OptimizerConfig
&
optimizer_config
)
{
...
...
@@ -82,6 +103,9 @@ class OptimizerConfig {
this
->
mf_initial_range
=
optimizer_config
.
mf_initial_range
;
this
->
mf_min_bound
=
optimizer_config
.
mf_min_bound
;
this
->
mf_max_bound
=
optimizer_config
.
mf_max_bound
;
this
->
mf_beta1_decay_rate
=
optimizer_config
.
mf_beta1_decay_rate
;
this
->
mf_beta2_decay_rate
=
optimizer_config
.
mf_beta2_decay_rate
;
this
->
mf_ada_epsilon
=
optimizer_config
.
mf_ada_epsilon
;
}
};
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
浏览文件 @
b8d106e1
...
...
@@ -33,9 +33,9 @@ limitations under the License. */
#include <algorithm>
#include <deque>
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/platform/timer.h"
#if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
#endif
...
...
@@ -135,7 +135,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
std
::
string
data_set_name
=
std
::
string
(
typeid
(
*
dataset_
).
name
());
if
(
data_set_name
.
find
(
"SlotRecordDataset"
)
!=
std
::
string
::
npos
)
{
SlotRecordDataset
*
dataset
=
dynamic_cast
<
SlotRecordDataset
*>
(
dataset_
);
SlotRecordDataset
*
dataset
=
(
SlotRecordDataset
*
)
(
dataset_
);
auto
input_channel
=
dataset
->
GetInputChannel
();
VLOG
(
0
)
<<
"psgpu wrapperinputslotchannle size: "
<<
input_channel
->
Size
();
const
std
::
deque
<
SlotRecord
>&
vec_data
=
input_channel
->
GetData
();
...
...
@@ -185,7 +185,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
}
else
{
CHECK
(
data_set_name
.
find
(
"MultiSlotDataset"
)
!=
std
::
string
::
npos
);
VLOG
(
0
)
<<
"ps_gpu_wrapper use MultiSlotDataset"
;
MultiSlotDataset
*
dataset
=
dynamic_cast
<
MultiSlotDataset
*>
(
dataset_
);
MultiSlotDataset
*
dataset
=
(
MultiSlotDataset
*
)
(
dataset_
);
auto
input_channel
=
dataset
->
GetInputChannel
();
const
std
::
deque
<
Record
>&
vec_data
=
input_channel
->
GetData
();
...
...
@@ -540,17 +540,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
&
device_vals
,
&
device_task_keys
,
&
device_task_ptrs
](
int
dev
,
int
shard_id
)
{
auto
&
task_keys
=
device_task_keys
[
shard_id
];
//
auto& task_keys = device_task_keys[shard_id];
#ifdef PADDLE_WITH_PSLIB
auto
&
task_ptrs
=
device_task_ptrs
[
shard_id
];
#endif
#ifdef PADDLE_WITH_PSCORE
auto
&
task_ptrs
=
device_task_ptrs
[
shard_id
];
#endif
//
#ifdef PADDLE_WITH_PSCORE
//
auto& task_ptrs = device_task_ptrs[shard_id];
//
#endif
int
len
=
prefix_sum
[
dev
][
shard_id
+
1
]
-
prefix_sum
[
dev
][
shard_id
];
int
cur
=
prefix_sum
[
dev
][
shard_id
];
//
int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id];
//
int cur = prefix_sum[dev][shard_id];
#ifdef PADDLE_WITH_PSLIB
for
(
int
j
=
0
;
j
<
len
;
++
j
)
{
device_keys
[
dev
][
cur
+
j
]
=
task_keys
[
dev
][
j
];
...
...
@@ -579,33 +579,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
}
}
#endif
#ifdef PADDLE_WITH_PSCORE
for
(
int
j
=
0
;
j
<
len
;
++
j
)
{
device_keys
[
dev
][
cur
+
j
]
=
task_keys
[
dev
][
j
];
float
*
ptr_val
=
task_ptrs
[
dev
][
j
]
->
data
();
FeatureValue
&
val
=
device_vals
[
dev
][
cur
+
j
];
size_t
dim
=
task_ptrs
[
dev
][
j
]
->
size
();
val
.
delta_score
=
ptr_val
[
2
];
val
.
show
=
ptr_val
[
3
];
val
.
clk
=
ptr_val
[
4
];
val
.
slot
=
ptr_val
[
0
];
val
.
lr
=
ptr_val
[
5
];
val
.
lr_g2sum
=
ptr_val
[
6
];
val
.
cpu_ptr
=
(
uint64_t
)(
task_ptrs
[
dev
][
j
]);
if
(
dim
>
7
)
{
val
.
mf_size
=
MF_DIM
+
1
;
for
(
int
x
=
0
;
x
<
val
.
mf_size
;
x
++
)
{
val
.
mf
[
x
]
=
ptr_val
[
x
+
7
];
}
}
else
{
val
.
mf_size
=
0
;
for
(
int
x
=
0
;
x
<
MF_DIM
+
1
;
x
++
)
{
val
.
mf
[
x
]
=
0
;
}
}
}
#endif
VLOG
(
3
)
<<
"GpuPs build hbmps done"
;
};
...
...
@@ -665,16 +638,25 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
return
;
}
std
::
vector
<
std
::
thread
>
threads
(
device_num
);
HeterPs_
=
HeterPsBase
::
get_instance
(
size_max
,
resource_
);
auto
accessor_wrapper_ptr
=
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
();
HeterPs_
=
HeterPsBase
::
get_instance
(
size_max
,
resource_
,
fleet_config_
,
accessor_class_
,
optimizer_type_
);
#ifdef PADDLE_WITH_CUDA
HeterPs_
->
set_nccl_comm_and_size
(
inner_comms_
,
inter_comms_
,
node_size_
);
HeterPs_
->
set_sparse_sgd
(
optimizer_config_
);
HeterPs_
->
set_embedx_sgd
(
optimizer_config_
);
#endif
auto
build_dymf_mem_pool
=
[
this
,
&
gpu_task
](
int
i
,
int
j
)
{
auto
build_dymf_mem_pool
=
[
this
,
&
gpu_task
,
&
accessor_wrapper_ptr
](
int
i
,
int
j
)
{
this
->
HeterPs_
->
set_multi_mf_dim
(
multi_mf_dim_
,
max_mf_dim_
);
int
mf_dim
=
this
->
index_dim_vec_
[
j
];
VLOG
(
0
)
<<
"building table: "
<<
i
<<
"with mf dim: "
<<
mf_dim
<<
" feature_value_size:"
<<
accessor_wrapper_ptr
->
GetFeatureValueSize
(
mf_dim
);
size_t
feature_value_size
=
TYPEALIGN
(
8
,
sizeof
(
FeatureValue
)
+
((
mf_dim
+
1
)
*
sizeof
(
float
))
);
accessor_wrapper_ptr
->
GetFeatureValueSize
(
mf_dim
);
auto
&
device_dim_keys
=
gpu_task
->
device_dim_keys_
[
i
][
j
];
auto
&
device_dim_ptrs
=
gpu_task
->
device_dim_ptr_
[
i
][
j
];
size_t
len
=
device_dim_keys
.
size
();
...
...
@@ -682,12 +664,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
this
->
mem_pools_
[
i
*
this
->
multi_mf_dim_
+
j
]
=
new
MemoryPool
(
len
,
feature_value_size
);
};
auto
build_dymf_hbm_pool
=
[
this
,
&
gpu_task
](
int
i
,
int
j
)
{
auto
build_dymf_hbm_pool
=
[
this
,
&
gpu_task
,
&
accessor_wrapper_ptr
](
int
i
,
int
j
)
{
auto
&
device_dim_keys
=
gpu_task
->
device_dim_keys_
[
i
][
j
];
size_t
len
=
device_dim_keys
.
size
();
int
mf_dim
=
this
->
index_dim_vec_
[
j
];
size_t
feature_value_size
=
TYPEALIGN
(
8
,
sizeof
(
FeatureValue
)
+
((
mf_dim
+
1
)
*
sizeof
(
float
))
);
accessor_wrapper_ptr
->
GetFeatureValueSize
(
mf_dim
);
auto
&
mem_pool
=
this
->
mem_pools_
[
i
*
this
->
multi_mf_dim_
+
j
];
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
...
...
@@ -710,13 +693,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
delete
mem_pool
;
};
int
thread_num
=
16
;
auto
build_dynamic_mf_func
=
[
this
,
&
gpu_task
,
thread_num
](
int
i
,
int
j
,
int
z
)
{
auto
build_dynamic_mf_func
=
[
this
,
&
gpu_task
,
thread_num
,
&
accessor_wrapper_ptr
](
int
i
,
int
j
,
int
z
)
{
// this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
int
mf_dim
=
this
->
index_dim_vec_
[
j
];
VLOG
(
0
)
<<
"building table: "
<<
i
<<
"with mf dim: "
<<
mf_dim
;
// size_t feature_value_size =
// TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
auto
&
device_dim_keys
=
gpu_task
->
device_dim_keys_
[
i
][
j
];
auto
&
device_dim_ptrs
=
gpu_task
->
device_dim_ptr_
[
i
][
j
];
size_t
len
=
device_dim_keys
.
size
();
...
...
@@ -743,10 +726,10 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
// ============ add for multi-thread ================
for
(
size_t
k
=
left
;
k
<
right
;
k
++
)
{
FeatureValue
*
val
=
(
FeatureValue
*
)(
mem_pool
->
mem_address
(
k
));
#ifdef PADDLE_WITH_PSLIB
float
*
val
=
(
float
*
)(
mem_pool
->
mem_address
(
k
));
float
*
ptr_val
=
device_dim_ptrs
[
k
]
->
data
();
size_t
dim
=
device_dim_ptrs
[
k
]
->
size
();
#ifdef PADDLE_WITH_PSLIB
val
->
delta_score
=
ptr_val
[
paddle
::
ps
::
DownpourCtrDymfAccessor
::
DownpourCtrDymfFeatureValue
::
delta_score_index
()];
...
...
@@ -765,23 +748,6 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
ptr_val
[
paddle
::
ps
::
DownpourCtrDymfAccessor
::
DownpourCtrDymfFeatureValue
::
mf_dim_index
()]
=
float
(
mf_dim
);
val
->
mf_dim
=
mf_dim
;
#endif
#ifdef PADDLE_WITH_PSCORE
paddle
::
distributed
::
CtrDymfAccessor
accessor
;
val
->
delta_score
=
ptr_val
[
accessor
.
common_feature_value
.
DeltaScoreIndex
()];
val
->
show
=
ptr_val
[
accessor
.
common_feature_value
.
ShowIndex
()];
val
->
clk
=
ptr_val
[
accessor
.
common_feature_value
.
ClickIndex
()];
val
->
slot
=
int
(
ptr_val
[
accessor
.
common_feature_value
.
SlotIndex
()]);
val
->
lr
=
ptr_val
[
accessor
.
common_feature_value
.
EmbedWIndex
()];
val
->
lr_g2sum
=
ptr_val
[
accessor
.
common_feature_value
.
EmbedG2SumIndex
()];
val
->
cpu_ptr
=
(
uint64_t
)(
device_dim_ptrs
[
k
]);
// TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
ptr_val
[
accessor
.
common_feature_value
.
MfDimIndex
()]
=
float
(
mf_dim
);
val
->
mf_dim
=
mf_dim
;
#endif
if
(
dim
>
8
)
{
// CpuPS alreay expand as mf_dim
val
->
mf_size
=
mf_dim
+
1
;
for
(
int
x
=
0
;
x
<
val
->
mf_dim
+
1
;
x
++
)
{
...
...
@@ -793,6 +759,12 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
val
->
mf
[
x
]
=
0
;
}
}
#endif
#ifdef PADDLE_WITH_PSCORE
void
*
val
=
mem_pool
->
mem_address
(
k
);
accessor_wrapper_ptr
->
BuildFill
(
val
,
device_dim_ptrs
[
k
],
cpu_table_accessor_
,
mf_dim
);
#endif
}
};
...
...
@@ -945,7 +917,10 @@ void PSGPUWrapper::EndPass() {
}
}
int
thread_num
=
8
;
auto
dump_pool_to_cpu_func
=
[
this
,
thread_num
](
int
i
,
int
j
,
int
z
)
{
auto
accessor_wrapper_ptr
=
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
();
auto
dump_pool_to_cpu_func
=
[
this
,
thread_num
,
&
accessor_wrapper_ptr
](
int
i
,
int
j
,
int
z
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaSetDevice
(
this
->
resource_
->
dev_id
(
i
)));
auto
&
hbm_pool
=
this
->
hbm_pools_
[
i
*
this
->
multi_mf_dim_
+
j
];
auto
&
device_keys
=
this
->
current_task_
->
device_dim_keys_
[
i
][
j
];
...
...
@@ -965,9 +940,11 @@ void PSGPUWrapper::EndPass() {
}
// ============ multi-thread process feasign============
int
mf_dim
=
this
->
index_dim_vec_
[
j
];
VLOG
(
0
)
<<
"dump pool to cpu table: "
<<
i
<<
"with mf dim: "
<<
mf_dim
;
size_t
feature_value_size
=
TYPEALIGN
(
8
,
sizeof
(
FeatureValue
)
+
((
mf_dim
+
1
)
*
sizeof
(
float
)));
accessor_wrapper_ptr
->
GetFeatureValueSize
(
mf_dim
);
VLOG
(
0
)
<<
"dump pool to cpu table: "
<<
i
<<
"with mf dim: "
<<
mf_dim
<<
" key_len :"
<<
len
<<
" feature_value_size:"
<<
feature_value_size
;
char
*
test_build_values
=
(
char
*
)
malloc
(
feature_value_size
*
real_len
);
uint64_t
offset
=
left
*
feature_value_size
;
cudaMemcpy
(
test_build_values
,
...
...
@@ -981,7 +958,7 @@ void PSGPUWrapper::EndPass() {
continue
;
}
size_t
local_offset
=
(
i
-
left
)
*
feature_value_size
;
FeatureValue
*
gpu_val
=
(
FeatureValue
*
)(
test_build_values
+
local_offset
);
float
*
gpu_val
=
(
float
*
)(
test_build_values
+
local_offset
);
#ifdef PADDLE_WITH_PSLIB
auto
*
downpour_value
=
(
paddle
::
ps
::
DownpourFixedFeatureValue
*
)(
gpu_val
->
cpu_ptr
);
...
...
@@ -1002,31 +979,15 @@ void PSGPUWrapper::EndPass() {
embed_g2sum_index
()]
=
gpu_val
->
lr_g2sum
;
cpu_val
[
paddle
::
ps
::
DownpourCtrDymfAccessor
::
DownpourCtrDymfFeatureValue
::
slot_index
()]
=
gpu_val
->
slot
;
#endif
#ifdef PADDLE_WITH_PSCORE
auto
*
downpour_value
=
(
paddle
::
distributed
::
FixedFeatureValue
*
)(
gpu_val
->
cpu_ptr
);
int
downpour_value_size
=
downpour_value
->
size
();
if
(
gpu_val
->
mf_size
>
0
&&
downpour_value_size
==
8
)
{
downpour_value
->
resize
(
gpu_val
->
mf_dim
+
1
+
downpour_value_size
);
}
float
*
cpu_val
=
downpour_value
->
data
();
paddle
::
distributed
::
CtrDymfAccessor
accessor
;
cpu_val
[
accessor
.
common_feature_value
.
DeltaScoreIndex
()]
=
gpu_val
->
delta_score
;
cpu_val
[
accessor
.
common_feature_value
.
ShowIndex
()]
=
gpu_val
->
show
;
cpu_val
[
accessor
.
common_feature_value
.
ClickIndex
()]
=
gpu_val
->
clk
;
cpu_val
[
accessor
.
common_feature_value
.
EmbedWIndex
()]
=
gpu_val
->
lr
;
cpu_val
[
accessor
.
common_feature_value
.
EmbedG2SumIndex
()]
=
gpu_val
->
lr_g2sum
;
cpu_val
[
accessor
.
common_feature_value
.
SlotIndex
()]
=
gpu_val
->
slot
;
#endif
if
(
gpu_val
->
mf_size
>
0
)
{
for
(
int
x
=
0
;
x
<
gpu_val
->
mf_dim
+
1
;
x
++
)
{
cpu_val
[
x
+
8
]
=
gpu_val
->
mf
[
x
];
}
}
#endif
#ifdef PADDLE_WITH_PSCORE
accessor_wrapper_ptr
->
DumpFill
(
gpu_val
,
cpu_table_accessor_
,
mf_dim
);
#endif
}
free
(
test_build_values
);
};
...
...
@@ -1066,79 +1027,8 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
const
std
::
vector
<
float
*>&
values
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
)
{
platform
::
Timer
all_timer
;
platform
::
Timer
pull_gpups_timer
;
all_timer
.
Start
();
int64_t
total_length
=
std
::
accumulate
(
slot_lengths
.
begin
(),
slot_lengths
.
end
(),
0UL
);
VLOG
(
3
)
<<
"Begine Gpu/Xpu Ps PullSparse"
;
auto
buf
=
memory
::
Alloc
(
place
,
total_length
*
sizeof
(
FeatureValue
));
FeatureValue
*
total_values_gpu
=
reinterpret_cast
<
FeatureValue
*>
(
buf
->
ptr
());
if
(
platform
::
is_cpu_place
(
place
))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Warning:: CPUPlace is not supported in GpuPs now."
));
}
else
if
(
platform
::
is_gpu_place
(
place
))
{
#ifdef PADDLE_WITH_CUDA
VLOG
(
3
)
<<
"Begin copy keys, key_num["
<<
total_length
<<
"]"
;
int
device_id
=
place
.
GetDeviceId
();
int
devid_2_index
=
HeterPs_
->
get_index_by_devid
(
device_id
);
LoDTensor
&
total_keys_tensor
=
keys_tensor
[
devid_2_index
];
uint64_t
*
total_keys
=
reinterpret_cast
<
uint64_t
*>
(
total_keys_tensor
.
mutable_data
<
int64_t
>
({
total_length
,
1
},
place
));
// construct slot_level lod info
auto
slot_lengths_lod
=
slot_lengths
;
for
(
size_t
i
=
1
;
i
<
slot_lengths_lod
.
size
();
i
++
)
{
slot_lengths_lod
[
i
]
+=
slot_lengths_lod
[
i
-
1
];
}
auto
buf_key
=
memory
::
Alloc
(
place
,
keys
.
size
()
*
sizeof
(
uint64_t
*
));
auto
buf_length
=
memory
::
Alloc
(
place
,
slot_lengths
.
size
()
*
sizeof
(
int64_t
));
uint64_t
**
gpu_keys
=
reinterpret_cast
<
uint64_t
**>
(
buf_key
->
ptr
());
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
cudaMemcpy
(
gpu_keys
,
keys
.
data
(),
keys
.
size
()
*
sizeof
(
uint64_t
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
this
->
CopyKeys
(
place
,
gpu_keys
,
total_keys
,
gpu_len
,
static_cast
<
int
>
(
slot_lengths
.
size
()),
static_cast
<
int
>
(
total_length
));
VLOG
(
3
)
<<
"Begin call PullSparseGPU in GPUPS, dev: "
<<
devid_2_index
<<
" len: "
<<
total_length
;
pull_gpups_timer
.
Start
();
HeterPs_
->
pull_sparse
(
devid_2_index
,
total_keys
,
total_values_gpu
,
static_cast
<
int
>
(
total_length
));
pull_gpups_timer
.
Pause
();
VLOG
(
3
)
<<
"Begin Copy result to tensor, total_length["
<<
total_length
<<
"]"
;
this
->
CopyForPull
(
place
,
gpu_keys
,
values
,
total_values_gpu
,
gpu_len
,
static_cast
<
int
>
(
slot_lengths
.
size
()),
hidden_size
,
total_length
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"GpuPs: PullSparse Only Support CUDAPlace Now."
));
}
all_timer
.
Pause
();
VLOG
(
3
)
<<
"GpuPs PullSparse total costs: "
<<
all_timer
.
ElapsedSec
()
<<
" s, of which GPUPS costs: "
<<
pull_gpups_timer
.
ElapsedSec
()
<<
" s"
;
VLOG
(
3
)
<<
"End PullSparse"
;
VLOG
(
0
)
<<
"Warning:: recommand use pull_gpups_sparse op instead. This "
"PullSparse is not used."
;
}
void
PSGPUWrapper
::
PullSparse
(
const
paddle
::
platform
::
Place
&
place
,
...
...
@@ -1156,13 +1046,16 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
std
::
accumulate
(
slot_lengths
.
begin
(),
slot_lengths
.
end
(),
0UL
);
size_t
feature_value_size
=
0
;
feature_value_size
=
TYPEALIGN
(
8
,
sizeof
(
FeatureValue
)
+
sizeof
(
float
)
*
(
index_dim_vec_
.
back
()
+
1
));
auto
accessor_wrapper_ptr
=
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
();
feature_value_size
=
accessor_wrapper_ptr
->
GetFeatureValueSize
(
max_mf_dim_
);
VLOG
(
3
)
<<
"PullSparse max_dim:"
<<
max_mf_dim_
<<
" feature_value_size:"
<<
feature_value_size
;
#ifdef PADDLE_WITH_CUDA
VLOG
(
3
)
<<
"Begine Gpu Ps PullSparse"
;
auto
buf
=
memory
::
Alloc
(
place
,
total_length
*
feature_value_size
);
FeatureValue
*
total_values_gpu
=
reinterpret_cast
<
FeatureValue
*>
(
buf
->
ptr
());
float
*
total_values_gpu
=
reinterpret_cast
<
float
*>
(
buf
->
ptr
());
#endif
#ifdef PADDLE_WITH_XPU_KP
VLOG
(
3
)
<<
"Begine Xpu Ps PullSparse"
;
...
...
@@ -1224,19 +1117,19 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
VLOG
(
3
)
<<
"Begin Copy result to tensor, total_length["
<<
total_length
<<
"]"
;
this
->
CopyForPull
(
place
,
gpu_keys
,
values
,
total_values_gpu
,
gpu_len
,
static_cast
<
int
>
(
slot_lengths
.
size
()),
hidden_size
,
total_length
,
gpu_dim
);
accessor_wrapper_ptr
->
CopyForPull
(
place
,
gpu_keys
,
values
,
total_values_gpu
,
gpu_len
,
static_cast
<
int
>
(
slot_lengths
.
size
()),
hidden_size
,
total_length
,
gpu_dim
,
val_type_size_
);
pull_gpups_timer
.
Pause
();
#endif
}
else
if
(
platform
::
is_xpu_place
(
place
))
{
#ifdef PADDLE_WITH_XPU_KP
VLOG
(
3
)
<<
"Begin copy keys, key_num["
<<
total_length
<<
"]"
;
...
...
@@ -1283,14 +1176,15 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
VLOG
(
3
)
<<
"Begin Copy result to tensor, total_length["
<<
total_length
<<
"]"
;
this
->
CopyForPull
(
place
,
xpu_keys
,
values
,
total_values_gpu
,
xpu_len
,
static_cast
<
int
>
(
slot_lengths
.
size
()),
hidden_size
,
total_length
);
accessor_wrapper_ptr
->
CopyForPull
(
place
,
xpu_keys
,
values
,
total_values_gpu
,
xpu_len
,
static_cast
<
int
>
(
slot_lengths
.
size
()),
hidden_size
,
total_length
,
val_type_size_
);
#endif
}
else
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
...
...
@@ -1317,12 +1211,13 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
std
::
accumulate
(
slot_lengths
.
begin
(),
slot_lengths
.
end
(),
0UL
);
// #ifdef PADDLE_WITH_CUDA
VLOG
(
3
)
<<
"Begin GPUPS PushSparseGrad"
;
size_t
grad_value_size
=
TYPEALIGN
(
8
,
sizeof
(
FeaturePushValue
)
+
(
max_mf_dim_
*
sizeof
(
float
)));
auto
accessor_wrapper_ptr
=
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
();
size_t
grad_value_size
=
accessor_wrapper_ptr
->
GetPushValueSize
(
max_mf_dim_
);
auto
buf
=
memory
::
Alloc
(
place
,
total_length
*
grad_value_size
);
VLOG
(
3
)
<<
"Push Sparse Max mf dimention: "
<<
max_mf_dim_
;
FeaturePushValue
*
total_grad_values_gpu
=
reinterpret_cast
<
FeaturePushValue
*>
(
buf
->
ptr
());
VLOG
(
3
)
<<
"Push Sparse Max mf dimention: "
<<
max_mf_dim_
<<
"grad_value_size:"
<<
grad_value_size
;
float
*
total_grad_values_gpu
=
reinterpret_cast
<
float
*>
(
buf
->
ptr
());
if
(
platform
::
is_cpu_place
(
place
))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Warning:: CPUPlace is not supported in GPUPS now."
));
...
...
@@ -1334,23 +1229,15 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
uint64_t
*
total_keys
=
reinterpret_cast
<
uint64_t
*>
(
cached_total_keys_tensor
.
data
<
int64_t
>
());
VLOG
(
3
)
<<
"Begin copy grad tensor to gpups struct"
;
if
(
!
multi_mf_dim_
)
{
this
->
CopyForPush
(
place
,
grad_values
,
total_grad_values_gpu
,
slot_lengths
,
hidden_size
,
total_length
,
batch_size
);
}
else
{
this
->
CopyForPush
(
place
,
grad_values
,
total_grad_values_gpu
,
slot_lengths
,
total_length
,
batch_size
,
grad_value_size
);
}
accessor_wrapper_ptr
->
CopyForPush
(
place
,
grad_values
,
total_grad_values_gpu
,
slot_lengths
,
total_length
,
batch_size
,
grad_value_size
,
slot_vector_
,
slot_mf_dim_vector_
);
VLOG
(
3
)
<<
"Begin call PushSparseGPU in GPUPS, dev: "
<<
devid_2_index
<<
" len: "
<<
total_length
;
...
...
@@ -1369,13 +1256,14 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
uint64_t
*
total_keys
=
reinterpret_cast
<
uint64_t
*>
(
cached_total_keys_tensor
.
data
<
int64_t
>
());
VLOG
(
3
)
<<
"Begin copy grad tensor to xpups struct"
;
this
->
CopyForPush
(
place
,
grad_values
,
total_grad_values_gpu
,
slot_lengths
,
hidden_size
,
total_length
,
batch_size
);
accessor_wrapper_ptr
->
CopyForPush
(
place
,
grad_values
,
total_grad_values_gpu
,
slot_lengths
,
hidden_size
,
total_length
,
batch_size
,
slot_vector_
);
VLOG
(
3
)
<<
"Begin call PushSparseXPU in XPUPS, dev: "
<<
devid_2_index
<<
" len: "
<<
total_length
;
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
浏览文件 @
b8d106e1
...
...
@@ -26,90 +26,6 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
__global__
void
PullCopy
(
float
**
dest
,
const
FeatureValue
*
src
,
const
int64_t
*
len
,
int
hidden
,
int
slot_num
,
int
total_len
,
uint64_t
**
keys
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
x
-
1
]
:
0
);
if
(
*
(
keys
[
x
]
+
y
)
==
0
)
{
*
(
dest
[
x
]
+
y
*
hidden
)
=
0
;
*
(
dest
[
x
]
+
y
*
hidden
+
1
)
=
0
;
*
(
dest
[
x
]
+
y
*
hidden
+
2
)
=
0
;
}
else
{
*
(
dest
[
x
]
+
y
*
hidden
)
=
(
src
+
i
)
->
show
;
*
(
dest
[
x
]
+
y
*
hidden
+
1
)
=
(
src
+
i
)
->
clk
;
*
(
dest
[
x
]
+
y
*
hidden
+
2
)
=
(
src
+
i
)
->
lr
;
}
if
((
src
+
i
)
->
mf_size
==
0
||
*
(
keys
[
x
]
+
y
)
==
0
)
{
for
(
int
j
=
0
;
j
<
hidden
-
3
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
hidden
+
3
+
j
)
=
0
;
}
}
else
{
for
(
int
j
=
0
;
j
<
hidden
-
3
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
hidden
+
3
+
j
)
=
(
src
+
i
)
->
mf
[
1
+
j
];
}
}
}
}
__global__
void
PullCopy
(
float
**
dest
,
const
FeatureValue
*
src
,
const
int64_t
*
len
,
int
slot_num
,
int
total_len
,
uint64_t
**
keys
,
uint64_t
max_val_size
,
int
*
gpu_dim
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
x
-
1
]
:
0
);
FeatureValue
*
feature_value_ptr
=
(
FeatureValue
*
)((
char
*
)
src
+
uint64_t
(
i
)
*
uint64_t
(
max_val_size
));
int
mf_dim
=
gpu_dim
[
x
]
-
3
;
if
(
*
(
keys
[
x
]
+
y
)
==
0
)
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
))
=
0
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
)
=
0
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
=
0
;
}
else
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
))
=
feature_value_ptr
->
show
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
)
=
feature_value_ptr
->
clk
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
=
feature_value_ptr
->
lr
;
}
if
((
feature_value_ptr
)
->
mf_size
==
0
||
*
(
keys
[
x
]
+
y
)
==
0
)
{
for
(
int
j
=
0
;
j
<
mf_dim
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
=
0
;
}
}
else
{
for
(
int
j
=
0
;
j
<
mf_dim
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
=
feature_value_ptr
->
mf
[
1
+
j
];
}
}
}
}
__global__
void
CopyKeysKernel
(
uint64_t
**
src_keys
,
uint64_t
*
dest_total_keys
,
const
int64_t
*
len
,
...
...
@@ -161,101 +77,8 @@ __global__ void PushCopy(FeaturePushValue* dest,
}
}
__global__
void
PushCopyWithPool
(
FeaturePushValue
*
dest
,
float
**
src
,
int64_t
*
len
,
int
slot_num
,
uint64_t
total_len
,
int
bs
,
int
*
slot_vector
,
int
*
mf_dim_vector
,
size_t
grad_value_size
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
low
-
1
]
:
0
);
FeaturePushValue
*
cur
=
(
FeaturePushValue
*
)((
char
*
)
dest
+
i
*
grad_value_size
);
cur
->
slot
=
slot_vector
[
x
];
int
mf_dim
=
mf_dim_vector
[
x
];
cur
->
mf_dim
=
mf_dim
;
cur
->
show
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
));
cur
->
clk
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
);
cur
->
lr_g
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
*
-
1.
*
bs
;
for
(
int
j
=
0
;
j
<
cur
->
mf_dim
;
j
++
)
{
cur
->
mf_g
[
j
]
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
*
-
1.
*
bs
;
}
}
}
PSGPUWrapper
::~
PSGPUWrapper
()
{
delete
HeterPs_
;
}
void
PSGPUWrapper
::
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
buf_value
=
memory
::
Alloc
(
place
,
values
.
size
()
*
sizeof
(
float
*
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_value
->
ptr
());
cudaMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
PullCopy
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
gpu_values
,
total_values_gpu
,
gpu_len
,
hidden_size
,
slot_num
,
total_length
,
gpu_keys
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
buf_value
=
memory
::
Alloc
(
place
,
values
.
size
()
*
sizeof
(
float
*
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_value
->
ptr
());
cudaMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
PullCopy
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
gpu_values
,
total_values_gpu
,
gpu_len
,
slot_num
,
total_length
,
gpu_keys
,
val_type_size_
,
gpu_dim
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyKeys
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
origin_keys
,
uint64_t
*
total_keys
,
...
...
@@ -270,125 +93,26 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
,
const
int64_t
total_length
,
const
int
batch_size
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
slot_lengths_lod
=
slot_lengths
;
for
(
int
i
=
1
;
i
<
slot_lengths_lod
.
size
();
i
++
)
{
slot_lengths_lod
[
i
]
+=
slot_lengths_lod
[
i
-
1
];
}
auto
buf_grad_value
=
memory
::
Alloc
(
place
,
grad_values
.
size
()
*
sizeof
(
float
*
));
auto
buf_length
=
memory
::
Alloc
(
place
,
slot_lengths
.
size
()
*
sizeof
(
int64_t
));
auto
buf_slot_vector
=
memory
::
Alloc
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_grad_value
->
ptr
());
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
int
*
d_slot_vector
=
reinterpret_cast
<
int
*>
(
buf_slot_vector
->
ptr
());
cudaMemcpy
(
gpu_values
,
grad_values
.
data
(),
grad_values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_slot_vector
,
slot_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
PushCopy
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
total_grad_values_gpu
,
gpu_values
,
gpu_len
,
hidden_size
,
slot_lengths
.
size
(),
total_length
,
batch_size
,
d_slot_vector
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
slot_lengths_lod
=
slot_lengths
;
for
(
int
i
=
1
;
i
<
slot_lengths_lod
.
size
();
i
++
)
{
slot_lengths_lod
[
i
]
+=
slot_lengths_lod
[
i
-
1
];
}
auto
buf_grad_value
=
memory
::
Alloc
(
place
,
grad_values
.
size
()
*
sizeof
(
float
*
));
auto
buf_length
=
memory
::
Alloc
(
place
,
slot_lengths
.
size
()
*
sizeof
(
int64_t
));
auto
buf_slot_vector
=
memory
::
Alloc
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
auto
buf_mf_dim_vector
=
memory
::
Alloc
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_grad_value
->
ptr
());
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
int
*
d_slot_vector
=
reinterpret_cast
<
int
*>
(
buf_slot_vector
->
ptr
());
int
*
d_mf_dim_vector
=
reinterpret_cast
<
int
*>
(
buf_mf_dim_vector
->
ptr
());
cudaMemcpy
(
gpu_values
,
grad_values
.
data
(),
grad_values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_slot_vector
,
slot_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_mf_dim_vector
,
slot_mf_dim_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
PushCopyWithPool
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
total_grad_values_gpu
,
gpu_values
,
gpu_len
,
slot_lengths
.
size
(),
total_length
,
batch_size
,
d_slot_vector
,
d_mf_dim_vector
,
grad_value_size
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
SetSparseSGD
(
float
nonclk_coeff
,
float
clk_coeff
,
float
min_bound
,
float
max_bound
,
float
learning_rate
,
float
initial_g2sum
,
float
initial_range
)
{
OptimizerConfig
optimizer_config
;
optimizer_config
.
set_sparse_sgd
(
nonclk_coeff
,
clk_coeff
,
min_bound
,
max_bound
,
learning_rate
,
initial_g2sum
,
initial_range
);
HeterPs_
->
set_sparse_sgd
(
optimizer_config
);
float
initial_range
,
float
beta1_decay_rate
,
float
beta2_decay_rate
,
float
ada_epsilon
)
{
optimizer_config_
.
set_sparse_sgd
(
nonclk_coeff
,
clk_coeff
,
min_bound
,
max_bound
,
learning_rate
,
initial_g2sum
,
initial_range
,
beta1_decay_rate
,
beta2_decay_rate
,
ada_epsilon
);
}
void
PSGPUWrapper
::
SetEmbedxSGD
(
float
mf_create_thresholds
,
...
...
@@ -396,15 +120,19 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float
mf_initial_g2sum
,
float
mf_initial_range
,
float
mf_min_bound
,
float
mf_max_bound
)
{
OptimizerConfig
optimizer_config
;
optimizer_config
.
set_embedx_sgd
(
mf_create_thresholds
,
mf_learning_rate
,
mf_initial_g2sum
,
mf_initial_range
,
mf_min_bound
,
mf_max_bound
);
HeterPs_
->
set_embedx_sgd
(
optimizer_config
);
float
mf_max_bound
,
float
mf_beta1_decay_rate
,
float
mf_beta2_decay_rate
,
float
mf_ada_epsilon
)
{
optimizer_config_
.
set_embedx_sgd
(
mf_create_thresholds
,
mf_learning_rate
,
mf_initial_g2sum
,
mf_initial_range
,
mf_min_bound
,
mf_max_bound
,
mf_beta1_decay_rate
,
mf_beta2_decay_rate
,
mf_ada_epsilon
);
}
}
// end namespace framework
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
浏览文件 @
b8d106e1
...
...
@@ -51,7 +51,10 @@ limitations under the License. */
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_PSCORE
#include "paddle/fluid/distributed/ps/table/accessor.h"
#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/distributed/the_one_ps.pb.h"
#endif
#ifdef PADDLE_WITH_PSLIB
#include "afs_api.h"
...
...
@@ -64,9 +67,6 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
#define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
class
Dataset
;
#ifdef PADDLE_WITH_PSLIB
...
...
@@ -98,7 +98,7 @@ class AfsWrapper {
class
PSGPUWrapper
{
public:
virtual
~
PSGPUWrapper
();
~
PSGPUWrapper
();
PSGPUWrapper
()
{
HeterPs_
=
NULL
;
...
...
@@ -139,37 +139,6 @@ class PSGPUWrapper {
const
int64_t
*
gpu_len
,
int
slot_num
,
int
total_len
);
void
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
);
void
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
);
void
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
,
const
int64_t
total_length
,
const
int
batch_size
);
void
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
);
void
BuildGPUTask
(
std
::
shared_ptr
<
HeterContext
>
gpu_task
);
void
PreBuildTask
(
std
::
shared_ptr
<
HeterContext
>
gpu_task
);
...
...
@@ -274,13 +243,96 @@ class PSGPUWrapper {
float
max_bound
,
float
learning_rate
,
float
initial_g2sum
,
float
initial_range
);
float
initial_range
,
float
beta1_decay_rate
,
float
beta2_decay_rate
,
float
ada_epsilon
);
void
SetEmbedxSGD
(
float
mf_create_thresholds
,
float
mf_learning_rate
,
float
mf_initial_g2sum
,
float
mf_initial_range
,
float
mf_min_bound
,
float
mf_max_bound
);
float
mf_max_bound
,
float
mf_beta1_decay_rate
,
float
mf_beta2_decay_rate
,
float
mf_ada_epsilon
);
#ifdef PADDLE_WITH_PSCORE
void
add_sparse_optimizer
(
std
::
unordered_map
<
std
::
string
,
float
>&
config
,
// NOLINT
const
::
paddle
::
distributed
::
SparseCommonSGDRuleParameter
&
sgd_param
,
const
std
::
string
&
prefix
=
""
)
{
auto
optimizer_name
=
sgd_param
.
name
();
if
(
optimizer_name
==
"SparseNaiveSGDRule"
)
{
config
[
prefix
+
"optimizer_type"
]
=
0
;
config
[
prefix
+
"learning_rate"
]
=
sgd_param
.
naive
().
learning_rate
();
config
[
prefix
+
"initial_range"
]
=
sgd_param
.
naive
().
initial_range
();
config
[
prefix
+
"min_bound"
]
=
sgd_param
.
naive
().
weight_bounds
()[
0
];
config
[
prefix
+
"max_bound"
]
=
sgd_param
.
naive
().
weight_bounds
()[
1
];
}
else
if
(
optimizer_name
==
"SparseAdaGradSGDRule"
)
{
config
[
prefix
+
"optimizer_type"
]
=
1
;
config
[
prefix
+
"learning_rate"
]
=
sgd_param
.
adagrad
().
learning_rate
();
config
[
prefix
+
"initial_range"
]
=
sgd_param
.
adagrad
().
initial_range
();
config
[
prefix
+
"initial_g2sum"
]
=
sgd_param
.
adagrad
().
initial_g2sum
();
config
[
prefix
+
"min_bound"
]
=
sgd_param
.
adagrad
().
weight_bounds
()[
0
];
config
[
prefix
+
"max_bound"
]
=
sgd_param
.
adagrad
().
weight_bounds
()[
1
];
}
else
if
(
optimizer_name
==
"StdAdaGradSGDRule"
)
{
config
[
prefix
+
"optimizer_type"
]
=
2
;
config
[
prefix
+
"learning_rate"
]
=
sgd_param
.
adagrad
().
learning_rate
();
config
[
prefix
+
"initial_range"
]
=
sgd_param
.
adagrad
().
initial_range
();
config
[
prefix
+
"initial_g2sum"
]
=
sgd_param
.
adagrad
().
initial_g2sum
();
config
[
prefix
+
"min_bound"
]
=
sgd_param
.
adagrad
().
weight_bounds
()[
0
];
config
[
prefix
+
"max_bound"
]
=
sgd_param
.
adagrad
().
weight_bounds
()[
1
];
}
else
if
(
optimizer_name
==
"SparseAdamSGDRule"
)
{
config
[
prefix
+
"optimizer_type"
]
=
3
;
config
[
prefix
+
"learning_rate"
]
=
sgd_param
.
adam
().
learning_rate
();
config
[
prefix
+
"initial_range"
]
=
sgd_param
.
adam
().
initial_range
();
config
[
prefix
+
"beta1_decay_rate"
]
=
sgd_param
.
adam
().
beta1_decay_rate
();
config
[
prefix
+
"beta2_decay_rate"
]
=
sgd_param
.
adam
().
beta2_decay_rate
();
config
[
prefix
+
"ada_epsilon"
]
=
sgd_param
.
adam
().
ada_epsilon
();
config
[
prefix
+
"min_bound"
]
=
sgd_param
.
adam
().
weight_bounds
()[
0
];
config
[
prefix
+
"max_bound"
]
=
sgd_param
.
adam
().
weight_bounds
()[
1
];
}
else
if
(
optimizer_name
==
"SparseSharedAdamSGDRule"
)
{
config
[
prefix
+
"optimizer_type"
]
=
4
;
config
[
prefix
+
"learning_rate"
]
=
sgd_param
.
adam
().
learning_rate
();
config
[
prefix
+
"initial_range"
]
=
sgd_param
.
adam
().
initial_range
();
config
[
prefix
+
"beta1_decay_rate"
]
=
sgd_param
.
adam
().
beta1_decay_rate
();
config
[
prefix
+
"beta2_decay_rate"
]
=
sgd_param
.
adam
().
beta2_decay_rate
();
config
[
prefix
+
"ada_epsilon"
]
=
sgd_param
.
adam
().
ada_epsilon
();
config
[
prefix
+
"min_bound"
]
=
sgd_param
.
adam
().
weight_bounds
()[
0
];
config
[
prefix
+
"max_bound"
]
=
sgd_param
.
adam
().
weight_bounds
()[
1
];
}
}
void
InitializeGPUServer
(
paddle
::
distributed
::
PSParameter
ps_param
)
{
auto
sparse_table
=
ps_param
.
server_param
().
downpour_server_param
().
downpour_table_param
(
0
);
auto
sparse_table_accessor
=
sparse_table
.
accessor
();
auto
sparse_table_accessor_parameter
=
sparse_table_accessor
.
ctr_accessor_param
();
accessor_class_
=
sparse_table_accessor
.
accessor_class
();
std
::
unordered_map
<
std
::
string
,
float
>
config
;
config
[
"embedx_dim"
]
=
sparse_table_accessor
.
embedx_dim
();
config
[
"nonclk_coeff"
]
=
sparse_table_accessor_parameter
.
nonclk_coeff
();
config
[
"clk_coeff"
]
=
sparse_table_accessor_parameter
.
click_coeff
();
config
[
"mf_create_thresholds"
]
=
sparse_table_accessor
.
embedx_threshold
();
if
(
accessor_class_
==
"CtrDymfAccessor"
)
{
// optimizer config for embed_w and embedx
add_sparse_optimizer
(
config
,
sparse_table_accessor
.
embed_sgd_param
());
add_sparse_optimizer
(
config
,
sparse_table_accessor
.
embedx_sgd_param
(),
"mf_"
);
}
fleet_config_
=
config
;
GlobalAccessorTransfor
::
GetInstance
().
Init
(
accessor_class_
);
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
()
->
Configure
(
config
);
InitializeGPUServer
(
config
);
}
#endif
void
InitializeGPUServer
(
std
::
unordered_map
<
std
::
string
,
float
>
config
)
{
float
nonclk_coeff
=
(
config
.
find
(
"nonclk_coeff"
)
==
config
.
end
())
?
1.0
...
...
@@ -288,54 +340,83 @@ class PSGPUWrapper {
float
clk_coeff
=
(
config
.
find
(
"clk_coeff"
)
==
config
.
end
())
?
1.0
:
config
[
"clk_coeff"
];
float
min_bound
=
(
config
.
find
(
"min_bound"
)
==
config
.
end
())
?
-
10
000
.0
?
-
10.0
:
config
[
"min_bound"
];
float
max_bound
=
(
config
.
find
(
"max_bound"
)
==
config
.
end
())
?
10000.0
:
config
[
"max_bound"
];
float
max_bound
=
(
config
.
find
(
"max_bound"
)
==
config
.
end
())
?
10.0
:
config
[
"max_bound"
];
float
learning_rate
=
(
config
.
find
(
"learning_rate"
)
==
config
.
end
())
?
1.0
?
0.05
:
config
[
"learning_rate"
];
float
initial_g2sum
=
(
config
.
find
(
"initial_g2sum"
)
==
config
.
end
())
?
1
.0
?
3
.0
:
config
[
"initial_g2sum"
];
float
initial_range
=
(
config
.
find
(
"initial_range"
)
==
config
.
end
())
?
1
.0
?
1
e-4
:
config
[
"initial_range"
];
float
beta1_decay_rate
=
(
config
.
find
(
"beta1_decay_rate"
)
==
config
.
end
())
?
0.9
:
config
[
"beta1_decay_rate"
];
float
beta2_decay_rate
=
(
config
.
find
(
"beta2_decay_rate"
)
==
config
.
end
())
?
0.999
:
config
[
"beta2_decay_rate"
];
float
ada_epsilon
=
(
config
.
find
(
"ada_epsilon"
)
==
config
.
end
())
?
1e-8
:
config
[
"ada_epsilon"
];
// mf config settings
float
mf_create_thresholds
=
(
config
.
find
(
"mf_create_thresholds"
)
==
config
.
end
())
?
static_cast
<
float
>
(
1.0
)
:
config
[
"mf_create_thresholds"
];
float
mf_learning_rate
=
(
config
.
find
(
"mf_learning_rate"
)
==
config
.
end
())
?
1.0
?
0.05
:
config
[
"mf_learning_rate"
];
float
mf_initial_g2sum
=
(
config
.
find
(
"mf_initial_g2sum"
)
==
config
.
end
())
?
1
.0
?
3
.0
:
config
[
"mf_initial_g2sum"
];
float
mf_initial_range
=
(
config
.
find
(
"mf_initial_range"
)
==
config
.
end
())
?
1
.0
?
1
e-4
:
config
[
"mf_initial_range"
];
float
mf_min_bound
=
(
config
.
find
(
"mf_min_bound"
)
==
config
.
end
())
?
1
.0
?
-
10
.0
:
config
[
"mf_min_bound"
];
float
mf_max_bound
=
(
config
.
find
(
"mf_max_bound"
)
==
config
.
end
())
?
1.0
?
1
0
.0
:
config
[
"mf_max_bound"
];
float
mf_beta1_decay_rate
=
(
config
.
find
(
"mf_beta1_decay_rate"
)
==
config
.
end
())
?
0.9
:
config
[
"mf_beta1_decay_rate"
];
float
mf_beta2_decay_rate
=
(
config
.
find
(
"mf_beta2_decay_rate"
)
==
config
.
end
())
?
0.999
:
config
[
"mf_beta2_decay_rate"
];
float
mf_ada_epsilon
=
(
config
.
find
(
"mf_ada_epsilon"
)
==
config
.
end
())
?
1e-8
:
config
[
"mf_ada_epsilon"
];
this
->
SetSparseSGD
(
nonclk_coeff
,
clk_coeff
,
min_bound
,
max_bound
,
learning_rate
,
initial_g2sum
,
initial_range
);
initial_range
,
beta1_decay_rate
,
beta2_decay_rate
,
ada_epsilon
);
this
->
SetEmbedxSGD
(
mf_create_thresholds
,
mf_learning_rate
,
mf_initial_g2sum
,
mf_initial_range
,
mf_min_bound
,
mf_max_bound
);
mf_max_bound
,
mf_beta1_decay_rate
,
mf_beta2_decay_rate
,
mf_ada_epsilon
);
// set optimizer type(naive,adagrad,std_adagrad,adam,share_adam)
optimizer_type_
=
(
config
.
find
(
"optimizer_type"
)
==
config
.
end
())
?
1
:
static_cast
<
int
>
(
config
[
"optimizer_type"
]);
}
void
SetDate
(
int
year
,
int
month
,
int
day
)
{
...
...
@@ -380,7 +461,7 @@ class PSGPUWrapper {
if
(
slot_info_initialized_
)
{
return
;
}
SlotRecordDataset
*
dataset
=
dynamic_cast
<
SlotRecordDataset
*>
(
dataset_
);
SlotRecordDataset
*
dataset
=
(
SlotRecordDataset
*
)
(
dataset_
);
auto
slots_vec
=
dataset
->
GetSlots
();
slot_offset_vector_
.
clear
();
for
(
auto
&
slot
:
slot_vector_
)
{
...
...
@@ -421,10 +502,13 @@ class PSGPUWrapper {
for
(
size_t
i
=
0
;
i
<
slot_index_vec_
.
size
();
i
++
)
{
slot_index_vec_
[
i
]
=
dim_index_map
[
slot_mf_dim_vector_
[
i
]];
}
val_type_size_
=
TYPEALIGN
(
8
,
sizeof
(
FeatureValue
)
+
sizeof
(
float
)
*
(
max_mf_dim_
+
1
));
grad_type_size_
=
TYPEALIGN
(
8
,
sizeof
(
FeaturePushValue
)
+
(
max_mf_dim_
*
sizeof
(
float
)));
auto
accessor_wrapper_ptr
=
GlobalAccessorTransfor
::
GetInstance
().
GetAccessorWrapper
();
val_type_size_
=
accessor_wrapper_ptr
->
GetFeatureValueSize
(
max_mf_dim_
);
grad_type_size_
=
accessor_wrapper_ptr
->
GetPushValueSize
(
max_mf_dim_
);
VLOG
(
0
)
<<
"InitSlotInfo: val_type_size_"
<<
val_type_size_
<<
" grad_type_size_:"
<<
grad_type_size_
;
slot_info_initialized_
=
true
;
}
#endif
...
...
@@ -445,6 +529,12 @@ class PSGPUWrapper {
const
std
::
string
&
conf
);
#endif
#ifdef PADDLE_WITH_PSCORE
void
SetTableAccessor
(
paddle
::
distributed
::
ValueAccessor
*
accessor
)
{
cpu_table_accessor_
=
accessor
;
}
#endif
private:
static
std
::
shared_ptr
<
PSGPUWrapper
>
s_instance_
;
Dataset
*
dataset_
;
...
...
@@ -497,6 +587,12 @@ class PSGPUWrapper {
int
day_
;
bool
slot_info_initialized_
=
false
;
int
use_afs_api_
=
0
;
int
optimizer_type_
=
1
;
std
::
string
accessor_class_
;
std
::
unordered_map
<
std
::
string
,
float
>
fleet_config_
;
#ifdef PADDLE_WITH_PSCORE
paddle
::
distributed
::
ValueAccessor
*
cpu_table_accessor_
;
#endif
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
MemoryPool
*>
mem_pools_
;
...
...
@@ -521,6 +617,7 @@ class PSGPUWrapper {
bool
running_
=
false
;
std
::
vector
<
std
::
shared_ptr
<
ThreadPool
>>
pull_thread_pool_
;
std
::
vector
<
std
::
shared_ptr
<
ThreadPool
>>
hbm_thread_pool_
;
OptimizerConfig
optimizer_config_
;
protected:
static
bool
is_initialized_
;
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
浏览文件 @
b8d106e1
...
...
@@ -28,9 +28,13 @@ limitations under the License. */
namespace paddle {
namespace framework {
__global__ void PullCopy(float* dest, const FeatureValue* src,
const long long* len, int hidden, int slot_num,
int total_len, unsigned long long* keys) {
__global__ void PullCopy(float* dest,
const FeatureValue* src,
const long long* len,
int hidden,
int slot_num,
int total_len,
unsigned long long* keys) {
int cid = core_id();
int ncores = core_num();
if (cid >= ncores) {
...
...
@@ -42,8 +46,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
GM2LM(len, local_len, slot_num * sizeof(int64_t));
__global_ptr__ unsigned long long* local_keys[slot_num];
GM2LM(
keys, local_keys,
slot_num * sizeof(__global_ptr__ unsigned long long*));
GM2LM(
keys, local_keys,
slot_num * sizeof(__global_ptr__ unsigned long long*));
__global_ptr__ float* local_dest[slot_num];
GM2LM(dest, local_dest, slot_num * sizeof(__global_ptr__ float*));
...
...
@@ -64,10 +68,11 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
// copy read_len (length) of slots' val to LM
for (int k = 0; k < slot_len; k += read_len) {
int real_read_len = min(read_len, slot_len - k);
GM2LM(src + dest_len + k, local_slot_vals,
GM2LM(src + dest_len + k,
local_slot_vals,
real_read_len * sizeof(FeatureValue));
GM2LM(
local_keys[i] + k, local_slot_keys,
real_read_len * sizeof(uint64_t));
GM2LM(
local_keys[i] + k, local_slot_keys,
real_read_len * sizeof(uint64_t));
for (int j = 0; j < real_read_len; j++) {
if (local_slot_keys[j] == 0) {
local_dest_vals[j * hidden] = 0;
...
...
@@ -89,7 +94,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
}
}
}
LM2GM(local_dest_vals, local_dest[i] + k * hidden,
LM2GM(local_dest_vals,
local_dest[i] + k * hidden,
real_read_len * hidden * sizeof(float));
}
}
...
...
@@ -97,7 +103,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
__global__ void CopyKeysKernel(unsigned long long* src_keys,
unsigned long long* dest_total_keys,
const long long* len, int slot_num,
const long long* len,
int slot_num,
int total_len) {
int cid = core_id();
int ncores = core_num();
...
...
@@ -110,7 +117,8 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys,
GM2LM(len, local_len, slot_num * sizeof(long long));
__global_ptr__ unsigned long long* local_keys[slot_num];
GM2LM(src_keys, local_keys,
GM2LM(src_keys,
local_keys,
slot_num * sizeof(__global_ptr__ unsigned long long*));
for (int i = thread_id; i < slot_num; i += nthreads) {
...
...
@@ -123,16 +131,23 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys,
for (int k = 0; k < slot_len; k += read_len) {
int real_read_len = min(read_len, slot_len - k);
GM2LM(local_keys[i] + k, local_slot_keys,
GM2LM(local_keys[i] + k,
local_slot_keys,
real_read_len * sizeof(unsigned long long));
LM2GM(local_slot_keys, dest_total_keys + dest_len + k,
LM2GM(local_slot_keys,
dest_total_keys + dest_len + k,
real_read_len * sizeof(unsigned long long));
}
}
}
__global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
int hidden, int slot_num, int total_len, int bs,
__global__ void PushCopy(FeaturePushValue* dest,
float* src,
long long* len,
int hidden,
int slot_num,
int total_len,
int bs,
int* slot_vector) {
int cid = core_id();
int ncores = core_num();
...
...
@@ -163,7 +178,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
// copy read_len(length) of slots' grad to LM
for (int k = 0; k < slot_len; k += read_len) {
int real_read_len = min(read_len, slot_len - k);
GM2LM(local_src[i] + k * hidden, local_slot_grads,
GM2LM(local_src[i] + k * hidden,
local_slot_grads,
real_read_len * hidden * sizeof(float));
// copy from slots' grad to total grad
for (int j = 0; j < real_read_len; j++) {
...
...
@@ -176,7 +192,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
local_slot_grads[j * hidden + 3 + m] * -1. * bs;
}
}
LM2GM(local_dest_grads, dest + dest_len + k,
LM2GM(local_dest_grads,
dest + dest_len + k,
real_read_len * sizeof(FeaturePushValue));
}
}
...
...
@@ -184,40 +201,11 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len, const int slot_num,
const int hidden_size,
const int64_t total_length) {
XPUStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
->x_context()
->xpu_stream;
// float* buf_value = nullptr;
// xpu_malloc(reinterpret_cast<void**>(&buf_value),
// values.size() * sizeof(float*));
// float** gpu_values = reinterpret_cast<float**>(&buf_value);
float* gpu_values = nullptr;
xpu_malloc(reinterpret_cast<void**>(&gpu_values),
values.size() * sizeof(float*));
xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*),
XPU_HOST_TO_DEVICE);
// unsigned long long** c_keys = (unsigned long long**)gpu_keys;
unsigned long long* c_keys = reinterpret_cast<unsigned long long*>(gpu_keys);
const long long* c_len = (const long long*)gpu_len;
PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size,
slot_num, total_length, c_keys);
xpu_wait(stream);
}
void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
uint64_t** origin_keys, uint64_t* total_keys,
const int64_t* gpu_len, int slot_num,
uint64_t** origin_keys,
uint64_t* total_keys,
const int64_t* gpu_len,
int slot_num,
int total_len) {
XPUStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
...
...
@@ -232,66 +220,49 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
xpu_wait(stream);
}
void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
FeaturePushValue* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const int hidden_size,
const int64_t total_length,
const int batch_size) {
XPUStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
->x_context()
->xpu_stream;
auto slot_lengths_lod = slot_lengths;
for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
slot_lengths_lod[i] += slot_lengths_lod[i - 1];
}
float* gpu_values = nullptr;
int64_t* gpu_len = nullptr;
int* d_slot_vector = nullptr;
xpu_malloc(reinterpret_cast<void**>(&gpu_values),
grad_values.size() * sizeof(float*));
xpu_malloc(reinterpret_cast<void**>(&gpu_len),
slot_lengths.size() * sizeof(int64_t));
xpu_malloc(reinterpret_cast<void**>(&d_slot_vector),
slot_lengths_lod.size() * sizeof(int));
xpu_memcpy(gpu_values, grad_values.data(),
grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE);
xpu_memcpy(gpu_len, slot_lengths_lod.data(),
slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE);
xpu_memcpy(d_slot_vector, slot_vector_.data(),
slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE);
long long* c_len = (long long*)gpu_len;
PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len,
hidden_size, slot_lengths.size(), total_length,
batch_size, d_slot_vector);
xpu_wait(stream);
}
void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
float min_bound, float max_bound,
float learning_rate, float initial_g2sum,
float initial_range) {
void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
float clk_coeff,
float min_bound,
float max_bound,
float learning_rate,
float initial_g2sum,
float initial_range,
float beta1_decay_rate,
float beta2_decay_rate,
float ada_epsilon) {
OptimizerConfig optimizer_config;
optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
learning_rate, initial_g2sum, initial_range);
optimizer_config.set_sparse_sgd(nonclk_coeff,
clk_coeff,
min_bound,
max_bound,
learning_rate,
initial_g2sum,
initial_range,
beta1_decay_rate,
beta2_decay_rate,
ada_epsilon);
HeterPs_->set_sparse_sgd(optimizer_config);
}
void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_learning_rate, float mf_initial_g2sum,
float mf_initial_range, float mf_min_bound,
float mf_max_bound) {
float mf_learning_rate,
float mf_initial_g2sum,
float mf_initial_range,
float mf_min_bound,
float mf_max_bound,
float mf_beta1_decay_rate,
float mf_beta2_decay_rate,
float mf_ada_epsilon) {
OptimizerConfig optimizer_config;
optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
mf_initial_g2sum, mf_initial_range,
mf_min_bound, mf_max_bound);
optimizer_config.set_embedx_sgd(mf_create_thresholds,
mf_learning_rate,
mf_initial_g2sum,
mf_initial_range,
mf_min_bound,
mf_max_bound,
mf_beta1_decay_rate,
mf_beta2_decay_rate,
mf_ada_epsilon);
HeterPs_->set_embedx_sgd(optimizer_config);
}
...
...
python/paddle/distributed/fleet/base/distributed_strategy.py
浏览文件 @
b8d106e1
...
...
@@ -594,6 +594,21 @@ class DistributedStrategy(object):
bounds
=
strategy
.
get
(
prefix
+
'sparse_weight_bounds'
,
[
-
10
,
10
])
sgd
.
adam
.
weight_bounds
.
extend
(
bounds
)
elif
optimizer_name
==
"shared_adam"
:
sgd
.
name
=
'SparseSharedAdamSGDRule'
sgd
.
adam
.
learning_rate
=
strategy
.
get
(
prefix
+
'sparse_learning_rate'
,
0.001
)
sgd
.
adam
.
initial_range
=
strategy
.
get
(
prefix
+
'sparse_initial_range'
,
1e-4
)
sgd
.
adam
.
beta1_decay_rate
=
strategy
.
get
(
prefix
+
'sparse_beta1_decay_rate'
,
0.9
)
sgd
.
adam
.
beta2_decay_rate
=
strategy
.
get
(
prefix
+
'sparse_beta2_decay_rate'
,
0.999
)
sgd
.
adam
.
ada_epsilon
=
strategy
.
get
(
prefix
+
'sparse_ada_epsilon'
,
1e-8
)
bounds
=
strategy
.
get
(
prefix
+
'sparse_weight_bounds'
,
[
-
10
,
10
])
sgd
.
adam
.
weight_bounds
.
extend
(
bounds
)
def
set_sparse_table_config
(
table_data
,
config
):
for
key
in
config
:
...
...
python/paddle/distributed/ps/the_one_ps.py
浏览文件 @
b8d106e1
...
...
@@ -195,7 +195,7 @@ class Accessor:
sgd_param
.
naive
.
initial_range
=
0.0001
if
len
(
sgd_param
.
naive
.
weight_bounds
)
==
0
:
sgd_param
.
naive
.
weight_bounds
.
extend
([
-
10.0
,
10.0
])
if
sgd_param
.
name
==
"SparseAdamSGDRule"
:
if
sgd_param
.
name
==
"SparseAdamSGDRule"
or
sgd_param
.
name
==
"SparseSharedAdamSGDRule"
:
if
not
sgd_param
.
adam
.
HasField
(
"learning_rate"
):
sgd_param
.
adam
.
learning_rate
=
0.001
if
not
sgd_param
.
adam
.
HasField
(
"initial_range"
):
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
0 → 100644
浏览文件 @
b8d106e1
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
unittest
import
tempfile
import
shutil
import
paddle
import
paddle.fluid
as
fluid
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
paddle.distributed.fleet
as
fleet
paddle
.
enable_static
()
# For Net
base_lr
=
0.2
emb_lr
=
base_lr
*
3
dict_dim
=
1500
emb_dim
=
128
hid_dim
=
128
margin
=
0.1
sample_rate
=
1
batch_size
=
4
# this unittest is tested for SparseSharedAdamSGDRule
class
TestPSPassWithBow
(
unittest
.
TestCase
):
def
net
(
self
):
def
get_acc
(
cos_q_nt
,
cos_q_pt
,
batch_size
):
cond
=
fluid
.
layers
.
less_than
(
cos_q_nt
,
cos_q_pt
)
cond
=
fluid
.
layers
.
cast
(
cond
,
dtype
=
'float64'
)
cond_3
=
fluid
.
layers
.
reduce_sum
(
cond
)
acc
=
fluid
.
layers
.
elementwise_div
(
cond_3
,
fluid
.
layers
.
fill_constant
(
shape
=
[
1
],
value
=
batch_size
*
1.0
,
dtype
=
'float64'
),
name
=
"simnet_acc"
)
return
acc
def
get_loss
(
cos_q_pt
,
cos_q_nt
):
loss_op1
=
fluid
.
layers
.
elementwise_sub
(
fluid
.
layers
.
fill_constant_batch_size_like
(
input
=
cos_q_pt
,
shape
=
[
-
1
,
1
],
value
=
margin
,
dtype
=
'float32'
),
cos_q_pt
)
loss_op2
=
fluid
.
layers
.
elementwise_add
(
loss_op1
,
cos_q_nt
)
loss_op3
=
fluid
.
layers
.
elementwise_max
(
fluid
.
layers
.
fill_constant_batch_size_like
(
input
=
loss_op2
,
shape
=
[
-
1
,
1
],
value
=
0.0
,
dtype
=
'float32'
),
loss_op2
)
avg_cost
=
fluid
.
layers
.
mean
(
loss_op3
)
return
avg_cost
is_distributed
=
False
is_sparse
=
True
# query
q
=
fluid
.
layers
.
data
(
name
=
"query_ids"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
# embedding
q_emb
=
fluid
.
contrib
.
layers
.
sparse_embedding
(
input
=
q
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__emb__"
,
learning_rate
=
emb_lr
))
q_emb
=
fluid
.
layers
.
reshape
(
q_emb
,
[
-
1
,
emb_dim
])
# vsum
q_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
q_emb
,
pool_type
=
'sum'
)
q_ss
=
fluid
.
layers
.
softsign
(
q_sum
)
# fc layer after conv
q_fc
=
fluid
.
layers
.
fc
(
input
=
q_ss
,
size
=
hid_dim
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__q_fc__"
,
learning_rate
=
base_lr
))
# label data
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
# pt
pt
=
fluid
.
layers
.
data
(
name
=
"pos_title_ids"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
# embedding
pt_emb
=
fluid
.
contrib
.
layers
.
sparse_embedding
(
input
=
pt
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__emb__"
,
learning_rate
=
emb_lr
))
pt_emb
=
fluid
.
layers
.
reshape
(
pt_emb
,
[
-
1
,
emb_dim
])
# vsum
pt_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
pt_emb
,
pool_type
=
'sum'
)
pt_ss
=
fluid
.
layers
.
softsign
(
pt_sum
)
# fc layer
pt_fc
=
fluid
.
layers
.
fc
(
input
=
pt_ss
,
size
=
hid_dim
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__fc__"
,
learning_rate
=
base_lr
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"__fc_b__"
))
# nt
nt
=
fluid
.
layers
.
data
(
name
=
"neg_title_ids"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
# embedding
nt_emb
=
fluid
.
contrib
.
layers
.
sparse_embedding
(
input
=
nt
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__emb__"
,
learning_rate
=
emb_lr
))
nt_emb
=
fluid
.
layers
.
reshape
(
nt_emb
,
[
-
1
,
emb_dim
])
# vsum
nt_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
nt_emb
,
pool_type
=
'sum'
)
nt_ss
=
fluid
.
layers
.
softsign
(
nt_sum
)
# fc layer
nt_fc
=
fluid
.
layers
.
fc
(
input
=
nt_ss
,
size
=
hid_dim
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__fc__"
,
learning_rate
=
base_lr
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"__fc_b__"
))
cos_q_pt
=
fluid
.
layers
.
cos_sim
(
q_fc
,
pt_fc
)
cos_q_nt
=
fluid
.
layers
.
cos_sim
(
q_fc
,
nt_fc
)
# loss
avg_cost
=
get_loss
(
cos_q_pt
,
cos_q_nt
)
# acc
acc
=
get_acc
(
cos_q_nt
,
cos_q_pt
,
batch_size
)
return
[
avg_cost
,
acc
,
cos_q_pt
]
def
test
(
self
):
os
.
environ
[
"PADDLE_PSERVER_NUMS"
]
=
"2"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"2"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"0"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"2"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001,127.0.0.2:36001"
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
role
=
role_maker
.
PaddleCloudRoleMaker
()
fleet
.
init
(
role
)
loss
,
acc
,
_
=
self
.
net
()
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
a_sync
=
True
configs
=
{}
configs
[
'__emb__'
]
=
{
"table_parameters.__emb__.accessor.embed_sgd_param.name"
:
"SparseSharedAdamSGDRule"
,
"table_parameters.__emb__.accessor.embedx_sgd_param.name"
:
"SparseSharedAdamSGDRule"
,
}
strategy
.
sparse_table_configs
=
configs
optimizer
=
paddle
.
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
loss
)
fleet
.
init_server
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
浏览文件 @
b8d106e1
...
...
@@ -334,6 +334,14 @@ class TestStrategyConfig(unittest.TestCase):
strategy
.
sparse_table_configs
[
0
].
accessor
.
embed_sgd_param
.
adagrad
.
initial_range
,
0.0001
)
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
configs
=
{}
configs
[
'emb'
]
=
{
"sparse_optimizer"
:
"shared_adam"
}
strategy
.
fleet_desc_configs
=
configs
self
.
assertEqual
(
strategy
.
sparse_table_configs
[
0
].
accessor
.
embed_sgd_param
.
adam
.
beta1_decay_rate
,
0.9
)
def
test_trainer_desc_configs
(
self
):
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
configs
=
{
...
...
tools/parallel_UT_rule.py
浏览文件 @
b8d106e1
...
...
@@ -671,7 +671,8 @@ HIGH_PARALLEL_JOB_NEW = [
'test_trt_convert_reduce_sum'
,
'save_quant2_model_lstm'
,
'test_trt_convert_slice'
,
'test_quant2_int8_lstm_mkldnn'
'test_quant2_int8_lstm_mkldnn'
,
'test_dist_fleet_ps13'
]
# mem=0 but always timeout or failed : It run 15 job each time in Single cases;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录