Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
3f619290
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3f619290
编写于
5月 20, 2022
作者:
Y
yaoxuefeng
提交者:
GitHub
5月 20, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
merge dymf branch (#42714)
merge dymf branch
上级
e726960a
变更
18
展开全部
隐藏空白更改
内联
并排
Showing
18 changed file
with
991 addition
and
344 deletion
+991
-344
paddle/fluid/framework/fleet/heter_context.h
paddle/fluid/framework/fleet/heter_context.h
+0
-5
paddle/fluid/framework/fleet/heter_ps/feature_value.h
paddle/fluid/framework/fleet/heter_ps/feature_value.h
+31
-16
paddle/fluid/framework/fleet/heter_ps/hashtable.h
paddle/fluid/framework/fleet/heter_ps/hashtable.h
+2
-2
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+33
-8
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+26
-2
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+282
-116
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+97
-0
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+54
-0
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+11
-0
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+4
-1
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+4
-0
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+2
-0
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+5
-4
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+285
-187
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+127
-0
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+25
-1
paddle/fluid/operators/pull_gpups_sparse_op.h
paddle/fluid/operators/pull_gpups_sparse_op.h
+2
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+1
-1
未找到文件。
paddle/fluid/framework/fleet/heter_context.h
浏览文件 @
3f619290
...
...
@@ -129,11 +129,6 @@ class HeterContext {
for
(
size_t
i
=
0
;
i
<
feature_dim_keys_
.
size
();
i
++
)
{
feature_dim_keys_
[
i
].
resize
(
dim_num
);
value_dim_ptr_
[
i
].
resize
(
dim_num
);
if
(
i
==
0
)
{
for
(
int
j
=
0
;
j
<
dim_num
;
j
++
)
{
feature_dim_keys_
[
i
][
j
].
push_back
(
0
);
}
}
}
device_values_
.
resize
(
device_num
);
device_dim_values_
.
resize
(
device_num
);
...
...
paddle/fluid/framework/fleet/heter_ps/feature_value.h
浏览文件 @
3f619290
...
...
@@ -32,17 +32,33 @@ struct FeatureValue {
float
lr
;
float
lr_g2sum
;
int
mf_size
;
float
mf
[
MF_DIM
+
1
]
;
int
mf_dim
;
uint64_t
cpu_ptr
;
float
mf
[
0
];
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
FeatureValue
&
val
)
{
out
<<
"show: "
<<
val
.
show
<<
" clk: "
<<
val
.
clk
<<
" slot: "
<<
val
.
slot
<<
" lr: "
<<
val
.
lr
<<
" mf_size: "
<<
val
.
mf_size
<<
" mf:"
;
for
(
int
i
=
0
;
i
<
val
.
mf_size
;
++
i
)
{
<<
" lr: "
<<
val
.
lr
<<
" mf_dim: "
<<
val
.
mf_dim
<<
"cpuptr: "
<<
val
.
cpu_ptr
<<
" mf_size: "
<<
val
.
mf_size
<<
" mf:"
;
for
(
int
i
=
0
;
i
<
val
.
mf_dim
+
1
;
++
i
)
{
out
<<
" "
<<
val
.
mf
[
i
];
}
return
out
;
}
__device__
__forceinline__
void
operator
=
(
const
FeatureValue
&
in
)
{
delta_score
=
in
.
delta_score
;
show
=
in
.
show
;
clk
=
in
.
clk
;
slot
=
in
.
slot
;
lr
=
in
.
lr
;
lr_g2sum
=
in
.
lr_g2sum
;
mf_size
=
in
.
mf_size
;
mf_dim
=
in
.
mf_dim
;
cpu_ptr
=
in
.
cpu_ptr
;
for
(
int
i
=
0
;
i
<
mf_dim
+
1
;
i
++
)
{
mf
[
i
]
=
in
.
mf
[
i
];
}
}
};
struct
FeaturePushValue
{
...
...
@@ -50,20 +66,19 @@ struct FeaturePushValue {
float
clk
;
int
slot
;
float
lr_g
;
float
mf_g
[
MF_DIM
];
int
mf_dim
;
float
mf_g
[
0
];
// __device__ __forceinline__ FeaturePushValue
// operator+(const FeaturePushValue& a) const {
// FeaturePushValue out;
// out.slot = a.slot;
// out.show = a.show + show;
// out.clk = a.clk + clk;
// out.lr_g = a.lr_g + lr_g;
// for (int i = 0; i < MF_DIM; ++i) {
// out.mf_g[i] = a.mf_g[i] + mf_g[i];
// }
// return out;
// }
__device__
__forceinline__
void
operator
=
(
const
FeaturePushValue
&
in
)
{
show
=
in
.
show
;
clk
=
in
.
clk
;
slot
=
in
.
slot
;
lr_g
=
in
.
lr_g
;
mf_dim
=
in
.
mf_dim
;
for
(
int
i
=
0
;
i
<
mf_dim
;
i
++
)
{
mf_g
[
i
]
=
in
.
mf_g
[
i
];
}
}
};
}
// end namespace framework
...
...
paddle/fluid/framework/fleet/heter_ps/hashtable.h
浏览文件 @
3f619290
...
...
@@ -118,8 +118,8 @@ class HashTable {
StreamType
stream
);
template
<
typename
StreamType
>
void
insert
(
const
KeyType
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
start_index
,
StreamType
stream
);
void
insert
(
const
KeyType
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
feature_value_size
,
size_t
start_index
,
StreamType
stream
);
template
<
typename
StreamType
>
void
get
(
const
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
,
...
...
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
浏览文件 @
3f619290
...
...
@@ -50,7 +50,8 @@ __global__ void insert_kernel(Table* table,
template
<
typename
Table
>
__global__
void
insert_kernel
(
Table
*
table
,
const
typename
Table
::
key_type
*
const
keys
,
size_t
len
,
char
*
pool
,
int
start_index
)
{
size_t
len
,
char
*
pool
,
size_t
feature_value_size
,
int
start_index
)
{
ReplaceOp
<
typename
Table
::
mapped_type
>
op
;
thrust
::
pair
<
typename
Table
::
key_type
,
typename
Table
::
mapped_type
>
kv
;
...
...
@@ -58,7 +59,8 @@ __global__ void insert_kernel(Table* table,
if
(
i
<
len
)
{
kv
.
first
=
keys
[
i
];
kv
.
second
=
(
Table
::
mapped_type
)(
pool
+
(
start_index
+
i
)
*
80
);
uint64_t
offset
=
uint64_t
(
start_index
+
i
)
*
feature_value_size
;
kv
.
second
=
(
Table
::
mapped_type
)(
pool
+
offset
);
auto
it
=
table
->
insert
(
kv
,
op
);
assert
(
it
!=
table
->
end
()
&&
"error: insert fails: table is full"
);
}
...
...
@@ -81,14 +83,16 @@ __global__ void search_kernel(Table* table,
template
<
typename
Table
>
__global__
void
dy_mf_search_kernel
(
Table
*
table
,
const
typename
Table
::
key_type
*
const
keys
,
char
*
const
vals
,
size_t
len
,
char
*
vals
,
size_t
len
,
size_t
pull_feature_value_size
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
auto
it
=
table
->
find
(
keys
[
i
]);
if
(
it
!=
table
->
end
())
{
*
(
FeatureValue
*
)(
vals
+
i
*
pull_feature_value_size
)
=
*
(
it
->
second
);
uint64_t
offset
=
i
*
pull_feature_value_size
;
FeatureValue
&
cur
=
*
(
FeatureValue
*
)(
vals
+
offset
);
FeatureValue
&
input
=
*
(
FeatureValue
*
)(
it
->
second
);
}
}
}
...
...
@@ -121,7 +125,7 @@ __global__ void dy_mf_update_kernel(Table* table,
FeaturePushValue
*
cur
=
(
FeaturePushValue
*
)(
grads
+
i
*
grad_value_size
);
sgd
.
dy_mf_update_value
(
optimizer_config
,
(
it
.
getter
())
->
second
,
*
cur
);
}
else
{
printf
(
"
yxf::
push miss key: %d"
,
keys
[
i
]);
printf
(
"
warning:
push miss key: %d"
,
keys
[
i
]);
}
}
}
...
...
@@ -201,7 +205,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
template
<
typename
KeyType
,
typename
ValType
>
template
<
typename
StreamType
>
void
HashTable
<
KeyType
,
ValType
>::
insert
(
const
KeyType
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
start_index
,
char
*
pool
,
size_t
feature_value_size
,
size_t
start_index
,
StreamType
stream
)
{
if
(
len
==
0
)
{
return
;
...
...
@@ -210,8 +215,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
return
;
}
const
int
grid_size
=
(
len
-
1
)
/
BLOCK_SIZE_
+
1
;
insert_kernel
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
container_
,
d_keys
,
len
,
pool
,
start_index
);
insert_kernel
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
container_
,
d_keys
,
len
,
pool
,
feature_value_size
,
start_index
);
}
template
<
typename
KeyType
,
typename
ValType
>
...
...
@@ -319,6 +324,7 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
}
template
class
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
>;
template
class
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>;
template
class
HashTable
<
long
,
int
>;
template
class
HashTable
<
unsigned
long
,
int
>;
template
class
HashTable
<
unsigned
long
,
unsigned
long
>;
...
...
@@ -331,6 +337,10 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
paddle
::
framework
::
FeatureValue
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>
::
get
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
char
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
int
>
::
get
<
cudaStream_t
>
(
const
long
*
d_keys
,
int
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
...
...
@@ -354,6 +364,11 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
const
paddle
::
framework
::
FeatureValue
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>
::
insert
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
feature_value_size
,
size_t
start_index
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
int
>
::
insert
<
cudaStream_t
>
(
const
long
*
d_keys
,
const
int
*
d_vals
,
size_t
len
,
...
...
@@ -393,6 +408,16 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
sgd
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>
::
update
<
Optimizer
<
paddle
::
framework
::
FeatureValue
,
paddle
::
framework
::
FeaturePushValue
>
,
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
const
char
*
d_grads
,
size_t
len
,
Optimizer
<
paddle
::
framework
::
FeatureValue
,
paddle
::
framework
::
FeaturePushValue
>
sgd
,
cudaStream_t
stream
);
// template void HashTable<unsigned long,
// paddle::framework::FeatureValue>::update<
// Optimizer<paddle::framework::FeatureValue,
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
浏览文件 @
3f619290
...
...
@@ -15,10 +15,13 @@ limitations under the License. */
#pragma once
#include <thread>
#include <vector>
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#if defined(PADDLE_WITH_CUDA)
#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/timer.h"
#include "thrust/pair.h"
#elif defined(PADDLE_WITH_XPU_KP)
// #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
...
...
@@ -38,6 +41,9 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
#define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
class
HeterComm
{
public:
...
...
@@ -50,9 +56,13 @@ class HeterComm {
int
*
left
,
int
*
right
,
int
gpu_num
);
void
merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
int
&
uniq_len
);
// NOLINT
void
dynamic_merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
int
&
uniq_len
);
void
pull_sparse
(
int
num
,
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
);
void
build_ps
(
int
num
,
KeyType
*
h_keys
,
ValType
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
);
void
build_ps
(
int
num
,
KeyType
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
);
void
dump
();
void
show_one_table
(
int
gpu_num
);
int
get_index_by_devid
(
int
devid
);
...
...
@@ -96,6 +106,11 @@ class HeterComm {
nccl_inter_comms_
=
inter_comms
;
node_size_
=
comm_size
;
}
void
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
{
multi_mf_dim_
=
multi_mf_dim
;
max_mf_dim_
=
max_mf_dim
;
}
#endif
bool
need_transfer
(
int
send_id
,
int
receive_id
)
{
...
...
@@ -114,8 +129,8 @@ class HeterComm {
char
*
key_storage
;
char
*
val_storage
;
int
sync
;
in
t
key_bytes_len
;
in
t
val_bytes_len
;
size_
t
key_bytes_len
;
size_
t
val_bytes_len
;
int
dev_num
;
};
...
...
@@ -206,12 +221,18 @@ class HeterComm {
void
destroy_storage
(
int
start_index
,
int
end_index
);
void
walk_to_dest
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
GradType
*
src_val
);
void
walk_to_dest
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
char
*
src_val
,
size_t
val_size
);
void
walk_to_src
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
ValType
*
src_val
);
void
walk_to_src
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
char
*
src_val
,
size_t
val_size
);
protected:
using
Table
=
HashTable
<
KeyType
,
ValType
>
;
using
PtrTable
=
HashTable
<
KeyType
,
ValType
*>
;
std
::
vector
<
Table
*>
tables_
;
std
::
vector
<
PtrTable
*>
ptr_tables_
;
std
::
shared_ptr
<
HeterPsResource
>
resource_
;
std
::
vector
<
std
::
vector
<
Path
>>
path_
;
float
load_factor_
{
0.75
};
...
...
@@ -221,6 +242,7 @@ class HeterComm {
private:
int
topo_aware_
{
0
};
std
::
vector
<
LocalStorage
>
storage_
;
DynamicGradMerger
merger_
;
int
feanum_
{
1800
*
2048
};
int
multi_node_
{
0
};
int
node_size_
;
...
...
@@ -228,6 +250,8 @@ class HeterComm {
#if defined(PADDLE_WITH_CUDA)
std
::
vector
<
ncclComm_t
>
nccl_inner_comms_
;
std
::
vector
<
ncclComm_t
>
nccl_inter_comms_
;
int
multi_mf_dim_
{
8
};
int
max_mf_dim_
=
8
;
std
::
vector
<
std
::
shared_ptr
<
cub
::
CachingDeviceAllocator
>>
allocators_
;
#endif
};
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
浏览文件 @
3f619290
此差异已折叠。
点击以展开。
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
浏览文件 @
3f619290
...
...
@@ -117,6 +117,52 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
}
}
template
<
typename
KeyType
,
typename
GradType
,
typename
T
>
__global__
void
dy_mf_fill_shard_grads_kernel
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
T
*
idx
,
size_t
len
,
size_t
grad_value_size
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
d_shard_keys
[
i
]
=
d_keys
[
idx
[
i
]];
*
(
GradType
*
)((
char
*
)
d_shard_grads
+
i
*
grad_value_size
)
=
*
(
GradType
*
)((
char
*
)
d_grads
+
uint64_t
(
idx
[
i
])
*
grad_value_size
);
}
}
__global__
void
merge_gradients_kernel
(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
const
char
*
input
,
char
*
output
,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
n
)
{
uint32_t
start
=
offset
[
i
];
uint32_t
num
=
fea_num
[
i
];
int
ori_index
=
index
[
start
];
FeaturePushValue
&
out
=
*
(
FeaturePushValue
*
)(
output
+
i
*
grad_value_size
);
FeaturePushValue
&
in
=
*
(
FeaturePushValue
*
)(
input
+
size_t
(
ori_index
)
*
grad_value_size
);
merger_
.
update_one
(
out
,
in
);
for
(
int
j
=
1
;
j
<
num
;
++
j
)
{
ori_index
=
index
[
start
+
j
];
in
=
*
(
FeaturePushValue
*
)(
input
+
size_t
(
ori_index
)
*
grad_value_size
);
merger_
.
merge_one
(
out
,
in
);
}
}
}
template
<
typename
ValType
,
typename
T
>
__global__
void
dy_mf_fill_dvals_kernel
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
T
*
idx
,
size_t
len
,
size_t
val_size
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
uint64_t
new_offset
=
uint64_t
(
idx
[
i
])
*
val_size
;
*
(
ValType
*
)((
char
*
)
d_vals
+
new_offset
)
=
*
(
ValType
*
)((
char
*
)
d_shard_vals
+
i
*
val_size
);
}
}
// cuda implemention of heter_comm_kernel.h
template
<
typename
T
,
typename
StreamType
>
void
HeterCommKernel
::
fill_idx
(
T
*
idx
,
long
long
len
,
...
...
@@ -207,8 +253,42 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
debug_synchronous
));
}
template
<
typename
KeyType
,
typename
GradType
,
typename
T
,
typename
StreamType
>
void
HeterCommKernel
::
dy_mf_fill_shard_grads
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
T
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
StreamType
&
stream
)
{
int
grid_size
=
(
len
-
1
)
/
block_size_
+
1
;
size_t
c_len
=
(
size_t
)
len
;
dy_mf_fill_shard_grads_kernel
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_keys
,
d_keys
,
d_shard_grads
,
d_grads
,
idx
,
c_len
,
grad_value_size
);
}
template
<
typename
StreamType
>
void
HeterCommKernel
::
merge_gradient
(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
const
char
*
input
,
char
*
output
,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
,
const
StreamType
&
stream
)
{
int
grid_size
=
(
n
-
1
)
/
block_size_
+
1
;
merge_gradients_kernel
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
offset
,
fea_num
,
index
,
input
,
output
,
n
,
grad_value_size
,
merger_
);
}
template
<
typename
ValType
,
typename
T
,
typename
StreamType
>
void
HeterCommKernel
::
dy_mf_fill_dvals
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
T
*
idx
,
long
long
len
,
size_t
val_size
,
const
StreamType
&
stream
)
{
int
grid_size
=
(
len
-
1
)
/
block_size_
+
1
;
size_t
c_len
=
(
size_t
)
len
;
dy_mf_fill_dvals_kernel
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_vals
,
d_vals
,
idx
,
c_len
,
val_size
);
}
template
void
HeterCommKernel
::
fill_idx
<
int
,
cudaStream_t
>(
int
*
idx
,
long
long
len
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
fill_idx
<
uint32_t
,
cudaStream_t
>(
uint32_t
*
idx
,
long
long
len
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
calc_shard_offset
<
int
,
cudaStream_t
>(
int
*
idx
,
int
*
left
,
int
*
right
,
long
long
len
,
int
total_devs
,
...
...
@@ -270,6 +350,23 @@ template void HeterCommKernel::reduce_by_key<
paddle
::
framework
::
FeaturePushValue
*
d_aggregates_out
,
int
*
d_num_runs_out
,
int
num_items
,
cudaStream_t
stream
,
bool
debug_synchronous
);
template
void
HeterCommKernel
::
dy_mf_fill_shard_grads
<
unsigned
long
,
paddle
::
framework
::
FeaturePushValue
,
int
,
cudaStream_t
>(
unsigned
long
*
d_shard_keys
,
unsigned
long
*
d_keys
,
paddle
::
framework
::
FeaturePushValue
*
d_shard_grads
,
paddle
::
framework
::
FeaturePushValue
*
d_grads
,
int
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
merge_gradient
<
cudaStream_t
>(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
const
char
*
input
,
char
*
output
,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
dy_mf_fill_dvals
<
paddle
::
framework
::
FeatureValue
,
int
,
cudaStream_t
>(
paddle
::
framework
::
FeatureValue
*
d_shard_vals
,
paddle
::
framework
::
FeatureValue
*
d_vals
,
int
*
idx
,
long
long
len
,
size_t
val_size
,
const
cudaStream_t
&
stream
);
#endif
}
// namespace framework
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
浏览文件 @
3f619290
...
...
@@ -27,6 +27,42 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
struct
DynamicGradMerger
{
template
<
typename
T
>
CUB_RUNTIME_FUNCTION
__forceinline__
__device__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
T
out
;
out
.
slot
=
a
.
slot
;
out
.
mf_dim
=
a
.
mf_dim
;
out
.
show
=
a
.
show
+
b
.
show
;
out
.
clk
=
a
.
clk
+
b
.
clk
;
out
.
lr_g
=
a
.
lr_g
+
b
.
lr_g
;
return
out
;
}
template
<
typename
T
>
__device__
__forceinline__
void
update_one
(
T
&
output
,
const
T
&
input
)
{
output
.
slot
=
input
.
slot
;
output
.
show
=
input
.
show
;
output
.
clk
=
input
.
clk
;
output
.
mf_dim
=
input
.
mf_dim
;
output
.
lr_g
=
input
.
lr_g
;
for
(
int
i
=
0
;
i
<
output
.
mf_dim
;
++
i
)
{
output
.
mf_g
[
i
]
=
input
.
mf_g
[
i
];
}
}
template
<
typename
T
>
__device__
__forceinline__
void
merge_one
(
T
&
output
,
const
T
&
input
)
{
output
.
show
+=
input
.
show
;
output
.
clk
+=
input
.
clk
;
output
.
lr_g
+=
input
.
lr_g
;
for
(
int
i
=
0
;
i
<
input
.
mf_dim
;
++
i
)
{
output
.
mf_g
[
i
]
+=
input
.
mf_g
[
i
];
}
}
};
class
HeterCommKernel
{
public:
HeterCommKernel
()
{}
...
...
@@ -80,6 +116,24 @@ class HeterCommKernel {
StreamType
stream
=
NULL
,
bool
debug_synchronous
=
false
);
template
<
typename
KeyType
,
typename
GradType
,
typename
T
,
typename
StreamType
>
void
dy_mf_fill_shard_grads
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
T
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
StreamType
&
stream
);
template
<
typename
StreamType
>
void
merge_gradient
(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
const
char
*
input
,
char
*
output
,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
,
const
StreamType
&
stream
);
template
<
typename
ValType
,
typename
T
,
typename
StreamType
>
void
dy_mf_fill_dvals
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
T
*
idx
,
long
long
len
,
size_t
val_size
,
const
StreamType
&
stream
);
private:
int
block_size_
{
256
};
};
...
...
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
浏览文件 @
3f619290
...
...
@@ -44,6 +44,13 @@ void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
comm_
->
build_ps
(
num
,
h_keys
,
h_vals
,
len
,
chunk_size
,
stream_num
);
}
void
HeterPs
::
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
{
comm_
->
build_ps
(
num
,
h_keys
,
pool
,
len
,
feature_value_size
,
chunk_size
,
stream_num
);
}
int
HeterPs
::
get_index_by_devid
(
int
devid
)
{
return
comm_
->
get_index_by_devid
(
devid
);
}
...
...
@@ -72,6 +79,10 @@ void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
comm_
->
set_nccl_comm_and_size
(
inner_comms
,
inter_comms
,
comm_size
);
}
void
HeterPs
::
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
{
comm_
->
set_multi_mf_dim
(
multi_mf_dim
,
max_mf_dim
);
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
浏览文件 @
3f619290
...
...
@@ -37,11 +37,14 @@ class HeterPs : public HeterPsBase {
size_t
len
)
override
;
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
override
;
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
override
;
#if defined(PADDLE_WITH_CUDA)
void
set_nccl_comm_and_size
(
const
std
::
vector
<
ncclComm_t
>&
inner_comms
,
const
std
::
vector
<
ncclComm_t
>&
inter_comms
,
int
comm_size
)
override
;
void
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
override
;
#endif
void
set_sparse_sgd
(
const
OptimizerConfig
&
optimizer_config
)
override
;
...
...
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
浏览文件 @
3f619290
...
...
@@ -35,11 +35,15 @@ class HeterPsBase {
size_t
len
)
=
0
;
virtual
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
=
0
;
virtual
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
=
0
;
virtual
int
get_index_by_devid
(
int
devid
)
=
0
;
#if defined(PADDLE_WITH_CUDA)
virtual
void
set_nccl_comm_and_size
(
const
std
::
vector
<
ncclComm_t
>&
inner_comms
,
const
std
::
vector
<
ncclComm_t
>&
inter_comms
,
int
comm_size
)
=
0
;
virtual
void
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
=
0
;
#endif
virtual
void
end_pass
()
=
0
;
virtual
void
show_one_table
(
int
gpu_num
)
=
0
;
...
...
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
浏览文件 @
3f619290
...
...
@@ -107,6 +107,8 @@ class HeterPsResource {
int
get_index_by_devid
(
int
devid
);
int
dev_id
(
int
num
);
void
set_multi_mf
(
int
multi_mf_dim
,
int
max_mf_dim
);
int
multi_mf
()
{
return
multi_mf_dim_
;
}
int
max_mf_dim
()
{
return
max_mf_dim_
;
}
ppStream
local_stream
(
int
dev_num
,
int
stream_num
);
ppStream
remote_stream
(
int
dev_num
,
int
stream_num
);
...
...
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
浏览文件 @
3f619290
...
...
@@ -125,20 +125,21 @@ class Optimizer {
if
(
optimizer_config
.
mf_create_thresholds
<=
optimizer_config
.
nonclk_coeff
*
(
ptr
->
show
-
ptr
->
clk
)
+
optimizer_config
.
clk_coeff
*
ptr
->
clk
)
{
//
ptr->mf_size = ptr->mf_dim + 1;
ptr
->
mf_size
=
ptr
->
mf_dim
+
1
;
ptr
->
mf_size
=
MF_DIM
+
1
;
//
ptr->mf_size = MF_DIM + 1;
ptr
->
mf
[
0
]
=
0
;
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
curandState
state
;
curand_init
(
clock64
(),
tid_x
,
0
,
&
state
);
for
(
int
i
=
0
;
i
<
MF_DIM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
ptr
->
mf_dim
;
++
i
)
{
ptr
->
mf
[
i
+
1
]
=
(
curand_uniform
(
&
state
))
*
optimizer_config
.
mf_initial_range
;
}
}
}
else
{
update_mf
(
optimizer_config
,
MF_DIM
,
&
(
ptr
->
mf
[
1
]),
ptr
->
mf
[
0
],
grad
.
mf_g
,
update_mf
(
optimizer_config
,
ptr
->
mf_dim
,
&
(
ptr
->
mf
[
1
]),
ptr
->
mf
[
0
],
grad
.
mf_g
,
grad
.
show
);
// for local test
}
}
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
浏览文件 @
3f619290
此差异已折叠。
点击以展开。
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
浏览文件 @
3f619290
...
...
@@ -61,6 +61,45 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
}
}
__global__
void
PullCopy
(
float
**
dest
,
const
FeatureValue
*
src
,
const
int64_t
*
len
,
int
slot_num
,
int
total_len
,
uint64_t
**
keys
,
uint64_t
max_val_size
,
int
*
gpu_dim
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
x
-
1
]
:
0
);
FeatureValue
*
feature_value_ptr
=
(
FeatureValue
*
)((
char
*
)
src
+
uint64_t
(
i
)
*
uint64_t
(
max_val_size
));
int
mf_dim
=
gpu_dim
[
x
]
-
3
;
if
(
*
(
keys
[
x
]
+
y
)
==
0
)
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
))
=
0
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
)
=
0
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
=
0
;
}
else
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
))
=
feature_value_ptr
->
show
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
)
=
feature_value_ptr
->
clk
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
=
feature_value_ptr
->
lr
;
}
if
((
feature_value_ptr
)
->
mf_size
==
0
||
*
(
keys
[
x
]
+
y
)
==
0
)
{
for
(
int
j
=
0
;
j
<
mf_dim
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
=
0
;
}
}
else
{
for
(
int
j
=
0
;
j
<
mf_dim
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
=
feature_value_ptr
->
mf
[
1
+
j
];
}
}
}
}
__global__
void
CopyKeysKernel
(
uint64_t
**
src_keys
,
uint64_t
*
dest_total_keys
,
const
int64_t
*
len
,
int
slot_num
,
int
total_len
)
{
...
...
@@ -105,6 +144,35 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len,
}
}
__global__
void
PushCopyWithPool
(
FeaturePushValue
*
dest
,
float
**
src
,
int64_t
*
len
,
int
slot_num
,
uint64_t
total_len
,
int
bs
,
int
*
slot_vector
,
int
*
mf_dim_vector
,
size_t
grad_value_size
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
low
-
1
]
:
0
);
FeaturePushValue
*
cur
=
(
FeaturePushValue
*
)((
char
*
)
dest
+
i
*
grad_value_size
);
cur
->
slot
=
slot_vector
[
x
];
int
mf_dim
=
mf_dim_vector
[
x
];
cur
->
mf_dim
=
mf_dim
;
cur
->
show
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
));
cur
->
clk
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
);
cur
->
lr_g
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
*
-
1.
*
bs
;
for
(
int
j
=
0
;
j
<
cur
->
mf_dim
;
j
++
)
{
cur
->
mf_g
[
j
]
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
*
-
1.
*
bs
;
}
}
}
PSGPUWrapper
::~
PSGPUWrapper
()
{
delete
HeterPs_
;
}
void
PSGPUWrapper
::
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
...
...
@@ -128,6 +196,26 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
buf_value
=
memory
::
Alloc
(
place
,
values
.
size
()
*
sizeof
(
float
*
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_value
->
ptr
());
cudaMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
PullCopy
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
gpu_values
,
total_values_gpu
,
gpu_len
,
slot_num
,
total_length
,
gpu_keys
,
val_type_size_
,
gpu_dim
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyKeys
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
origin_keys
,
uint64_t
*
total_keys
,
const
int64_t
*
gpu_len
,
int
slot_num
,
...
...
@@ -177,6 +265,45 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
slot_lengths_lod
=
slot_lengths
;
for
(
int
i
=
1
;
i
<
slot_lengths_lod
.
size
();
i
++
)
{
slot_lengths_lod
[
i
]
+=
slot_lengths_lod
[
i
-
1
];
}
auto
buf_grad_value
=
memory
::
Alloc
(
place
,
grad_values
.
size
()
*
sizeof
(
float
*
));
auto
buf_length
=
memory
::
Alloc
(
place
,
slot_lengths
.
size
()
*
sizeof
(
int64_t
));
auto
buf_slot_vector
=
memory
::
Alloc
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
auto
buf_mf_dim_vector
=
memory
::
Alloc
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_grad_value
->
ptr
());
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
int
*
d_slot_vector
=
reinterpret_cast
<
int
*>
(
buf_slot_vector
->
ptr
());
int
*
d_mf_dim_vector
=
reinterpret_cast
<
int
*>
(
buf_mf_dim_vector
->
ptr
());
cudaMemcpy
(
gpu_values
,
grad_values
.
data
(),
grad_values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_slot_vector
,
slot_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_mf_dim_vector
,
slot_mf_dim_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
PushCopyWithPool
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
total_grad_values_gpu
,
gpu_values
,
gpu_len
,
slot_lengths
.
size
(),
total_length
,
batch_size
,
d_slot_vector
,
d_mf_dim_vector
,
grad_value_size
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
SetSparseSGD
(
float
nonclk_coeff
,
float
clk_coeff
,
float
min_bound
,
float
max_bound
,
float
learning_rate
,
float
initial_g2sum
,
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
浏览文件 @
3f619290
...
...
@@ -27,6 +27,7 @@ limitations under the License. */
#include <vector>
#ifdef PADDLE_WITH_GLOO
#include <gloo/broadcast.h>
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h"
...
...
@@ -54,6 +55,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_PSLIB
#include "afs_api.h"
#endif
#ifdef PADDLE_WITH_PSLIB
#include "downpour_accessor.h" // NOLINT
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -95,12 +99,21 @@ class PSGPUWrapper {
PSGPUWrapper
()
{
HeterPs_
=
NULL
;
sleep_seconds_before_fail_exit_
=
300
;
pull_thread_pool_
.
resize
(
thread_keys_shard_num_
);
for
(
size_t
i
=
0
;
i
<
pull_thread_pool_
.
size
();
i
++
)
{
pull_thread_pool_
[
i
].
reset
(
new
::
ThreadPool
(
1
));
}
hbm_thread_pool_
.
resize
(
thread_keys_shard_num_
);
for
(
size_t
i
=
0
;
i
<
hbm_thread_pool_
.
size
();
i
++
)
{
hbm_thread_pool_
[
i
].
reset
(
new
::
ThreadPool
(
1
));
}
}
void
PullSparse
(
const
paddle
::
platform
::
Place
&
place
,
const
int
table_id
,
const
std
::
vector
<
const
uint64_t
*>&
keys
,
const
std
::
vector
<
float
*>&
values
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
std
::
vector
<
int
>&
slot_dim
,
const
int
hidden_size
);
void
PullSparse
(
const
paddle
::
platform
::
Place
&
place
,
const
int
table_id
,
const
std
::
vector
<
const
uint64_t
*>&
keys
,
const
std
::
vector
<
float
*>&
values
,
...
...
@@ -119,13 +132,23 @@ class PSGPUWrapper {
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
);
void
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
);
void
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
,
const
int64_t
total_length
,
const
int
batch_size
);
void
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
);
void
BuildGPUTask
(
std
::
shared_ptr
<
HeterContext
>
gpu_task
);
void
PreBuildTask
(
std
::
shared_ptr
<
HeterContext
>
gpu_task
);
...
...
@@ -428,6 +451,7 @@ class PSGPUWrapper {
std
::
shared_ptr
<
HeterContext
>
current_task_
=
nullptr
;
std
::
thread
pre_build_threads_
;
bool
running_
=
false
;
std
::
vector
<
std
::
shared_ptr
<
ThreadPool
>>
pull_thread_pool_
;
std
::
vector
<
std
::
shared_ptr
<
ThreadPool
>>
hbm_thread_pool_
;
protected:
...
...
paddle/fluid/operators/pull_gpups_sparse_op.h
浏览文件 @
3f619290
...
...
@@ -26,6 +26,7 @@ template <typename T>
static
void
PullGpuPSSparseFunctor
(
const
framework
::
ExecutionContext
&
ctx
)
{
auto
inputs
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Ids"
);
auto
outputs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"Out"
);
auto
embedding_size_vec
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"size"
);
const
auto
slot_size
=
inputs
.
size
();
std
::
vector
<
const
uint64_t
*>
all_keys
(
slot_size
);
// GpuPSPS only supports float now
...
...
@@ -44,7 +45,7 @@ static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
#ifdef PADDLE_WITH_HETERPS
auto
gpu_ps_ptr
=
paddle
::
framework
::
PSGPUWrapper
::
GetInstance
();
gpu_ps_ptr
->
PullSparse
(
ctx
.
GetPlace
(),
0
,
all_keys
,
all_values
,
slot_lengths
,
0
);
embedding_size_vec
,
0
);
#endif
}
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
3f619290
...
...
@@ -737,7 +737,7 @@ def _pull_gpups_sparse(input,
for i in range(len(inputs))
]
w = helper.create_parameter(
attr=helper.param_attr, shape=[
11
], dtype=dtype, is_bias=False)
attr=helper.param_attr, shape=[
size[0]
], dtype=dtype, is_bias=False)
helper.append_op(
type='pull_gpups_sparse',
inputs={'Ids': inputs,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录