Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
3f619290
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3f619290
编写于
5月 20, 2022
作者:
Y
yaoxuefeng
提交者:
GitHub
5月 20, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
merge dymf branch (#42714)
merge dymf branch
上级
e726960a
变更
18
展开全部
隐藏空白更改
内联
并排
Showing
18 changed file
with
991 addition
and
344 deletion
+991
-344
paddle/fluid/framework/fleet/heter_context.h
paddle/fluid/framework/fleet/heter_context.h
+0
-5
paddle/fluid/framework/fleet/heter_ps/feature_value.h
paddle/fluid/framework/fleet/heter_ps/feature_value.h
+31
-16
paddle/fluid/framework/fleet/heter_ps/hashtable.h
paddle/fluid/framework/fleet/heter_ps/hashtable.h
+2
-2
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+33
-8
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+26
-2
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+282
-116
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+97
-0
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+54
-0
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+11
-0
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+4
-1
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+4
-0
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+2
-0
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+5
-4
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+285
-187
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+127
-0
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+25
-1
paddle/fluid/operators/pull_gpups_sparse_op.h
paddle/fluid/operators/pull_gpups_sparse_op.h
+2
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+1
-1
未找到文件。
paddle/fluid/framework/fleet/heter_context.h
浏览文件 @
3f619290
...
...
@@ -129,11 +129,6 @@ class HeterContext {
for
(
size_t
i
=
0
;
i
<
feature_dim_keys_
.
size
();
i
++
)
{
feature_dim_keys_
[
i
].
resize
(
dim_num
);
value_dim_ptr_
[
i
].
resize
(
dim_num
);
if
(
i
==
0
)
{
for
(
int
j
=
0
;
j
<
dim_num
;
j
++
)
{
feature_dim_keys_
[
i
][
j
].
push_back
(
0
);
}
}
}
device_values_
.
resize
(
device_num
);
device_dim_values_
.
resize
(
device_num
);
...
...
paddle/fluid/framework/fleet/heter_ps/feature_value.h
浏览文件 @
3f619290
...
...
@@ -32,17 +32,33 @@ struct FeatureValue {
float
lr
;
float
lr_g2sum
;
int
mf_size
;
float
mf
[
MF_DIM
+
1
]
;
int
mf_dim
;
uint64_t
cpu_ptr
;
float
mf
[
0
];
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
FeatureValue
&
val
)
{
out
<<
"show: "
<<
val
.
show
<<
" clk: "
<<
val
.
clk
<<
" slot: "
<<
val
.
slot
<<
" lr: "
<<
val
.
lr
<<
" mf_size: "
<<
val
.
mf_size
<<
" mf:"
;
for
(
int
i
=
0
;
i
<
val
.
mf_size
;
++
i
)
{
<<
" lr: "
<<
val
.
lr
<<
" mf_dim: "
<<
val
.
mf_dim
<<
"cpuptr: "
<<
val
.
cpu_ptr
<<
" mf_size: "
<<
val
.
mf_size
<<
" mf:"
;
for
(
int
i
=
0
;
i
<
val
.
mf_dim
+
1
;
++
i
)
{
out
<<
" "
<<
val
.
mf
[
i
];
}
return
out
;
}
__device__
__forceinline__
void
operator
=
(
const
FeatureValue
&
in
)
{
delta_score
=
in
.
delta_score
;
show
=
in
.
show
;
clk
=
in
.
clk
;
slot
=
in
.
slot
;
lr
=
in
.
lr
;
lr_g2sum
=
in
.
lr_g2sum
;
mf_size
=
in
.
mf_size
;
mf_dim
=
in
.
mf_dim
;
cpu_ptr
=
in
.
cpu_ptr
;
for
(
int
i
=
0
;
i
<
mf_dim
+
1
;
i
++
)
{
mf
[
i
]
=
in
.
mf
[
i
];
}
}
};
struct
FeaturePushValue
{
...
...
@@ -50,20 +66,19 @@ struct FeaturePushValue {
float
clk
;
int
slot
;
float
lr_g
;
float
mf_g
[
MF_DIM
];
int
mf_dim
;
float
mf_g
[
0
];
// __device__ __forceinline__ FeaturePushValue
// operator+(const FeaturePushValue& a) const {
// FeaturePushValue out;
// out.slot = a.slot;
// out.show = a.show + show;
// out.clk = a.clk + clk;
// out.lr_g = a.lr_g + lr_g;
// for (int i = 0; i < MF_DIM; ++i) {
// out.mf_g[i] = a.mf_g[i] + mf_g[i];
// }
// return out;
// }
__device__
__forceinline__
void
operator
=
(
const
FeaturePushValue
&
in
)
{
show
=
in
.
show
;
clk
=
in
.
clk
;
slot
=
in
.
slot
;
lr_g
=
in
.
lr_g
;
mf_dim
=
in
.
mf_dim
;
for
(
int
i
=
0
;
i
<
mf_dim
;
i
++
)
{
mf_g
[
i
]
=
in
.
mf_g
[
i
];
}
}
};
}
// end namespace framework
...
...
paddle/fluid/framework/fleet/heter_ps/hashtable.h
浏览文件 @
3f619290
...
...
@@ -118,8 +118,8 @@ class HashTable {
StreamType
stream
);
template
<
typename
StreamType
>
void
insert
(
const
KeyType
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
start_index
,
StreamType
stream
);
void
insert
(
const
KeyType
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
feature_value_size
,
size_t
start_index
,
StreamType
stream
);
template
<
typename
StreamType
>
void
get
(
const
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
,
...
...
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
浏览文件 @
3f619290
...
...
@@ -50,7 +50,8 @@ __global__ void insert_kernel(Table* table,
template
<
typename
Table
>
__global__
void
insert_kernel
(
Table
*
table
,
const
typename
Table
::
key_type
*
const
keys
,
size_t
len
,
char
*
pool
,
int
start_index
)
{
size_t
len
,
char
*
pool
,
size_t
feature_value_size
,
int
start_index
)
{
ReplaceOp
<
typename
Table
::
mapped_type
>
op
;
thrust
::
pair
<
typename
Table
::
key_type
,
typename
Table
::
mapped_type
>
kv
;
...
...
@@ -58,7 +59,8 @@ __global__ void insert_kernel(Table* table,
if
(
i
<
len
)
{
kv
.
first
=
keys
[
i
];
kv
.
second
=
(
Table
::
mapped_type
)(
pool
+
(
start_index
+
i
)
*
80
);
uint64_t
offset
=
uint64_t
(
start_index
+
i
)
*
feature_value_size
;
kv
.
second
=
(
Table
::
mapped_type
)(
pool
+
offset
);
auto
it
=
table
->
insert
(
kv
,
op
);
assert
(
it
!=
table
->
end
()
&&
"error: insert fails: table is full"
);
}
...
...
@@ -81,14 +83,16 @@ __global__ void search_kernel(Table* table,
template
<
typename
Table
>
__global__
void
dy_mf_search_kernel
(
Table
*
table
,
const
typename
Table
::
key_type
*
const
keys
,
char
*
const
vals
,
size_t
len
,
char
*
vals
,
size_t
len
,
size_t
pull_feature_value_size
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
auto
it
=
table
->
find
(
keys
[
i
]);
if
(
it
!=
table
->
end
())
{
*
(
FeatureValue
*
)(
vals
+
i
*
pull_feature_value_size
)
=
*
(
it
->
second
);
uint64_t
offset
=
i
*
pull_feature_value_size
;
FeatureValue
&
cur
=
*
(
FeatureValue
*
)(
vals
+
offset
);
FeatureValue
&
input
=
*
(
FeatureValue
*
)(
it
->
second
);
}
}
}
...
...
@@ -121,7 +125,7 @@ __global__ void dy_mf_update_kernel(Table* table,
FeaturePushValue
*
cur
=
(
FeaturePushValue
*
)(
grads
+
i
*
grad_value_size
);
sgd
.
dy_mf_update_value
(
optimizer_config
,
(
it
.
getter
())
->
second
,
*
cur
);
}
else
{
printf
(
"
yxf::
push miss key: %d"
,
keys
[
i
]);
printf
(
"
warning:
push miss key: %d"
,
keys
[
i
]);
}
}
}
...
...
@@ -201,7 +205,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
template
<
typename
KeyType
,
typename
ValType
>
template
<
typename
StreamType
>
void
HashTable
<
KeyType
,
ValType
>::
insert
(
const
KeyType
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
start_index
,
char
*
pool
,
size_t
feature_value_size
,
size_t
start_index
,
StreamType
stream
)
{
if
(
len
==
0
)
{
return
;
...
...
@@ -210,8 +215,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
return
;
}
const
int
grid_size
=
(
len
-
1
)
/
BLOCK_SIZE_
+
1
;
insert_kernel
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
container_
,
d_keys
,
len
,
pool
,
start_index
);
insert_kernel
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
container_
,
d_keys
,
len
,
pool
,
feature_value_size
,
start_index
);
}
template
<
typename
KeyType
,
typename
ValType
>
...
...
@@ -319,6 +324,7 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
}
template
class
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
>;
template
class
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>;
template
class
HashTable
<
long
,
int
>;
template
class
HashTable
<
unsigned
long
,
int
>;
template
class
HashTable
<
unsigned
long
,
unsigned
long
>;
...
...
@@ -331,6 +337,10 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
paddle
::
framework
::
FeatureValue
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>
::
get
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
char
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
int
>
::
get
<
cudaStream_t
>
(
const
long
*
d_keys
,
int
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
...
...
@@ -354,6 +364,11 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
const
paddle
::
framework
::
FeatureValue
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>
::
insert
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
size_t
len
,
char
*
pool
,
size_t
feature_value_size
,
size_t
start_index
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
int
>
::
insert
<
cudaStream_t
>
(
const
long
*
d_keys
,
const
int
*
d_vals
,
size_t
len
,
...
...
@@ -393,6 +408,16 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
sgd
,
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
*
>
::
update
<
Optimizer
<
paddle
::
framework
::
FeatureValue
,
paddle
::
framework
::
FeaturePushValue
>
,
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
const
char
*
d_grads
,
size_t
len
,
Optimizer
<
paddle
::
framework
::
FeatureValue
,
paddle
::
framework
::
FeaturePushValue
>
sgd
,
cudaStream_t
stream
);
// template void HashTable<unsigned long,
// paddle::framework::FeatureValue>::update<
// Optimizer<paddle::framework::FeatureValue,
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
浏览文件 @
3f619290
...
...
@@ -15,10 +15,13 @@ limitations under the License. */
#pragma once
#include <thread>
#include <vector>
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#if defined(PADDLE_WITH_CUDA)
#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/timer.h"
#include "thrust/pair.h"
#elif defined(PADDLE_WITH_XPU_KP)
// #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
...
...
@@ -38,6 +41,9 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
#define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
class
HeterComm
{
public:
...
...
@@ -50,9 +56,13 @@ class HeterComm {
int
*
left
,
int
*
right
,
int
gpu_num
);
void
merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
int
&
uniq_len
);
// NOLINT
void
dynamic_merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
int
&
uniq_len
);
void
pull_sparse
(
int
num
,
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
);
void
build_ps
(
int
num
,
KeyType
*
h_keys
,
ValType
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
);
void
build_ps
(
int
num
,
KeyType
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
);
void
dump
();
void
show_one_table
(
int
gpu_num
);
int
get_index_by_devid
(
int
devid
);
...
...
@@ -96,6 +106,11 @@ class HeterComm {
nccl_inter_comms_
=
inter_comms
;
node_size_
=
comm_size
;
}
void
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
{
multi_mf_dim_
=
multi_mf_dim
;
max_mf_dim_
=
max_mf_dim
;
}
#endif
bool
need_transfer
(
int
send_id
,
int
receive_id
)
{
...
...
@@ -114,8 +129,8 @@ class HeterComm {
char
*
key_storage
;
char
*
val_storage
;
int
sync
;
in
t
key_bytes_len
;
in
t
val_bytes_len
;
size_
t
key_bytes_len
;
size_
t
val_bytes_len
;
int
dev_num
;
};
...
...
@@ -206,12 +221,18 @@ class HeterComm {
void
destroy_storage
(
int
start_index
,
int
end_index
);
void
walk_to_dest
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
GradType
*
src_val
);
void
walk_to_dest
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
char
*
src_val
,
size_t
val_size
);
void
walk_to_src
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
ValType
*
src_val
);
void
walk_to_src
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
char
*
src_val
,
size_t
val_size
);
protected:
using
Table
=
HashTable
<
KeyType
,
ValType
>
;
using
PtrTable
=
HashTable
<
KeyType
,
ValType
*>
;
std
::
vector
<
Table
*>
tables_
;
std
::
vector
<
PtrTable
*>
ptr_tables_
;
std
::
shared_ptr
<
HeterPsResource
>
resource_
;
std
::
vector
<
std
::
vector
<
Path
>>
path_
;
float
load_factor_
{
0.75
};
...
...
@@ -221,6 +242,7 @@ class HeterComm {
private:
int
topo_aware_
{
0
};
std
::
vector
<
LocalStorage
>
storage_
;
DynamicGradMerger
merger_
;
int
feanum_
{
1800
*
2048
};
int
multi_node_
{
0
};
int
node_size_
;
...
...
@@ -228,6 +250,8 @@ class HeterComm {
#if defined(PADDLE_WITH_CUDA)
std
::
vector
<
ncclComm_t
>
nccl_inner_comms_
;
std
::
vector
<
ncclComm_t
>
nccl_inter_comms_
;
int
multi_mf_dim_
{
8
};
int
max_mf_dim_
=
8
;
std
::
vector
<
std
::
shared_ptr
<
cub
::
CachingDeviceAllocator
>>
allocators_
;
#endif
};
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
浏览文件 @
3f619290
此差异已折叠。
点击以展开。
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
浏览文件 @
3f619290
...
...
@@ -117,6 +117,52 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
}
}
template
<
typename
KeyType
,
typename
GradType
,
typename
T
>
__global__
void
dy_mf_fill_shard_grads_kernel
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
T
*
idx
,
size_t
len
,
size_t
grad_value_size
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
d_shard_keys
[
i
]
=
d_keys
[
idx
[
i
]];
*
(
GradType
*
)((
char
*
)
d_shard_grads
+
i
*
grad_value_size
)
=
*
(
GradType
*
)((
char
*
)
d_grads
+
uint64_t
(
idx
[
i
])
*
grad_value_size
);
}
}
__global__
void
merge_gradients_kernel
(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
const
char
*
input
,
char
*
output
,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
n
)
{
uint32_t
start
=
offset
[
i
];
uint32_t
num
=
fea_num
[
i
];
int
ori_index
=
index
[
start
];
FeaturePushValue
&
out
=
*
(
FeaturePushValue
*
)(
output
+
i
*
grad_value_size
);
FeaturePushValue
&
in
=
*
(
FeaturePushValue
*
)(
input
+
size_t
(
ori_index
)
*
grad_value_size
);
merger_
.
update_one
(
out
,
in
);
for
(
int
j
=
1
;
j
<
num
;
++
j
)
{
ori_index
=
index
[
start
+
j
];
in
=
*
(
FeaturePushValue
*
)(
input
+
size_t
(
ori_index
)
*
grad_value_size
);
merger_
.
merge_one
(
out
,
in
);
}
}
}
template
<
typename
ValType
,
typename
T
>
__global__
void
dy_mf_fill_dvals_kernel
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
T
*
idx
,
size_t
len
,
size_t
val_size
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
uint64_t
new_offset
=
uint64_t
(
idx
[
i
])
*
val_size
;
*
(
ValType
*
)((
char
*
)
d_vals
+
new_offset
)
=
*
(
ValType
*
)((
char
*
)
d_shard_vals
+
i
*
val_size
);
}
}
// cuda implemention of heter_comm_kernel.h
template
<
typename
T
,
typename
StreamType
>
void
HeterCommKernel
::
fill_idx
(
T
*
idx
,
long
long
len
,
...
...
@@ -207,8 +253,42 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
debug_synchronous
));
}
template
<
typename
KeyType
,
typename
GradType
,
typename
T
,
typename
StreamType
>
void
HeterCommKernel
::
dy_mf_fill_shard_grads
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
T
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
StreamType
&
stream
)
{
int
grid_size
=
(
len
-
1
)
/
block_size_
+
1
;
size_t
c_len
=
(
size_t
)
len
;
dy_mf_fill_shard_grads_kernel
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_keys
,
d_keys
,
d_shard_grads
,
d_grads
,
idx
,
c_len
,
grad_value_size
);
}
template
<
typename
StreamType
>
void
HeterCommKernel
::
merge_gradient
(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
const
char
*
input
,
char
*
output
,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
,
const
StreamType
&
stream
)
{
int
grid_size
=
(
n
-
1
)
/
block_size_
+
1
;
merge_gradients_kernel
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
offset
,
fea_num
,
index
,
input
,
output
,
n
,
grad_value_size
,
merger_
);
}
template
<
typename
ValType
,
typename
T
,
typename
StreamType
>
void
HeterCommKernel
::
dy_mf_fill_dvals
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
T
*
idx
,
long
long
len
,
size_t
val_size
,
const
StreamType
&
stream
)
{
int
grid_size
=
(
len
-
1
)
/
block_size_
+
1
;
size_t
c_len
=
(
size_t
)
len
;
dy_mf_fill_dvals_kernel
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_vals
,
d_vals
,
idx
,
c_len
,
val_size
);
}
template
void
HeterCommKernel
::
fill_idx
<
int
,
cudaStream_t
>(
int
*
idx
,
long
long
len
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
fill_idx
<
uint32_t
,
cudaStream_t
>(
uint32_t
*
idx
,
long
long
len
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
calc_shard_offset
<
int
,
cudaStream_t
>(
int
*
idx
,
int
*
left
,
int
*
right
,
long
long
len
,
int
total_devs
,
...
...
@@ -270,6 +350,23 @@ template void HeterCommKernel::reduce_by_key<
paddle
::
framework
::
FeaturePushValue
*
d_aggregates_out
,
int
*
d_num_runs_out
,
int
num_items
,
cudaStream_t
stream
,
bool
debug_synchronous
);
template
void
HeterCommKernel
::
dy_mf_fill_shard_grads
<
unsigned
long
,
paddle
::
framework
::
FeaturePushValue
,
int
,
cudaStream_t
>(
unsigned
long
*
d_shard_keys
,
unsigned
long
*
d_keys
,
paddle
::
framework
::
FeaturePushValue
*
d_shard_grads
,
paddle
::
framework
::
FeaturePushValue
*
d_grads
,
int
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
merge_gradient
<
cudaStream_t
>(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
const
char
*
input
,
char
*
output
,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
,
const
cudaStream_t
&
stream
);
template
void
HeterCommKernel
::
dy_mf_fill_dvals
<
paddle
::
framework
::
FeatureValue
,
int
,
cudaStream_t
>(
paddle
::
framework
::
FeatureValue
*
d_shard_vals
,
paddle
::
framework
::
FeatureValue
*
d_vals
,
int
*
idx
,
long
long
len
,
size_t
val_size
,
const
cudaStream_t
&
stream
);
#endif
}
// namespace framework
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
浏览文件 @
3f619290
...
...
@@ -27,6 +27,42 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
struct
DynamicGradMerger
{
template
<
typename
T
>
CUB_RUNTIME_FUNCTION
__forceinline__
__device__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
T
out
;
out
.
slot
=
a
.
slot
;
out
.
mf_dim
=
a
.
mf_dim
;
out
.
show
=
a
.
show
+
b
.
show
;
out
.
clk
=
a
.
clk
+
b
.
clk
;
out
.
lr_g
=
a
.
lr_g
+
b
.
lr_g
;
return
out
;
}
template
<
typename
T
>
__device__
__forceinline__
void
update_one
(
T
&
output
,
const
T
&
input
)
{
output
.
slot
=
input
.
slot
;
output
.
show
=
input
.
show
;
output
.
clk
=
input
.
clk
;
output
.
mf_dim
=
input
.
mf_dim
;
output
.
lr_g
=
input
.
lr_g
;
for
(
int
i
=
0
;
i
<
output
.
mf_dim
;
++
i
)
{
output
.
mf_g
[
i
]
=
input
.
mf_g
[
i
];
}
}
template
<
typename
T
>
__device__
__forceinline__
void
merge_one
(
T
&
output
,
const
T
&
input
)
{
output
.
show
+=
input
.
show
;
output
.
clk
+=
input
.
clk
;
output
.
lr_g
+=
input
.
lr_g
;
for
(
int
i
=
0
;
i
<
input
.
mf_dim
;
++
i
)
{
output
.
mf_g
[
i
]
+=
input
.
mf_g
[
i
];
}
}
};
class
HeterCommKernel
{
public:
HeterCommKernel
()
{}
...
...
@@ -80,6 +116,24 @@ class HeterCommKernel {
StreamType
stream
=
NULL
,
bool
debug_synchronous
=
false
);
template
<
typename
KeyType
,
typename
GradType
,
typename
T
,
typename
StreamType
>
void
dy_mf_fill_shard_grads
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
T
*
idx
,
long
long
len
,
size_t
grad_value_size
,
const
StreamType
&
stream
);
template
<
typename
StreamType
>
void
merge_gradient
(
const
uint32_t
*
offset
,
const
uint32_t
*
fea_num
,
const
uint32_t
*
index
,
const
char
*
input
,
char
*
output
,
int
n
,
size_t
grad_value_size
,
DynamicGradMerger
&
merger_
,
const
StreamType
&
stream
);
template
<
typename
ValType
,
typename
T
,
typename
StreamType
>
void
dy_mf_fill_dvals
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
T
*
idx
,
long
long
len
,
size_t
val_size
,
const
StreamType
&
stream
);
private:
int
block_size_
{
256
};
};
...
...
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
浏览文件 @
3f619290
...
...
@@ -44,6 +44,13 @@ void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
comm_
->
build_ps
(
num
,
h_keys
,
h_vals
,
len
,
chunk_size
,
stream_num
);
}
void
HeterPs
::
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
{
comm_
->
build_ps
(
num
,
h_keys
,
pool
,
len
,
feature_value_size
,
chunk_size
,
stream_num
);
}
int
HeterPs
::
get_index_by_devid
(
int
devid
)
{
return
comm_
->
get_index_by_devid
(
devid
);
}
...
...
@@ -72,6 +79,10 @@ void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
comm_
->
set_nccl_comm_and_size
(
inner_comms
,
inter_comms
,
comm_size
);
}
void
HeterPs
::
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
{
comm_
->
set_multi_mf_dim
(
multi_mf_dim
,
max_mf_dim
);
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
浏览文件 @
3f619290
...
...
@@ -37,11 +37,14 @@ class HeterPs : public HeterPsBase {
size_t
len
)
override
;
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
override
;
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
override
;
#if defined(PADDLE_WITH_CUDA)
void
set_nccl_comm_and_size
(
const
std
::
vector
<
ncclComm_t
>&
inner_comms
,
const
std
::
vector
<
ncclComm_t
>&
inter_comms
,
int
comm_size
)
override
;
void
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
override
;
#endif
void
set_sparse_sgd
(
const
OptimizerConfig
&
optimizer_config
)
override
;
...
...
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
浏览文件 @
3f619290
...
...
@@ -35,11 +35,15 @@ class HeterPsBase {
size_t
len
)
=
0
;
virtual
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
=
0
;
virtual
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
char
*
pool
,
size_t
len
,
size_t
feature_value_size
,
size_t
chunk_size
,
int
stream_num
)
=
0
;
virtual
int
get_index_by_devid
(
int
devid
)
=
0
;
#if defined(PADDLE_WITH_CUDA)
virtual
void
set_nccl_comm_and_size
(
const
std
::
vector
<
ncclComm_t
>&
inner_comms
,
const
std
::
vector
<
ncclComm_t
>&
inter_comms
,
int
comm_size
)
=
0
;
virtual
void
set_multi_mf_dim
(
int
multi_mf_dim
,
int
max_mf_dim
)
=
0
;
#endif
virtual
void
end_pass
()
=
0
;
virtual
void
show_one_table
(
int
gpu_num
)
=
0
;
...
...
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
浏览文件 @
3f619290
...
...
@@ -107,6 +107,8 @@ class HeterPsResource {
int
get_index_by_devid
(
int
devid
);
int
dev_id
(
int
num
);
void
set_multi_mf
(
int
multi_mf_dim
,
int
max_mf_dim
);
int
multi_mf
()
{
return
multi_mf_dim_
;
}
int
max_mf_dim
()
{
return
max_mf_dim_
;
}
ppStream
local_stream
(
int
dev_num
,
int
stream_num
);
ppStream
remote_stream
(
int
dev_num
,
int
stream_num
);
...
...
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
浏览文件 @
3f619290
...
...
@@ -125,20 +125,21 @@ class Optimizer {
if
(
optimizer_config
.
mf_create_thresholds
<=
optimizer_config
.
nonclk_coeff
*
(
ptr
->
show
-
ptr
->
clk
)
+
optimizer_config
.
clk_coeff
*
ptr
->
clk
)
{
//
ptr->mf_size = ptr->mf_dim + 1;
ptr
->
mf_size
=
ptr
->
mf_dim
+
1
;
ptr
->
mf_size
=
MF_DIM
+
1
;
//
ptr->mf_size = MF_DIM + 1;
ptr
->
mf
[
0
]
=
0
;
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
curandState
state
;
curand_init
(
clock64
(),
tid_x
,
0
,
&
state
);
for
(
int
i
=
0
;
i
<
MF_DIM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
ptr
->
mf_dim
;
++
i
)
{
ptr
->
mf
[
i
+
1
]
=
(
curand_uniform
(
&
state
))
*
optimizer_config
.
mf_initial_range
;
}
}
}
else
{
update_mf
(
optimizer_config
,
MF_DIM
,
&
(
ptr
->
mf
[
1
]),
ptr
->
mf
[
0
],
grad
.
mf_g
,
update_mf
(
optimizer_config
,
ptr
->
mf_dim
,
&
(
ptr
->
mf
[
1
]),
ptr
->
mf
[
0
],
grad
.
mf_g
,
grad
.
show
);
// for local test
}
}
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
浏览文件 @
3f619290
此差异已折叠。
点击以展开。
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
浏览文件 @
3f619290
...
...
@@ -61,6 +61,45 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
}
}
__global__
void
PullCopy
(
float
**
dest
,
const
FeatureValue
*
src
,
const
int64_t
*
len
,
int
slot_num
,
int
total_len
,
uint64_t
**
keys
,
uint64_t
max_val_size
,
int
*
gpu_dim
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
x
-
1
]
:
0
);
FeatureValue
*
feature_value_ptr
=
(
FeatureValue
*
)((
char
*
)
src
+
uint64_t
(
i
)
*
uint64_t
(
max_val_size
));
int
mf_dim
=
gpu_dim
[
x
]
-
3
;
if
(
*
(
keys
[
x
]
+
y
)
==
0
)
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
))
=
0
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
)
=
0
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
=
0
;
}
else
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
))
=
feature_value_ptr
->
show
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
)
=
feature_value_ptr
->
clk
;
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
=
feature_value_ptr
->
lr
;
}
if
((
feature_value_ptr
)
->
mf_size
==
0
||
*
(
keys
[
x
]
+
y
)
==
0
)
{
for
(
int
j
=
0
;
j
<
mf_dim
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
=
0
;
}
}
else
{
for
(
int
j
=
0
;
j
<
mf_dim
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
=
feature_value_ptr
->
mf
[
1
+
j
];
}
}
}
}
__global__
void
CopyKeysKernel
(
uint64_t
**
src_keys
,
uint64_t
*
dest_total_keys
,
const
int64_t
*
len
,
int
slot_num
,
int
total_len
)
{
...
...
@@ -105,6 +144,35 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len,
}
}
__global__
void
PushCopyWithPool
(
FeaturePushValue
*
dest
,
float
**
src
,
int64_t
*
len
,
int
slot_num
,
uint64_t
total_len
,
int
bs
,
int
*
slot_vector
,
int
*
mf_dim_vector
,
size_t
grad_value_size
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
low
-
1
]
:
0
);
FeaturePushValue
*
cur
=
(
FeaturePushValue
*
)((
char
*
)
dest
+
i
*
grad_value_size
);
cur
->
slot
=
slot_vector
[
x
];
int
mf_dim
=
mf_dim_vector
[
x
];
cur
->
mf_dim
=
mf_dim
;
cur
->
show
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
));
cur
->
clk
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
1
);
cur
->
lr_g
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
2
)
*
-
1.
*
bs
;
for
(
int
j
=
0
;
j
<
cur
->
mf_dim
;
j
++
)
{
cur
->
mf_g
[
j
]
=
*
(
src
[
x
]
+
y
*
(
mf_dim
+
3
)
+
3
+
j
)
*
-
1.
*
bs
;
}
}
}
PSGPUWrapper
::~
PSGPUWrapper
()
{
delete
HeterPs_
;
}
void
PSGPUWrapper
::
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
...
...
@@ -128,6 +196,26 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
buf_value
=
memory
::
Alloc
(
place
,
values
.
size
()
*
sizeof
(
float
*
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_value
->
ptr
());
cudaMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
PullCopy
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
gpu_values
,
total_values_gpu
,
gpu_len
,
slot_num
,
total_length
,
gpu_keys
,
val_type_size_
,
gpu_dim
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyKeys
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
origin_keys
,
uint64_t
*
total_keys
,
const
int64_t
*
gpu_len
,
int
slot_num
,
...
...
@@ -177,6 +265,45 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
slot_lengths_lod
=
slot_lengths
;
for
(
int
i
=
1
;
i
<
slot_lengths_lod
.
size
();
i
++
)
{
slot_lengths_lod
[
i
]
+=
slot_lengths_lod
[
i
-
1
];
}
auto
buf_grad_value
=
memory
::
Alloc
(
place
,
grad_values
.
size
()
*
sizeof
(
float
*
));
auto
buf_length
=
memory
::
Alloc
(
place
,
slot_lengths
.
size
()
*
sizeof
(
int64_t
));
auto
buf_slot_vector
=
memory
::
Alloc
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
auto
buf_mf_dim_vector
=
memory
::
Alloc
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_grad_value
->
ptr
());
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
int
*
d_slot_vector
=
reinterpret_cast
<
int
*>
(
buf_slot_vector
->
ptr
());
int
*
d_mf_dim_vector
=
reinterpret_cast
<
int
*>
(
buf_mf_dim_vector
->
ptr
());
cudaMemcpy
(
gpu_values
,
grad_values
.
data
(),
grad_values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_slot_vector
,
slot_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_mf_dim_vector
,
slot_mf_dim_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
PushCopyWithPool
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
total_grad_values_gpu
,
gpu_values
,
gpu_len
,
slot_lengths
.
size
(),
total_length
,
batch_size
,
d_slot_vector
,
d_mf_dim_vector
,
grad_value_size
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
SetSparseSGD
(
float
nonclk_coeff
,
float
clk_coeff
,
float
min_bound
,
float
max_bound
,
float
learning_rate
,
float
initial_g2sum
,
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
浏览文件 @
3f619290
...
...
@@ -27,6 +27,7 @@ limitations under the License. */
#include <vector>
#ifdef PADDLE_WITH_GLOO
#include <gloo/broadcast.h>
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h"
...
...
@@ -54,6 +55,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_PSLIB
#include "afs_api.h"
#endif
#ifdef PADDLE_WITH_PSLIB
#include "downpour_accessor.h" // NOLINT
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -95,12 +99,21 @@ class PSGPUWrapper {
PSGPUWrapper
()
{
HeterPs_
=
NULL
;
sleep_seconds_before_fail_exit_
=
300
;
pull_thread_pool_
.
resize
(
thread_keys_shard_num_
);
for
(
size_t
i
=
0
;
i
<
pull_thread_pool_
.
size
();
i
++
)
{
pull_thread_pool_
[
i
].
reset
(
new
::
ThreadPool
(
1
));
}
hbm_thread_pool_
.
resize
(
thread_keys_shard_num_
);
for
(
size_t
i
=
0
;
i
<
hbm_thread_pool_
.
size
();
i
++
)
{
hbm_thread_pool_
[
i
].
reset
(
new
::
ThreadPool
(
1
));
}
}
void
PullSparse
(
const
paddle
::
platform
::
Place
&
place
,
const
int
table_id
,
const
std
::
vector
<
const
uint64_t
*>&
keys
,
const
std
::
vector
<
float
*>&
values
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
std
::
vector
<
int
>&
slot_dim
,
const
int
hidden_size
);
void
PullSparse
(
const
paddle
::
platform
::
Place
&
place
,
const
int
table_id
,
const
std
::
vector
<
const
uint64_t
*>&
keys
,
const
std
::
vector
<
float
*>&
values
,
...
...
@@ -119,13 +132,23 @@ class PSGPUWrapper {
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
);
void
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
,
int
*
gpu_dim
);
void
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
,
const
int64_t
total_length
,
const
int
batch_size
);
void
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
uint64_t
total_length
,
const
int
batch_size
,
size_t
grad_value_size
);
void
BuildGPUTask
(
std
::
shared_ptr
<
HeterContext
>
gpu_task
);
void
PreBuildTask
(
std
::
shared_ptr
<
HeterContext
>
gpu_task
);
...
...
@@ -428,6 +451,7 @@ class PSGPUWrapper {
std
::
shared_ptr
<
HeterContext
>
current_task_
=
nullptr
;
std
::
thread
pre_build_threads_
;
bool
running_
=
false
;
std
::
vector
<
std
::
shared_ptr
<
ThreadPool
>>
pull_thread_pool_
;
std
::
vector
<
std
::
shared_ptr
<
ThreadPool
>>
hbm_thread_pool_
;
protected:
...
...
paddle/fluid/operators/pull_gpups_sparse_op.h
浏览文件 @
3f619290
...
...
@@ -26,6 +26,7 @@ template <typename T>
static
void
PullGpuPSSparseFunctor
(
const
framework
::
ExecutionContext
&
ctx
)
{
auto
inputs
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Ids"
);
auto
outputs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"Out"
);
auto
embedding_size_vec
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"size"
);
const
auto
slot_size
=
inputs
.
size
();
std
::
vector
<
const
uint64_t
*>
all_keys
(
slot_size
);
// GpuPSPS only supports float now
...
...
@@ -44,7 +45,7 @@ static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
#ifdef PADDLE_WITH_HETERPS
auto
gpu_ps_ptr
=
paddle
::
framework
::
PSGPUWrapper
::
GetInstance
();
gpu_ps_ptr
->
PullSparse
(
ctx
.
GetPlace
(),
0
,
all_keys
,
all_values
,
slot_lengths
,
0
);
embedding_size_vec
,
0
);
#endif
}
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
3f619290
...
...
@@ -737,7 +737,7 @@ def _pull_gpups_sparse(input,
for i in range(len(inputs))
]
w = helper.create_parameter(
attr=helper.param_attr, shape=[
11
], dtype=dtype, is_bias=False)
attr=helper.param_attr, shape=[
size[0]
], dtype=dtype, is_bias=False)
helper.append_op(
type='pull_gpups_sparse',
inputs={'Ids': inputs,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录