Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
1bebc092
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1bebc092
编写于
1月 21, 2021
作者:
T
Thunderbrook
提交者:
GitHub
1月 21, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
solve build gpu task core (#30626)
* build gpu task core * format
上级
33bf6eb7
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
113 addition
and
55 deletion
+113
-55
paddle/fluid/framework/fleet/heter_context.h
paddle/fluid/framework/fleet/heter_context.h
+22
-5
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+85
-47
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+2
-1
paddle/fluid/framework/ps_gpu_trainer.cc
paddle/fluid/framework/ps_gpu_trainer.cc
+0
-2
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+4
-0
未找到文件。
paddle/fluid/framework/fleet/heter_context.h
浏览文件 @
1bebc092
...
...
@@ -30,11 +30,19 @@ namespace framework {
class
HeterContext
{
public:
~
HeterContext
()
{
for
(
size_t
i
=
0
;
i
<
mutex_
.
size
();
++
i
)
{
delete
mutex_
[
i
];
}
mutex_
.
clear
();
}
Scope
*
scope_
{
nullptr
};
std
::
vector
<
std
::
vector
<
FeatureKey
>>
feature_keys_
;
std
::
vector
<
std
::
vector
<
paddle
::
ps
::
DownpourFixedFeatureValue
*>>
value_ptr_
;
std
::
vector
<
std
::
vector
<
FeatureValue
>>
feature_values_
;
std
::
vector
<
std
::
mutex
*>
mutex_lock_
;
std
::
vector
<
std
::
vector
<
FeatureValue
>>
device_values_
;
std
::
vector
<
std
::
vector
<
FeatureKey
>>
device_keys_
;
std
::
vector
<
std
::
mutex
*>
mutex_
;
uint32_t
shard_num_
=
37
;
uint64_t
size
()
{
uint64_t
total_size
=
0
;
...
...
@@ -45,19 +53,28 @@ class HeterContext {
}
void
SetShardNum
(
uint32_t
shard_num
)
{
shard_num_
=
shard_num
;
}
uint32_t
ShardNum
()
{
return
shard_num_
;
}
void
init
()
{
feature_keys_
.
resize
(
shard_num_
);
}
void
init
(
int
shard_num
,
int
device_num
)
{
shard_num_
=
shard_num
;
feature_keys_
.
resize
(
shard_num_
);
value_ptr_
.
resize
(
shard_num_
);
device_values_
.
resize
(
device_num
);
device_keys_
.
resize
(
device_num
);
mutex_
.
resize
(
device_num
);
for
(
size_t
i
=
0
;
i
<
mutex_
.
size
();
++
i
)
{
mutex_
[
i
]
=
new
std
::
mutex
();
}
}
void
batch_add_keys
(
const
std
::
vector
<
std
::
vector
<
uint64_t
>>&
thread_keys
)
{
assert
(
thread_keys
.
size
()
==
feature_keys_
.
size
());
for
(
uint32_t
i
=
0
;
i
<
shard_num_
;
i
++
)
{
int
idx
=
0
;
// mutex_lock_[i]->lock();
idx
=
feature_keys_
[
i
].
size
();
feature_keys_
[
i
].
resize
(
feature_keys_
[
i
].
size
()
+
thread_keys
[
i
].
size
());
for
(
uint64_t
j
=
0
;
j
<
thread_keys
[
i
].
size
();
j
++
)
{
feature_keys_
[
i
][
idx
+
j
]
=
thread_keys
[
i
][
j
];
}
// mutex_lock_[i]->unlock();
}
}
void
UniqueKeys
()
{
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
浏览文件 @
1bebc092
...
...
@@ -40,16 +40,22 @@ namespace framework {
std
::
shared_ptr
<
PSGPUWrapper
>
PSGPUWrapper
::
s_instance_
=
NULL
;
bool
PSGPUWrapper
::
is_initialized_
=
false
;
void
PSGPUWrapper
::
BuildTask
(
uint64_t
table_id
,
int
feature_dim
)
{
void
PSGPUWrapper
::
BuildTask
(
std
::
shared_ptr
<
HeterContext
>
gpu_task
,
uint64_t
table_id
,
int
feature_dim
)
{
VLOG
(
3
)
<<
"PSGPUWrapper::BuildGPUPSTask begin"
;
platform
::
Timer
timeline
;
timeline
.
Start
();
int
device_num
=
heter_devices_
.
size
();
MultiSlotDataset
*
dataset
=
dynamic_cast
<
MultiSlotDataset
*>
(
dataset_
);
std
::
shared_ptr
<
HeterContext
>
gpu_task
=
gpu_task_pool_
.
Get
(
);
gpu_task
->
init
(
thread_keys_shard_num_
,
device_num
);
auto
input_channel
=
dataset
->
GetInputChannel
();
auto
&
local_keys
=
gpu_task
->
feature_keys_
;
auto
&
local_values
=
gpu_task
->
feature_values_
;
auto
&
local_ptr
=
gpu_task
->
value_ptr_
;
auto
&
device_keys
=
gpu_task
->
device_keys_
;
auto
&
device_vals
=
gpu_task
->
device_values_
;
auto
&
device_mutex
=
gpu_task
->
mutex_
;
std
::
vector
<
std
::
thread
>
threads
;
auto
fleet_ptr
=
FleetWrapper
::
GetInstance
();
...
...
@@ -91,12 +97,11 @@ void PSGPUWrapper::BuildTask(uint64_t table_id, int feature_dim) {
t
.
join
();
}
timeline
.
Pause
();
VLOG
(
0
)
<<
"GpuPs build task cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
VLOG
(
1
)
<<
"GpuPs build task cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
timeline
.
Start
();
// merge thread_keys to shard_keys
gpu_task
->
init
();
for
(
size_t
i
=
0
;
i
<
thread_keys_
.
size
();
i
++
)
{
gpu_task
->
batch_add_keys
(
thread_keys_
[
i
]);
for
(
int
j
=
0
;
j
<
thread_keys_thread_num_
;
j
++
)
{
...
...
@@ -105,21 +110,20 @@ void PSGPUWrapper::BuildTask(uint64_t table_id, int feature_dim) {
}
timeline
.
Pause
();
VLOG
(
0
)
<<
"GpuPs task unique11111 cost "
<<
timeline
.
ElapsedSec
()
VLOG
(
1
)
<<
"GpuPs task unique11111 cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
VLOG
(
0
)
<<
"FK1"
;
timeline
.
Start
();
gpu_task
->
UniqueKeys
();
timeline
.
Pause
();
VLOG
(
0
)
<<
"GpuPs task unique cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
VLOG
(
1
)
<<
"GpuPs task unique cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
for
(
int
i
=
0
;
i
<
thread_keys_shard_num_
;
i
++
)
{
local_values
[
i
].
resize
(
local_keys
[
i
].
size
()
);
VLOG
(
3
)
<<
"GpuPs shard: "
<<
i
<<
" key len: "
<<
local_keys
[
i
].
size
(
);
local_ptr
[
i
].
resize
(
local_keys
[
i
].
size
());
}
auto
ptl_func
=
[
this
,
&
local_keys
,
&
local_
values
,
&
local_
ptr
,
&
table_id
,
auto
ptl_func
=
[
this
,
&
local_keys
,
&
local_ptr
,
&
table_id
,
&
fleet_ptr
](
int
i
)
{
size_t
key_size
=
local_keys
[
i
].
size
();
auto
tt
=
fleet_ptr
->
pslib_ptr_
->
_worker_ptr
->
pull_sparse_ptr
(
...
...
@@ -136,68 +140,102 @@ void PSGPUWrapper::BuildTask(uint64_t table_id, int feature_dim) {
VLOG
(
3
)
<<
"FleetWrapper Pull sparse to local done with table size: "
<<
local_keys
[
i
].
size
();
}
for
(
size_t
num
=
0
;
num
<
local_ptr
[
i
].
size
();
++
num
)
{
float
*
ptr_val
=
local_ptr
[
i
][
num
]
->
data
();
FeatureValue
&
val
=
local_values
[
i
][
num
];
size_t
dim
=
local_ptr
[
i
][
num
]
->
size
();
val
.
delta_score
=
ptr_val
[
1
];
val
.
show
=
ptr_val
[
2
];
val
.
clk
=
ptr_val
[
3
];
val
.
slot
=
ptr_val
[
6
];
val
.
lr
=
ptr_val
[
4
];
val
.
lr_g2sum
=
ptr_val
[
5
];
if
(
dim
>
7
)
{
val
.
mf_size
=
MF_DIM
+
1
;
for
(
int
x
=
0
;
x
<
val
.
mf_size
;
x
++
)
{
val
.
mf
[
x
]
=
ptr_val
[
x
+
7
];
}
}
else
{
val
.
mf_size
=
0
;
for
(
int
x
=
0
;
x
<
MF_DIM
+
1
;
x
++
)
{
val
.
mf
[
x
]
=
0
;
};
for
(
size_t
i
=
0
;
i
<
threads
.
size
();
i
++
)
{
threads
[
i
]
=
std
::
thread
(
ptl_func
,
i
);
}
for
(
std
::
thread
&
t
:
threads
)
{
t
.
join
();
}
timeline
.
Pause
();
VLOG
(
1
)
<<
"GpuPs pull sparse cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
timeline
.
Start
();
auto
build_func
=
[
device_num
,
&
local_keys
,
&
local_ptr
,
&
device_keys
,
&
device_vals
,
&
device_mutex
](
int
i
)
{
std
::
vector
<
std
::
vector
<
FeatureKey
>>
task_keys
(
device_num
);
std
::
vector
<
std
::
vector
<
paddle
::
ps
::
DownpourFixedFeatureValue
*>>
task_ptrs
(
device_num
);
for
(
size_t
j
=
0
;
j
<
local_keys
[
i
].
size
();
j
++
)
{
int
shard
=
local_keys
[
i
][
j
]
%
device_num
;
task_keys
[
shard
].
push_back
(
local_keys
[
i
][
j
]);
task_ptrs
[
shard
].
push_back
(
local_ptr
[
i
][
j
]);
}
for
(
int
dev
=
0
;
dev
<
device_num
;
dev
++
)
{
device_mutex
[
dev
]
->
lock
();
int
len
=
task_keys
[
dev
].
size
();
int
cur
=
device_keys
[
dev
].
size
();
device_keys
[
dev
].
resize
(
device_keys
[
dev
].
size
()
+
len
);
device_vals
[
dev
].
resize
(
device_vals
[
dev
].
size
()
+
len
);
for
(
int
j
=
0
;
j
<
len
;
++
j
)
{
device_keys
[
dev
][
cur
+
j
]
=
task_keys
[
dev
][
j
];
float
*
ptr_val
=
task_ptrs
[
dev
][
j
]
->
data
();
FeatureValue
&
val
=
device_vals
[
dev
][
cur
+
j
];
size_t
dim
=
task_ptrs
[
dev
][
j
]
->
size
();
val
.
delta_score
=
ptr_val
[
1
];
val
.
show
=
ptr_val
[
2
];
val
.
clk
=
ptr_val
[
3
];
val
.
slot
=
ptr_val
[
6
];
val
.
lr
=
ptr_val
[
4
];
val
.
lr_g2sum
=
ptr_val
[
5
];
if
(
dim
>
7
)
{
val
.
mf_size
=
MF_DIM
+
1
;
for
(
int
x
=
0
;
x
<
val
.
mf_size
;
x
++
)
{
val
.
mf
[
x
]
=
ptr_val
[
x
+
7
];
}
}
else
{
val
.
mf_size
=
0
;
for
(
int
x
=
0
;
x
<
MF_DIM
+
1
;
x
++
)
{
val
.
mf
[
x
]
=
0
;
}
}
}
device_mutex
[
dev
]
->
unlock
();
}
};
for
(
size_t
i
=
0
;
i
<
threads
.
size
();
i
++
)
{
threads
[
i
]
=
std
::
thread
(
ptl
_func
,
i
);
threads
[
i
]
=
std
::
thread
(
build
_func
,
i
);
}
for
(
std
::
thread
&
t
:
threads
)
{
t
.
join
();
}
timeline
.
Pause
();
VLOG
(
0
)
<<
"GpuPs pull sparse cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
VLOG
(
1
)
<<
"GpuPs prepare for build hbm cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
}
void
PSGPUWrapper
::
BuildGPUPS
(
uint64_t
table_id
,
int
feature_dim
)
{
BuildTask
(
table_id
,
feature_dim
);
int
device_num
=
heter_devices_
.
size
();
std
::
shared_ptr
<
HeterContext
>
gpu_task
=
gpu_task_pool_
.
Get
();
BuildTask
(
gpu_task
,
table_id
,
feature_dim
);
platform
::
Timer
timeline
;
timeline
.
Start
();
std
::
shared_ptr
<
HeterContext
>
gpu_task
=
gpu_task_pool_
.
Get
();
int
shard_num
=
gpu_task
->
feature_keys_
.
size
();
if
(
shard_num
==
0
)
{
return
;
}
std
::
vector
<
size_t
>
feature_keys_count
(
shard
_num
);
std
::
vector
<
size_t
>
feature_keys_count
(
device
_num
);
size_t
size_max
=
0
;
for
(
int
i
=
0
;
i
<
shard
_num
;
i
++
)
{
feature_keys_count
[
i
]
=
gpu_task
->
featur
e_keys_
[
i
].
size
();
for
(
int
i
=
0
;
i
<
device
_num
;
i
++
)
{
feature_keys_count
[
i
]
=
gpu_task
->
devic
e_keys_
[
i
].
size
();
size_max
=
std
::
max
(
size_max
,
feature_keys_count
[
i
]);
}
if
(
HeterPs_
)
{
HeterPs_
->
show_one_table
(
0
);
return
;
}
std
::
vector
<
std
::
thread
>
threads
(
shard
_num
);
std
::
vector
<
std
::
thread
>
threads
(
device
_num
);
HeterPs_
=
HeterPsBase
::
get_instance
(
size_max
,
resource_
);
auto
build_func
=
[
this
,
&
gpu_task
,
&
feature_keys_count
](
int
i
)
{
std
::
cout
<<
"building table: "
<<
i
<<
std
::
endl
;
this
->
HeterPs_
->
build_ps
(
i
,
gpu_task
->
featur
e_keys_
[
i
].
data
(),
gpu_task
->
featur
e_values_
[
i
].
data
(),
feature_keys_count
[
i
],
1
0000
,
2
);
this
->
HeterPs_
->
build_ps
(
i
,
gpu_task
->
devic
e_keys_
[
i
].
data
(),
gpu_task
->
devic
e_values_
[
i
].
data
(),
feature_keys_count
[
i
],
50
0000
,
2
);
HeterPs_
->
show_one_table
(
i
);
};
for
(
size_t
i
=
0
;
i
<
threads
.
size
();
i
++
)
{
...
...
@@ -207,7 +245,7 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
t
.
join
();
}
timeline
.
Pause
();
VLOG
(
0
)
<<
"GpuPs build table total costs: "
<<
timeline
.
ElapsedSec
()
VLOG
(
1
)
<<
"GpuPs build table total costs: "
<<
timeline
.
ElapsedSec
()
<<
" s."
;
}
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
浏览文件 @
1bebc092
...
...
@@ -76,7 +76,8 @@ class PSGPUWrapper {
const
int
batch_size
);
void
BuildGPUPS
(
const
uint64_t
table_id
,
int
feature_dim
);
void
BuildTask
(
uint64_t
table_id
,
int
feature_dim
);
void
BuildTask
(
std
::
shared_ptr
<
HeterContext
>
gpu_task
,
uint64_t
table_id
,
int
feature_dim
);
void
InitializeGPU
(
const
std
::
vector
<
int
>&
dev_ids
)
{
if
(
s_instance_
!=
NULL
)
{
VLOG
(
3
)
<<
"PSGPUWrapper Begin InitializeGPU"
;
...
...
paddle/fluid/framework/ps_gpu_trainer.cc
浏览文件 @
1bebc092
...
...
@@ -74,8 +74,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
workers_
[
i
]
->
Initialize
(
trainer_desc
);
workers_
[
i
]
->
SetWorkerNum
(
place_num
);
}
auto
gpu_ps_wrapper
=
PSGPUWrapper
::
GetInstance
();
gpu_ps_wrapper
->
InitializeGPU
(
dev_ids
);
return
;
}
...
...
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
浏览文件 @
1bebc092
...
...
@@ -41,6 +41,10 @@ void BindPSGPUWrapper(py::module* m) {
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"init_GPU_server"
,
&
framework
::
PSGPUWrapper
::
InitializeGPUServer
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"set_dataset"
,
&
framework
::
PSGPUWrapper
::
SetDataset
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"init_gpu_ps"
,
&
framework
::
PSGPUWrapper
::
InitializeGPU
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"build_gpu_ps"
,
&
framework
::
PSGPUWrapper
::
BuildGPUPS
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
}
// end PSGPUWrapper
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录