Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
45d7a3ea
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
45d7a3ea
编写于
5月 25, 2022
作者:
D
danleifeng
提交者:
GitHub
5月 25, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[GPUPS]fix gpups pscore (#42967)
上级
b6859054
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
55 addition
and
18 deletion
+55
-18
paddle/fluid/framework/hogwild_worker.cc
paddle/fluid/framework/hogwild_worker.cc
+8
-1
paddle/fluid/framework/multi_trainer.cc
paddle/fluid/framework/multi_trainer.cc
+30
-11
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+1
-0
python/paddle/distributed/passes/ps_trainer_pass.py
python/paddle/distributed/passes/ps_trainer_pass.py
+11
-2
python/paddle/distributed/ps/the_one_ps.py
python/paddle/distributed/ps/the_one_ps.py
+5
-4
未找到文件。
paddle/fluid/framework/hogwild_worker.cc
浏览文件 @
45d7a3ea
...
...
@@ -219,6 +219,10 @@ void HogwildWorker::TrainFiles() {
device_reader_
->
Start
();
int
cur_batch
;
int
batch_cnt
=
0
;
#if defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_CUDA)
platform
::
SetDeviceId
(
thread_id_
);
#endif
while
((
cur_batch
=
device_reader_
->
Next
())
>
0
)
{
for
(
auto
&
op
:
ops_
)
{
bool
need_skip
=
false
;
...
...
@@ -244,9 +248,12 @@ void HogwildWorker::TrainFiles() {
++
batch_cnt
;
PrintFetchVars
();
thread_scope_
->
DropKids
();
#ifdef PADDLE_WITH_HETERPS
dev_ctx_
->
Wait
();
#endif
}
timeline
.
Pause
();
VLOG
(
3
)
<<
"worker "
<<
thread_id_
<<
" train cost "
<<
timeline
.
ElapsedSec
()
VLOG
(
1
)
<<
"worker "
<<
thread_id_
<<
" train cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds, ins_num: "
<<
total_ins_num
;
if
(
need_dump_field_
||
need_dump_param_
)
{
...
...
paddle/fluid/framework/multi_trainer.cc
浏览文件 @
45d7a3ea
...
...
@@ -148,6 +148,17 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
}
}
#endif
for
(
auto
&
var
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
var
->
Persistable
())
{
auto
it
=
std
::
find
(
need_merge_var_names_
.
begin
(),
need_merge_var_names_
.
end
(),
var
->
Name
());
if
(
it
==
need_merge_var_names_
.
end
()
&&
var
->
GetType
()
!=
proto
::
VarType
::
SELECTED_ROWS
)
{
VLOG
(
2
)
<<
"train param: "
<<
var
->
Name
();
trainable_param_
.
push_back
(
var
->
Name
());
}
}
}
}
void
MultiTrainer
::
InitOtherEnv
(
const
ProgramDesc
&
main_program
)
{
...
...
@@ -192,18 +203,30 @@ void MultiTrainer::Run() {
#ifdef PADDLE_WITH_HETERPS
void
MultiTrainer
::
MergeDenseParam
()
{
#ifdef PADDLE_W
TI
H_PSCORE
#ifdef PADDLE_W
IT
H_PSCORE
auto
communicator
=
paddle
::
distributed
::
Communicator
::
GetInstance
();
auto
&
recv_ctx
=
communicator
->
GetRecvCtxMap
();
Scope
*
thread_scope
=
workers_
[
0
]
->
GetThreadScope
();
for
(
auto
&
iter
:
recv_ctx
)
{
auto
&
varnames
=
iter
.
second
;
for
(
auto
&
name
:
varnames
)
{
auto
thread_scope
=
workers_
[
0
]
->
GetThreadScope
();
if
(
communicator
==
nullptr
)
{
for
(
auto
&
name
:
trainable_param_
)
{
VLOG
(
2
)
<<
"merge var "
<<
name
<<
" to root scope"
;
Variable
*
root_var
=
root_scope_
->
FindVar
(
name
);
LoDTensor
*
root_tensor
=
root_var
->
GetMutable
<
LoDTensor
>
();
Variable
*
var
=
thread_scope
->
FindVar
(
name
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
TensorCopy
((
*
tensor
),
root_tensor
->
place
(),
root_tensor
);
TensorCopySync
((
*
tensor
),
root_tensor
->
place
(),
root_tensor
);
}
}
else
{
auto
&
recv_ctx
=
communicator
->
GetRecvCtxMap
();
for
(
auto
&
iter
:
recv_ctx
)
{
auto
&
varnames
=
iter
.
second
;
for
(
auto
&
name
:
varnames
)
{
VLOG
(
2
)
<<
"merge var "
<<
name
<<
" to root scope"
;
Variable
*
root_var
=
root_scope_
->
FindVar
(
name
);
LoDTensor
*
root_tensor
=
root_var
->
GetMutable
<
LoDTensor
>
();
Variable
*
var
=
thread_scope
->
FindVar
(
name
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
TensorCopySync
((
*
tensor
),
root_tensor
->
place
(),
root_tensor
);
}
}
}
#endif
...
...
@@ -236,11 +259,7 @@ void MultiTrainer::Finalize() {
}
LoDTensor
*
root_tensor
=
root_var
->
GetMutable
<
LoDTensor
>
();
#ifdef PADDLE_WITH_HETERPS
for
(
size_t
j
=
0
;
j
<
places_
.
size
();
j
++
)
{
#else
for
(
int
j
=
1
;
j
<
thread_num_
;
j
++
)
{
#endif
Scope
*
cur_thread_scope
=
workers_
[
j
]
->
GetThreadScope
();
Variable
*
thread_var
=
cur_thread_scope
->
FindVar
(
need_merge_var_names_
[
i
]);
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
45d7a3ea
...
...
@@ -129,6 +129,7 @@ class MultiTrainer : public TrainerBase {
std
::
vector
<
DataFeed
*>
readers_
;
std
::
vector
<
std
::
shared_ptr
<
DeviceWorker
>>
workers_
;
std
::
vector
<
std
::
string
>
need_merge_var_names_
;
std
::
vector
<
std
::
string
>
trainable_param_
;
#ifdef PADDLE_WITH_HETERPS
std
::
vector
<
platform
::
Place
>
places_
;
#endif
...
...
python/paddle/distributed/passes/ps_trainer_pass.py
浏览文件 @
45d7a3ea
...
...
@@ -614,15 +614,24 @@ class PsGpuPass(PassBase):
return
True
def
_add_push_box_sparse_op
(
self
,
program
):
insert_index
=
-
1
for
idx
,
op
in
list
(
enumerate
(
program
.
global_block
().
ops
)):
if
op
.
type
==
"lookup_table_grad"
:
insert_index
=
idx
for
op
in
program
.
global_block
().
ops
:
if
op
.
type
!=
"pull_box_sparse"
:
if
op
.
type
!=
"pull_box_sparse"
and
op
.
type
!=
"pull_gpups_sparse"
:
continue
grad_op_desc
,
op_grad_to_var
=
core
.
get_grad_op_desc
(
op
.
desc
,
cpt
.
to_text
(
set
()),
[])
for
op_desc
in
grad_op_desc
:
new_op_desc
=
program
.
global_block
().
desc
.
append_op
()
new_op_desc
=
program
.
global_block
().
desc
.
_insert_op
(
insert_index
+
1
)
new_op_desc
.
copy_from
(
op_desc
)
new_op_desc
.
_set_attr
(
op_role_attr_name
,
backward
)
new_op
=
paddle
.
fluid
.
framework
.
Operator
(
program
.
global_block
(),
new_op_desc
)
program
.
global_block
().
ops
.
insert
(
insert_index
+
1
,
new_op
)
program
.
global_block
().
_sync_with_cpp
()
def
_remove_optimizer_var
(
self
,
program
):
embedding_w
=
{}
...
...
python/paddle/distributed/ps/the_one_ps.py
浏览文件 @
45d7a3ea
...
...
@@ -1013,12 +1013,13 @@ class TheOnePSRuntime(RuntimeBase):
if
self
.
context
[
'ps_mode'
]
==
DistributedMode
.
GEO
:
self
.
_communicator
.
init_params
(
init_params
)
else
:
if
role_id
==
0
:
self
.
_init_all_params
(
scopes
,
send_ctx
,
dense_map
)
if
not
self
.
context
[
'use_ps_gpu'
]:
if
role_id
==
0
:
self
.
_init_all_params
(
scopes
,
send_ctx
,
dense_map
)
fleet
.
util
.
barrier
()
self
.
_pull_all_dense
(
scopes
,
send_ctx
,
dense_map
)
if
not
self
.
context
[
'use_ps_gpu'
]:
self
.
_pull_all_dense
(
scopes
,
send_ctx
,
dense_map
)
fleet
.
util
.
barrier
()
if
self
.
context
[
'ps_mode'
]
==
DistributedMode
.
GEO
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录