Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
3789a699
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3789a699
编写于
3月 11, 2021
作者:
T
Thunderbrook
提交者:
GitHub
3月 11, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
solve bug in heter mode (#31531)
* heter bug * format * format
上级
6148b87f
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
55 addition
and
27 deletion
+55
-27
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+1
-0
paddle/fluid/framework/dist_multi_trainer.cc
paddle/fluid/framework/dist_multi_trainer.cc
+2
-3
paddle/fluid/framework/fleet/fleet_wrapper.cc
paddle/fluid/framework/fleet/fleet_wrapper.cc
+0
-2
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+52
-22
未找到文件。
paddle/fluid/framework/device_worker.h
浏览文件 @
3789a699
...
...
@@ -168,6 +168,7 @@ class DeviceWorker {
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{}
virtual
void
ProduceTasks
()
{}
virtual
void
GetXpuOpIndex
()
{}
virtual
void
Schedule
(
int
taskid
)
{}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
virtual
void
SetStream
(
const
gpuStream_t
stream
)
{}
virtual
void
SetEvent
(
const
gpuEvent_t
event
)
{}
...
...
paddle/fluid/framework/dist_multi_trainer.cc
浏览文件 @
3789a699
...
...
@@ -62,9 +62,8 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
void
DistMultiTrainer
::
RegisterHeterCallback
()
{
auto
fleet_ptr
=
FleetWrapper
::
GetInstance
();
fleet_ptr
->
RegisterHeterCallback
([
this
](
int
worker
,
int
taskid
)
{
// workers_[worker]->Schedule(taskid);
});
fleet_ptr
->
RegisterHeterCallback
(
[
this
](
int
worker
,
int
taskid
)
{
workers_
[
worker
]
->
Schedule
(
taskid
);
});
}
void
DistMultiTrainer
::
InitDumpEnv
()
{
...
...
paddle/fluid/framework/fleet/fleet_wrapper.cc
浏览文件 @
3789a699
...
...
@@ -193,7 +193,6 @@ void FleetWrapper::HeterPullSparseVars(
for
(
auto
&
t
:
fea_values
)
{
pull_result_ptr
.
push_back
(
t
.
data
());
}
/*
auto
status
=
pslib_ptr_
->
_worker_ptr
->
heter_pull_sparse
(
workerid
,
pull_result_ptr
.
data
(),
table_id
,
fea_keys
.
data
(),
fea_keys
.
size
(),
task
->
taskid_
);
...
...
@@ -207,7 +206,6 @@ void FleetWrapper::HeterPullSparseVars(
exit
(
-
1
);
}
}
*/
}
void
FleetWrapper
::
HeterPushSparseVars
(
...
...
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
3789a699
...
...
@@ -1039,11 +1039,17 @@ class HeterRoleMaker(GeneralRoleMaker):
self
.
_node_type
=
1
self
.
_cur_endpoint
=
worker_endpoints
[
current_id
]
gloo
=
fluid
.
core
.
Gloo
()
gloo
.
init
(
current_id
,
len
(
worker_endpoints
),
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/trainer"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
gloo
.
set_rank
(
current_id
)
gloo
.
set_size
(
len
(
worker_endpoints
))
gloo
.
set_prefix
(
self
.
_prefix
)
gloo
.
set_iface
(
self
.
_iface
)
gloo
.
set_timeout_seconds
(
self
.
_init_timeout_seconds
,
self
.
_run_timeout_seconds
)
gloo
.
set_hdfs_store
(
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/trainer"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
)
gloo
.
init
()
self
.
_node_type_comm
=
gloo
elif
training_role
==
"XPU"
:
role
=
Role
.
XPU
...
...
@@ -1051,10 +1057,17 @@ class HeterRoleMaker(GeneralRoleMaker):
self
.
_node_type
=
2
self
.
_cur_endpoint
=
xpu_endpoints
[
current_id
]
gloo
=
fluid
.
core
.
Gloo
()
gloo
.
init
(
current_id
,
len
(
xpu_endpoints
),
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/xpu"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
gloo
.
set_rank
(
current_id
)
gloo
.
set_size
(
len
(
xpu_endpoints
))
gloo
.
set_prefix
(
self
.
_prefix
)
gloo
.
set_iface
(
self
.
_iface
)
gloo
.
set_timeout_seconds
(
self
.
_init_timeout_seconds
,
self
.
_run_timeout_seconds
)
gloo
.
set_hdfs_store
(
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/xpu"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
)
gloo
.
init
()
self
.
_node_type_comm
=
gloo
elif
training_role
==
"PSERVER"
:
role
=
Role
.
SERVER
...
...
@@ -1070,30 +1083,47 @@ class HeterRoleMaker(GeneralRoleMaker):
self
.
_node_type
=
0
self
.
_cur_endpoint
=
cur_endpoint
gloo
=
fluid
.
core
.
Gloo
()
gloo
.
init
(
current_id
,
len
(
eplist
),
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/pserver"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
gloo
.
set_rank
(
current_id
)
gloo
.
set_size
(
len
(
eplist
))
gloo
.
set_prefix
(
self
.
_prefix
)
gloo
.
set_iface
(
self
.
_iface
)
gloo
.
set_timeout_seconds
(
self
.
_init_timeout_seconds
,
self
.
_run_timeout_seconds
)
gloo
.
set_hdfs_store
(
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/pserver"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
)
gloo
.
init
()
self
.
_node_type_comm
=
gloo
if
training_role
==
"TRAINER"
or
training_role
==
"XPU"
:
gloo
=
fluid
.
core
.
Gloo
()
heter_list
=
worker_endpoints
+
xpu_endpoints
gloo
.
init
(
heter_list
.
index
(
self
.
_cur_endpoint
),
len
(
heter_list
),
gloo
.
set_rank
(
heter_list
.
index
(
self
.
_cur_endpoint
))
gloo
.
set_size
(
len
(
heter_list
))
gloo
.
set_prefix
(
self
.
_prefix
)
gloo
.
set_iface
(
self
.
_iface
)
gloo
.
set_timeout_seconds
(
self
.
_init_timeout_seconds
,
self
.
_run_timeout_seconds
)
gloo
.
set_hdfs_store
(
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/heter"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
self
.
_hdfs_ugi
)
gloo
.
init
()
self
.
_heter_comm
=
gloo
gloo
=
fluid
.
core
.
Gloo
()
all_list
=
worker_endpoints
+
eplist
+
xpu_endpoints
gloo
.
init
(
all_list
.
index
(
self
.
_cur_endpoint
),
len
(
all_list
),
gloo
.
set_rank
(
all_list
.
index
(
self
.
_cur_endpoint
))
gloo
.
set_size
(
len
(
all_list
))
gloo
.
set_prefix
(
self
.
_prefix
)
gloo
.
set_iface
(
self
.
_iface
)
gloo
.
set_timeout_seconds
(
self
.
_init_timeout_seconds
,
self
.
_run_timeout_seconds
)
gloo
.
set_hdfs_store
(
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/all"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
self
.
_hdfs_ugi
)
gloo
.
init
()
self
.
_all_comm
=
gloo
self
.
_trainers_num
=
trainers_num
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录