Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
d3cda7f7
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d3cda7f7
编写于
9月 25, 2020
作者:
C
chengmo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix
上级
dbbcc43c
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
65 addition
and
55 deletion
+65
-55
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+1
-1
python/paddle/distributed/fleet/base/role_maker.py
python/paddle/distributed/fleet/base/role_maker.py
+1
-15
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+40
-32
python/paddle/distributed/fleet/launch_utils.py
python/paddle/distributed/fleet/launch_utils.py
+14
-3
python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
...dle/distributed/fleet/runtime/parameter_server_runtime.py
+9
-4
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
d3cda7f7
...
...
@@ -97,7 +97,7 @@ message AsyncConfig {
optional
int32
thread_pool_size
=
6
[
default
=
1
];
optional
int32
send_wait_times
=
7
[
default
=
1
];
optional
bool
runtime_split_send_recv
=
8
[
default
=
false
];
optional
string
worker_device
=
9
[
default
=
'cpu'
];
optional
string
heter_
worker_device
=
9
[
default
=
'cpu'
];
}
message
PipelineConfig
{
optional
int32
micro_batch
=
1
[
default
=
1
];
}
...
...
python/paddle/distributed/fleet/base/role_maker.py
浏览文件 @
d3cda7f7
...
...
@@ -511,13 +511,6 @@ class RoleMakerBase(object):
return
self
.
_heter_trainer_endpoints
[(
self
.
_current_id
)
%
self
.
_heter_worker_num
()]
def
_get_heter_worker_device
(
self
):
"""
Returns:
string: heter_trainer's device of current node, e.g: CPU/GPU/XPU
"""
return
self
.
_heter_trainer_device
.
upper
()
class
PaddleCloudRoleMaker
(
RoleMakerBase
):
def
__init__
(
self
,
is_collective
=
False
,
**
kwargs
):
...
...
@@ -696,8 +689,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
# For heter parameter server env setting
heter_trainer_eplist
=
os
.
getenv
(
"PADDLE_HETER_TRAINER_IP_PORT_LIST"
,
""
)
heter_trainer_device
=
os
.
getenv
(
"PADDLE_HETER_TRAINER_DEVICE"
,
""
)
if
heter_trainer_eplist
!=
""
and
heter_trainer_device
!=
""
:
if
heter_trainer_eplist
!=
""
:
try
:
heter_trainer_eplist
=
os
.
environ
[
"PADDLE_HETER_TRAINER_IP_PORT_LIST"
].
split
(
","
)
...
...
@@ -708,12 +700,6 @@ class PaddleCloudRoleMaker(RoleMakerBase):
self
.
_is_heter_parameter_server_mode
=
True
heter_trainers_num
=
len
(
heter_trainer_eplist
)
current_node_device
=
heter_trainer_device
.
upper
()
if
current_node_device
not
in
[
"CPU"
,
"GPU"
,
"XPU"
]:
raise
ValueError
(
"Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)"
.
format
(
heter_trainer_device
))
self
.
_heter_trainer_device
=
current_node_device
else
:
self
.
_is_heter_parameter_server_mode
=
False
heter_trainers_num
=
0
...
...
python/paddle/distributed/fleet/launch.py
浏览文件 @
d3cda7f7
...
...
@@ -91,14 +91,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
'''
)
base_group
=
parser
.
add_argument_group
(
"Base Parameters"
)
base_group
.
add_argument
(
"-d"
,
"--distributed_mode"
,
type
=
str
,
choices
=
[
"collective"
,
"ps"
,
"ps_heter"
,
"ps_gpu"
,
""
],
default
=
""
,
help
=
"Distributed running mode: collective/ps/ps_gpu/ps_heter"
)
base_group
.
add_argument
(
"--log_dir"
,
type
=
str
,
...
...
@@ -150,13 +142,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
ps_group
.
add_argument
(
"--heter_worker_num"
,
type
=
int
,
help
=
"number of heter_workers"
)
ps_group
.
add_argument
(
"--heter_worker_device"
,
type
=
str
,
default
=
"gpu"
,
choices
=
[
"gpu"
,
"xpu"
],
help
=
"heter worker device"
)
return
parser
.
parse_args
()
...
...
@@ -244,34 +229,37 @@ def launch_collective(args):
shutil
.
rmtree
(
gloo_rendezvous_dir
)
def
launch_ps
(
args
):
def
launch_ps
(
args
,
distribute_mode
):
cloud_flag
=
cloud_utils
.
use_paddlecloud
()
# for ps-cpu on paddlecloud
direct_start_mode
=
[
"ps"
,
""
]
if
cloud_flag
and
(
args
.
distributed_mode
in
direct_start_mode
):
if
cloud_flag
and
distribute_mode
==
DistributeMode
.
PS
:
direct_start
(
args
)
return
elif
cloud_flag
and
args
.
distributed_mode
==
"ps_heter"
:
elif
cloud_flag
and
distribute_mode
==
DistributeMode
.
PS_HETER
:
cloud_ps_heter_env_set
(
args
)
args
.
trainers
=
os
.
getenv
(
"PADDLE_TRAINER_ENDPOINTS"
)
args
.
workers
=
os
.
getenv
(
"PADDLE_PSERVERS_IP_PORT_LIST"
)
args
.
heter_workers
=
os
.
getenv
(
"PADDLE_HETER_TRAINER_IP_PORT_LIST"
)
ps_launcher
=
ParameterServerLauncher
(
args
)
ps_launcher
.
start_ps
(
args
)
ps_launcher
=
ParameterServerLauncher
(
args
,
distribute_mode
)
ps_launcher
.
start_ps
()
return
def
launch
():
args
=
_parse_args
()
logger
=
get_logger
()
_print_arguments
(
args
)
def
which_distributed_mode
(
args
):
ps_args
=
[
'--worker_num'
,
'--server_num'
,
'--heter_worker_num'
,
'--servers'
,
'--workers'
,
'--heter_worrkers'
,
'heter_worker_device'
'--worker_num'
,
'--server_num'
,
'--heter_worker_num'
,
'--servers'
,
'--workers'
,
'--heter_workers'
,
]
collective_args
=
[
'--ips'
,
'--gpus'
]
collective_args
=
[
'--ips'
]
ps_heter_args
=
[
"--heter_worker_num"
,
"--heter_workers"
]
has_ps_args
=
[
ps_arg
for
ps_arg
in
ps_args
if
ps_arg
in
" "
.
join
(
sys
.
argv
[
1
:
-
1
])
]
...
...
@@ -279,25 +267,45 @@ def launch():
co_arg
for
co_arg
in
collective_args
if
co_arg
in
" "
.
join
(
sys
.
argv
[
1
:
-
1
])
]
assert
(
len
(
has_ps_args
)
>
1
and
len
(
has_collective_args
)
>
1
),
"Only one mode(Collective or Parameter-Server ) can be selected at the same time, but more than one configuration was received."
if
fluid
.
core
.
is_compiled_with_cuda
():
cuda_device_num
=
fluid
.
core
.
get_cuda_device_count
()
else
:
cuda_device_num
=
0
ps_mode
=
[
'ps'
,
'ps_gpu'
,
'ps_heter'
]
if
len
(
has_ps_args
)
>
0
or
args
.
distributed_mode
in
ps_mode
:
if
len
(
has_ps_args
)
>
0
:
logger
.
info
(
"Run parameter-sever mode. pserver arguments:{}, cuda count:{}"
.
format
(
has_ps_args
,
cuda_device_num
))
launch_ps
(
args
)
has_ps_heter_args
=
list
(
set
(
has_ps_args
)
&
set
(
ps_heter_args
))
if
len
(
has_ps_heter_args
)
>
0
:
return
DistributeMode
.
PS_HETER
else
:
return
DistributeMode
.
PS
elif
len
(
has_collective_args
)
>
0
:
logger
.
info
(
"Run collective gpu mode. gpu arguments:{}, cuda count:{}"
.
format
(
has_collective_args
,
cuda_device_num
))
launch_collective
(
args
)
return
DistributeMode
.
COLLECTIVE
else
:
logger
.
warning
(
"Not found distinct arguments. Default use gpu collective mode"
)
return
DistributeMode
.
COLLECTIVE
def
launch
():
args
=
_parse_args
()
logger
=
get_logger
()
_print_arguments
(
args
)
distribute_mode
=
which_distributed_mode
(
args
)
if
distribute_mode
==
DistributeMode
.
COLLECTIVE
:
launch_collective
(
args
)
else
:
launch_ps
(
args
,
distribute_mode
)
if
__name__
==
"__main__"
:
...
...
python/paddle/distributed/fleet/launch_utils.py
浏览文件 @
d3cda7f7
...
...
@@ -32,6 +32,15 @@ logger = logging.getLogger("root")
logger
.
propagate
=
False
class
DistributeMode
:
"""
There are various mode for fleetrun, each of them is designed for different model.
"""
COLLECTIVE
=
0
PS
=
1
PS_HETER
=
2
class
Cluster
(
object
):
def
__init__
(
self
,
hdfs
):
self
.
job_server
=
None
...
...
@@ -616,7 +625,9 @@ def cloud_ps_heter_env_set(args):
class
ParameterServerLauncher
(
object
):
def
__init__
(
self
,
args
):
def
__init__
(
self
,
args
,
distribute_mode
):
self
.
args
=
args
self
.
distribute_mode
=
distribute_mode
self
.
server_num
=
0
self
.
worker_num
=
0
self
.
heter_worker_num
=
0
...
...
@@ -677,7 +688,7 @@ class ParameterServerLauncher(object):
self
.
worker_num
=
len
(
self
.
worker_endpoints
.
split
(
","
))
# get heter worker envs
if
args
.
distributed_mode
==
"ps_heter"
:
if
self
.
distribute_mode
==
DistributeMode
.
PS_HETER
:
if
args
.
heter_worker_num
:
self
.
heter_worker_num
=
args
.
heter_worker_num
if
args
.
heter_workers
:
...
...
@@ -713,7 +724,7 @@ class ParameterServerLauncher(object):
]
self
.
node_ips
=
list
(
set
(
self
.
server_endpoints_ips
+
self
.
worker_endpoints_ips
))
if
args
.
distributed_mode
==
"ps_heter"
:
if
self
.
distribute_mode
==
DistributeMode
.
PS_HETER
:
self
.
heter_worker_endpoints_ips
=
[
x
.
strip
().
split
(
":"
)[
0
]
for
x
in
self
.
heter_worker_endpoints
.
split
(
","
)
...
...
python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
浏览文件 @
d3cda7f7
...
...
@@ -198,16 +198,21 @@ class ParameterServerRuntime(RuntimeBase):
warnings
.
warn
(
"communicator has been initialized, skip"
)
def
_get_executor
(
self
):
if
self
.
role_maker
.
_is_heter_worker
():
if
self
.
role_maker
.
_get_heter_worker_device
()
==
"GPU"
:
heter_worker_device
=
self
.
context
[
"valid_strategy"
].
a_sync_configs
[
"heter_worker_device"
].
upper
()
if
heter_worker_device
==
"GPU"
:
gpu_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
executor
=
Executor
(
fluid
.
CUDAPlace
(
gpu_id
))
elif
self
.
role_maker
.
_get_heter_worker_device
()
==
"XPU"
:
elif
heter_worker_device
==
"XPU"
:
xpu_id
=
int
(
os
.
getenv
(
"FLAGS_selected_xpus"
,
"0"
))
executor
=
Executor
(
fluid
.
XPUPlace
(
xpu_id
))
elif
heter_worker_device
==
"CPU"
:
fluid
.
Executor
(
fluid
.
CPUPlace
())
else
:
raise
ValueError
(
"Not Support Device {}"
.
format
(
self
.
role_maker
.
_get_heter_worker_device
()
))
raise
ValueError
(
"
Heter Worker
Not Support Device {}"
.
format
(
heter_worker_device
))
else
:
executor
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
return
executor
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录