Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
7f9869d3
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7f9869d3
编写于
7月 14, 2020
作者:
C
Chengmo
提交者:
GitHub
7月 14, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update paddlecloud train (#142)
* update * fix * delete ps-memory * fix * fix
上级
9b89d8f7
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
316 addition
and
48 deletion
+316
-48
core/engine/cluster/cloud/before_hook_cpu.sh.template
core/engine/cluster/cloud/before_hook_cpu.sh.template
+2
-2
core/engine/cluster/cloud/before_hook_gpu.sh.template
core/engine/cluster/cloud/before_hook_gpu.sh.template
+1
-1
core/engine/cluster/cloud/cluster.sh
core/engine/cluster/cloud/cluster.sh
+25
-2
core/engine/cluster/cloud/k8s_cpu_job.sh.template
core/engine/cluster/cloud/k8s_cpu_job.sh.template
+40
-0
core/engine/cluster/cloud/k8s_job.sh.template
core/engine/cluster/cloud/k8s_job.sh.template
+18
-4
core/engine/cluster/cloud/mpi_job.sh.template
core/engine/cluster/cloud/mpi_job.sh.template
+1
-1
core/engine/cluster/cluster.py
core/engine/cluster/cluster.py
+25
-9
doc/distributed_train.md
doc/distributed_train.md
+194
-24
models/rank/dnn/backend.yaml
models/rank/dnn/backend.yaml
+8
-5
setup.cfg
setup.cfg
+2
-0
未找到文件。
core/engine/cluster/cloud/before_hook_cpu.sh.template
浏览文件 @
7f9869d3
echo "Run before_hook.sh ..."
wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz
wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz
--no-check-certificate
tar -xf PaddleRec.tar.gz
...
...
@@ -10,6 +10,6 @@ python setup.py install
pip uninstall -y paddlepaddle
pip install paddlepaddle
-gpu
==<$ PADDLEPADDLE_VERSION $> --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
pip install paddlepaddle==<$ PADDLEPADDLE_VERSION $> --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
echo "End before_hook.sh ..."
core/engine/cluster/cloud/before_hook_gpu.sh.template
浏览文件 @
7f9869d3
echo "Run before_hook.sh ..."
wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz
wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz
--no-check-certificate
tar -xf PaddleRec.tar.gz
...
...
core/engine/cluster/cloud/cluster.sh
浏览文件 @
7f9869d3
...
...
@@ -39,7 +39,12 @@ function _before_submit() {
elif
[
${
DISTRIBUTE_MODE
}
==
"COLLECTIVE_GPU_K8S"
]
;
then
_gen_gpu_before_hook
_gen_k8s_config
_gen_k8s_job
_gen_k8s_gpu_job
_gen_end_hook
elif
[
${
DISTRIBUTE_MODE
}
==
"PS_CPU_K8S"
]
;
then
_gen_cpu_before_hook
_gen_k8s_config
_gen_k8s_cpu_job
_gen_end_hook
fi
...
...
@@ -101,6 +106,7 @@ function _gen_end_hook() {
function
_gen_mpi_job
()
{
echo
"gen mpi_job.sh"
sed
-e
"s#<
$
GROUP_NAME
$>
#
$GROUP_NAME
#g"
\
-e
"s#<
$
JOB_NAME
$>
#
$OLD_JOB_NAME
#g"
\
-e
"s#<
$
AK
$>
#
$AK
#g"
\
-e
"s#<
$
SK
$>
#
$SK
#g"
\
-e
"s#<
$
MPI_PRIORITY
$>
#
$PRIORITY
#g"
\
...
...
@@ -109,18 +115,34 @@ function _gen_mpi_job() {
${
abs_dir
}
/cloud/mpi_job.sh.template
>
${
PWD
}
/job.sh
}
function
_gen_k8s_job
()
{
function
_gen_k8s_
gpu_
job
()
{
echo
"gen k8s_job.sh"
sed
-e
"s#<
$
GROUP_NAME
$>
#
$GROUP_NAME
#g"
\
-e
"s#<
$
JOB_NAME
$>
#
$OLD_JOB_NAME
#g"
\
-e
"s#<
$
AK
$>
#
$AK
#g"
\
-e
"s#<
$
SK
$>
#
$SK
#g"
\
-e
"s#<
$
K8S_PRIORITY
$>
#
$PRIORITY
#g"
\
-e
"s#<
$
K8S_TRAINERS
$>
#
$K8S_TRAINERS
#g"
\
-e
"s#<
$
K8S_CPU_CORES
$>
#
$K8S_CPU_CORES
#g"
\
-e
"s#<
$
K8S_GPU_CARD
$>
#
$K8S_GPU_CARD
#g"
\
-e
"s#<
$
START_CMD
$>
#
$START_CMD
#g"
\
${
abs_dir
}
/cloud/k8s_job.sh.template
>
${
PWD
}
/job.sh
}
function
_gen_k8s_cpu_job
()
{
echo
"gen k8s_job.sh"
sed
-e
"s#<
$
GROUP_NAME
$>
#
$GROUP_NAME
#g"
\
-e
"s#<
$
JOB_NAME
$>
#
$OLD_JOB_NAME
#g"
\
-e
"s#<
$
AK
$>
#
$AK
#g"
\
-e
"s#<
$
SK
$>
#
$SK
#g"
\
-e
"s#<
$
K8S_PRIORITY
$>
#
$PRIORITY
#g"
\
-e
"s#<
$
K8S_TRAINERS
$>
#
$K8S_TRAINERS
#g"
\
-e
"s#<
$
K8S_PS_NUM
$>
#
$K8S_PS_NUM
#g"
\
-e
"s#<
$
K8S_PS_CORES
$>
#
$K8S_PS_CORES
#g"
\
-e
"s#<
$
K8S_CPU_CORES
$>
#
$K8S_CPU_CORES
#g"
\
-e
"s#<
$
START_CMD
$>
#
$START_CMD
#g"
\
${
abs_dir
}
/cloud/k8s_cpu_job.sh.template
>
${
PWD
}
/job.sh
}
#-----------------------------------------------------------------------------------------------------------------
...
...
@@ -145,6 +167,7 @@ function _submit() {
function
package_hook
()
{
cur_time
=
`
date
+
"%Y%m%d%H%M"
`
new_job_name
=
"
${
JOB_NAME
}
_
${
cur_time
}
"
export
OLD_JOB_NAME
=
${
JOB_NAME
}
export
JOB_NAME
=
${
new_job_name
}
export
job_file_path
=
"
${
PWD
}
/
${
new_job_name
}
"
mkdir
${
job_file_path
}
...
...
core/engine/cluster/cloud/k8s_cpu_job.sh.template
0 → 100644
浏览文件 @
7f9869d3
#!/bin/bash
###############################################################
## 注意-- 注意--注意 ##
## K8S PS-CPU多机作业作业示例 ##
###############################################################
job_name
=
<
$
JOB_NAME
$>
# 作业参数
group_name
=
"<
$
GROUP_NAME
$>
"
job_version
=
"paddle-fluid-v1.7.1"
start_cmd
=
"<
$
START_CMD
$>
"
wall_time
=
"10:00:00"
k8s_priority
=
<
$
K8S_PRIORITY
$>
k8s_trainers
=
<
$
K8S_TRAINERS
$>
k8s_cpu_cores
=
<
$
K8S_CPU_CORES
$>
k8s_ps_num
=
<
$
K8S_PS_NUM
$>
k8s_ps_cores
=
<
$
K8S_PS_CORES
$>
# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取)
ak
=
<
$
AK
$>
sk
=
<
$
SK
$>
paddlecloud job
--ak
${
ak
}
--sk
${
sk
}
\
train
--job-name
${
job_name
}
\
--group-name
${
group_name
}
\
--job-conf
config.ini
\
--start-cmd
"
${
start_cmd
}
"
\
--files
./
*
\
--job-version
${
job_version
}
\
--k8s-priority
${
k8s_priority
}
\
--wall-time
${
wall_time
}
\
--k8s-trainers
${
k8s_trainers
}
\
--k8s-cpu-cores
${
k8s_cpu_cores
}
\
--k8s-ps-num
${
k8s_ps_num
}
\
--k8s-ps-cores
${
k8s_ps_cores
}
\
--is-standalone
0
\
--distribute-job-type
"PSERVER"
\
--json
\ No newline at end of file
core/engine/cluster/cloud/k8s_job.sh.template
浏览文件 @
7f9869d3
...
...
@@ -3,7 +3,7 @@
## 注意-- 注意--注意 ##
## K8S NCCL2多机作业作业示例 ##
###############################################################
job_name
=
${
JOB_NAME
}
job_name
=
<
$
JOB_NAME
$>
# 作业参数
group_name
=
"<
$
GROUP_NAME
$>
"
...
...
@@ -13,8 +13,20 @@ wall_time="10:00:00"
k8s_priority
=
<
$
K8S_PRIORITY
$>
k8s_trainers
=
<
$
K8S_TRAINERS
$>
k8s_cpu_cores
=
<
$
K8S_CPU_CORES
$>
k8s_gpu_cards
=
<
$
K8S_GPU_CARD
$>
is_stand_alone
=
0
nccl
=
"--distribute-job-type "
NCCL2
""
if
[
${
k8s_trainers
}
==
1
]
;
then
is_stand_alone
=
1
nccl
=
"--job-remark single-trainer"
if
[
${
k8s_gpu_cards
}
==
1]
;
then
nccl
=
"--job-remark single-gpu"
echo
"Attention: Use single GPU card for PaddleRec distributed training, please set runner class from 'cluster_train' to 'train' in config.yaml."
fi
fi
# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取)
ak
=
<
$
AK
$>
sk
=
<
$
SK
$>
...
...
@@ -27,9 +39,11 @@ paddlecloud job --ak ${ak} --sk ${sk} \
--files
./
*
\
--job-version
${
job_version
}
\
--k8s-trainers
${
k8s_trainers
}
\
--k8s-cpu-cores
${
k8s_cpu_cores
}
\
--k8s-gpu-cards
${
k8s_gpu_cards
}
\
--k8s-priority
${
k8s_priority
}
\
--wall-time
${
wall_time
}
\
--is-standalone
0
\
--distribute-job-type
"NCCL2"
\
--json
\ No newline at end of file
--is-standalone
${
is_stand_alone
}
\
--json
\
${
nccl
}
\ No newline at end of file
core/engine/cluster/cloud/mpi_job.sh.template
浏览文件 @
7f9869d3
...
...
@@ -3,7 +3,7 @@
## 注意--注意--注意 ##
## MPI 类型作业演示 ##
###############################################################
job_name
=
${
JOB_NAME
}
job_name
=
<
$
JOB_NAME
$>
# 作业参数
group_name
=
<
$
GROUP_NAME
$>
...
...
core/engine/cluster/cluster.py
浏览文件 @
7f9869d3
...
...
@@ -67,10 +67,10 @@ class ClusterEngine(Engine):
@
staticmethod
def
workspace_replace
():
workspace
=
envs
.
get_runtime_environ
(
"
workspace"
)
remote_workspace
=
envs
.
get_runtime_environ
(
"remote_
workspace"
)
for
k
,
v
in
os
.
environ
.
items
():
v
=
v
.
replace
(
"{workspace}"
,
workspace
)
v
=
v
.
replace
(
"{workspace}"
,
remote_
workspace
)
os
.
environ
[
k
]
=
str
(
v
)
def
run
(
self
):
...
...
@@ -98,14 +98,12 @@ class ClusterEngine(Engine):
cluster_env_check_tool
=
PaddleCloudMpiEnv
()
else
:
raise
ValueError
(
"Paddlecloud with Mpi don't support GPU training, check your config"
"Paddlecloud with Mpi don't support GPU training, check your config
.yaml & backend.yaml
"
)
elif
cluster_type
.
upper
()
==
"K8S"
:
if
fleet_mode
==
"PS"
:
if
device
==
"CPU"
:
raise
ValueError
(
"PS-CPU on paddlecloud is not supported at this time, comming soon"
)
cluster_env_check_tool
=
CloudPsCpuEnv
()
elif
device
==
"GPU"
:
raise
ValueError
(
"PS-GPU on paddlecloud is not supported at this time, comming soon"
...
...
@@ -115,7 +113,7 @@ class ClusterEngine(Engine):
cluster_env_check_tool
=
CloudCollectiveEnv
()
elif
device
==
"CPU"
:
raise
ValueError
(
"Unexpected config -> device: CPU with fleet_mode: Collective, check your config"
"Unexpected config -> device: CPU with fleet_mode: Collective, check your config
.yaml
"
)
else
:
raise
ValueError
(
"cluster_type {} error, must in MPI/K8S"
.
format
(
...
...
@@ -234,7 +232,7 @@ class PaddleCloudMpiEnv(ClusterEnvBase):
"config.train_data_path"
,
""
)
if
self
.
cluster_env
[
"TRAIN_DATA_PATH"
]
==
""
:
raise
ValueError
(
"No -- TRAIN_DATA_PATH -- found in your backend.yaml, please
check
."
"No -- TRAIN_DATA_PATH -- found in your backend.yaml, please
add train_data_path in your backend yaml
."
)
# test_data_path
self
.
cluster_env
[
"TEST_DATA_PATH"
]
=
self
.
backend_env
.
get
(
...
...
@@ -274,7 +272,7 @@ class PaddleCloudK8sEnv(ClusterEnvBase):
category
=
UserWarning
,
stacklevel
=
2
)
warnings
.
warn
(
"The remote
mount point
will be mounted to the ./afs/"
,
"The remote
afs path
will be mounted to the ./afs/"
,
category
=
UserWarning
,
stacklevel
=
2
)
...
...
@@ -293,3 +291,21 @@ class CloudCollectiveEnv(PaddleCloudK8sEnv):
"submit.k8s_gpu_card"
,
1
)
self
.
cluster_env
[
"K8S_CPU_CORES"
]
=
self
.
backend_env
.
get
(
"submit.k8s_cpu_cores"
,
1
)
class
CloudPsCpuEnv
(
PaddleCloudK8sEnv
):
def
__init__
(
self
):
super
(
CloudPsCpuEnv
,
self
).
__init__
()
def
env_check
(
self
):
super
(
CloudPsCpuEnv
,
self
).
env_check
()
self
.
cluster_env
[
"DISTRIBUTE_MODE"
]
=
"PS_CPU_K8S"
self
.
cluster_env
[
"K8S_TRAINERS"
]
=
self
.
backend_env
.
get
(
"submit.k8s_trainers"
,
1
)
self
.
cluster_env
[
"K8S_CPU_CORES"
]
=
self
.
backend_env
.
get
(
"submit.k8s_cpu_cores"
,
2
)
self
.
cluster_env
[
"K8S_PS_NUM"
]
=
self
.
backend_env
.
get
(
"submit.k8s_ps_num"
,
1
)
self
.
cluster_env
[
"K8S_PS_CORES"
]
=
self
.
backend_env
.
get
(
"submit.k8s_ps_cores"
,
2
)
doc/distributed_train.md
浏览文件 @
7f9869d3
...
...
@@ -9,6 +9,7 @@
-
[
第三步:增加集群运行`backend.yaml`配置
](
#第三步增加集群运行backendyaml配置
)
-
[
MPI集群的Parameter Server模式配置
](
#mpi集群的parameter-server模式配置
)
-
[
K8S集群的Collective模式配置
](
#k8s集群的collective模式配置
)
-
[
K8S集群的PS-CPU模式配置
](
#k8s集群的ps-cpu模式配置
)
-
[
第四步:任务提交
](
#第四步任务提交
)
-
[
使用PaddleCloud Client提交
](
#使用paddlecloud-client提交
)
-
[
第一步:在`before_hook.sh`里手动安装PaddleRec
](
#第一步在before_hooksh里手动安装paddlerec
)
...
...
@@ -34,10 +35,10 @@
分布式运行首先需要更改
`config.yaml`
,主要调整以下内容:
-
workspace: 调整为在
节点运行时的工作目录
-
runner_class: 从单机的"train"调整为"cluster_train"
-
fleet_mode: 选则参数服务器模式
,抑或GPU Collective模式
-
distribute_strategy: 可选项,选择分布式训练的策略
-
workspace: 调整为在
远程点运行时的工作目录,一般设置为
`"./"`
即可
-
runner_class: 从单机的"train"调整为"cluster_train"
,单机训练->分布式训练(例外情况,k8s上单机单卡训练仍然为train)
-
fleet_mode: 选则参数服务器模式
(ps),抑或GPU的all-reduce模式(collective)
-
distribute_strategy: 可选项,选择分布式训练的策略
,目前只在参数服务器模式下生效,可选项:
`sync、asycn、half_async、geo`
配置选项具体参数,可以参考
[
yaml配置说明
](
./yaml.md
)
...
...
@@ -50,47 +51,56 @@
workspace
:
"
paddlerec.models.rank.dnn"
mode
:
[
single_cpu_train
]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner
:
-
name
:
single_cpu_train
class
:
train
# num of epochs
epochs
:
4
# device to run training or infer
device
:
cpu
save_checkpoint_interval
:
2
# save model interval of epochs
save_checkpoint_path
:
"
increment_dnn"
# save checkpoint path
init_model_path
:
"
"
# load model path
save_checkpoint_interval
:
2
save_checkpoint_path
:
"
increment_dnn"
init_model_path
:
"
"
print_interval
:
10
phases
:
[
phase1
]
dataset
:
-
name
:
dataloader_train
batch_size
:
2
type
:
DataLoader
data_path
:
"
{workspace}/data/sample_data/train"
sparse_slots
:
"
click
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26"
dense_slots
:
"
dense_var:13"
```
分布式的训练配置可以改为:
```
yaml
# workspace
# 改变一:代码上传至节点后,与运行shell同在一个默认目录下
# 改变一:代码上传至节点后,在默认目录下
workspace
:
"
./"
mode
:
[
ps_cluster
]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner
:
-
name
:
ps_cluster
# 改变二:调整runner的class
class
:
cluster_train
# num of epochs
epochs
:
4
# device to run training or infer
device
:
cpu
# 改变三 & 四: 指定fleet_mode 与 distribute_strategy
fleet_mode
:
ps
distribute_strategy
:
async
save_checkpoint_interval
:
2
# save model interval of epochs
save_checkpoint_path
:
"
increment_dnn"
# save checkpoint path
init_model_path
:
"
"
# load model path
save_checkpoint_interval
:
2
save_checkpoint_path
:
"
increment_dnn"
init_model_path
:
"
"
print_interval
:
10
phases
:
[
phase1
]
dataset
:
-
name
:
dataloader_train
batch_size
:
2
type
:
DataLoader
# 改变五: 改变数据的读取目录
# 通常而言,mpi模式下,数据会下载到远程节点执行目录的'./train_data'下, k8s则与挂载位置有关
data_path
:
"
{workspace}/train_data"
sparse_slots
:
"
click
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26"
dense_slots
:
"
dense_var:13"
```
除此之外,还需关注数据及模型加载的路径,一般而言:
...
...
@@ -165,7 +175,14 @@ submit:
# for k8s gpu
# k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数
k8s_trainers
:
2
k8s-cpu-cores
:
4
k8s_gpu_card
:
1
# for k8s ps-cpu
k8s_trainers
:
2
k8s-cpu-cores
:
4
k8s_ps_num
:
2
k8s_ps_cores
:
4
```
...
...
@@ -173,18 +190,51 @@ submit:
除此之外,我们还需要关注上传到工作目录的文件(
`files选项`
)的路径问题,在示例中是
`./*.py`
,说明我们执行任务提交时,与这些py文件在同一目录。若不在同一目录,则需要适当调整files路径,或改为这些文件的绝对路径。
不建议利用
`files`
上传
数据文件,可以通过指定
`train_data_path`
自动下载,或
指定
`afs_remote_mount_point`
挂载实现数据到节点的转移。
不建议利用
`files`
上传
过大的数据文件,可以通过指定
`train_data_path`
自动下载,或在k8s模式下
指定
`afs_remote_mount_point`
挂载实现数据到节点的转移。
#### MPI集群的Parameter Server模式配置
下面是一个利用PaddleCloud提交MPI参数服务器模式任务的
`backend.yaml`
示例
首先调整
`config.yaml`
:
```
yaml
workspace
:
"
./"
mode
:
[
ps_cluster
]
dataset
:
-
name
:
dataloader_train
batch_size
:
2
type
:
DataLoader
data_path
:
"
{workspace}/train_data"
sparse_slots
:
"
click
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26"
dense_slots
:
"
dense_var:13"
runner
:
-
name
:
ps_cluster
class
:
cluster_train
epochs
:
2
device
:
cpu
fleet_mode
:
ps
save_checkpoint_interval
:
1
save_checkpoint_path
:
"
increment_dnn"
init_model_path
:
"
"
print_interval
:
1
phases
:
[
phase1
]
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
dataset_name
:
dataloader_train
thread_num
:
1
```
再新增
`backend.yaml`
```
yaml
backend
:
"
PaddleCloud"
cluster_type
:
mpi
# k8s 可选
cluster_type
:
mpi
config
:
# 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2
paddle_version
:
"
1.7.2"
# hdfs/afs的配置信息填写
...
...
@@ -229,9 +279,45 @@ submit:
下面是一个利用PaddleCloud提交K8S集群进行GPU训练的
`backend.yaml`
示例
首先调整
`config.yaml`
```
yaml
workspace
:
"
./"
mode
:
[
collective_cluster
]
dataset
:
-
name
:
dataloader_train
batch_size
:
2
type
:
DataLoader
data_path
:
"
{workspace}/train_data"
sparse_slots
:
"
click
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26"
dense_slots
:
"
dense_var:13"
runner
:
-
name
:
collective_cluster
class
:
cluster_train
epochs
:
2
device
:
gpu
fleet_mode
:
collective
save_checkpoint_interval
:
1
# save model interval of epochs
save_checkpoint_path
:
"
increment_dnn"
# save checkpoint path
init_model_path
:
"
"
# load model path
print_interval
:
1
phases
:
[
phase1
]
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
dataset_name
:
dataloader_train
thread_num
:
1
```
再增加
`backend.yaml`
```
yaml
backend
:
"
PaddleCloud"
cluster_type
:
mpi
# k8s 可选
cluster_type
:
k8s
# k8s 可选
config
:
# 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2
...
...
@@ -271,9 +357,93 @@ submit:
# for k8s gpu
# k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数
k8s_trainers
:
2
k8s-cpu-cores
:
4
k8s_gpu_card
:
1
```
#### K8S集群的PS-CPU模式配置
下面是一个利用PaddleCloud提交K8S集群进行参数服务器CPU训练的
`backend.yaml`
示例
首先调整
`config.yaml`
:
```
yaml
workspace
:
"
./"
mode
:
[
ps_cluster
]
dataset
:
-
name
:
dataloader_train
batch_size
:
2
type
:
DataLoader
data_path
:
"
{workspace}/train_data"
sparse_slots
:
"
click
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26"
dense_slots
:
"
dense_var:13"
runner
:
-
name
:
ps_cluster
class
:
cluster_train
epochs
:
2
device
:
cpu
fleet_mode
:
ps
save_checkpoint_interval
:
1
save_checkpoint_path
:
"
increment_dnn"
init_model_path
:
"
"
print_interval
:
1
phases
:
[
phase1
]
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
dataset_name
:
dataloader_train
thread_num
:
1
```
再新增
`backend.yaml`
```
yaml
backend
:
"
PaddleCloud"
cluster_type
:
k8s
# k8s 可选
config
:
# 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2
paddle_version
:
"
1.7.2"
# hdfs/afs的配置信息填写
fs_name
:
"
afs://xxx.com"
fs_ugi
:
"
usr,pwd"
# 填任务输出目录的远程地址,如afs:/user/your/path/ 则此处填 /user/your/path
output_path
:
"
"
# for k8s
# 填远程挂载地址,如afs:/user/your/path/ 则此处填 /user/your/path
afs_remote_mount_point
:
"
"
submit
:
# PaddleCloud 个人信息 AK 及 SK
ak
:
"
"
sk
:
"
"
# 任务运行优先级,默认high
priority
:
"
high"
# 任务名称
job_name
:
"
PaddleRec_CTR"
# 训练资源所在组
group
:
"
"
# 节点上的任务启动命令
start_cmd
:
"
python
-m
paddlerec.run
-m
./config.yaml"
# 本地需要上传到节点工作目录的文件
files
:
./*.py ./*.yaml
# for k8s gpu
# k8s ps-cpu 模式下,训练节点数,参数服务器节点数,及每个节点上的cpu核心数及内存限制
k8s_trainers
:
2
k8s-cpu-cores
:
4
k8s_ps_num
:
2
k8s_ps_cores
:
4
```
### 第四步:任务提交
当我们准备好
`config.yaml`
与
`backend.yaml`
,便可以进行一键任务提交,命令为:
...
...
models/rank/dnn/backend.yaml
浏览文件 @
7f9869d3
...
...
@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
./"
backend
:
"
PaddleCloud"
cluster_type
:
k8s
#
k8s
可选
cluster_type
:
k8s
#
mpi
可选
config
:
fs_name
:
"
afs://xxx.com"
...
...
@@ -56,5 +52,12 @@ submit:
# for k8s gpu
k8s_trainers
:
2
k8s_cpu_cores
:
2
k8s_gpu_card
:
1
# for k8s ps-cpu
k8s_trainers
:
2
k8s_cpu_cores
:
4
k8s_ps_num
:
2
k8s_ps_cores
:
4
setup.cfg
0 → 100644
浏览文件 @
7f9869d3
[easy_install]
index_url=http://pip.baidu.com/pypi/simple
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录