未验证 提交 7f9869d3 编写于 作者: C Chengmo 提交者: GitHub

update paddlecloud train (#142)

* update

* fix

* delete ps-memory

* fix

* fix
上级 9b89d8f7
echo "Run before_hook.sh ..." echo "Run before_hook.sh ..."
wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz --no-check-certificate
tar -xf PaddleRec.tar.gz tar -xf PaddleRec.tar.gz
...@@ -10,6 +10,6 @@ python setup.py install ...@@ -10,6 +10,6 @@ python setup.py install
pip uninstall -y paddlepaddle pip uninstall -y paddlepaddle
pip install paddlepaddle-gpu==<$ PADDLEPADDLE_VERSION $> --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com pip install paddlepaddle==<$ PADDLEPADDLE_VERSION $> --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
echo "End before_hook.sh ..." echo "End before_hook.sh ..."
echo "Run before_hook.sh ..." echo "Run before_hook.sh ..."
wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz --no-check-certificate
tar -xf PaddleRec.tar.gz tar -xf PaddleRec.tar.gz
......
...@@ -39,7 +39,12 @@ function _before_submit() { ...@@ -39,7 +39,12 @@ function _before_submit() {
elif [ ${DISTRIBUTE_MODE} == "COLLECTIVE_GPU_K8S" ]; then elif [ ${DISTRIBUTE_MODE} == "COLLECTIVE_GPU_K8S" ]; then
_gen_gpu_before_hook _gen_gpu_before_hook
_gen_k8s_config _gen_k8s_config
_gen_k8s_job _gen_k8s_gpu_job
_gen_end_hook
elif [ ${DISTRIBUTE_MODE} == "PS_CPU_K8S" ]; then
_gen_cpu_before_hook
_gen_k8s_config
_gen_k8s_cpu_job
_gen_end_hook _gen_end_hook
fi fi
...@@ -101,6 +106,7 @@ function _gen_end_hook() { ...@@ -101,6 +106,7 @@ function _gen_end_hook() {
function _gen_mpi_job() { function _gen_mpi_job() {
echo "gen mpi_job.sh" echo "gen mpi_job.sh"
sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \ sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \
-e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \
-e "s#<$ AK $>#$AK#g" \ -e "s#<$ AK $>#$AK#g" \
-e "s#<$ SK $>#$SK#g" \ -e "s#<$ SK $>#$SK#g" \
-e "s#<$ MPI_PRIORITY $>#$PRIORITY#g" \ -e "s#<$ MPI_PRIORITY $>#$PRIORITY#g" \
...@@ -109,18 +115,34 @@ function _gen_mpi_job() { ...@@ -109,18 +115,34 @@ function _gen_mpi_job() {
${abs_dir}/cloud/mpi_job.sh.template >${PWD}/job.sh ${abs_dir}/cloud/mpi_job.sh.template >${PWD}/job.sh
} }
function _gen_k8s_job() { function _gen_k8s_gpu_job() {
echo "gen k8s_job.sh" echo "gen k8s_job.sh"
sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \ sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \
-e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \
-e "s#<$ AK $>#$AK#g" \ -e "s#<$ AK $>#$AK#g" \
-e "s#<$ SK $>#$SK#g" \ -e "s#<$ SK $>#$SK#g" \
-e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \ -e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \
-e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \ -e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \
-e "s#<$ K8S_CPU_CORES $>#$K8S_CPU_CORES#g" \
-e "s#<$ K8S_GPU_CARD $>#$K8S_GPU_CARD#g" \ -e "s#<$ K8S_GPU_CARD $>#$K8S_GPU_CARD#g" \
-e "s#<$ START_CMD $>#$START_CMD#g" \ -e "s#<$ START_CMD $>#$START_CMD#g" \
${abs_dir}/cloud/k8s_job.sh.template >${PWD}/job.sh ${abs_dir}/cloud/k8s_job.sh.template >${PWD}/job.sh
} }
function _gen_k8s_cpu_job() {
echo "gen k8s_job.sh"
sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \
-e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \
-e "s#<$ AK $>#$AK#g" \
-e "s#<$ SK $>#$SK#g" \
-e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \
-e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \
-e "s#<$ K8S_PS_NUM $>#$K8S_PS_NUM#g" \
-e "s#<$ K8S_PS_CORES $>#$K8S_PS_CORES#g" \
-e "s#<$ K8S_CPU_CORES $>#$K8S_CPU_CORES#g" \
-e "s#<$ START_CMD $>#$START_CMD#g" \
${abs_dir}/cloud/k8s_cpu_job.sh.template >${PWD}/job.sh
}
#----------------------------------------------------------------------------------------------------------------- #-----------------------------------------------------------------------------------------------------------------
...@@ -145,6 +167,7 @@ function _submit() { ...@@ -145,6 +167,7 @@ function _submit() {
function package_hook() { function package_hook() {
cur_time=`date +"%Y%m%d%H%M"` cur_time=`date +"%Y%m%d%H%M"`
new_job_name="${JOB_NAME}_${cur_time}" new_job_name="${JOB_NAME}_${cur_time}"
export OLD_JOB_NAME=${JOB_NAME}
export JOB_NAME=${new_job_name} export JOB_NAME=${new_job_name}
export job_file_path="${PWD}/${new_job_name}" export job_file_path="${PWD}/${new_job_name}"
mkdir ${job_file_path} mkdir ${job_file_path}
......
#!/bin/bash
###############################################################
## 注意-- 注意--注意 ##
## K8S PS-CPU多机作业作业示例 ##
###############################################################
job_name=<$ JOB_NAME $>
# 作业参数
group_name="<$ GROUP_NAME $>"
job_version="paddle-fluid-v1.7.1"
start_cmd="<$ START_CMD $>"
wall_time="10:00:00"
k8s_priority=<$ K8S_PRIORITY $>
k8s_trainers=<$ K8S_TRAINERS $>
k8s_cpu_cores=<$ K8S_CPU_CORES $>
k8s_ps_num=<$ K8S_PS_NUM $>
k8s_ps_cores=<$ K8S_PS_CORES $>
# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取)
ak=<$ AK $>
sk=<$ SK $>
paddlecloud job --ak ${ak} --sk ${sk} \
train --job-name ${job_name} \
--group-name ${group_name} \
--job-conf config.ini \
--start-cmd "${start_cmd}" \
--files ./* \
--job-version ${job_version} \
--k8s-priority ${k8s_priority} \
--wall-time ${wall_time} \
--k8s-trainers ${k8s_trainers} \
--k8s-cpu-cores ${k8s_cpu_cores} \
--k8s-ps-num ${k8s_ps_num} \
--k8s-ps-cores ${k8s_ps_cores} \
--is-standalone 0 \
--distribute-job-type "PSERVER" \
--json
\ No newline at end of file
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
## 注意-- 注意--注意 ## ## 注意-- 注意--注意 ##
## K8S NCCL2多机作业作业示例 ## ## K8S NCCL2多机作业作业示例 ##
############################################################### ###############################################################
job_name=${JOB_NAME} job_name=<$ JOB_NAME $>
# 作业参数 # 作业参数
group_name="<$ GROUP_NAME $>" group_name="<$ GROUP_NAME $>"
...@@ -13,8 +13,20 @@ wall_time="10:00:00" ...@@ -13,8 +13,20 @@ wall_time="10:00:00"
k8s_priority=<$ K8S_PRIORITY $> k8s_priority=<$ K8S_PRIORITY $>
k8s_trainers=<$ K8S_TRAINERS $> k8s_trainers=<$ K8S_TRAINERS $>
k8s_cpu_cores=<$ K8S_CPU_CORES $>
k8s_gpu_cards=<$ K8S_GPU_CARD $> k8s_gpu_cards=<$ K8S_GPU_CARD $>
is_stand_alone=0
nccl="--distribute-job-type "NCCL2""
if [ ${k8s_trainers} == 1 ];then
is_stand_alone=1
nccl="--job-remark single-trainer"
if [ ${k8s_gpu_cards} == 1];then
nccl="--job-remark single-gpu"
echo "Attention: Use single GPU card for PaddleRec distributed training, please set runner class from 'cluster_train' to 'train' in config.yaml."
fi
fi
# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取) # 你的ak/sk(可在paddlecloud web页面【个人中心】处获取)
ak=<$ AK $> ak=<$ AK $>
sk=<$ SK $> sk=<$ SK $>
...@@ -27,9 +39,11 @@ paddlecloud job --ak ${ak} --sk ${sk} \ ...@@ -27,9 +39,11 @@ paddlecloud job --ak ${ak} --sk ${sk} \
--files ./* \ --files ./* \
--job-version ${job_version} \ --job-version ${job_version} \
--k8s-trainers ${k8s_trainers} \ --k8s-trainers ${k8s_trainers} \
--k8s-cpu-cores ${k8s_cpu_cores} \
--k8s-gpu-cards ${k8s_gpu_cards} \ --k8s-gpu-cards ${k8s_gpu_cards} \
--k8s-priority ${k8s_priority} \ --k8s-priority ${k8s_priority} \
--wall-time ${wall_time} \ --wall-time ${wall_time} \
--is-standalone 0 \ --is-standalone ${is_stand_alone} \
--distribute-job-type "NCCL2" \ --json \
--json ${nccl}
\ No newline at end of file
\ No newline at end of file
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
## 注意--注意--注意 ## ## 注意--注意--注意 ##
## MPI 类型作业演示 ## ## MPI 类型作业演示 ##
############################################################### ###############################################################
job_name=${JOB_NAME} job_name=<$ JOB_NAME $>
# 作业参数 # 作业参数
group_name=<$ GROUP_NAME $> group_name=<$ GROUP_NAME $>
......
...@@ -67,10 +67,10 @@ class ClusterEngine(Engine): ...@@ -67,10 +67,10 @@ class ClusterEngine(Engine):
@staticmethod @staticmethod
def workspace_replace(): def workspace_replace():
workspace = envs.get_runtime_environ("workspace") remote_workspace = envs.get_runtime_environ("remote_workspace")
for k, v in os.environ.items(): for k, v in os.environ.items():
v = v.replace("{workspace}", workspace) v = v.replace("{workspace}", remote_workspace)
os.environ[k] = str(v) os.environ[k] = str(v)
def run(self): def run(self):
...@@ -98,14 +98,12 @@ class ClusterEngine(Engine): ...@@ -98,14 +98,12 @@ class ClusterEngine(Engine):
cluster_env_check_tool = PaddleCloudMpiEnv() cluster_env_check_tool = PaddleCloudMpiEnv()
else: else:
raise ValueError( raise ValueError(
"Paddlecloud with Mpi don't support GPU training, check your config" "Paddlecloud with Mpi don't support GPU training, check your config.yaml & backend.yaml"
) )
elif cluster_type.upper() == "K8S": elif cluster_type.upper() == "K8S":
if fleet_mode == "PS": if fleet_mode == "PS":
if device == "CPU": if device == "CPU":
raise ValueError( cluster_env_check_tool = CloudPsCpuEnv()
"PS-CPU on paddlecloud is not supported at this time, comming soon"
)
elif device == "GPU": elif device == "GPU":
raise ValueError( raise ValueError(
"PS-GPU on paddlecloud is not supported at this time, comming soon" "PS-GPU on paddlecloud is not supported at this time, comming soon"
...@@ -115,7 +113,7 @@ class ClusterEngine(Engine): ...@@ -115,7 +113,7 @@ class ClusterEngine(Engine):
cluster_env_check_tool = CloudCollectiveEnv() cluster_env_check_tool = CloudCollectiveEnv()
elif device == "CPU": elif device == "CPU":
raise ValueError( raise ValueError(
"Unexpected config -> device: CPU with fleet_mode: Collective, check your config" "Unexpected config -> device: CPU with fleet_mode: Collective, check your config.yaml"
) )
else: else:
raise ValueError("cluster_type {} error, must in MPI/K8S".format( raise ValueError("cluster_type {} error, must in MPI/K8S".format(
...@@ -234,7 +232,7 @@ class PaddleCloudMpiEnv(ClusterEnvBase): ...@@ -234,7 +232,7 @@ class PaddleCloudMpiEnv(ClusterEnvBase):
"config.train_data_path", "") "config.train_data_path", "")
if self.cluster_env["TRAIN_DATA_PATH"] == "": if self.cluster_env["TRAIN_DATA_PATH"] == "":
raise ValueError( raise ValueError(
"No -- TRAIN_DATA_PATH -- found in your backend.yaml, please check." "No -- TRAIN_DATA_PATH -- found in your backend.yaml, please add train_data_path in your backend yaml."
) )
# test_data_path # test_data_path
self.cluster_env["TEST_DATA_PATH"] = self.backend_env.get( self.cluster_env["TEST_DATA_PATH"] = self.backend_env.get(
...@@ -274,7 +272,7 @@ class PaddleCloudK8sEnv(ClusterEnvBase): ...@@ -274,7 +272,7 @@ class PaddleCloudK8sEnv(ClusterEnvBase):
category=UserWarning, category=UserWarning,
stacklevel=2) stacklevel=2)
warnings.warn( warnings.warn(
"The remote mount point will be mounted to the ./afs/", "The remote afs path will be mounted to the ./afs/",
category=UserWarning, category=UserWarning,
stacklevel=2) stacklevel=2)
...@@ -293,3 +291,21 @@ class CloudCollectiveEnv(PaddleCloudK8sEnv): ...@@ -293,3 +291,21 @@ class CloudCollectiveEnv(PaddleCloudK8sEnv):
"submit.k8s_gpu_card", 1) "submit.k8s_gpu_card", 1)
self.cluster_env["K8S_CPU_CORES"] = self.backend_env.get( self.cluster_env["K8S_CPU_CORES"] = self.backend_env.get(
"submit.k8s_cpu_cores", 1) "submit.k8s_cpu_cores", 1)
class CloudPsCpuEnv(PaddleCloudK8sEnv):
def __init__(self):
super(CloudPsCpuEnv, self).__init__()
def env_check(self):
super(CloudPsCpuEnv, self).env_check()
self.cluster_env["DISTRIBUTE_MODE"] = "PS_CPU_K8S"
self.cluster_env["K8S_TRAINERS"] = self.backend_env.get(
"submit.k8s_trainers", 1)
self.cluster_env["K8S_CPU_CORES"] = self.backend_env.get(
"submit.k8s_cpu_cores", 2)
self.cluster_env["K8S_PS_NUM"] = self.backend_env.get(
"submit.k8s_ps_num", 1)
self.cluster_env["K8S_PS_CORES"] = self.backend_env.get(
"submit.k8s_ps_cores", 2)
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
- [第三步:增加集群运行`backend.yaml`配置](#第三步增加集群运行backendyaml配置) - [第三步:增加集群运行`backend.yaml`配置](#第三步增加集群运行backendyaml配置)
- [MPI集群的Parameter Server模式配置](#mpi集群的parameter-server模式配置) - [MPI集群的Parameter Server模式配置](#mpi集群的parameter-server模式配置)
- [K8S集群的Collective模式配置](#k8s集群的collective模式配置) - [K8S集群的Collective模式配置](#k8s集群的collective模式配置)
- [K8S集群的PS-CPU模式配置](#k8s集群的ps-cpu模式配置)
- [第四步:任务提交](#第四步任务提交) - [第四步:任务提交](#第四步任务提交)
- [使用PaddleCloud Client提交](#使用paddlecloud-client提交) - [使用PaddleCloud Client提交](#使用paddlecloud-client提交)
- [第一步:在`before_hook.sh`里手动安装PaddleRec](#第一步在before_hooksh里手动安装paddlerec) - [第一步:在`before_hook.sh`里手动安装PaddleRec](#第一步在before_hooksh里手动安装paddlerec)
...@@ -34,10 +35,10 @@ ...@@ -34,10 +35,10 @@
分布式运行首先需要更改`config.yaml`,主要调整以下内容: 分布式运行首先需要更改`config.yaml`,主要调整以下内容:
- workspace: 调整为在节点运行时的工作目录 - workspace: 调整为在远程点运行时的工作目录,一般设置为`"./"`即可
- runner_class: 从单机的"train"调整为"cluster_train" - runner_class: 从单机的"train"调整为"cluster_train",单机训练->分布式训练(例外情况,k8s上单机单卡训练仍然为train)
- fleet_mode: 选则参数服务器模式,抑或GPU Collective模式 - fleet_mode: 选则参数服务器模式(ps),抑或GPU的all-reduce模式(collective)
- distribute_strategy: 可选项,选择分布式训练的策略 - distribute_strategy: 可选项,选择分布式训练的策略,目前只在参数服务器模式下生效,可选项:`sync、asycn、half_async、geo`
配置选项具体参数,可以参考[yaml配置说明](./yaml.md) 配置选项具体参数,可以参考[yaml配置说明](./yaml.md)
...@@ -50,47 +51,56 @@ ...@@ -50,47 +51,56 @@
workspace: "paddlerec.models.rank.dnn" workspace: "paddlerec.models.rank.dnn"
mode: [single_cpu_train] mode: [single_cpu_train]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner: runner:
- name: single_cpu_train - name: single_cpu_train
class: train class: train
# num of epochs
epochs: 4 epochs: 4
# device to run training or infer
device: cpu device: cpu
save_checkpoint_interval: 2 # save model interval of epochs save_checkpoint_interval: 2
save_checkpoint_path: "increment_dnn" # save checkpoint path save_checkpoint_path: "increment_dnn"
init_model_path: "" # load model path init_model_path: ""
print_interval: 10 print_interval: 10
phases: [phase1] phases: [phase1]
dataset:
- name: dataloader_train
batch_size: 2
type: DataLoader
data_path: "{workspace}/data/sample_data/train"
sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
dense_slots: "dense_var:13"
``` ```
分布式的训练配置可以改为: 分布式的训练配置可以改为:
```yaml ```yaml
# workspace # 改变一:代码上传至节点后,在默认目录下
# 改变一:代码上传至节点后,与运行shell同在一个默认目录下
workspace: "./" workspace: "./"
mode: [ps_cluster] mode: [ps_cluster]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner: runner:
- name: ps_cluster - name: ps_cluster
# 改变二:调整runner的class # 改变二:调整runner的class
class: cluster_train class: cluster_train
# num of epochs
epochs: 4 epochs: 4
# device to run training or infer
device: cpu device: cpu
# 改变三 & 四: 指定fleet_mode 与 distribute_strategy # 改变三 & 四: 指定fleet_mode 与 distribute_strategy
fleet_mode: ps fleet_mode: ps
distribute_strategy: async distribute_strategy: async
save_checkpoint_interval: 2 # save model interval of epochs save_checkpoint_interval: 2
save_checkpoint_path: "increment_dnn" # save checkpoint path save_checkpoint_path: "increment_dnn"
init_model_path: "" # load model path init_model_path: ""
print_interval: 10 print_interval: 10
phases: [phase1] phases: [phase1]
dataset:
- name: dataloader_train
batch_size: 2
type: DataLoader
# 改变五: 改变数据的读取目录
# 通常而言,mpi模式下,数据会下载到远程节点执行目录的'./train_data'下, k8s则与挂载位置有关
data_path: "{workspace}/train_data"
sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
dense_slots: "dense_var:13"
``` ```
除此之外,还需关注数据及模型加载的路径,一般而言: 除此之外,还需关注数据及模型加载的路径,一般而言:
...@@ -165,26 +175,66 @@ submit: ...@@ -165,26 +175,66 @@ submit:
# for k8s gpu # for k8s gpu
# k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数 # k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数
k8s_trainers: 2 k8s_trainers: 2
k8s-cpu-cores: 4
k8s_gpu_card: 1 k8s_gpu_card: 1
# for k8s ps-cpu
k8s_trainers: 2
k8s-cpu-cores: 4
k8s_ps_num: 2
k8s_ps_cores: 4
``` ```
更多backend.yaml配置选项信息,可以查看[yaml配置说明](./yaml.md) 更多backend.yaml配置选项信息,可以查看[yaml配置说明](./yaml.md)
除此之外,我们还需要关注上传到工作目录的文件(`files选项`)的路径问题,在示例中是`./*.py`,说明我们执行任务提交时,与这些py文件在同一目录。若不在同一目录,则需要适当调整files路径,或改为这些文件的绝对路径。 除此之外,我们还需要关注上传到工作目录的文件(`files选项`)的路径问题,在示例中是`./*.py`,说明我们执行任务提交时,与这些py文件在同一目录。若不在同一目录,则需要适当调整files路径,或改为这些文件的绝对路径。
不建议利用`files`上传数据文件,可以通过指定`train_data_path`自动下载,或指定`afs_remote_mount_point`挂载实现数据到节点的转移。 不建议利用`files`上传过大的数据文件,可以通过指定`train_data_path`自动下载,或在k8s模式下指定`afs_remote_mount_point`挂载实现数据到节点的转移。
#### MPI集群的Parameter Server模式配置 #### MPI集群的Parameter Server模式配置
下面是一个利用PaddleCloud提交MPI参数服务器模式任务的`backend.yaml`示例 下面是一个利用PaddleCloud提交MPI参数服务器模式任务的`backend.yaml`示例
首先调整`config.yaml`:
```yaml
workspace: "./"
mode: [ps_cluster]
dataset:
- name: dataloader_train
batch_size: 2
type: DataLoader
data_path: "{workspace}/train_data"
sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
dense_slots: "dense_var:13"
runner:
- name: ps_cluster
class: cluster_train
epochs: 2
device: cpu
fleet_mode: ps
save_checkpoint_interval: 1
save_checkpoint_path: "increment_dnn"
init_model_path: ""
print_interval: 1
phases: [phase1]
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataloader_train
thread_num: 1
```
再新增`backend.yaml`
```yaml ```yaml
backend: "PaddleCloud" backend: "PaddleCloud"
cluster_type: mpi # k8s 可选 cluster_type: mpi
config: config:
# 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2
paddle_version: "1.7.2" paddle_version: "1.7.2"
# hdfs/afs的配置信息填写 # hdfs/afs的配置信息填写
...@@ -229,9 +279,45 @@ submit: ...@@ -229,9 +279,45 @@ submit:
下面是一个利用PaddleCloud提交K8S集群进行GPU训练的`backend.yaml`示例 下面是一个利用PaddleCloud提交K8S集群进行GPU训练的`backend.yaml`示例
首先调整`config.yaml`
```yaml
workspace: "./"
mode: [collective_cluster]
dataset:
- name: dataloader_train
batch_size: 2
type: DataLoader
data_path: "{workspace}/train_data"
sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
dense_slots: "dense_var:13"
runner:
- name: collective_cluster
class: cluster_train
epochs: 2
device: gpu
fleet_mode: collective
save_checkpoint_interval: 1 # save model interval of epochs
save_checkpoint_path: "increment_dnn" # save checkpoint path
init_model_path: "" # load model path
print_interval: 1
phases: [phase1]
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataloader_train
thread_num: 1
```
再增加`backend.yaml`
```yaml ```yaml
backend: "PaddleCloud" backend: "PaddleCloud"
cluster_type: mpi # k8s 可选 cluster_type: k8s # k8s 可选
config: config:
# 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2
...@@ -271,9 +357,93 @@ submit: ...@@ -271,9 +357,93 @@ submit:
# for k8s gpu # for k8s gpu
# k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数 # k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数
k8s_trainers: 2 k8s_trainers: 2
k8s-cpu-cores: 4
k8s_gpu_card: 1 k8s_gpu_card: 1
``` ```
#### K8S集群的PS-CPU模式配置
下面是一个利用PaddleCloud提交K8S集群进行参数服务器CPU训练的`backend.yaml`示例
首先调整`config.yaml`:
```yaml
workspace: "./"
mode: [ps_cluster]
dataset:
- name: dataloader_train
batch_size: 2
type: DataLoader
data_path: "{workspace}/train_data"
sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
dense_slots: "dense_var:13"
runner:
- name: ps_cluster
class: cluster_train
epochs: 2
device: cpu
fleet_mode: ps
save_checkpoint_interval: 1
save_checkpoint_path: "increment_dnn"
init_model_path: ""
print_interval: 1
phases: [phase1]
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataloader_train
thread_num: 1
```
再新增`backend.yaml`
```yaml
backend: "PaddleCloud"
cluster_type: k8s # k8s 可选
config:
# 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2
paddle_version: "1.7.2"
# hdfs/afs的配置信息填写
fs_name: "afs://xxx.com"
fs_ugi: "usr,pwd"
# 填任务输出目录的远程地址,如afs:/user/your/path/ 则此处填 /user/your/path
output_path: ""
# for k8s
# 填远程挂载地址,如afs:/user/your/path/ 则此处填 /user/your/path
afs_remote_mount_point: ""
submit:
# PaddleCloud 个人信息 AK 及 SK
ak: ""
sk: ""
# 任务运行优先级,默认high
priority: "high"
# 任务名称
job_name: "PaddleRec_CTR"
# 训练资源所在组
group: ""
# 节点上的任务启动命令
start_cmd: "python -m paddlerec.run -m ./config.yaml"
# 本地需要上传到节点工作目录的文件
files: ./*.py ./*.yaml
# for k8s gpu
# k8s ps-cpu 模式下,训练节点数,参数服务器节点数,及每个节点上的cpu核心数及内存限制
k8s_trainers: 2
k8s-cpu-cores: 4
k8s_ps_num: 2
k8s_ps_cores: 4
```
### 第四步:任务提交 ### 第四步:任务提交
当我们准备好`config.yaml``backend.yaml`,便可以进行一键任务提交,命令为: 当我们准备好`config.yaml``backend.yaml`,便可以进行一键任务提交,命令为:
......
...@@ -11,12 +11,8 @@ ...@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
workspace: "./"
backend: "PaddleCloud" backend: "PaddleCloud"
cluster_type: k8s # k8s 可选 cluster_type: k8s # mpi 可选
config: config:
fs_name: "afs://xxx.com" fs_name: "afs://xxx.com"
...@@ -56,5 +52,12 @@ submit: ...@@ -56,5 +52,12 @@ submit:
# for k8s gpu # for k8s gpu
k8s_trainers: 2 k8s_trainers: 2
k8s_cpu_cores: 2
k8s_gpu_card: 1 k8s_gpu_card: 1
# for k8s ps-cpu
k8s_trainers: 2
k8s_cpu_cores: 4
k8s_ps_num: 2
k8s_ps_cores: 4
[easy_install]
index_url=http://pip.baidu.com/pypi/simple
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册