k8s_job.sh.template 1.6 KB
Newer Older
C
Chengmo 已提交
1 2 3 4 5
#!/bin/bash
###############################################################
##                  注意-- 注意--注意                          ##
##                 K8S NCCL2多机作业作业示例                    ##
###############################################################
C
Chengmo 已提交
6
job_name=<$ JOB_NAME $>
C
Chengmo 已提交
7 8 9 10 11

# 作业参数
group_name="<$ GROUP_NAME $>"               
job_version="paddle-fluid-v1.7.1"
start_cmd="<$ START_CMD $>"
C
chengmo 已提交
12
wall_time="2000:00:00"
C
Chengmo 已提交
13 14 15

k8s_priority=<$ K8S_PRIORITY $>
k8s_trainers=<$ K8S_TRAINERS $>
C
Chengmo 已提交
16
k8s_cpu_cores=<$ K8S_CPU_CORES $>
C
Chengmo 已提交
17 18
k8s_gpu_cards=<$ K8S_GPU_CARD $>

C
Chengmo 已提交
19 20 21 22 23 24 25 26 27 28 29
is_stand_alone=0
nccl="--distribute-job-type "NCCL2""
if [ ${k8s_trainers} == 1 ];then
    is_stand_alone=1
    nccl="--job-remark single-trainer"
    if [ ${k8s_gpu_cards} == 1];then
        nccl="--job-remark single-gpu"
        echo "Attention: Use single GPU card for PaddleRec distributed training, please set runner class from 'cluster_train' to 'train' in config.yaml."
    fi
fi

C
Chengmo 已提交
30 31 32 33 34 35 36 37 38 39 40 41
# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取)
ak=<$ AK $>
sk=<$ SK $>

paddlecloud job --ak ${ak} --sk ${sk} \
        train --job-name ${job_name} \
        --group-name ${group_name} \
        --job-conf config.ini \
        --start-cmd "${start_cmd}" \
        --files ./*  \
        --job-version ${job_version}  \
        --k8s-trainers ${k8s_trainers} \
C
Chengmo 已提交
42
        --k8s-cpu-cores ${k8s_cpu_cores} \
C
Chengmo 已提交
43 44 45
        --k8s-gpu-cards ${k8s_gpu_cards} \
        --k8s-priority ${k8s_priority} \
        --wall-time ${wall_time} \
C
Chengmo 已提交
46 47 48 49
        --is-standalone ${is_stand_alone} \
        --json \
        ${nccl}