cluster.sh 7.1 KB
Newer Older
T
tangwei 已提交
1
#!/bin/bash
T
tangwei 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

T
tangwei 已提交
16 17 18

###################################################
# Usage: submit.sh
C
Chengmo 已提交
19
# Description: run paddlecloud submit client implement
T
tangwei 已提交
20 21 22 23 24 25 26 27 28 29 30 31 32
###################################################

# ---------------------------------------------------------------------------- #
#                            variable define                                   #
# ---------------------------------------------------------------------------- #

#-----------------------------------------------------------------------------------------------------------------
#fun : before hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function _before_submit() {
  echo "before_submit"
C
Chengmo 已提交
33 34 35 36 37 38 39 40 41
  
  if [ ${DISTRIBUTE_MODE} == "PS_CPU_MPI" ]; then
    _gen_cpu_before_hook
    _gen_mpi_config
    _gen_mpi_job
    _gen_end_hook
  elif [ ${DISTRIBUTE_MODE} == "COLLECTIVE_GPU_K8S" ]; then
    _gen_gpu_before_hook
    _gen_k8s_config
C
Chengmo 已提交
42 43 44 45 46 47
    _gen_k8s_gpu_job
    _gen_end_hook
  elif [ ${DISTRIBUTE_MODE} == "PS_CPU_K8S" ]; then
    _gen_cpu_before_hook
    _gen_k8s_config
    _gen_k8s_cpu_job
C
Chengmo 已提交
48 49 50 51 52 53 54 55 56 57 58 59 60 61
    _gen_end_hook
  fi
  
}

function _gen_mpi_config() {
  echo "gen mpi_config.ini"
  sed -e "s#<$ FS_NAME $>#$FS_NAME#g" \
      -e "s#<$ FS_UGI $>#$FS_UGI#g" \
      -e "s#<$ TRAIN_DATA_PATH $>#$TRAIN_DATA_PATH#g" \
      -e "s#<$ TEST_DATA_PATH $>#$TEST_DATA_PATH#g" \
      -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \
      -e "s#<$ THIRDPARTY_PATH $>#$THIRDPARTY_PATH#g" \
      -e "s#<$ CPU_NUM $>#$max_thread_num#g" \
M
MrChengmo 已提交
62
      -e "s#<$ USE_PYTHON3 $>#$USE_PYTHON3#g" \
C
Chengmo 已提交
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
      -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \
      -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \
      -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \
      -e "s#<$ FLAGS_communicator_max_merge_var_num $>#$FLAGS_communicator_max_merge_var_num#g" \
      -e "s#<$ FLAGS_communicator_max_send_grad_num_before_recv $>#$FLAGS_communicator_max_send_grad_num_before_recv#g" \
      -e "s#<$ FLAGS_communicator_fake_rpc $>#$FLAGS_communicator_fake_rpc#g" \
      -e "s#<$ FLAGS_rpc_retry_times $>#$FLAGS_rpc_retry_times#g" \
      ${abs_dir}/cloud/mpi_config.ini.template >${PWD}/config.ini
}

function _gen_k8s_config() {
  echo "gen k8s_config.ini"
  sed -e "s#<$ FS_NAME $>#$FS_NAME#g" \
      -e "s#<$ FS_UGI $>#$FS_UGI#g" \
      -e "s#<$ AFS_REMOTE_MOUNT_POINT $>#$AFS_REMOTE_MOUNT_POINT#g" \
      -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \
      -e "s#<$ CPU_NUM $>#$max_thread_num#g" \
M
MrChengmo 已提交
80
      -e "s#<$ USE_PYTHON3 $>#$USE_PYTHON3#g" \
C
Chengmo 已提交
81 82 83 84 85 86 87 88 89 90 91 92 93 94
      -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \
      -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \
      -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \
      -e "s#<$ FLAGS_communicator_max_merge_var_num $>#$FLAGS_communicator_max_merge_var_num#g" \
      -e "s#<$ FLAGS_communicator_max_send_grad_num_before_recv $>#$FLAGS_communicator_max_send_grad_num_before_recv#g" \
      -e "s#<$ FLAGS_communicator_fake_rpc $>#$FLAGS_communicator_fake_rpc#g" \
      -e "s#<$ FLAGS_rpc_retry_times $>#$FLAGS_rpc_retry_times#g" \
      ${abs_dir}/cloud/k8s_config.ini.template >${PWD}/config.ini
}

function _gen_cpu_before_hook() {
  echo "gen cpu before_hook.sh"
  sed -e "s#<$ PADDLEPADDLE_VERSION $>#$PADDLE_VERSION#g" \
    ${abs_dir}/cloud/before_hook_cpu.sh.template >${PWD}/before_hook.sh
T
tangwei 已提交
95 96
}

C
Chengmo 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109 110
function _gen_gpu_before_hook() {
  echo "gen gpu before_hook.sh"
  sed -e "s#<$ PADDLEPADDLE_VERSION $>#$PADDLE_VERSION#g" \
    ${abs_dir}/cloud/before_hook_gpu.sh.template >${PWD}/before_hook.sh
}

function _gen_end_hook() {
  echo "gen end_hook.sh"
  cp ${abs_dir}/cloud/end_hook.sh.template ${PWD}/end_hook.sh
}

function _gen_mpi_job() {
  echo "gen mpi_job.sh"
  sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \
C
Chengmo 已提交
111
      -e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \
C
Chengmo 已提交
112 113 114 115 116 117 118 119
      -e "s#<$ AK $>#$AK#g" \
      -e "s#<$ SK $>#$SK#g" \
      -e "s#<$ MPI_PRIORITY $>#$PRIORITY#g" \
      -e "s#<$ MPI_NODES $>#$MPI_NODES#g" \
      -e "s#<$ START_CMD $>#$START_CMD#g" \
      ${abs_dir}/cloud/mpi_job.sh.template >${PWD}/job.sh
}

C
Chengmo 已提交
120
function _gen_k8s_gpu_job() {
C
Chengmo 已提交
121 122
  echo "gen k8s_job.sh"
  sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \
C
Chengmo 已提交
123
      -e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \
C
Chengmo 已提交
124 125 126 127
      -e "s#<$ AK $>#$AK#g" \
      -e "s#<$ SK $>#$SK#g" \
      -e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \
      -e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \
C
Chengmo 已提交
128
      -e "s#<$ K8S_CPU_CORES $>#$K8S_CPU_CORES#g" \
C
Chengmo 已提交
129 130 131 132 133
      -e "s#<$ K8S_GPU_CARD $>#$K8S_GPU_CARD#g" \
      -e "s#<$ START_CMD $>#$START_CMD#g" \
      ${abs_dir}/cloud/k8s_job.sh.template >${PWD}/job.sh
}

C
Chengmo 已提交
134 135 136 137 138 139 140 141 142 143 144 145 146 147
function _gen_k8s_cpu_job() {
  echo "gen k8s_job.sh"
  sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \
      -e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \
      -e "s#<$ AK $>#$AK#g" \
      -e "s#<$ SK $>#$SK#g" \
      -e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \
      -e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \
      -e "s#<$ K8S_PS_NUM $>#$K8S_PS_NUM#g" \
      -e "s#<$ K8S_PS_CORES $>#$K8S_PS_CORES#g" \
      -e "s#<$ K8S_CPU_CORES $>#$K8S_CPU_CORES#g" \
      -e "s#<$ START_CMD $>#$START_CMD#g" \
      ${abs_dir}/cloud/k8s_cpu_job.sh.template >${PWD}/job.sh
}
C
Chengmo 已提交
148 149


T
tangwei 已提交
150 151 152 153 154 155
#-----------------------------------------------------------------------------------------------------------------
#fun : after hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function _after_submit() {
C
Chengmo 已提交
156
  echo "end submit"
T
tangwei 已提交
157 158 159 160 161 162 163 164 165
}

#-----------------------------------------------------------------------------------------------------------------
#fun : submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function _submit() {
  g_run_stage="submit"
C
Chengmo 已提交
166 167
  sh job.sh
}
T
tangwei 已提交
168

C
Chengmo 已提交
169 170 171
function package_hook() {
  cur_time=`date  +"%Y%m%d%H%M"`
  new_job_name="${JOB_NAME}_${cur_time}"
C
Chengmo 已提交
172
  export OLD_JOB_NAME=${JOB_NAME}
C
Chengmo 已提交
173 174 175 176 177 178
  export JOB_NAME=${new_job_name}
  export job_file_path="${PWD}/${new_job_name}"
  mkdir ${job_file_path}
  cp $FILES ${job_file_path}/
  cd ${job_file_path}
  echo "The task submission folder is generated at ${job_file_path}"
T
tangwei 已提交
179 180 181 182 183 184 185 186 187 188 189 190 191
}

function submit_hook() {
  _before_submit
  _submit
  _after_submit
}

function main() {
  package_hook
  submit_hook
}

T
bug fix  
tangwei12 已提交
192
main