cluster.sh 2.6 KB
Newer Older
T
tangwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
#!/bin/bash

###################################################
# Usage: submit.sh
# Description: run mpi submit clinet implement
###################################################

# ---------------------------------------------------------------------------- #
#                            variable define                                   #
# ---------------------------------------------------------------------------- #

#-----------------------------------------------------------------------------------------------------------------
#fun : package
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function package_hook() {
  g_run_stage="package"
  package
}

#-----------------------------------------------------------------------------------------------------------------
#fun : before hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function _before_submit() {
  echo "before_submit"
  before_submit_hook
}

#-----------------------------------------------------------------------------------------------------------------
#fun : after hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function _after_submit() {
  echo "after_submit"
  after_submit_hook
}

#-----------------------------------------------------------------------------------------------------------------
#fun : submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function _submit() {
  g_run_stage="submit"

T
bug fix  
tangwei12 已提交
50 51 52 53 54
  cd ${engine_temp_path}

  paddlecloud job --ak ${engine_submit_ak} --sk ${engine_submit_sk} train --cluster-name ${engine_submit_cluster} \
    --job-version ${engine_submit_version} \
    --mpi-priority ${engine_submit_priority} \
T
tangwei 已提交
55
    --mpi-wall-time 300:59:00 \
T
bug fix  
tangwei12 已提交
56
    --mpi-nodes ${engine_submit_nodes} --is-standalone 0 \
T
tangwei 已提交
57
    --mpi-memory 110Gi \
T
bug fix  
tangwei12 已提交
58 59 60 61
    --job-name ${engine_submit_jobname} \
    --start-cmd "${g_run_cmd}" \
    --group-name ${engine_submit_group} \
    --job-conf ${engine_submit_config} \
T
tangwei 已提交
62 63
    --files ${g_submitfiles} \
    --json
T
bug fix  
tangwei12 已提交
64 65

  cd -
T
tangwei 已提交
66 67 68 69 70 71 72 73 74
}

function submit_hook() {
  _before_submit
  _submit
  _after_submit
}

function main() {
T
bug fix  
tangwei12 已提交
75 76
  source ${engine_submit_scrpit}

T
tangwei 已提交
77 78 79 80
  package_hook
  submit_hook
}

T
bug fix  
tangwei12 已提交
81
main