submit.sh 3.0 KB
Newer Older
T
tangwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#!/bin/bash

###################################################
# Usage: submit.sh
# Description: run mpi submit clinet implement
###################################################

#-----------------------------------------------------------------------------------------------------------------
#fun : get argument from env, set it into variables
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function vars_get_from_env() {
T
tangwei 已提交
14
  echo "xx"
T
tangwei 已提交
15 16 17 18 19 20 21 22 23 24
}

#-----------------------------------------------------------------------------------------------------------------
#fun : package
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function package() {
  g_run_stage="package"

T
tangwei 已提交
25 26 27
  temp=${engine_temp_path}
  echo "package temp dir: " ${temp}

T
tangwei 已提交
28 29
  cp ${engine_job_scrpit} ${temp}
  cp ${engine_submit_qconf} ${temp}
T
tangwei 已提交
30 31
  echo "copy job.sh from " ${engine_worker} " to " ${temp}

T
bug fix  
tangwei12 已提交
32 33
  mkdir -p ${temp}/package
  cp -r ${engine_package_python} ${temp}/package/
T
tangwei 已提交
34 35
  echo "copy python from " ${engine_package_python} " to " ${temp}

T
tangwei 已提交
36
  mkdir ${temp}/package/whl
T
tangwei 已提交
37
  cp ${engine_package_paddlerec} ${temp}/package/whl/
T
tangwei 已提交
38
  echo "copy " ${engine_package_paddlerec} " to " ${temp}"/whl/"
T
tangwei 已提交
39 40 41 42 43 44 45 46
}

#-----------------------------------------------------------------------------------------------------------------
#fun : before hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function before_submit() {
T
tangwei 已提交
47
  echo "before_submit"
T
tangwei 已提交
48 49 50 51 52 53 54 55
}

#-----------------------------------------------------------------------------------------------------------------
#fun : after hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function after_submit() {
T
tangwei 已提交
56
  echo "after_submit"
T
tangwei 已提交
57 58 59 60 61 62 63 64 65
}

#-----------------------------------------------------------------------------------------------------------------
#fun : submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function submit() {
  g_run_stage="submit"
T
tangwei 已提交
66 67 68
  g_job_name="paddle_rec_mpi"
  g_hdfs_path=$g_hdfs_path
  g_job_entry="worker.sh"
T
tangwei 已提交
69

T
tangwei 已提交
70
  engine_hdfs_output=${engine_hdfs_output}/$(date +%Y%m%d%H%M%S)
T
tangwei 已提交
71 72 73 74

  cd ${engine_temp_path}

  ${engine_submit_hpc}/bin/qsub_f \
T
tangwei 已提交
75
    -N ${g_job_name} \
T
tangwei 已提交
76 77 78 79
    --conf ${engine_submit_qconf} \
    --hdfs ${engine_hdfs_name} \
    --ugi ${engine_hdfs_ugi} \
    --hout ${engine_hdfs_output} \
T
tangwei 已提交
80
    --files ./package \
T
tangwei 已提交
81
    -l nodes=${engine_submit_nodes},walltime=1000:00:00,resource=full ${g_job_entry}
T
tangwei 已提交
82 83 84
}

function main() {
T
tangwei 已提交
85 86 87 88 89
  package

  before_submit
  submit
  after_submit
T
tangwei 已提交
90
}