worker.sh 2.8 KB
Newer Older
T
tangwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#!/bin/bash

###################################################
# Usage: job.sh
# Description: run job on mpi per node
###################################################

# ---------------------------------------------------------------------------- #
#                            variable define                                   #
# ---------------------------------------------------------------------------- #
declare g_curPath=""
declare g_scriptName=""
declare g_workPath=""
declare g_run_stage=""

# ---------------------------------------------------------------------------- #
#                             const define                                     #
# ---------------------------------------------------------------------------- #
T
bug fix  
tangwei12 已提交
19 20 21 22
export FLAGS_communicator_thread_pool_size=5
export FLAGS_communicator_send_queue_size=18
export FLAGS_communicator_thread_pool_size=20
export FLAGS_communicator_max_merge_var_num=18
T
tangwei 已提交
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
################################################################################

#-----------------------------------------------------------------------------------------------------------------
#fun : check function return code
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function check_error() {
  if [ ${?} -ne 0 ]; then
    echo "execute " + $g_run_stage + " raise exception! please check ..."
    exit 1
  fi
}

#-----------------------------------------------------------------------------------------------------------------
#fun : check function return code
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function env_prepare() {
  g_run_stage="env_prepare"
T
tangwei 已提交
44 45 46
  WORKDIR=$(pwd)
  mpirun -npernode 1 mv package/* ./
  echo "current:"$WORKDIR
T
tangwei 已提交
47

T
bug fix  
tangwei12 已提交
48 49 50 51 52 53 54 55 56 57 58 59 60
  mpirun -npernode 1 tar -zxvf python.tar.gz > /dev/null

  export PYTHONPATH=$WORKDIR/python/
  export PYTHONROOT=$WORKDIR/python/
  export LIBRARY_PATH=$PYTHONPATH/lib:$LIBRARY_PATH
  export LD_LIBRARY_PATH=$PYTHONPATH/lib:$LD_LIBRARY_PATH
  export PATH=$PYTHONPATH/bin:$PATH
  export LIBRARY_PATH=$PYTHONROOT/lib:$LIBRARY_PATH

  python -c "print('heheda')"

  mpirun -npernode 1 python/bin/python -m pip uninstall -y fleet-rec
  mpirun -npernode 1 python/bin/python -m pip install whl/fleet_rec-0.0.2-py2-none-any.whl --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
T
tangwei 已提交
61
  check_error
T
tangwei 已提交
62 63
}

T
tangwei 已提交
64 65 66 67
function run() {
  echo "run"
  g_run_stage="run"
  mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python/bin/python -u -m fleetrec.run -m fleetrec.models.rank.dnn --engine cluster --role worker
T
tangwei 已提交
68 69 70 71
}

function main() {
  env_prepare
T
tangwei 已提交
72
  run
T
tangwei 已提交
73 74 75
}

main