start_mpi_train.sh 930 字节
Newer Older
武毅 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#!/bin/bash
# General trainning configurations

NICS=eth0
PADDLE_INIT_PORT=7164
PADDLE_INIT_PORTS_NUM=1
PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
PADDLE_INIT_USE_GPU=False

PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
PADDLE_CLUSTER_TRAIN=True

env

# start pserver
Q
qiao hai-jun 已提交
18 19 20 21 22
stdbuf -oL nohup paddle pserver \
  --port=$PADDLE_INIT_PORT \
  --ports_num=$PADDLE_INIT_PORTS_NUM \
  --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
  --nics=$NICS \
武毅 已提交
23
  --comment=paddle_cluster_pserver \
Q
qiao hai-jun 已提交
24 25
  --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS \
  &> logs/pserver.log &
武毅 已提交
26 27 28 29 30 31 32

# start trainer
# NOTE: train.py will use the above environment variables as configuration
python train.py &> logs/train.log

# kill background pservers when train finishes
ps -ef | grep pserver | awk '{print $2}' | xargs kill