gpubox_run.sh 1.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
# !/bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

if [ ! -d "./log" ]; then
  mkdir ./log
  echo "Create log floder for store running log"
fi

export FLAGS_LAUNCH_BARRIER=0
export PADDLE_TRAINER_ID=0
export PADDLE_PSERVER_NUMS=1
export PADDLE_TRAINERS=1
export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS}
export POD_IP=127.0.0.1

# set free port if 29011 is occupied
export PADDLE_PSERVERS_IP_PORT_LIST="127.0.0.1:29011"
export PADDLE_PSERVER_PORT_ARRAY=(29011)

# set gpu numbers according to your device
W
wangzhen38 已提交
34 35
#export FLAGS_selected_gpus="0,1,2,3,4,5,6,7"
export FLAGS_selected_gpus="0,1"
36 37 38 39 40 41 42 43 44 45 46 47

# set your model yaml
#SC="gpubox_ps_trainer.py"
SC="static_gpubox_trainer.py"

# run pserver
export TRAINING_ROLE=PSERVER
for((i=0;i<$PADDLE_PSERVER_NUMS;i++))
do
    cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]}
    echo "PADDLE WILL START PSERVER "$cur_port
    export PADDLE_PORT=${cur_port}
T
tianshuo78520a 已提交
48
    python -u $SC &> ./log/pserver.$i.log &
49 50 51 52 53 54 55 56
done

# run trainer
export TRAINING_ROLE=TRAINER
for((i=0;i<$PADDLE_TRAINERS;i++))
do
    echo "PADDLE WILL START Trainer "$i
    export PADDLE_TRAINER_ID=$i
T
tianshuo78520a 已提交
57
    python -u $SC &> ./log/worker.$i.log
58 59
done

W
wangzhen38 已提交
60
if [ $? -eq 0 ];then
61
echo "Training log stored in ./log/"
W
wangzhen38 已提交
62 63 64
else
exit 1
fi