run.sh 1.0 KB
Newer Older
Z
Ziyan 已提交
1 2
#!/bin/bash

Z
Ziyan 已提交
3
DATA_PATH=$1
Z
Ziyan 已提交
4 5 6
export DATA_PATH=${DATA_PATH}
RANK_SIZE=$2

7 8
EXEC_PATH=$(pwd)

Z
Ziyan 已提交
9
test_dist_8pcs()
Z
Ziyan 已提交
10
{
Z
Ziyan 已提交
11
    export MINDSPORE_HCCL_CONFIG_PATH=${EXEC_PATH}/rank_table_8pcs.json
Z
Ziyan 已提交
12 13 14
    export RANK_SIZE=8
}

Z
Ziyan 已提交
15
test_dist_2pcs()
Z
Ziyan 已提交
16
{
Z
Ziyan 已提交
17
    export MINDSPORE_HCCL_CONFIG_PATH=${EXEC_PATH}/rank_table_2pcs.json
Z
Ziyan 已提交
18 19 20
    export RANK_SIZE=2
}

Z
Ziyan 已提交
21
test_dist_${RANK_SIZE}pcs
Z
Ziyan 已提交
22

L
lichenever 已提交
23
for((i=1;i<${RANK_SIZE};i++))
Z
Ziyan 已提交
24 25 26
do
    rm -rf device$i
    mkdir device$i
27
    cp ./resnet50_distributed_training.py ./resnet.py ./device$i
Z
Ziyan 已提交
28 29
    cd ./device$i
    export DEVICE_ID=$i
30
    export RANK_ID=$i
Z
Ziyan 已提交
31 32 33 34 35
    echo "start training for device $i"
    env > env$i.log
    pytest -s -v ./resnet50_distributed_training.py > train.log$i 2>&1 &
    cd ../
done
L
lichenever 已提交
36 37 38 39 40 41 42 43 44
rm -rf device0
mkdir device0
cp ./resnet50_distributed_training.py ./resnet.py ./device0
cd ./device0
export DEVICE_ID=0
export RANK_ID=0
echo "start training for device 0"
env > env0.log
pytest -s -v ./resnet50_distributed_training.py > train.log0 2>&1
Z
Ziyan 已提交
45
if [ $? -eq 0 ];then
L
lichenever 已提交
46 47 48 49 50 51
    echo "training success"
else
    echo "training failed"
    exit 2
fi
cd ../