run.sh 699 字节
Newer Older
Z
Ziyan 已提交
1 2
#!/bin/bash

Z
Ziyan 已提交
3
DATA_PATH=$1
Z
Ziyan 已提交
4 5 6
export DATA_PATH=${DATA_PATH}
RANK_SIZE=$2

7 8
EXEC_PATH=$(pwd)

Z
Ziyan 已提交
9
test_dist_8pcs()
Z
Ziyan 已提交
10
{
Z
Ziyan 已提交
11
    export MINDSPORE_HCCL_CONFIG_PATH=${EXEC_PATH}/rank_table_8pcs.json
Z
Ziyan 已提交
12 13 14
    export RANK_SIZE=8
}

Z
Ziyan 已提交
15
test_dist_2pcs()
Z
Ziyan 已提交
16
{
Z
Ziyan 已提交
17
    export MINDSPORE_HCCL_CONFIG_PATH=${EXEC_PATH}/rank_table_2pcs.json
Z
Ziyan 已提交
18 19 20
    export RANK_SIZE=2
}

Z
Ziyan 已提交
21
test_dist_${RANK_SIZE}pcs
Z
Ziyan 已提交
22 23

for((i=0;i<${RANK_SIZE};i++))
Z
Ziyan 已提交
24 25 26
do
    rm -rf device$i
    mkdir device$i
27
    cp ./resnet50_distributed_training.py ./resnet.py ./device$i
Z
Ziyan 已提交
28 29
    cd ./device$i
    export DEVICE_ID=$i
30
    export RANK_ID=$i
Z
Ziyan 已提交
31 32 33 34 35
    echo "start training for device $i"
    env > env$i.log
    pytest -s -v ./resnet50_distributed_training.py > train.log$i 2>&1 &
    cd ../
done