of_e2e.sh 995 字节
Newer Older
S
ShawnXuan 已提交
1
rm -rf core.* 
S
fix  
ShawnXuan 已提交
2
rm -rf ./output/snapshots/*
S
ShawnXuan 已提交
3 4
#DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/ofrecord
DATA_ROOT=/dataset/ImageNet/ofrecord
S
ShawnXuan 已提交
5 6 7
#DATA_ROOT=/dataset/imagenet-mxnet
  #python3 cnn_benchmark/of_cnn_train_val.py \
#nvprof -f -o resnet.nvvp \
S
ShawnXuan 已提交
8
#gdb --args \
S
ShawnXuan 已提交
9 10 11 12 13
  python3 cnn_e2e/of_cnn_train_val.py \
    --train_data_dir=$DATA_ROOT/train \
    --train_data_part_num=256 \
    --val_data_dir=$DATA_ROOT/validation \
    --val_data_part_num=256 \
S
ShawnXuan 已提交
14
    --num_nodes=1 \
S
ShawnXuan 已提交
15
    --node_ips='11.11.1.12,11.11.1.14' \
S
ShawnXuan 已提交
16 17 18 19
    --gpu_num_per_node=4 \
    --optimizer="momentum-cosine-decay" \
    --learning_rate=0.256 \
    --loss_print_every_n_iter=20 \
S
ShawnXuan 已提交
20
    --batch_size_per_device=32 \
S
ShawnXuan 已提交
21 22 23 24 25 26 27 28 29
    --val_batch_size_per_device=125 \
    --model="resnet50" 
    #--use_fp16 true \
    #--weight_l2=3.0517578125e-05 \
    #--num_examples=1024 \
    #--optimizer="momentum-decay" \
    #--data_dir="/mnt/13_nfs/xuan/ImageNet/ofrecord/train"
    #--data_dir="/mnt/dataset/xuan/ImageNet/ofrecord/train"
    #--warmup_iter_num=10000 \