run.sh 5.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
#!/usr/bin/env bash
set -eux
R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
cd ${MYDIR}/../../../
# config env
source ${MYDIR}/model_conf

source ./env.sh
source ./utils.sh

timestamp=`date "+%Y%m%d-%H%M%S"`
echo $timestamp

# check
check_iplist

set -eu
output_dir=../output-coco
log_dir=../log-coco
mkdir -p $output_dir $log_dir

e_executor=$(echo ${use_experimental_executor-'True'} | tr '[A-Z]' '[a-z]')

use_fuse=$(echo ${use_fuse-'False'} | tr '[A-Z]' '[a-z]')
if [[ ${use_fuse} == "true" ]]; then
    #MB
    export FLAGS_fuse_parameter_memory_size=64
fi

export EVAL_SCRIPT_LOG=${MYDIR}/../../../${output_dir}/eval.log
export TASK_DATA_PATH=${data_path}

distributed_args="--node_ips ${PADDLE_TRAINERS} \
                --node_id ${PADDLE_TRAINER_ID} \
                --current_node_ip ${POD_IP} \
                --selected_gpus 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 \
                --split_log_path $log_dir \
                --nproc_per_node 16"

skip_steps=10
save_steps=10000
validation_steps=10000

for random_seed in "${DD_RAND_SEED[@]}"; do
    echo "random_seed "${random_seed}
    for batch_size in "${BATCH_SIZE[@]}"; do
        echo "batch_size "${batch_size}
        for warmup_proportion in "${WARMUP_PROP[@]}"; do
            echo "warmup_proportion "${warmup_proportion}
            for learning_rate in "${LR_RATE[@]}"; do
                echo "learning rate "${learning_rate}

                python -u ./src/launch.py ${distributed_args} \
                    ./src/run_img2txt.py --use_cuda "True" \
                       --is_distributed "True" \
                       --use_multi_gpu_test ${use_multi_gpu_test:-"True"} \
                       --use_fp16 ${use_fp16:-"False"} \
                       --use_dynamic_loss_scaling ${use_fp16} \
                       --init_loss_scaling ${loss_scaling:-128} \
                       --use_fast_executor ${e_executor:-"True"} \
                       --use_fuse ${use_fuse:-"False"} \
                       --nccl_comm_num ${nccl_comm_num:-1} \
                       --use_hierarchical_allreduce ${use_hierarchical_allreduce:-"False"} \
                       --do_train ${do_train:-"true"} \
                       --do_val ${do_val:-"false"} \
                       --do_test ${do_test:-"true"} \
                       --do_pred ${do_pred:-"false"} \
                       --do_decode ${do_decode:-"True"} \
                       --train_filelist ${data_path}/${train_filelist:-""} \
                       --valid_filelist ${data_path}/${valid_filelist:-""} \
                       --test_filelist ${data_path}/${test_filelist:-""} \
                       --object_file ${data_path}/${object_file_local_path:-""} \
                       --epoch ${epoch} \
                       --task_type ${task_type:-"img2txt"} \
                       --max_seq_len ${max_seq_len} \
                       --max_img_len ${max_img_len} \
                       --max_obj_len ${max_obj_len} \
                       --max_tgt_len ${max_tgt_len} \
                       --max_out_len ${max_out_len} \
                       --min_out_len ${min_out_len} \
                       --block_trigram ${block_trigram:-"True"} \
                       --beam_size ${beam_size:-5}  \
                       --length_penalty ${length_penalty:-0.6} \
                       --hidden_dropout_prob ${hidden_dropout_prob:-0.1} \
                       --attention_probs_dropout_prob ${attention_probs_dropout_prob:-0.1} \
                       --beta1 ${beta1:-0.9} \
                       --beta2 ${beta2:-0.98} \
                       --epsilon ${epsilon:-1e-06} \
                       --tgt_type_id ${tgt_type_id:-1}\
                       --batch_size ${batch_size} \
                       --pred_batch_size ${pred_batch_size} \
                       --learning_rate ${learning_rate} \
                       --lr_scheduler ${lr_scheduler:-"linear_warmup_decay"} \
                       --warmup_proportion ${warmup_proportion:-0.02} \
                       --weight_decay ${weight_decay:-0.01} \
                       --weight_sharing ${weight_sharing:-"True"} \
                       --label_smooth ${label_smooth:-0.1} \
                       --init_pretraining_params ${init_model:-""} \
                       --unimo_vocab_file ${vocab_file} \
                       --encoder_json_file ${bpe_json} \
                       --vocab_bpe_file ${bpe_file} \
                       --unimo_config_path ${config_path} \
                       --checkpoints $output_dir \
                       --adv_step ${adv_step:-2} \
                       --adv_lr ${adv_lr:-0.05} \
                       --adv_type ${adv_type:-"None"} \
                       --norm_type ${norm_type:-"l2"} \
                       --adv_max_norm ${adv_max_norm:-0.4} \
                       --adv_init_mag ${adv_init_mag:-0.4} \
                       --adv_kl_weight ${adv_kl_weight:-1.5} \
                       --save_steps ${save_steps:-10000} \
                       --validation_steps ${validation_steps:-10000} \
                       --skip_steps ${skip_steps:-10} \
                       --save_and_valid_by_epoch ${save_and_valid_by_epoch:-"False"} \
                       --eval_script ${eval_script:-""} \
                       --eval_mertrics ${eval_mertrics:-""} \
                       --random_seed ${random_seed:-"1"} >> $log_dir/lanch.log 2>&1
            done
        done
    done
done

python ./src/utils/extract_eval_res.py --log_dir=$log_dir
exit 0