run.sh 3.9 KB
Newer Older
X
xfcygaocan 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
#!/usr/bin/env bash
set -eux
R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
cd ${MYDIR}/../../../
source ${MYDIR}/model_conf

source ./env.sh
source ./utils.sh

check_iplist
export FLAGS_fuse_parameter_memory_size=64

output_dir=./output/${task}
log_dir=${output_dir}/log
save_model_base_dir=$output_dir/save_model
mkdir -p $output_dir $log_dir $save_model_base_dir

log_prefix=$seed"_"$epoch"_"$lr"_"$batch_size"."
eval_dir="${output_dir}/tmp/params.${seed}.${epoch}.${lr}.${batch_size}"
mkdir -p $eval_dir

if [[ ${save_checkpoints} == "True" ]]; then
  save_model_dir="${save_model_base_dir}/params.${seed}.${epoch}.${lr}.${batch_size}"
  mkdir -p $save_model_dir
fi

distributed_args="--node_ips ${PADDLE_TRAINERS} \
                    --node_id ${PADDLE_TRAINER_ID} \
                    --current_node_ip ${POD_IP} \
                    --selected_gpus 0,1,2,3,4,5,6,7 \
                    --split_log_path $log_dir \
                    --log_prefix $log_prefix \
                    --nproc_per_node 8"                    
lanch_start=" -u ./src/launch.py ${distributed_args} "
python $lanch_start ./src/run_retrieval.py \
        --use_cuda "True" \
        --is_distributed ${is_distributed:-"True"} \
        --weight_sharing ${weight_sharing:-"True"} \
        --use_fuse ${use_fuse:-"True"} \
        --use_fast_executor ${e_executor:-"true"} \
        --use_fp16 ${use_fp16:-"false"} \
42
        --nccl_comm_num ${nccl_comm_num:-1} \
X
xfcygaocan 已提交
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
        --use_hierarchical_allreduce ${use_hierarchical_allreduce:-"False"} \
        --use_dynamic_loss_scaling ${use_fp16:-"False"} \
        --use_sigmoid ${use_sigmoid:-"False"} \
        --init_loss_scaling ${loss_scaling:-12800} \
        --beta1 ${beta1:-0.9} \
        --beta2 ${beta2:-0.98} \
        --epsilon ${epsilon:-1e-06} \
        --scale_circle ${scale_circle:-1.0} \
        --margin ${margin:-0.2} \
        --verbose true \
        --samples_num ${samples_num:-20} \
        --run_random ${run_random:-"False"} \
        --do_train ${do_train:-"True"} \
        --do_val ${do_val:-"True"} \
        --do_test ${do_test:-"True"} \
        --batch_size ${batch_size:-16} \
        --test_batch_size ${test_batch_size:-96} \
        --init_pretraining_params ${init_model:-""} \
        --train_image_caption ./data/Flickr30k/flickr30k-textids/train.ids \
        --train_image_feature_dir ./data/Flickr30k/flickr30k-features/$bbox/train \
        --dev_image_caption ./data/Flickr30k/flickr30k-textids/val.all.ids \
        --dev_image_feature_dir ./data/Flickr30k/flickr30k-features/$bbox/dev \
        --test_image_caption ./data/Flickr30k/flickr30k-textids/test.all.ids \
        --test_image_feature_dir ./data/Flickr30k/flickr30k-features/$bbox/test \
        --img_id_path ./data/Flickr30k/flickr30k-textids/dataset_flickr30k_name_id.txt \
        --checkpoints ${save_model_dir:-""} \
        --save_checkpoints ${save_checkpoints:-"True"} \
        --save_steps ${save_steps:-1000} \
        --weight_decay ${weight_decay:-"0.1"} \
        --warmup_step ${warmup_step:-"1"} \
        --validation_steps ${validation_steps:-"100"} \
        --epoch $epoch \
        --max_seq_len ${max_len:-512} \
        --max_img_len ${max_img_len:-37} \
        --learning_rate ${lr:-"5e-6"} \
        --learning_rate_scale ${learning_rate_scale:-0.1} \
        --learning_rate_decay_epoch1 ${learning_rate_decay_epoch1:-24} \
        --learning_rate_decay_epoch2 ${learning_rate_decay_epoch2:-32} \
        --lr_scheduler ${lr_scheduler:-"scale_by_epoch_decay"} \
        --skip_steps ${skip_steps:-"50"} \
        --num_iteration_per_drop_scope 10 \
        --unimo_vocab_file ${vocab_file} \
        --encoder_json_file ${bpe_json} \
        --vocab_bpe_file ${bpe_file} \
        --unimo_config_path ${config_path} \
        --eval_mertrics ${eval_mertrics:-"recall@k"} \
        --eval_dir $eval_dir \
        --random_seed ${seed:-1} \
        >> $log_dir/${log_prefix}lanch.log 2>&1

if [[ $? -ne 0 ]]; then
    echo "run failed"
    exit 1
fi
exit 0