run_distill.sh

#!/bin/bash

export FLAGS_sync_nccl_allreduce=0
export FLAGS_eager_delete_tensor_gb=1

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
if  [ ! "$CUDA_VISIBLE_DEVICES" ]
then
    export CPU_NUM=1
    use_cuda=false
else
    use_cuda=true
fi

# path of pre_train model
INPUT_PATH="data/input"
PRETRAIN_MODEL_PATH="data/pretrain_model/squad2_model"
# path to save checkpoint
CHECKPOINT_PATH="data/output/output_mrqa"
mkdir -p $CHECKPOINT_PATH

python -u train.py --use_cuda ${use_cuda}\
        --batch_size 8 \
        --in_tokens false \
        --init_pretraining_params ${PRETRAIN_MODEL_PATH}/params \
        --checkpoints $CHECKPOINT_PATH \
        --vocab_path ${PRETRAIN_MODEL_PATH}/vocab.txt \
        --do_distill true \
        --do_train true \
        --do_predict true \
        --save_steps 10000 \
        --warmup_proportion 0.1 \
        --weight_decay  0.01 \
        --sample_rate 0.02 \
        --epoch 2 \
        --max_seq_len 512 \
        --bert_config_path ${PRETRAIN_MODEL_PATH}/bert_config.json \
        --predict_file ${INPUT_PATH}/mrqa_distill_data/mrqa-combined.all_dev.raw.json \
        --do_lower_case false \
        --doc_stride 128 \
        --train_file ${INPUT_PATH}/mrqa_distill_data/mrqa_distill.json \
        --mlm_path ${INPUT_PATH}/mlm_data \
        --mix_ratio 2.0 \
        --learning_rate 3e-5 \
        --lr_scheduler linear_warmup_decay \
        --skip_steps 100