提交 f8658874 编写于 作者: Y Yang An 提交者: Yibing Liu

Add the workspace of ACL2019-KTNET into PaddleNLP Research Version (#3244)

* add readme for KTNET

* update readme

* update readme

* update readme

* update readme of KTNET

* update readme of KTNET

* add source files for KTNET

* update files for KTNET

* update files for KTNET

* update draft of readme for KTNET

* modified scripts for KTNET

* fix typos in readme.md for KTNET

* update scripts for KTNET

* update scripts for KTNET

* update readme for KTNET

* edit two-staged training scripts for KTNET

* add details in the readme of KTNET

* fix typos in the readme of KTNET

* added eval scripts for KTNET

* rename folders for KTNET

* add copyright in the code and add links in readme for KTNET

* add the remaining download link for KTNET

* add md5sum for KTNET

* final version for KTNET
上级 d6c65111
ad550852cf26241b20e8364e40340a99 train.json
60c70c4a7e8190483f9899a1c9bc4178 dev.json
df45d93b87ca3c47b54a33e03fabf719 record_official_evaluate.py
981b29407e0affa3b1b156f72073b945 train-v1.1.json
3e85deb501d4e538b6bc56f786231552 dev-v1.1.json
afb04912d18ff20696f7f88eed49bea9 squad_v1_official_evaluate.py
64010b964ae2ebf00148b3519a4aafc8 KTNET_preprocess_squad_tagging_output.tar.gz
e9352221127b7620427c18e39bfae7fc KTNET_preprocess_tokenize_result_record.tar.gz
e52da2b1d096e889d32df267b82f9c77 KTNET_preprocess_tokenize_result_squad.tar.gz
89db2f5cfb07f0c44998d7f49098eb90 KTNET_preprocess_wordnet_concepts.tar.gz
fb62db2fe82d88480ec853f3c6fa237a NELL.08m.1115.esv.csv.gz
a68e68f9dcf4524b356163369c7f9f50 KTNET_preprocess_nell_concepts.tar.gz
d9b62183c6367ffac3ee6f864c9425a5 wn_concept2vec.txt
1f69c3d092089b0a0652616b72d61bd8 nell_concept2vec.txt
5405c050e64fee4ffec17ee50f079b64 cased_L-24_H-1024_A-16.tar.gz
4bd6e911cdad39c543ba8922a70580cd KTNET_fine-tuned-model_record_both.tar.gz
43fa464d6aeabe6dc7a15315d4ea8288 KTNET_fine-tuned-model_record_nell.tar.gz
20aaefead331f64e435a94ac8a7b58aa KTNET_fine-tuned-model_record_wordnet.tar.gz
3abdb7be3fc5e3b98633c918acc25af4 KTNET_fine-tuned-model_squad_both.tar.gz
9232cf27adda9d64265ccb315e1b9c81 KTNET_fine-tuned-model_squad_nell.tar.gz
a36fdd6d5c88e3e931bb3b28f9aeb4e2 KTNET_fine-tuned-model_squad_wordnet.tar.gz
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
CKPT_DIR=$1
python3 src/run_record.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
CKPT_DIR=$1
python3 src/run_record_twomemory.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
CKPT_DIR=$1
python3 src/run_record.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
CKPT_DIR=$1
python3 src/run_squad.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
CKPT_DIR=$1
python3 src/run_squad_twomemory.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
CKPT_DIR=$1
python3 src/run_squad.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint record_nell_first_stage_output/step_41970 \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d record_nell_first_stage_log ]; then
mkdir record_nell_first_stage_log
else
rm -r record_nell_first_stage_log/*
fi
if [ ! -d record_nell_first_stage_output ]; then
mkdir record_nell_first_stage_output
else
rm -r record_nell_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-4 \
--epoch 10 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints record_nell_first_stage_output/ 1>$PWD_DIR/record_nell_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint record_both_first_stage_output/step_41970 \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d record_both_first_stage_log ]; then
mkdir record_both_first_stage_log
else
rm -r record_both_first_stage_log/*
fi
if [ ! -d record_both_first_stage_output ]; then
mkdir record_both_first_stage_output
else
rm -r record_both_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-4 \
--epoch 10 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints record_both_first_stage_output/ 1>$PWD_DIR/record_both_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint record_wn_first_stage_output/step_41970 \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d record_wn_first_stage_log ]; then
mkdir record_wn_first_stage_log
else
rm -r record_wn_first_stage_log/*
fi
if [ ! -d record_wn_first_stage_output ]; then
mkdir record_wn_first_stage_output
else
rm -r record_wn_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-4 \
--epoch 10 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints record_wn_first_stage_output/ 1>$PWD_DIR/record_wn_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint sqd_nell_first_stage_output/step_3649 \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d sqd_nell_first_stage_log ]; then
mkdir sqd_nell_first_stage_log
else
rm -r sqd_nell_first_stage_log/*
fi
if [ ! -d sqd_nell_first_stage_output ]; then
mkdir sqd_nell_first_stage_output
else
rm -r sqd_nell_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-5 \
--epoch 1 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints sqd_nell_first_stage_output/ 1>$PWD_DIR/sqd_nell_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint sqd_both_first_stage_output/step_3649 \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d sqd_both_first_stage_log ]; then
mkdir sqd_both_first_stage_log
else
rm -r sqd_both_first_stage_log/*
fi
if [ ! -d sqd_both_first_stage_output ]; then
mkdir sqd_both_first_stage_output
else
rm -r sqd_both_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-5 \
--epoch 1 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints sqd_both_first_stage_output/ 1>$PWD_DIR/sqd_both_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint sqd_wn_first_stage_output/step_3649 \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d sqd_wn_first_stage_log ]; then
mkdir sqd_wn_first_stage_log
else
rm -r sqd_wn_first_stage_log/*
fi
if [ ! -d sqd_wn_first_stage_output ]; then
mkdir sqd_wn_first_stage_output
else
rm -r sqd_wn_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-5 \
--epoch 1 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints sqd_wn_first_stage_output/ 1>$PWD_DIR/sqd_wn_first_stage_log/train.log 2>&1
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts,
total_token_num,
voc_size=0,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=None,
return_input_mask=True,
return_max_len=True,
return_num_token=False,
max_concept_length=50):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids = [inst[0] for inst in insts]
batch_sent_ids = [inst[1] for inst in insts]
batch_pos_ids = [inst[2] for inst in insts]
batch_concept_ids = [inst[3] for inst in insts]
labels_list = []
# compatible with squad, whose example includes start/end positions,
# or unique id
for i in range(4, len(insts[0]), 1):
labels = [inst[i] for inst in insts]
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
# First step: do mask without padding
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, return_input_mask=True)
pos_id = pad_batch_data(
batch_pos_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
sent_id = pad_batch_data(
batch_sent_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
concept_ids = pad_batch_data(
batch_concept_ids, pad_idx=[],
max_concept_length=max_concept_length) # 用[0,0,..]来pad
if mask_id >= 0:
return_list = [
src_id, pos_id, sent_id, concept_ids, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, concept_ids, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
def pad_batch_data(insts,
pad_idx=0,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False,
max_concept_length=50):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
# max_len = max(len(inst) for inst in insts)
max_len = 384
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
if type(pad_idx) == list: # padding list, for concept_ids
inst_data = np.array(
[inst + list([0] * max_concept_length for x in range(max_len - len(inst))) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, max_len, max_concept_length, 1])]
else:
inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0]
if __name__ == "__main__":
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts,
total_token_num,
voc_size=0,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=None,
return_input_mask=True,
return_max_len=True,
return_num_token=False,
max_wn_concept_length=50,
max_nell_concept_length=50):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids = [inst[0] for inst in insts]
batch_sent_ids = [inst[1] for inst in insts]
batch_pos_ids = [inst[2] for inst in insts]
batch_wn_concept_ids = [inst[3] for inst in insts]
batch_nell_concept_ids = [inst[4] for inst in insts]
labels_list = []
# compatible with squad, whose example includes start/end positions,
# or unique id
for i in range(5, len(insts[0]), 1):
labels = [inst[i] for inst in insts]
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
# First step: do mask without padding
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, return_input_mask=True)
pos_id = pad_batch_data(
batch_pos_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
sent_id = pad_batch_data(
batch_sent_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
wn_concept_ids = pad_batch_data(
batch_wn_concept_ids, pad_idx=[],
max_concept_length=max_wn_concept_length) # 用[0,0,..]来pad
nell_concept_ids = pad_batch_data(
batch_nell_concept_ids, pad_idx=[],
max_concept_length=max_nell_concept_length) # 用[0,0,..]来pad
if mask_id >= 0:
return_list = [
src_id, pos_id, sent_id, wn_concept_ids, nell_concept_ids, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, wn_concept_ids, nell_concept_ids, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
def pad_batch_data(insts,
pad_idx=0,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False,
max_concept_length=50):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
# max_len = max(len(inst) for inst in insts)
max_len = 384
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
if type(pad_idx) == list: # padding list, for concept_ids
inst_data = np.array(
[inst + list([0] * max_concept_length for x in range(max_len - len(inst))) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, max_len, max_concept_length, 1])]
else:
inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0]
if __name__ == "__main__":
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import json
import logging
import numpy as np
import paddle.fluid as fluid
from model.transformer_encoder import encoder, pre_process_layer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
class BertConfig(object):
def __init__(self, config_path):
self._config_dict = self._parse(config_path)
def _parse(self, config_path):
try:
with open(config_path) as json_file:
config_dict = json.load(json_file)
except Exception:
raise IOError("Error in parsing bert model config file '%s'" %
config_path)
else:
return config_dict
def __getitem__(self, key):
return self._config_dict[key]
def print_config(self):
for arg, value in sorted(six.iteritems(self._config_dict)):
logger.info('%s: %s' % (arg, value))
logger.info('------------------------------------------------')
class BertModel(object):
def __init__(self,
src_ids,
position_ids,
sentence_ids,
input_mask,
config,
weight_sharing=True,
use_fp16=False):
self._emb_size = config['hidden_size']
self._n_layer = config['num_hidden_layers']
self._n_head = config['num_attention_heads']
self._voc_size = config['vocab_size']
self._max_position_seq_len = config['max_position_embeddings']
self._sent_types = config['type_vocab_size']
self._hidden_act = config['hidden_act']
self._prepostprocess_dropout = config['hidden_dropout_prob']
self._attention_dropout = config['attention_probs_dropout_prob']
self._weight_sharing = weight_sharing
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._dtype = "float16" if use_fp16 else "float32"
# Initialize all weigths by truncated normal initializer, and all biases
# will be initialized by constant zero by default.
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
self._build_model(src_ids, position_ids, sentence_ids, input_mask)
def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
# padding id in vocabulary must be set to 0
emb_out = fluid.layers.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._dtype,
param_attr=fluid.ParamAttr(
name=self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
position_emb_out = fluid.layers.embedding(
input=position_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._dtype,
param_attr=fluid.ParamAttr(
name=self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.layers.embedding(
sentence_ids,
size=[self._sent_types, self._emb_size],
dtype=self._dtype,
param_attr=fluid.ParamAttr(
name=self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
if self._dtype == "float16":
input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
# self_attn_mask = fluid.layers.matmul(
# x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.expand(fluid.layers.transpose(input_mask, [0, 2, 1]), [1, 384, 1])
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
self._enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name='encoder')
def get_sequence_output(self):
return self._enc_out
def get_pooled_output(self):
"""Get the first feature of each sequence for classification"""
next_sent_feat = fluid.layers.slice(
input=self._enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name="pooled_fc.w_0", initializer=self._param_initializer),
bias_attr="pooled_fc.b_0")
return next_sent_feat
def get_pretraining_output(self, mask_label, mask_pos, labels):
"""Get the loss & accuracy for pretraining"""
mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
# extract the first token feature in each sentence
next_sent_feat = self.get_pooled_output()
reshaped_emb_out = fluid.layers.reshape(
x=self._enc_out, shape=[-1, self._emb_size])
# extract masked tokens' feature
mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
# transform: fc
mask_trans_feat = fluid.layers.fc(
input=mask_feat,
size=self._emb_size,
act=self._hidden_act,
param_attr=fluid.ParamAttr(
name='mask_lm_trans_fc.w_0',
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
# transform: layer norm
mask_trans_feat = pre_process_layer(
mask_trans_feat, 'n', name='mask_lm_trans')
mask_lm_out_bias_attr = fluid.ParamAttr(
name="mask_lm_out_fc.b_0",
initializer=fluid.initializer.Constant(value=0.0))
if self._weight_sharing:
fc_out = fluid.layers.matmul(
x=mask_trans_feat,
y=fluid.default_main_program().global_block().var(
self._word_emb_name),
transpose_y=True)
fc_out += fluid.layers.create_parameter(
shape=[self._voc_size],
dtype=self._dtype,
attr=mask_lm_out_bias_attr,
is_bias=True)
else:
fc_out = fluid.layers.fc(input=mask_trans_feat,
size=self._voc_size,
param_attr=fluid.ParamAttr(
name="mask_lm_out_fc.w_0",
initializer=self._param_initializer),
bias_attr=mask_lm_out_bias_attr)
mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
logits=fc_out, label=mask_label)
mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
next_sent_fc_out = fluid.layers.fc(
input=next_sent_feat,
size=2,
param_attr=fluid.ParamAttr(
name="next_sent_fc.w_0", initializer=self._param_initializer),
bias_attr="next_sent_fc.b_0")
next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
logits=next_sent_fc_out, label=labels, return_softmax=True)
next_sent_acc = fluid.layers.accuracy(
input=next_sent_softmax, label=labels)
mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
loss = mean_next_sent_loss + mean_mask_lm_loss
return next_sent_acc, mean_mask_lm_loss, loss
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""bert model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import sys
import six
import logging
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.layers import shape
from model.transformer_encoder import encoder, pre_process_layer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
def dynamic_expand(dynamic_tensor, smaller_tensor):
"""
:param dynamic_tensor:
:param smaller_tensor:
:return:
"""
assert len(dynamic_tensor.shape) > len(smaller_tensor.shape)
if type(smaller_tensor.shape) == list:
for dim_idx, dim in smaller_tensor.shape:
dynamic_tensor_dim_idx = len(dynamic_tensor) - len(smaller_tensor) + dim_idx
assert dynamic_tensor.shape[dynamic_tensor_dim_idx] % dim == 0
elif type(smaller_tensor.shape) == int:
assert dynamic_tensor.shape[-1] % smaller_tensor.shape == 0
memory_embs_zero = fluid.layers.scale(dynamic_tensor, scale=0.0)
smaller_tensor = fluid.layers.elementwise_add(memory_embs_zero, smaller_tensor)
return smaller_tensor
def print_tensor(tensor, message, print_runtime=False):
logger.info("{}: {}".format(message, tensor.shape))
if print_runtime:
fluid.layers.Print(tensor, summarize=10, message=message)
class MemoryLayer(object):
def __init__(self, bert_config, concept_size, mem_emb_size, mem_method='cat', prefix=None):
self.initializer_range = bert_config['initializer_range']
self.bert_size = bert_config['hidden_size']
self.concept_size = concept_size
self.mem_emb_size = mem_emb_size
assert mem_method in ['add', 'cat', 'raw']
self.mem_method = mem_method
self.prefix = prefix
def forward(self, bert_output, memory_embs, mem_length, ignore_no_memory_token=True):
"""
:param bert_output: [batch_size, seq_size, bert_size]
:param memory_embs: [batch_size, seq_size, concept_size, mem_emb_size]
:param mem_length: [batch_size, sent_size, 1]
:return:
"""
bert_size = self.bert_size
concept_size = self.concept_size
mem_emb_size = self.mem_emb_size
print_tensor(bert_output, "bert_output")
print_tensor(memory_embs, "memory_embs")
print_tensor(mem_length, "mem_length")
projected_bert = fluid.layers.fc(bert_output, size=mem_emb_size, num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name='{}_memory_layer_projection.w_0'.format(self.prefix) if self.prefix else 'memory_layer_projection.w_0',
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=self.initializer_range)),
bias_attr=False) # [batch_size *seq_size, mem_emb_size]
logger.info("projected_bert: {}".format(projected_bert.shape))
expanded_bert = fluid.layers.unsqueeze(projected_bert, axes=[2]) # [batch_size, seq_size, 1, mem_emb_size]
extended_memory, memory_score = self.add_sentinel(expanded_bert, memory_embs, mem_emb_size)
# extended_memory: [batch_size, seq_size, 1+concept_size, mem_emb_size]
# memory_score: [batch_size, seq_size, 1+concept_size]
concept_ordinal = self.get_concept_oridinal(concept_size, memory_score) # [bs,sq,1+cs]
memory_reverse_mask = fluid.layers.less_than(
fluid.layers.expand(mem_length, expand_times=[1, 1, 1 + concept_size])
, concept_ordinal)
# [batch_size, seq_size, 1+concept_size]
memory_reverse_mask = fluid.layers.cast(memory_reverse_mask, dtype="float32")
print_tensor(memory_reverse_mask, "memory_reverse_mask")
memory_reverse_masked_infinity = fluid.layers.scale(memory_reverse_mask, scale=-1e6)
# [batch_size, seq_size, 1+concept_size]
print_tensor(memory_reverse_masked_infinity, "memory_reverse_masked_infinity")
memory_score = fluid.layers.elementwise_add(memory_score, memory_reverse_masked_infinity)
# [batch_size, seq_size, 1+concept_size]
logger.info("memory_score:{}".format(memory_score.shape))
memory_att = fluid.layers.softmax(memory_score) # [batch_size, seq_size, 1+concept_size]
memory_att = fluid.layers.unsqueeze(memory_att, axes=[2]) # [batch_size, seq_size, 1, 1+concept_size]
logger.info("memory_att: {}".format(memory_att.shape))
logger.info("extended_memory: {}".format(extended_memory.shape))
summ = fluid.layers.matmul(memory_att,extended_memory) # [batch_size, seq_size,1, mem_emb_size]
summ = fluid.layers.squeeze(summ, axes=[2]) # [batch_size, seq_size,mem_emb_size]
if ignore_no_memory_token:
condition = fluid.layers.less_than(
dynamic_expand(mem_length, fluid.layers.zeros([1],"float32")),
mem_length) # [bs, sq]
# summ_true = fluid.layers.elementwise_mul(
# summ,
# fluid.layers.cast(condition, "float32")) # [bs, sq, ms]
# summ_false = fluid.layers.elementwise_mul(
# summ,
# fluid.layers.scale(fluid.layers.cast(condition, "float32"), -1)) # [bs, sq, ms]
# summ = fluid.layers.elementwise_add(summ_true, summ_false) # [bs, sq, ms]
summ = fluid.layers.elementwise_mul(
summ,
fluid.layers.cast(condition, "float32")) # [bs, sq, ms]
print_tensor(summ, "summ")
if self.mem_method == "add":
summ_transform = fluid.layers.fc(summ, size=bert_size, num_flatten_dims=2) # [batch_size, seq_size, bert_size]
output = fluid.layers.sums(input=[summ_transform, bert_output]) # [batch_size, seq_size, bert_size]
elif self.mem_method == "cat":
logger.info("bert_output: {}".format(bert_output.shape))
logger.info("summ: {}".format(summ.shape))
output = fluid.layers.concat(input=[bert_output, summ], axis=2) # [batch_size, seq_size, bert_size + mem_emb_size]
elif self.mem_method == "raw":
logger.info("bert_output: {}".format(bert_output.shape))
logger.info("summ: {}".format(summ.shape))
output = summ # [batch_size, seq_size, mem_emb_size]
else:
raise ValueError("mem_method not supported")
logger.info("output: {}".format(output.shape))
return output
def get_concept_oridinal(self, concept_size, memory_score):
"""
:param concept_size:
:param memory_score: [batch_size, seq_size, 1+concept_size]
:return:
"""
concept_ordinal = fluid.layers.create_tensor(dtype="float32")
fluid.layers.assign(np.arange(start=0, stop=(1 + concept_size), step=1, dtype=np.float32),
concept_ordinal) # [1+cs]
print_tensor(concept_ordinal, "concept_ordinal")
print_tensor(memory_score, "memory_score")
concept_ordinal = dynamic_expand(memory_score, concept_ordinal) # [bs,sq,1+cs]
logger.info("concept_ordinal: {}".format(concept_ordinal.shape))
return concept_ordinal
def add_sentinel(self, expanded_bert, memory_embs, mem_emb_size):
"""
:param expanded_bert: [batch_size, seq_size, 1, mem_emb_size]
:param memory_embs: [batch_size, seq_size, concept_size, mem_emb_size]
:param mem_emb_size:
:return:
"""
sentinel = fluid.layers.create_parameter(
name='{}_memory_layer_sentinel'.format(self.prefix) if self.prefix else 'memory_layer_sentinel',
dtype="float32",
shape=[mem_emb_size],
default_initializer=fluid.initializer.ConstantInitializer(0)) # [mem_emb_size]
print_tensor(sentinel, "sentinel")
memory_embs_squeeze = fluid.layers.slice(memory_embs, axes=[2], starts=[0],
ends=[1]) # [bs,sq,1,ms]
print_tensor(memory_embs_squeeze, "memory_embs_squeeze")
sentinel = dynamic_expand(memory_embs_squeeze, sentinel) # [bs,sq,1,ms]
print_tensor(sentinel, "sentinel")
print_tensor(memory_embs, "memory_embs")
extended_memory = fluid.layers.concat([sentinel, memory_embs],
axis=2) # [batch_size, seq_size, 1+concept_size, mem_emb_size]
extended_memory = fluid.layers.transpose(extended_memory, perm=[0, 1, 3, 2])
# [batch_size, seq_size, mem_emb_size, 1+concept_size]
logger.info("extended_memory: {}".format(extended_memory.shape))
memory_score = fluid.layers.matmul(expanded_bert,
extended_memory) # [batch_size, seq_size, 1, 1+concept_size]
memory_score = fluid.layers.squeeze(memory_score, axes=[2])
# [batch_size, seq_size, 1+concept_size]
extended_memory = fluid.layers.transpose(extended_memory, perm=[0, 1, 3, 2])
# [batch_size, seq_size, 1+concept_size, mem_emb_size]
return extended_memory, memory_score
class TriLinearTwoTimeSelfAttentionLayer(object):
def __init__(self, hidden_size, dropout_rate=0.0,
cat_mul=False, cat_sub=False, cat_twotime=False, cat_twotime_mul=False, cat_twotime_sub=False):
self.hidden_size = hidden_size
self.dropout_rate = dropout_rate
self.cat_mul = cat_mul
self.cat_sub = cat_sub
self.cat_twotime = cat_twotime
self.cat_twotime_mul = cat_twotime_mul
self.cat_twotime_sub = cat_twotime_sub
def forward(self, hidden_emb, sequence_mask):
"""
:param hidden_emb: [batch_size, seq_size, hidden_size]
:param sequence_mask: [batch_size, seq_size, 1]
:return:
"""
assert len(hidden_emb.shape) ==3 and len(sequence_mask.shape) == 3 \
and sequence_mask.shape[-1] == 1
assert hidden_emb.shape[:2] == sequence_mask.shape[:2]
hidden_size = self.hidden_size
bias = fluid.layers.create_parameter(name='self_matching_layer_bias', shape=[1], dtype="float32",
default_initializer=fluid.initializer.ConstantInitializer(0))
weight_1 = fluid.layers.create_parameter(name='self_matching_layer_weight1', shape=[hidden_size], dtype="float32",
default_initializer=fluid.initializer.XavierInitializer(uniform=True, fan_in=1, fan_out=hidden_size)) # [HS]
bs_1_hs = fluid.layers.slice(hidden_emb, axes=[1], starts=[0], ends=[1]) # [bs, 1, hs]
print_tensor(bs_1_hs, "bs_1_hs")
bs_hs_1 = fluid.layers.transpose(bs_1_hs, perm=[0, 2, 1]) # [bs, hs, 1]
print_tensor(bs_hs_1, "bs_hs_1")
print_tensor(weight_1, "weight_1")
weight_1 = dynamic_expand(bs_1_hs, weight_1) # [BS, 1, HS] (a)jk
weight_1 = fluid.layers.transpose(weight_1, perm=[0, 2, 1])
print_tensor(hidden_emb, "hidden_emb")
print_tensor(weight_1, "weight_1")
r1 = fluid.layers.matmul(hidden_emb, weight_1) # [BS, SQ, 1] aik
print_tensor(r1, "r1")
weight_2 = fluid.layers.create_parameter(name='self_matching_layer_weight2', shape=[hidden_size], dtype="float32",
default_initializer=fluid.initializer.XavierInitializer(uniform=True, fan_in=1, fan_out=hidden_size)) # [HS]
weight_2 = dynamic_expand(bs_1_hs, weight_2) # # [BS, 1, HS] (a)jk
hidden_emb_transpose = fluid.layers.transpose(hidden_emb, perm=[0, 2, 1]) # [BS, HS, SQ] aji
r2 = fluid.layers.matmul(weight_2, hidden_emb_transpose) # [BS, 1, SQ] aki
print_tensor(r2, "r2")
weight_mul = fluid.layers.create_parameter(name='self_matching_layer_weightmul', shape=[hidden_size], dtype="float32",
default_initializer=fluid.initializer.XavierInitializer(uniform=True)) # [HS]
weight_mul = dynamic_expand(hidden_emb, weight_mul)
rmul_1 = fluid.layers.elementwise_mul(hidden_emb, weight_mul) # for "hidden * self.weight_mul". [bs, sq(i), hs(j)]
print_tensor(rmul_1, "rmul_1")
rmul_2 = fluid.layers.matmul(rmul_1, hidden_emb_transpose) # [bs, sq(i), hs(j)] mul [bs, hs(j), sq(k)] = [bs, sq(i), sq(k)]
print_tensor(rmul_2, "rmul_2")
r1 = fluid.layers.squeeze(r1, axes=[2]) # [BS, SQ] aik
r1 = dynamic_expand(
fluid.layers.transpose(rmul_2, [1, 0, 2]), # [sq, bs, sq]
r1) # [ SQ(from 1), bs, SQ]
r1 = fluid.layers.transpose(r1, [1, 2, 0]) # [bs, sq, sq(from 1)]
r2 = fluid.layers.squeeze(r2, axes=[1]) # [BS, SQ] aik
r2 = dynamic_expand(
fluid.layers.transpose(rmul_2, [1, 0, 2]), # [sq, bs, sq]
r2) # [ SQ(from 1), bs, SQ]
r2 = fluid.layers.transpose(r2, [1, 0, 2]) # [bs,sq(from 1),sq]
bias = dynamic_expand(rmul_2, bias) # [BS, SQ, SQ]
sim_score = fluid.layers.sums(input=[r1, r2, rmul_2, bias])
# [bs,sq,1]+[bs,1,sq]+[bs,sq,sq]+[bs,sq,sq]=[BS,SQ,SQ]
print_tensor(sim_score, "sim_score")
sequence_mask = fluid.layers.cast(sequence_mask, dtype="float32") # [BS,SQ,1]
softmax_mask = fluid.layers.elementwise_sub(
sequence_mask,
fluid.layers.fill_constant([1], "float32", 1)) # [BS,SQ,1]
softmax_mask = fluid.layers.scale(softmax_mask, -1)
very_negative_number = fluid.layers.fill_constant([1], value=-1e6, dtype="float32")
logger.info("softmax_mask: {}".format(softmax_mask.shape))
logger.info("very_negative_number: {}".format(very_negative_number.shape))
softmax_mask = fluid.layers.elementwise_mul(softmax_mask, very_negative_number) # [BS,SQ,1]
softmax_mask = fluid.layers.squeeze(softmax_mask, axes=[2]) # [BS,SQ]
softmax_mask = dynamic_expand(fluid.layers.transpose(sim_score, perm=[2, 0, 1]), softmax_mask) # [sq(1),bs,sq]
softmax_mask = fluid.layers.transpose(softmax_mask, perm=[1, 0, 2]) # [BS,sq(1),SQ]
print_tensor(softmax_mask, "softmax_mask")
sim_score = fluid.layers.elementwise_add(sim_score, softmax_mask) # [bs,sq,sq]+[bs,sq(1),sq]=[BS,SQ,SQ]
print_tensor(sim_score, "sim_score")
attn_prob = fluid.layers.softmax(sim_score) # [BS,SQ,SQ]
weighted_sum = fluid.layers.matmul(attn_prob, hidden_emb) # [bs,sq,sq]*[bs,sq,hs]=[BS,SQ,HS]
if any([self.cat_twotime, self.cat_twotime_mul, self.cat_twotime_sub]):
twotime_att_prob = fluid.layers.matmul(attn_prob, attn_prob) # [bs,sq,sq]*[bs,sq,sq]=[BS,SQ,SQ]
twotime_weited_sum = fluid.layers.matmul(twotime_att_prob, hidden_emb) # [BS,SQ,HS]
out_tensors = [hidden_emb, weighted_sum]
if self.cat_mul:
out_tensors.append(fluid.layers.elementwise_mul(hidden_emb, weighted_sum))
if self.cat_sub:
out_tensors.append(fluid.layers.elementwise_sub(hidden_emb, weighted_sum))
if self.cat_twotime:
out_tensors.append(twotime_weited_sum)
if self.cat_twotime_mul:
out_tensors.append(fluid.layers.elementwise_mul(hidden_emb, twotime_weited_sum))
if self.cat_twotime_sub:
out_tensors.append(fluid.layers.elementwise_sub(hidden_emb, twotime_weited_sum))
output = fluid.layers.concat(out_tensors, axis=2) # [BS,SQ, HS+HS+....]
print_tensor(output, "output")
return output
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer encoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from functools import partial, reduce
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layer_helper import LayerHelper
def layer_norm(x, begin_norm_axis=1, epsilon=1e-12, param_attr=None, bias_attr=None):
"""
Replace build-in layer_norm op with this function
"""
helper = LayerHelper('layer_norm', **locals())
mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
variance = layers.reduce_mean(layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
r_stdev = layers.rsqrt(variance + epsilon)
norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
param_dtype = norm_x.dtype
scale = helper.create_parameter(
attr=param_attr,
shape=param_shape,
dtype=param_dtype,
default_initializer=fluid.initializer.Constant(1.))
bias = helper.create_parameter(
attr=bias_attr,
shape=param_shape,
dtype=param_dtype,
is_bias=True,
default_initializer=fluid.initializer.Constant(0.))
out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
out = layers.elementwise_add(x=out, y=bias, axis=-1)
return out
def multi_head_attention(queries,
keys,
values,
attn_bias,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
param_initializer=None,
name='multi_head_att'):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
they will not considered in attention weights.
"""
keys = queries if keys is None else keys
values = keys if values is None else values
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs: quries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Add linear projection to queries, keys, and values.
"""
q = layers.fc(input=queries,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_query_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_query_fc.b_0')
k = layers.fc(input=keys,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_key_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_key_fc.b_0')
v = layers.fc(input=values,
size=d_value * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_value_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_value_fc.b_0')
return q, k, v
def __split_heads(x, n_head):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
hidden_size = x.shape[-1]
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped = layers.reshape(
x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Transpose and then reshape the last two dimensions of inpunt tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if len(x.shape) == 3: return x
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=True)
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
"""
Scaled Dot-Product Attention
"""
scaled_q = layers.scale(x=q, scale=d_key**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
if attn_bias:
product += attn_bias
weights = layers.softmax(product)
if dropout_rate:
weights = layers.dropout(
weights,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
if cache is not None: # use cache and concat time steps
# Since the inplace reshape in __split_heads changes the shape of k and
# v, which is the cache input for next time step, reshape the cache
# input from the previous time step first.
k = cache["k"] = layers.concat(
[layers.reshape(
cache["k"], shape=[0, 0, d_model]), k], axis=1)
v = cache["v"] = layers.concat(
[layers.reshape(
cache["v"], shape=[0, 0, d_model]), v], axis=1)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
v = __split_heads(v, n_head)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
dropout_rate)
out = __combine_heads(ctx_multiheads)
# Project back to the model size.
proj_out = layers.fc(input=out,
size=d_model,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_output_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_output_fc.b_0')
return proj_out
def positionwise_feed_forward(x,
d_inner_hid,
d_hid,
dropout_rate,
hidden_act,
param_initializer=None,
name='ffn'):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden = layers.fc(input=x,
size=d_inner_hid,
num_flatten_dims=2,
act=hidden_act,
param_attr=fluid.ParamAttr(
name=name + '_fc_0.w_0',
initializer=param_initializer),
bias_attr=name + '_fc_0.b_0')
if dropout_rate:
hidden = layers.dropout(
hidden,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.fc(input=hidden,
size=d_hid,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_fc_1.w_0', initializer=param_initializer),
bias_attr=name + '_fc_1.b_0')
return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
name=''):
"""
Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out_dtype = out.dtype
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float32")
out = layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.ParamAttr(
name=name + '_layer_norm_scale',
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
name=name + '_layer_norm_bias',
initializer=fluid.initializer.Constant(0.)))
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float16")
elif cmd == "d": # add dropout
if dropout_rate:
out = layers.dropout(
out,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
return out
pre_process_layer = partial(pre_post_process_layer, None)
post_process_layer = pre_post_process_layer
def encoder_layer(enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and droput.
"""
attn_output = multi_head_attention(
pre_process_layer(
enc_input,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_att'),
None,
None,
attn_bias,
d_key,
d_value,
d_model,
n_head,
attention_dropout,
param_initializer=param_initializer,
name=name + '_multi_head_att')
attn_output = post_process_layer(
enc_input,
attn_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_att')
ffd_output = positionwise_feed_forward(
pre_process_layer(
attn_output,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_ffn'),
d_inner_hid,
d_model,
relu_dropout,
hidden_act,
param_initializer=param_initializer,
name=name + '_ffn')
return post_process_layer(
attn_output,
ffd_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_ffn')
def encoder(enc_input,
attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
for i in range(n_layer):
enc_output = encoder_layer(
enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd,
postprocess_cmd,
param_initializer=param_initializer,
name=name + '_layer_' + str(i))
enc_input = enc_output
enc_output = pre_process_layer(
enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
return enc_output
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
from utils.fp16 import create_master_params_grads, master_param_to_train_param
def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
""" Applies linear warmup of learning rate from 0 and decay to 0."""
with fluid.default_main_program()._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
persistable=True,
name="scheduled_learning_rate")
global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
with fluid.layers.control_flow.Switch() as switch:
with switch.case(global_step < num_train_steps * 0.1):
warmup_lr = learning_rate * (global_step / (num_train_steps * 0.1))
fluid.layers.tensor.assign(warmup_lr, lr)
with switch.default():
decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
learning_rate=learning_rate,
decay_steps=num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
fluid.layers.tensor.assign(decayed_lr, lr)
return lr
def optimization(loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
startup_prog,
weight_decay,
scheduler='linear_warmup_decay',
use_fp16=False,
loss_scaling=1.0):
if warmup_steps > 0:
if scheduler == 'noam_decay':
scheduled_lr = fluid.layers.learning_rate_scheduler\
.noam_decay(1/(warmup_steps *(learning_rate ** 2)),
warmup_steps)
elif scheduler == 'linear_warmup_decay':
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
num_train_steps)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, epsilon=1e-6)
else:
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, epsilon=1e-6)
scheduled_lr = learning_rate
clip_norm_thres = 1.0
# When using mixed precision training, scale the gradient clip threshold
# by loss_scaling
if use_fp16 and loss_scaling > 1.0:
clip_norm_thres *= loss_scaling
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
def exclude_from_weight_decay(name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
param_list = dict()
if use_fp16:
param_grads = optimizer.backward(loss)
master_param_grads = create_master_params_grads(
param_grads, train_program, startup_prog, loss_scaling)
for param, _ in master_param_grads:
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
optimizer.apply_gradients(master_param_grads)
if weight_decay > 0:
for param, grad in master_param_grads:
# if exclude_from_weight_decay(param.name.rstrip(".master")):
# continue
if param.name == 'concept_emb_mat' or param.name == 'wn_concept_emb_mat' or param.name == 'nell_concept_emb_mat':
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
master_param_to_train_param(master_param_grads, param_grads,
train_program)
else:
for param in train_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
_, param_grads = optimizer.minimize(loss)
if weight_decay > 0:
for param, grad in param_grads:
# if exclude_from_weight_decay(param.name):
# continue
if param.name == 'concept_emb_mat' or param.name == 'wn_concept_emb_mat' or param.name == 'nell_concept_emb_mat':
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Arguments for configuration."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import argparse
import logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
def str2bool(v):
# because argparse does not support to parse "true, False" as python
# boolean directly
return v.lower() in ("true", "t", "1")
class ArgumentGroup(object):
def __init__(self, parser, title, des):
self._group = parser.add_argument_group(title=title, description=des)
def add_arg(self, name, type, default, help, **kwargs):
type = str2bool if type == bool else type
self._group.add_argument(
"--" + name,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def print_arguments(args):
logger.info('----------- Configuration Arguments -----------')
for arg, value in sorted(six.iteritems(vars(args))):
logger.info('%s: %s' % (arg, value))
logger.info('------------------------------------------------')
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
def cast_fp16_to_fp32(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP16,
"out_dtype": fluid.core.VarDesc.VarType.FP32
})
def cast_fp32_to_fp16(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP32,
"out_dtype": fluid.core.VarDesc.VarType.FP16
})
def copy_to_master_param(p, block):
v = block.vars.get(p.name, None)
if v is None:
raise ValueError("no param name %s found!" % p.name)
new_p = fluid.framework.Parameter(
block=block,
shape=v.shape,
dtype=fluid.core.VarDesc.VarType.FP32,
type=v.type,
lod_level=v.lod_level,
stop_gradient=p.stop_gradient,
trainable=p.trainable,
optimize_attr=p.optimize_attr,
regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip,
name=v.name + ".master")
return new_p
def create_master_params_grads(params_grads, main_prog, startup_prog,
loss_scaling):
master_params_grads = []
tmp_role = main_prog._current_role
OpRole = fluid.core.op_proto_and_checker_maker.OpRole
main_prog._current_role = OpRole.Backward
for p, g in params_grads:
# create master parameters
master_param = copy_to_master_param(p, main_prog.global_block())
startup_master_param = startup_prog.global_block()._clone_variable(
master_param)
startup_p = startup_prog.global_block().var(p.name)
cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
# cast fp16 gradients to fp32 before apply gradients
if g.name.find("layer_norm") > -1:
if loss_scaling > 1:
scaled_g = g / float(loss_scaling)
else:
scaled_g = g
master_params_grads.append([p, scaled_g])
continue
master_grad = fluid.layers.cast(g, "float32")
if loss_scaling > 1:
master_grad = master_grad / float(loss_scaling)
master_params_grads.append([master_param, master_grad])
main_prog._current_role = tmp_role
return master_params_grads
def master_param_to_train_param(master_params_grads, params_grads, main_prog):
for idx, m_p_g in enumerate(master_params_grads):
train_p, _ = params_grads[idx]
if train_p.name.find("layer_norm") > -1:
continue
with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import six
import ast
import copy
import logging
import numpy as np
import paddle.fluid as fluid
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
def cast_fp32_to_fp16(exe, main_program):
logger.info("Cast parameters to float16 data format.")
for param in main_program.global_block().all_parameters():
if not param.name.endswith(".master"):
param_t = fluid.global_scope().find_var(param.name).get_tensor()
data = np.array(param_t)
if param.name.find("layer_norm") == -1:
param_t.set(np.float16(data).view(np.uint16), exe.place)
master_param_var = fluid.global_scope().find_var(param.name +
".master")
if master_param_var is not None:
master_param_var.get_tensor().set(data, exe.place)
def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
assert os.path.exists(
init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
def existed_persitables(var):
if not fluid.io.is_persistable(var):
return False
return os.path.exists(os.path.join(init_checkpoint_path, var.name))
fluid.io.load_vars(
exe,
init_checkpoint_path,
main_program=main_program,
predicate=existed_persitables)
logger.info("Load model from {}".format(init_checkpoint_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
def init_pretraining_params(exe,
pretraining_params_path,
main_program,
use_fp16=False):
assert os.path.exists(pretraining_params_path
), "[%s] cann't be found." % pretraining_params_path
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(os.path.join(pretraining_params_path, var.name))
fluid.io.load_vars(
exe,
pretraining_params_path,
main_program=main_program,
predicate=existed_params)
logger.info("Load pretraining parameters from {}.".format(
pretraining_params_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册