提交 f8658874 编写于 作者: Y Yang An 提交者: Yibing Liu

Add the workspace of ACL2019-KTNET into PaddleNLP Research Version (#3244)

* add readme for KTNET

* update readme

* update readme

* update readme

* update readme of KTNET

* update readme of KTNET

* add source files for KTNET

* update files for KTNET

* update files for KTNET

* update draft of readme for KTNET

* modified scripts for KTNET

* fix typos in readme.md for KTNET

* update scripts for KTNET

* update scripts for KTNET

* update readme for KTNET

* edit two-staged training scripts for KTNET

* add details in the readme of KTNET

* fix typos in the readme of KTNET

* added eval scripts for KTNET

* rename folders for KTNET

* add copyright in the code and add links in readme for KTNET

* add the remaining download link for KTNET

* add md5sum for KTNET

* final version for KTNET
上级 d6c65111
ad550852cf26241b20e8364e40340a99 train.json
60c70c4a7e8190483f9899a1c9bc4178 dev.json
df45d93b87ca3c47b54a33e03fabf719 record_official_evaluate.py
981b29407e0affa3b1b156f72073b945 train-v1.1.json
3e85deb501d4e538b6bc56f786231552 dev-v1.1.json
afb04912d18ff20696f7f88eed49bea9 squad_v1_official_evaluate.py
64010b964ae2ebf00148b3519a4aafc8 KTNET_preprocess_squad_tagging_output.tar.gz
e9352221127b7620427c18e39bfae7fc KTNET_preprocess_tokenize_result_record.tar.gz
e52da2b1d096e889d32df267b82f9c77 KTNET_preprocess_tokenize_result_squad.tar.gz
89db2f5cfb07f0c44998d7f49098eb90 KTNET_preprocess_wordnet_concepts.tar.gz
fb62db2fe82d88480ec853f3c6fa237a NELL.08m.1115.esv.csv.gz
a68e68f9dcf4524b356163369c7f9f50 KTNET_preprocess_nell_concepts.tar.gz
d9b62183c6367ffac3ee6f864c9425a5 wn_concept2vec.txt
1f69c3d092089b0a0652616b72d61bd8 nell_concept2vec.txt
5405c050e64fee4ffec17ee50f079b64 cased_L-24_H-1024_A-16.tar.gz
4bd6e911cdad39c543ba8922a70580cd KTNET_fine-tuned-model_record_both.tar.gz
43fa464d6aeabe6dc7a15315d4ea8288 KTNET_fine-tuned-model_record_nell.tar.gz
20aaefead331f64e435a94ac8a7b58aa KTNET_fine-tuned-model_record_wordnet.tar.gz
3abdb7be3fc5e3b98633c918acc25af4 KTNET_fine-tuned-model_squad_both.tar.gz
9232cf27adda9d64265ccb315e1b9c81 KTNET_fine-tuned-model_squad_nell.tar.gz
a36fdd6d5c88e3e931bb3b28f9aeb4e2 KTNET_fine-tuned-model_squad_wordnet.tar.gz
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
CKPT_DIR=$1
python3 src/run_record.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
CKPT_DIR=$1
python3 src/run_record_twomemory.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
CKPT_DIR=$1
python3 src/run_record.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
CKPT_DIR=$1
python3 src/run_squad.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
CKPT_DIR=$1
python3 src/run_squad_twomemory.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
CKPT_DIR=$1
python3 src/run_squad.py \
--batch_size 6 \
--do_train false \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--init_checkpoint $CKPT_DIR \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint record_nell_first_stage_output/step_41970 \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d record_nell_first_stage_log ]; then
mkdir record_nell_first_stage_log
else
rm -r record_nell_first_stage_log/*
fi
if [ ! -d record_nell_first_stage_output ]; then
mkdir record_nell_first_stage_output
else
rm -r record_nell_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-4 \
--epoch 10 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints record_nell_first_stage_output/ 1>$PWD_DIR/record_nell_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint record_both_first_stage_output/step_41970 \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d record_both_first_stage_log ]; then
mkdir record_both_first_stage_log
else
rm -r record_both_first_stage_log/*
fi
if [ ! -d record_both_first_stage_output ]; then
mkdir record_both_first_stage_output
else
rm -r record_both_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_record_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-4 \
--epoch 10 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints record_both_first_stage_output/ 1>$PWD_DIR/record_both_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint record_wn_first_stage_output/step_41970 \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 4 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d record_wn_first_stage_log ]; then
mkdir record_wn_first_stage_log
else
rm -r record_wn_first_stage_log/*
fi
if [ ! -d record_wn_first_stage_output ]; then
mkdir record_wn_first_stage_output
else
rm -r record_wn_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_record.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/ReCoRD/train.json \
--predict_file $DATA/ReCoRD/dev.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-4 \
--epoch 10 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints record_wn_first_stage_output/ 1>$PWD_DIR/record_wn_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint sqd_nell_first_stage_output/step_3649 \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d sqd_nell_first_stage_log ]; then
mkdir sqd_nell_first_stage_log
else
rm -r sqd_nell_first_stage_log/*
fi
if [ ! -d sqd_nell_first_stage_output ]; then
mkdir sqd_nell_first_stage_output
else
rm -r sqd_nell_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-5 \
--epoch 1 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_nell true \
--random_seed 45 \
--checkpoints sqd_nell_first_stage_output/ 1>$PWD_DIR/sqd_nell_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint sqd_both_first_stage_output/step_3649 \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d sqd_both_first_stage_log ]; then
mkdir sqd_both_first_stage_log
else
rm -r sqd_both_first_stage_log/*
fi
if [ ! -d sqd_both_first_stage_output ]; then
mkdir sqd_both_first_stage_output
else
rm -r sqd_both_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
python3 src/run_squad_twomemory.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-5 \
--epoch 1 \
--max_seq_len 384 \
--doc_stride 128 \
--wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
--nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
--use_wordnet true \
--use_nell true \
--random_seed 45 \
--checkpoints sqd_both_first_stage_output/ 1>$PWD_DIR/sqd_both_first_stage_log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d log ]; then
mkdir log
else
rm -r log/*
fi
if [ ! -d output ]; then
mkdir output
else
rm -r output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--do_lower_case false \
--init_checkpoint sqd_wn_first_stage_output/step_3649 \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze false \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--learning_rate 3e-5 \
--epoch 3 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
#!/bin/bash
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export LC_CTYPE=en_US.UTF-8
if [ ! -d sqd_wn_first_stage_log ]; then
mkdir sqd_wn_first_stage_log
else
rm -r sqd_wn_first_stage_log/*
fi
if [ ! -d sqd_wn_first_stage_output ]; then
mkdir sqd_wn_first_stage_output
else
rm -r sqd_wn_first_stage_output/*
fi
export FLAGS_cudnn_deterministic=true
export FLAGS_cpu_deterministic=true
PWD_DIR=`pwd`
DATA=../data/
BERT_DIR=cased_L-24_H-1024_A-16
CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
python3 src/run_squad.py \
--batch_size 6 \
--do_train true \
--do_predict true \
--use_ema false \
--do_lower_case false \
--init_pretraining_params $BERT_DIR/params \
--train_file $DATA/SQuAD/train-v1.1.json \
--predict_file $DATA/SQuAD/dev-v1.1.json \
--vocab_path $BERT_DIR/vocab.txt \
--bert_config_path $BERT_DIR/bert_config.json \
--freeze true \
--save_steps 4000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--learning_rate 3e-5 \
--epoch 1 \
--max_seq_len 384 \
--doc_stride 128 \
--concept_embedding_path $CPT_EMBEDDING_PATH \
--use_wordnet true \
--random_seed 45 \
--checkpoints sqd_wn_first_stage_output/ 1>$PWD_DIR/sqd_wn_first_stage_log/train.log 2>&1
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts,
total_token_num,
voc_size=0,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=None,
return_input_mask=True,
return_max_len=True,
return_num_token=False,
max_concept_length=50):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids = [inst[0] for inst in insts]
batch_sent_ids = [inst[1] for inst in insts]
batch_pos_ids = [inst[2] for inst in insts]
batch_concept_ids = [inst[3] for inst in insts]
labels_list = []
# compatible with squad, whose example includes start/end positions,
# or unique id
for i in range(4, len(insts[0]), 1):
labels = [inst[i] for inst in insts]
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
# First step: do mask without padding
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, return_input_mask=True)
pos_id = pad_batch_data(
batch_pos_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
sent_id = pad_batch_data(
batch_sent_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
concept_ids = pad_batch_data(
batch_concept_ids, pad_idx=[],
max_concept_length=max_concept_length) # 用[0,0,..]来pad
if mask_id >= 0:
return_list = [
src_id, pos_id, sent_id, concept_ids, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, concept_ids, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
def pad_batch_data(insts,
pad_idx=0,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False,
max_concept_length=50):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
# max_len = max(len(inst) for inst in insts)
max_len = 384
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
if type(pad_idx) == list: # padding list, for concept_ids
inst_data = np.array(
[inst + list([0] * max_concept_length for x in range(max_len - len(inst))) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, max_len, max_concept_length, 1])]
else:
inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0]
if __name__ == "__main__":
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts,
total_token_num,
voc_size=0,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=None,
return_input_mask=True,
return_max_len=True,
return_num_token=False,
max_wn_concept_length=50,
max_nell_concept_length=50):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids = [inst[0] for inst in insts]
batch_sent_ids = [inst[1] for inst in insts]
batch_pos_ids = [inst[2] for inst in insts]
batch_wn_concept_ids = [inst[3] for inst in insts]
batch_nell_concept_ids = [inst[4] for inst in insts]
labels_list = []
# compatible with squad, whose example includes start/end positions,
# or unique id
for i in range(5, len(insts[0]), 1):
labels = [inst[i] for inst in insts]
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
# First step: do mask without padding
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, return_input_mask=True)
pos_id = pad_batch_data(
batch_pos_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
sent_id = pad_batch_data(
batch_sent_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
wn_concept_ids = pad_batch_data(
batch_wn_concept_ids, pad_idx=[],
max_concept_length=max_wn_concept_length) # 用[0,0,..]来pad
nell_concept_ids = pad_batch_data(
batch_nell_concept_ids, pad_idx=[],
max_concept_length=max_nell_concept_length) # 用[0,0,..]来pad
if mask_id >= 0:
return_list = [
src_id, pos_id, sent_id, wn_concept_ids, nell_concept_ids, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, wn_concept_ids, nell_concept_ids, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
def pad_batch_data(insts,
pad_idx=0,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False,
max_concept_length=50):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
# max_len = max(len(inst) for inst in insts)
max_len = 384
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
if type(pad_idx) == list: # padding list, for concept_ids
inst_data = np.array(
[inst + list([0] * max_concept_length for x in range(max_len - len(inst))) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, max_len, max_concept_length, 1])]
else:
inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0]
if __name__ == "__main__":
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import json
import logging
import numpy as np
import paddle.fluid as fluid
from model.transformer_encoder import encoder, pre_process_layer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
class BertConfig(object):
def __init__(self, config_path):
self._config_dict = self._parse(config_path)
def _parse(self, config_path):
try:
with open(config_path) as json_file:
config_dict = json.load(json_file)
except Exception:
raise IOError("Error in parsing bert model config file '%s'" %
config_path)
else:
return config_dict
def __getitem__(self, key):
return self._config_dict[key]
def print_config(self):
for arg, value in sorted(six.iteritems(self._config_dict)):
logger.info('%s: %s' % (arg, value))
logger.info('------------------------------------------------')
class BertModel(object):
def __init__(self,
src_ids,
position_ids,
sentence_ids,
input_mask,
config,
weight_sharing=True,
use_fp16=False):
self._emb_size = config['hidden_size']
self._n_layer = config['num_hidden_layers']
self._n_head = config['num_attention_heads']
self._voc_size = config['vocab_size']
self._max_position_seq_len = config['max_position_embeddings']
self._sent_types = config['type_vocab_size']
self._hidden_act = config['hidden_act']
self._prepostprocess_dropout = config['hidden_dropout_prob']
self._attention_dropout = config['attention_probs_dropout_prob']
self._weight_sharing = weight_sharing
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._dtype = "float16" if use_fp16 else "float32"
# Initialize all weigths by truncated normal initializer, and all biases
# will be initialized by constant zero by default.
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
self._build_model(src_ids, position_ids, sentence_ids, input_mask)
def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
# padding id in vocabulary must be set to 0
emb_out = fluid.layers.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._dtype,
param_attr=fluid.ParamAttr(
name=self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
position_emb_out = fluid.layers.embedding(
input=position_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._dtype,
param_attr=fluid.ParamAttr(
name=self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.layers.embedding(
sentence_ids,
size=[self._sent_types, self._emb_size],
dtype=self._dtype,
param_attr=fluid.ParamAttr(
name=self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
if self._dtype == "float16":
input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
# self_attn_mask = fluid.layers.matmul(
# x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.expand(fluid.layers.transpose(input_mask, [0, 2, 1]), [1, 384, 1])
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
self._enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name='encoder')
def get_sequence_output(self):
return self._enc_out
def get_pooled_output(self):
"""Get the first feature of each sequence for classification"""
next_sent_feat = fluid.layers.slice(
input=self._enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name="pooled_fc.w_0", initializer=self._param_initializer),
bias_attr="pooled_fc.b_0")
return next_sent_feat
def get_pretraining_output(self, mask_label, mask_pos, labels):
"""Get the loss & accuracy for pretraining"""
mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
# extract the first token feature in each sentence
next_sent_feat = self.get_pooled_output()
reshaped_emb_out = fluid.layers.reshape(
x=self._enc_out, shape=[-1, self._emb_size])
# extract masked tokens' feature
mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
# transform: fc
mask_trans_feat = fluid.layers.fc(
input=mask_feat,
size=self._emb_size,
act=self._hidden_act,
param_attr=fluid.ParamAttr(
name='mask_lm_trans_fc.w_0',
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
# transform: layer norm
mask_trans_feat = pre_process_layer(
mask_trans_feat, 'n', name='mask_lm_trans')
mask_lm_out_bias_attr = fluid.ParamAttr(
name="mask_lm_out_fc.b_0",
initializer=fluid.initializer.Constant(value=0.0))
if self._weight_sharing:
fc_out = fluid.layers.matmul(
x=mask_trans_feat,
y=fluid.default_main_program().global_block().var(
self._word_emb_name),
transpose_y=True)
fc_out += fluid.layers.create_parameter(
shape=[self._voc_size],
dtype=self._dtype,
attr=mask_lm_out_bias_attr,
is_bias=True)
else:
fc_out = fluid.layers.fc(input=mask_trans_feat,
size=self._voc_size,
param_attr=fluid.ParamAttr(
name="mask_lm_out_fc.w_0",
initializer=self._param_initializer),
bias_attr=mask_lm_out_bias_attr)
mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
logits=fc_out, label=mask_label)
mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
next_sent_fc_out = fluid.layers.fc(
input=next_sent_feat,
size=2,
param_attr=fluid.ParamAttr(
name="next_sent_fc.w_0", initializer=self._param_initializer),
bias_attr="next_sent_fc.b_0")
next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
logits=next_sent_fc_out, label=labels, return_softmax=True)
next_sent_acc = fluid.layers.accuracy(
input=next_sent_softmax, label=labels)
mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
loss = mean_next_sent_loss + mean_mask_lm_loss
return next_sent_acc, mean_mask_lm_loss, loss
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""bert model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import sys
import six
import logging
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.layers import shape
from model.transformer_encoder import encoder, pre_process_layer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
def dynamic_expand(dynamic_tensor, smaller_tensor):
"""
:param dynamic_tensor:
:param smaller_tensor:
:return:
"""
assert len(dynamic_tensor.shape) > len(smaller_tensor.shape)
if type(smaller_tensor.shape) == list:
for dim_idx, dim in smaller_tensor.shape:
dynamic_tensor_dim_idx = len(dynamic_tensor) - len(smaller_tensor) + dim_idx
assert dynamic_tensor.shape[dynamic_tensor_dim_idx] % dim == 0
elif type(smaller_tensor.shape) == int:
assert dynamic_tensor.shape[-1] % smaller_tensor.shape == 0
memory_embs_zero = fluid.layers.scale(dynamic_tensor, scale=0.0)
smaller_tensor = fluid.layers.elementwise_add(memory_embs_zero, smaller_tensor)
return smaller_tensor
def print_tensor(tensor, message, print_runtime=False):
logger.info("{}: {}".format(message, tensor.shape))
if print_runtime:
fluid.layers.Print(tensor, summarize=10, message=message)
class MemoryLayer(object):
def __init__(self, bert_config, concept_size, mem_emb_size, mem_method='cat', prefix=None):
self.initializer_range = bert_config['initializer_range']
self.bert_size = bert_config['hidden_size']
self.concept_size = concept_size
self.mem_emb_size = mem_emb_size
assert mem_method in ['add', 'cat', 'raw']
self.mem_method = mem_method
self.prefix = prefix
def forward(self, bert_output, memory_embs, mem_length, ignore_no_memory_token=True):
"""
:param bert_output: [batch_size, seq_size, bert_size]
:param memory_embs: [batch_size, seq_size, concept_size, mem_emb_size]
:param mem_length: [batch_size, sent_size, 1]
:return:
"""
bert_size = self.bert_size
concept_size = self.concept_size
mem_emb_size = self.mem_emb_size
print_tensor(bert_output, "bert_output")
print_tensor(memory_embs, "memory_embs")
print_tensor(mem_length, "mem_length")
projected_bert = fluid.layers.fc(bert_output, size=mem_emb_size, num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name='{}_memory_layer_projection.w_0'.format(self.prefix) if self.prefix else 'memory_layer_projection.w_0',
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=self.initializer_range)),
bias_attr=False) # [batch_size *seq_size, mem_emb_size]
logger.info("projected_bert: {}".format(projected_bert.shape))
expanded_bert = fluid.layers.unsqueeze(projected_bert, axes=[2]) # [batch_size, seq_size, 1, mem_emb_size]
extended_memory, memory_score = self.add_sentinel(expanded_bert, memory_embs, mem_emb_size)
# extended_memory: [batch_size, seq_size, 1+concept_size, mem_emb_size]
# memory_score: [batch_size, seq_size, 1+concept_size]
concept_ordinal = self.get_concept_oridinal(concept_size, memory_score) # [bs,sq,1+cs]
memory_reverse_mask = fluid.layers.less_than(
fluid.layers.expand(mem_length, expand_times=[1, 1, 1 + concept_size])
, concept_ordinal)
# [batch_size, seq_size, 1+concept_size]
memory_reverse_mask = fluid.layers.cast(memory_reverse_mask, dtype="float32")
print_tensor(memory_reverse_mask, "memory_reverse_mask")
memory_reverse_masked_infinity = fluid.layers.scale(memory_reverse_mask, scale=-1e6)
# [batch_size, seq_size, 1+concept_size]
print_tensor(memory_reverse_masked_infinity, "memory_reverse_masked_infinity")
memory_score = fluid.layers.elementwise_add(memory_score, memory_reverse_masked_infinity)
# [batch_size, seq_size, 1+concept_size]
logger.info("memory_score:{}".format(memory_score.shape))
memory_att = fluid.layers.softmax(memory_score) # [batch_size, seq_size, 1+concept_size]
memory_att = fluid.layers.unsqueeze(memory_att, axes=[2]) # [batch_size, seq_size, 1, 1+concept_size]
logger.info("memory_att: {}".format(memory_att.shape))
logger.info("extended_memory: {}".format(extended_memory.shape))
summ = fluid.layers.matmul(memory_att,extended_memory) # [batch_size, seq_size,1, mem_emb_size]
summ = fluid.layers.squeeze(summ, axes=[2]) # [batch_size, seq_size,mem_emb_size]
if ignore_no_memory_token:
condition = fluid.layers.less_than(
dynamic_expand(mem_length, fluid.layers.zeros([1],"float32")),
mem_length) # [bs, sq]
# summ_true = fluid.layers.elementwise_mul(
# summ,
# fluid.layers.cast(condition, "float32")) # [bs, sq, ms]
# summ_false = fluid.layers.elementwise_mul(
# summ,
# fluid.layers.scale(fluid.layers.cast(condition, "float32"), -1)) # [bs, sq, ms]
# summ = fluid.layers.elementwise_add(summ_true, summ_false) # [bs, sq, ms]
summ = fluid.layers.elementwise_mul(
summ,
fluid.layers.cast(condition, "float32")) # [bs, sq, ms]
print_tensor(summ, "summ")
if self.mem_method == "add":
summ_transform = fluid.layers.fc(summ, size=bert_size, num_flatten_dims=2) # [batch_size, seq_size, bert_size]
output = fluid.layers.sums(input=[summ_transform, bert_output]) # [batch_size, seq_size, bert_size]
elif self.mem_method == "cat":
logger.info("bert_output: {}".format(bert_output.shape))
logger.info("summ: {}".format(summ.shape))
output = fluid.layers.concat(input=[bert_output, summ], axis=2) # [batch_size, seq_size, bert_size + mem_emb_size]
elif self.mem_method == "raw":
logger.info("bert_output: {}".format(bert_output.shape))
logger.info("summ: {}".format(summ.shape))
output = summ # [batch_size, seq_size, mem_emb_size]
else:
raise ValueError("mem_method not supported")
logger.info("output: {}".format(output.shape))
return output
def get_concept_oridinal(self, concept_size, memory_score):
"""
:param concept_size:
:param memory_score: [batch_size, seq_size, 1+concept_size]
:return:
"""
concept_ordinal = fluid.layers.create_tensor(dtype="float32")
fluid.layers.assign(np.arange(start=0, stop=(1 + concept_size), step=1, dtype=np.float32),
concept_ordinal) # [1+cs]
print_tensor(concept_ordinal, "concept_ordinal")
print_tensor(memory_score, "memory_score")
concept_ordinal = dynamic_expand(memory_score, concept_ordinal) # [bs,sq,1+cs]
logger.info("concept_ordinal: {}".format(concept_ordinal.shape))
return concept_ordinal
def add_sentinel(self, expanded_bert, memory_embs, mem_emb_size):
"""
:param expanded_bert: [batch_size, seq_size, 1, mem_emb_size]
:param memory_embs: [batch_size, seq_size, concept_size, mem_emb_size]
:param mem_emb_size:
:return:
"""
sentinel = fluid.layers.create_parameter(
name='{}_memory_layer_sentinel'.format(self.prefix) if self.prefix else 'memory_layer_sentinel',
dtype="float32",
shape=[mem_emb_size],
default_initializer=fluid.initializer.ConstantInitializer(0)) # [mem_emb_size]
print_tensor(sentinel, "sentinel")
memory_embs_squeeze = fluid.layers.slice(memory_embs, axes=[2], starts=[0],
ends=[1]) # [bs,sq,1,ms]
print_tensor(memory_embs_squeeze, "memory_embs_squeeze")
sentinel = dynamic_expand(memory_embs_squeeze, sentinel) # [bs,sq,1,ms]
print_tensor(sentinel, "sentinel")
print_tensor(memory_embs, "memory_embs")
extended_memory = fluid.layers.concat([sentinel, memory_embs],
axis=2) # [batch_size, seq_size, 1+concept_size, mem_emb_size]
extended_memory = fluid.layers.transpose(extended_memory, perm=[0, 1, 3, 2])
# [batch_size, seq_size, mem_emb_size, 1+concept_size]
logger.info("extended_memory: {}".format(extended_memory.shape))
memory_score = fluid.layers.matmul(expanded_bert,
extended_memory) # [batch_size, seq_size, 1, 1+concept_size]
memory_score = fluid.layers.squeeze(memory_score, axes=[2])
# [batch_size, seq_size, 1+concept_size]
extended_memory = fluid.layers.transpose(extended_memory, perm=[0, 1, 3, 2])
# [batch_size, seq_size, 1+concept_size, mem_emb_size]
return extended_memory, memory_score
class TriLinearTwoTimeSelfAttentionLayer(object):
def __init__(self, hidden_size, dropout_rate=0.0,
cat_mul=False, cat_sub=False, cat_twotime=False, cat_twotime_mul=False, cat_twotime_sub=False):
self.hidden_size = hidden_size
self.dropout_rate = dropout_rate
self.cat_mul = cat_mul
self.cat_sub = cat_sub
self.cat_twotime = cat_twotime
self.cat_twotime_mul = cat_twotime_mul
self.cat_twotime_sub = cat_twotime_sub
def forward(self, hidden_emb, sequence_mask):
"""
:param hidden_emb: [batch_size, seq_size, hidden_size]
:param sequence_mask: [batch_size, seq_size, 1]
:return:
"""
assert len(hidden_emb.shape) ==3 and len(sequence_mask.shape) == 3 \
and sequence_mask.shape[-1] == 1
assert hidden_emb.shape[:2] == sequence_mask.shape[:2]
hidden_size = self.hidden_size
bias = fluid.layers.create_parameter(name='self_matching_layer_bias', shape=[1], dtype="float32",
default_initializer=fluid.initializer.ConstantInitializer(0))
weight_1 = fluid.layers.create_parameter(name='self_matching_layer_weight1', shape=[hidden_size], dtype="float32",
default_initializer=fluid.initializer.XavierInitializer(uniform=True, fan_in=1, fan_out=hidden_size)) # [HS]
bs_1_hs = fluid.layers.slice(hidden_emb, axes=[1], starts=[0], ends=[1]) # [bs, 1, hs]
print_tensor(bs_1_hs, "bs_1_hs")
bs_hs_1 = fluid.layers.transpose(bs_1_hs, perm=[0, 2, 1]) # [bs, hs, 1]
print_tensor(bs_hs_1, "bs_hs_1")
print_tensor(weight_1, "weight_1")
weight_1 = dynamic_expand(bs_1_hs, weight_1) # [BS, 1, HS] (a)jk
weight_1 = fluid.layers.transpose(weight_1, perm=[0, 2, 1])
print_tensor(hidden_emb, "hidden_emb")
print_tensor(weight_1, "weight_1")
r1 = fluid.layers.matmul(hidden_emb, weight_1) # [BS, SQ, 1] aik
print_tensor(r1, "r1")
weight_2 = fluid.layers.create_parameter(name='self_matching_layer_weight2', shape=[hidden_size], dtype="float32",
default_initializer=fluid.initializer.XavierInitializer(uniform=True, fan_in=1, fan_out=hidden_size)) # [HS]
weight_2 = dynamic_expand(bs_1_hs, weight_2) # # [BS, 1, HS] (a)jk
hidden_emb_transpose = fluid.layers.transpose(hidden_emb, perm=[0, 2, 1]) # [BS, HS, SQ] aji
r2 = fluid.layers.matmul(weight_2, hidden_emb_transpose) # [BS, 1, SQ] aki
print_tensor(r2, "r2")
weight_mul = fluid.layers.create_parameter(name='self_matching_layer_weightmul', shape=[hidden_size], dtype="float32",
default_initializer=fluid.initializer.XavierInitializer(uniform=True)) # [HS]
weight_mul = dynamic_expand(hidden_emb, weight_mul)
rmul_1 = fluid.layers.elementwise_mul(hidden_emb, weight_mul) # for "hidden * self.weight_mul". [bs, sq(i), hs(j)]
print_tensor(rmul_1, "rmul_1")
rmul_2 = fluid.layers.matmul(rmul_1, hidden_emb_transpose) # [bs, sq(i), hs(j)] mul [bs, hs(j), sq(k)] = [bs, sq(i), sq(k)]
print_tensor(rmul_2, "rmul_2")
r1 = fluid.layers.squeeze(r1, axes=[2]) # [BS, SQ] aik
r1 = dynamic_expand(
fluid.layers.transpose(rmul_2, [1, 0, 2]), # [sq, bs, sq]
r1) # [ SQ(from 1), bs, SQ]
r1 = fluid.layers.transpose(r1, [1, 2, 0]) # [bs, sq, sq(from 1)]
r2 = fluid.layers.squeeze(r2, axes=[1]) # [BS, SQ] aik
r2 = dynamic_expand(
fluid.layers.transpose(rmul_2, [1, 0, 2]), # [sq, bs, sq]
r2) # [ SQ(from 1), bs, SQ]
r2 = fluid.layers.transpose(r2, [1, 0, 2]) # [bs,sq(from 1),sq]
bias = dynamic_expand(rmul_2, bias) # [BS, SQ, SQ]
sim_score = fluid.layers.sums(input=[r1, r2, rmul_2, bias])
# [bs,sq,1]+[bs,1,sq]+[bs,sq,sq]+[bs,sq,sq]=[BS,SQ,SQ]
print_tensor(sim_score, "sim_score")
sequence_mask = fluid.layers.cast(sequence_mask, dtype="float32") # [BS,SQ,1]
softmax_mask = fluid.layers.elementwise_sub(
sequence_mask,
fluid.layers.fill_constant([1], "float32", 1)) # [BS,SQ,1]
softmax_mask = fluid.layers.scale(softmax_mask, -1)
very_negative_number = fluid.layers.fill_constant([1], value=-1e6, dtype="float32")
logger.info("softmax_mask: {}".format(softmax_mask.shape))
logger.info("very_negative_number: {}".format(very_negative_number.shape))
softmax_mask = fluid.layers.elementwise_mul(softmax_mask, very_negative_number) # [BS,SQ,1]
softmax_mask = fluid.layers.squeeze(softmax_mask, axes=[2]) # [BS,SQ]
softmax_mask = dynamic_expand(fluid.layers.transpose(sim_score, perm=[2, 0, 1]), softmax_mask) # [sq(1),bs,sq]
softmax_mask = fluid.layers.transpose(softmax_mask, perm=[1, 0, 2]) # [BS,sq(1),SQ]
print_tensor(softmax_mask, "softmax_mask")
sim_score = fluid.layers.elementwise_add(sim_score, softmax_mask) # [bs,sq,sq]+[bs,sq(1),sq]=[BS,SQ,SQ]
print_tensor(sim_score, "sim_score")
attn_prob = fluid.layers.softmax(sim_score) # [BS,SQ,SQ]
weighted_sum = fluid.layers.matmul(attn_prob, hidden_emb) # [bs,sq,sq]*[bs,sq,hs]=[BS,SQ,HS]
if any([self.cat_twotime, self.cat_twotime_mul, self.cat_twotime_sub]):
twotime_att_prob = fluid.layers.matmul(attn_prob, attn_prob) # [bs,sq,sq]*[bs,sq,sq]=[BS,SQ,SQ]
twotime_weited_sum = fluid.layers.matmul(twotime_att_prob, hidden_emb) # [BS,SQ,HS]
out_tensors = [hidden_emb, weighted_sum]
if self.cat_mul:
out_tensors.append(fluid.layers.elementwise_mul(hidden_emb, weighted_sum))
if self.cat_sub:
out_tensors.append(fluid.layers.elementwise_sub(hidden_emb, weighted_sum))
if self.cat_twotime:
out_tensors.append(twotime_weited_sum)
if self.cat_twotime_mul:
out_tensors.append(fluid.layers.elementwise_mul(hidden_emb, twotime_weited_sum))
if self.cat_twotime_sub:
out_tensors.append(fluid.layers.elementwise_sub(hidden_emb, twotime_weited_sum))
output = fluid.layers.concat(out_tensors, axis=2) # [BS,SQ, HS+HS+....]
print_tensor(output, "output")
return output
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer encoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from functools import partial, reduce
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layer_helper import LayerHelper
def layer_norm(x, begin_norm_axis=1, epsilon=1e-12, param_attr=None, bias_attr=None):
"""
Replace build-in layer_norm op with this function
"""
helper = LayerHelper('layer_norm', **locals())
mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
variance = layers.reduce_mean(layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
r_stdev = layers.rsqrt(variance + epsilon)
norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
param_dtype = norm_x.dtype
scale = helper.create_parameter(
attr=param_attr,
shape=param_shape,
dtype=param_dtype,
default_initializer=fluid.initializer.Constant(1.))
bias = helper.create_parameter(
attr=bias_attr,
shape=param_shape,
dtype=param_dtype,
is_bias=True,
default_initializer=fluid.initializer.Constant(0.))
out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
out = layers.elementwise_add(x=out, y=bias, axis=-1)
return out
def multi_head_attention(queries,
keys,
values,
attn_bias,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
param_initializer=None,
name='multi_head_att'):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
they will not considered in attention weights.
"""
keys = queries if keys is None else keys
values = keys if values is None else values
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs: quries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Add linear projection to queries, keys, and values.
"""
q = layers.fc(input=queries,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_query_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_query_fc.b_0')
k = layers.fc(input=keys,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_key_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_key_fc.b_0')
v = layers.fc(input=values,
size=d_value * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_value_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_value_fc.b_0')
return q, k, v
def __split_heads(x, n_head):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
hidden_size = x.shape[-1]
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped = layers.reshape(
x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Transpose and then reshape the last two dimensions of inpunt tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if len(x.shape) == 3: return x
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=True)
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
"""
Scaled Dot-Product Attention
"""
scaled_q = layers.scale(x=q, scale=d_key**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
if attn_bias:
product += attn_bias
weights = layers.softmax(product)
if dropout_rate:
weights = layers.dropout(
weights,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
if cache is not None: # use cache and concat time steps
# Since the inplace reshape in __split_heads changes the shape of k and
# v, which is the cache input for next time step, reshape the cache
# input from the previous time step first.
k = cache["k"] = layers.concat(
[layers.reshape(
cache["k"], shape=[0, 0, d_model]), k], axis=1)
v = cache["v"] = layers.concat(
[layers.reshape(
cache["v"], shape=[0, 0, d_model]), v], axis=1)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
v = __split_heads(v, n_head)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
dropout_rate)
out = __combine_heads(ctx_multiheads)
# Project back to the model size.
proj_out = layers.fc(input=out,
size=d_model,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_output_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_output_fc.b_0')
return proj_out
def positionwise_feed_forward(x,
d_inner_hid,
d_hid,
dropout_rate,
hidden_act,
param_initializer=None,
name='ffn'):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden = layers.fc(input=x,
size=d_inner_hid,
num_flatten_dims=2,
act=hidden_act,
param_attr=fluid.ParamAttr(
name=name + '_fc_0.w_0',
initializer=param_initializer),
bias_attr=name + '_fc_0.b_0')
if dropout_rate:
hidden = layers.dropout(
hidden,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.fc(input=hidden,
size=d_hid,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_fc_1.w_0', initializer=param_initializer),
bias_attr=name + '_fc_1.b_0')
return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
name=''):
"""
Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out_dtype = out.dtype
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float32")
out = layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.ParamAttr(
name=name + '_layer_norm_scale',
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
name=name + '_layer_norm_bias',
initializer=fluid.initializer.Constant(0.)))
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float16")
elif cmd == "d": # add dropout
if dropout_rate:
out = layers.dropout(
out,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
return out
pre_process_layer = partial(pre_post_process_layer, None)
post_process_layer = pre_post_process_layer
def encoder_layer(enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and droput.
"""
attn_output = multi_head_attention(
pre_process_layer(
enc_input,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_att'),
None,
None,
attn_bias,
d_key,
d_value,
d_model,
n_head,
attention_dropout,
param_initializer=param_initializer,
name=name + '_multi_head_att')
attn_output = post_process_layer(
enc_input,
attn_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_att')
ffd_output = positionwise_feed_forward(
pre_process_layer(
attn_output,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_ffn'),
d_inner_hid,
d_model,
relu_dropout,
hidden_act,
param_initializer=param_initializer,
name=name + '_ffn')
return post_process_layer(
attn_output,
ffd_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_ffn')
def encoder(enc_input,
attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
for i in range(n_layer):
enc_output = encoder_layer(
enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd,
postprocess_cmd,
param_initializer=param_initializer,
name=name + '_layer_' + str(i))
enc_input = enc_output
enc_output = pre_process_layer(
enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
return enc_output
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
from utils.fp16 import create_master_params_grads, master_param_to_train_param
def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
""" Applies linear warmup of learning rate from 0 and decay to 0."""
with fluid.default_main_program()._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
persistable=True,
name="scheduled_learning_rate")
global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
with fluid.layers.control_flow.Switch() as switch:
with switch.case(global_step < num_train_steps * 0.1):
warmup_lr = learning_rate * (global_step / (num_train_steps * 0.1))
fluid.layers.tensor.assign(warmup_lr, lr)
with switch.default():
decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
learning_rate=learning_rate,
decay_steps=num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
fluid.layers.tensor.assign(decayed_lr, lr)
return lr
def optimization(loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
startup_prog,
weight_decay,
scheduler='linear_warmup_decay',
use_fp16=False,
loss_scaling=1.0):
if warmup_steps > 0:
if scheduler == 'noam_decay':
scheduled_lr = fluid.layers.learning_rate_scheduler\
.noam_decay(1/(warmup_steps *(learning_rate ** 2)),
warmup_steps)
elif scheduler == 'linear_warmup_decay':
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
num_train_steps)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, epsilon=1e-6)
else:
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, epsilon=1e-6)
scheduled_lr = learning_rate
clip_norm_thres = 1.0
# When using mixed precision training, scale the gradient clip threshold
# by loss_scaling
if use_fp16 and loss_scaling > 1.0:
clip_norm_thres *= loss_scaling
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
def exclude_from_weight_decay(name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
param_list = dict()
if use_fp16:
param_grads = optimizer.backward(loss)
master_param_grads = create_master_params_grads(
param_grads, train_program, startup_prog, loss_scaling)
for param, _ in master_param_grads:
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
optimizer.apply_gradients(master_param_grads)
if weight_decay > 0:
for param, grad in master_param_grads:
# if exclude_from_weight_decay(param.name.rstrip(".master")):
# continue
if param.name == 'concept_emb_mat' or param.name == 'wn_concept_emb_mat' or param.name == 'nell_concept_emb_mat':
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
master_param_to_train_param(master_param_grads, param_grads,
train_program)
else:
for param in train_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
_, param_grads = optimizer.minimize(loss)
if weight_decay > 0:
for param, grad in param_grads:
# if exclude_from_weight_decay(param.name):
# continue
if param.name == 'concept_emb_mat' or param.name == 'wn_concept_emb_mat' or param.name == 'nell_concept_emb_mat':
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on ReCoRD."""
import six
import math
import json
import random
import collections
import os
import pickle
import logging
import tokenization
from batching import prepare_batch_data
from eval.record_official_evaluate import evaluate, f1_score
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
class ReCoRDExample(object):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
concept_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
self.concept_ids = concept_ids
def read_record_examples(input_file, is_training, version_2_with_negative=False):
"""Read a ReCoRD json file into a list of ReCoRDExample."""
with open(input_file, "r") as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
paragraph_text = entry["passage"]["text"].replace('\xa0', ' ')
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in entry["qas"]:
qas_id = qa["id"]
question_text = qa["query"].replace('\xa0', ' ')
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
# if (len(qa["answers"]) != 1) and (not is_impossible):
# raise ValueError(
# "For training, each question should have exactly 1 answer."
# )
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset +
answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(
end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.info("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = ReCoRDExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
class Examples_To_Features_Converter(object):
def __init__(self, **concept_settings):
self.concept_settings = concept_settings
# load necessary data files for mapping to related concepts
# 1. mapping from subword-level tokenization to word-level tokenization
tokenization_filepath = self.concept_settings['tokenization_path']
assert os.path.exists(tokenization_filepath)
self.all_tokenization_info = {}
for item in pickle.load(open(tokenization_filepath, 'rb')):
self.all_tokenization_info[item['id']] = item
# 2. mapping from concept name to concept id (currently only support one KB)
self.concept2id = self.concept_settings['concept2id']
# 3. retrieved related wordnet concepts (if use_wordnet)
if concept_settings['use_wordnet']:
assert not self.concept_settings['use_nell']
retrieved_synset_filepath = self.concept_settings['retrieved_synset_path']
assert os.path.exists(retrieved_synset_filepath)
self.synsets_info = pickle.load(open(retrieved_synset_filepath, 'rb')) # token to sysnet names
self.max_concept_length = max([len(synsets) for synsets in self.synsets_info.values()])
# 4. retrieved related nell concepts (if use_nell)
if concept_settings['use_nell']:
assert not self.concept_settings['use_wordnet']
retrieved_nell_concept_filepath = self.concept_settings['retrieved_nell_concept_path']
assert os.path.exists(retrieved_nell_concept_filepath)
self.nell_retrieve_info = {}
for item in pickle.load(open(retrieved_nell_concept_filepath, 'rb')):
self.nell_retrieve_info[item['id']] = item
self.max_concept_length = max([max([len(entity_info['retrieved_concepts']) for entity_info in item['query_entities'] + item['document_entities']])
for qid, item in self.nell_retrieve_info.items() if len(item['query_entities'] + item['document_entities']) > 0])
# return list of concept ids given input subword list
def _lookup_wordnet_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, tolower, tokenizer):
concept_ids = []
for index in range(len(sub_tokens)):
original_token = tokens[sub_to_ori_index[index]]
# if tokens are in upper case, we must lower it for retrieving
retrieve_token = tokenizer.basic_tokenizer._run_strip_accents(original_token.lower()) if tolower else original_token
if retrieve_token in self.synsets_info:
concept_ids.append([self.concept2id[synset_name] for synset_name in self.synsets_info[retrieve_token]])
else:
concept_ids.append([])
return concept_ids
def _lookup_nell_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, nell_info):
original_concept_ids = [[] for _ in range(len(tokens))]
for entity_info in nell_info:
for pos in range(entity_info['token_start'], entity_info['token_end'] + 1):
original_concept_ids[pos] += [self.concept2id[category_name] for category_name in entity_info['retrieved_concepts']]
for pos in range(len(original_concept_ids)):
original_concept_ids[pos] = list(set(original_concept_ids[pos]))
concept_ids = [original_concept_ids[sub_to_ori_index[index]] for index in range(len(sub_tokens))]
return concept_ids
def __call__(self,
examples,
tokenizer,
max_seq_length,
doc_stride,
max_query_length,
is_training):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
for (example_index, example) in enumerate(examples):
tokenization_info = self.all_tokenization_info[example.qas_id]
query_tokens = tokenizer.tokenize(example.question_text)
# check online subword tokenization result is the same as offline result
assert query_tokens == tokenization_info['query_subtokens']
if self.concept_settings['use_wordnet']:
query_concepts = self._lookup_wordnet_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'],
tokenization_info['query_tokens'],
tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
if self.concept_settings['use_nell']:
query_concepts = self._lookup_nell_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'],
tokenization_info['query_tokens'], self.nell_retrieve_info[example.qas_id]['query_entities'])
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
query_concepts = query_concepts[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
assert all_doc_tokens == tokenization_info['document_subtokens']
if self.concept_settings['use_wordnet']:
doc_concepts = self._lookup_wordnet_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'],
tokenization_info['document_tokens'],
tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
if self.concept_settings['use_nell']:
doc_concepts = self._lookup_nell_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'],
tokenization_info['document_tokens'], self.nell_retrieve_info[example.qas_id]['document_entities'])
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position +
1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
concept_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
concept_ids.append([])
for token, query_concept in zip(query_tokens, query_concepts):
tokens.append(token)
segment_ids.append(0)
concept_ids.append(query_concept)
tokens.append("[SEP]")
segment_ids.append(0)
concept_ids.append([])
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
concept_ids.append(doc_concepts[split_token_index])
tokens.append("[SEP]")
segment_ids.append(1)
concept_ids.append([])
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
#while len(input_ids) < max_seq_length:
# input_ids.append(0)
# input_mask.append(0)
# segment_ids.append(0)
#assert len(input_ids) == max_seq_length
#assert len(input_mask) == max_seq_length
#assert len(segment_ids) == max_seq_length
for cindex in range(len(concept_ids)):
concept_ids[cindex] = concept_ids[cindex] + [0] * (self.max_concept_length - len(concept_ids[cindex]))
concept_ids[cindex] = concept_ids[cindex][:self.max_concept_length]
assert all([len(id_list) == self.max_concept_length for id_list in concept_ids])
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
# out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
continue
# out_of_span = True
# if out_of_span:
# start_position = 0
# end_position = 0
# else:
# doc_offset = len(query_tokens) + 2
# start_position = tok_start_position - doc_start + doc_offset
# end_position = tok_end_position - doc_start + doc_offset
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
if example_index < 3:
logger.info("*** Example ***")
logger.info("unique_id: %s" % (unique_id))
logger.info("example_index: %s" % (example_index))
logger.info("doc_span_index: %s" % (doc_span_index))
logger.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
logger.info("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map)
]))
logger.info("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context)
]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" %
" ".join([str(x) for x in segment_ids]))
logger.info("concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(concept_ids)]))
if is_training and example.is_impossible:
logger.info("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(tokens[start_position:(end_position +
1)])
logger.info("start_position: %d" % (start_position))
logger.info("end_position: %d" % (end_position))
logger.info("answer: %s" %
(tokenization.printable_text(answer_text)))
feature = InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
concept_ids=concept_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible)
unique_id += 1
yield feature
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The ReCoRD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in ReCoRD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
class DataProcessor(object):
def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
doc_stride, max_query_length):
self._tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self._max_seq_length = max_seq_length
self._doc_stride = doc_stride
self._max_query_length = max_query_length
self._in_tokens = in_tokens
self.vocab = self._tokenizer.vocab
self.vocab_size = len(self.vocab)
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.current_train_example = -1
self.num_train_examples = -1
self.current_train_epoch = -1
self.train_examples = None
self.predict_examples = None
self.num_examples = {'train': -1, 'predict': -1}
self.train_max_concept_length = None
self.predict_max_concept_length = None
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_train_example, self.current_train_epoch
def get_examples(self,
data_path,
is_training,
version_2_with_negative=False):
examples = read_record_examples(
input_file=data_path,
is_training=is_training,
version_2_with_negative=version_2_with_negative)
return examples
def get_num_examples(self, phase):
if phase not in ['train', 'predict']:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
return self.num_examples[phase]
def get_features(self, examples, is_training, **concept_settings):
convert_examples_to_features = Examples_To_Features_Converter(**concept_settings)
features = convert_examples_to_features(
examples=examples,
tokenizer=self._tokenizer,
max_seq_length=self._max_seq_length,
doc_stride=self._doc_stride,
max_query_length=self._max_query_length,
is_training=is_training)
return features
def data_generator(self,
data_path,
batch_size,
phase='train',
shuffle=False,
dev_count=1,
version_2_with_negative=False,
epoch=1,
**concept_settings):
if phase == 'train':
self.train_examples = self.get_examples(
data_path,
is_training=True,
version_2_with_negative=version_2_with_negative)
examples = self.train_examples
self.num_examples['train'] = len(self.train_examples)
elif phase == 'predict':
self.predict_examples = self.get_examples(
data_path,
is_training=False,
version_2_with_negative=version_2_with_negative)
examples = self.predict_examples
self.num_examples['predict'] = len(self.predict_examples)
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
def batch_reader(features, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for (index, feature) in enumerate(features):
if phase == 'train':
self.current_train_example = index + 1
seq_len = len(feature.input_ids)
labels = [feature.unique_id
] if feature.start_position is None else [
feature.start_position, feature.end_position
]
example = [
# feature.input_ids, feature.segment_ids, range(seq_len), feature.concept_ids
feature.input_ids, feature.segment_ids, range(384), feature.concept_ids
] + labels
max_len = max(max_len, seq_len)
#max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(example)
total_token_num += seq_len
else:
yield batch, total_token_num
batch, total_token_num, max_len = [example
], seq_len, seq_len
if len(batch) > 0:
yield batch, total_token_num
if phase == 'train':
self.train_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_concept_length
else:
self.predict_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_concept_length
def wrapper():
for epoch_index in range(epoch):
if shuffle:
random.shuffle(examples)
if phase == 'train':
self.current_train_epoch = epoch_index
features = self.get_features(examples, is_training=True, **concept_settings)
max_concept_length = self.train_max_concept_length
else:
features = self.get_features(examples, is_training=False, **concept_settings)
max_concept_length = self.predict_max_concept_length
all_dev_batches = []
for batch_data, total_token_num in batch_reader(
features, batch_size, self._in_tokens):
batch_data = prepare_batch_data(
batch_data,
total_token_num,
voc_size=-1,
pad_id=self.pad_id,
cls_id=self.cls_id,
sep_id=self.sep_id,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False,
max_concept_length=max_concept_length)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data)
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
yield batch
all_dev_batches = []
return wrapper
def write_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
version_2_with_negative, null_score_diff_threshold,
verbose, predict_file, evaluation_result_file):
"""Write final predictions to the json file and log-odds of null if needed."""
logger.info("Writing predictions to: %s" % (output_prediction_file))
logger.info("Writing nbest to: %s" % (output_nbest_file))
logger.info("Writing evaluation result to: %s" % (evaluation_result_file))
# load ground truth file for evaluation and post-edit
with open(predict_file, "r", encoding='utf-8') as reader:
predict_json = json.load(reader)["data"]
all_candidates = {}
for passage in predict_json:
passage_text = passage['passage']['text']
candidates = []
for entity_info in passage['passage']['entities']:
start_offset = entity_info['start']
end_offset = entity_info['end']
candidates.append(passage_text[start_offset: end_offset + 1])
for qa in passage['qas']:
all_candidates[qa['id']] = candidates
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit",
"end_logit"
])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + result.end_logits[
0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
for start_index in start_indexes:
for end_index in end_indexes:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, do_lower_case,
verbose)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't inlude the empty option in the n-best, inlcude it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(
text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(
text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
# debug
if best_non_null_entry is None:
logger.info("Emmm..., sth wrong")
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
# restrict the finally picked prediction to have overlap with at least one candidate
picked_index = 0
for pred_index in range(len(nbest_json)):
if any([f1_score(nbest_json[pred_index]['text'], candidate) > 0. for candidate in all_candidates[example.qas_id]]):
picked_index = pred_index
break
all_predictions[example.qas_id] = nbest_json[picked_index]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
eval_result, _ = evaluate(predict_json, all_predictions)
with open(evaluation_result_file, "w") as writer:
writer.write(json.dumps(eval_result, indent=4) + "\n")
return eval_result
def get_final_text(pred_text, orig_text, do_lower_case, verbose):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the ReCoRD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose:
logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(
enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on ReCoRD."""
import six
import math
import json
import random
import collections
import os
import pickle
import logging
import tokenization
from batching_twomemory import prepare_batch_data
from eval.record_official_evaluate import evaluate, f1_score
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
class ReCoRDExample(object):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
wn_concept_ids,
nell_concept_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
self.wn_concept_ids = wn_concept_ids
self.nell_concept_ids = nell_concept_ids
def read_record_examples(input_file, is_training, version_2_with_negative=False):
"""Read a ReCoRD json file into a list of ReCoRDExample."""
with open(input_file, "r") as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
paragraph_text = entry["passage"]["text"].replace('\xa0', ' ')
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in entry["qas"]:
qas_id = qa["id"]
question_text = qa["query"].replace('\xa0', ' ')
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
# if (len(qa["answers"]) != 1) and (not is_impossible):
# raise ValueError(
# "For training, each question should have exactly 1 answer."
# )
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset +
answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(
end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.info("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = ReCoRDExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
class Examples_To_Features_Converter(object):
def __init__(self, **concept_settings):
self.concept_settings = concept_settings
# load necessary data files for mapping to related concepts
# 1. mapping from subword-level tokenization to word-level tokenization
tokenization_filepath = self.concept_settings['tokenization_path']
assert os.path.exists(tokenization_filepath)
self.all_tokenization_info = {}
for item in pickle.load(open(tokenization_filepath, 'rb')):
self.all_tokenization_info[item['id']] = item
# 2. mapping from concept name to concept id
self.wn_concept2id = self.concept_settings['wn_concept2id']
self.nell_concept2id = self.concept_settings['nell_concept2id']
# 3. retrieved related wordnet concepts (if use_wordnet)
if concept_settings['use_wordnet']:
retrieved_synset_filepath = self.concept_settings['retrieved_synset_path']
assert os.path.exists(retrieved_synset_filepath)
self.synsets_info = pickle.load(open(retrieved_synset_filepath, 'rb')) # token to sysnet names
self.max_wn_concept_length = max([len(synsets) for synsets in self.synsets_info.values()])
# 4. retrieved related nell concepts (if use_nell)
if concept_settings['use_nell']:
retrieved_nell_concept_filepath = self.concept_settings['retrieved_nell_concept_path']
assert os.path.exists(retrieved_nell_concept_filepath)
self.nell_retrieve_info = {}
for item in pickle.load(open(retrieved_nell_concept_filepath, 'rb')):
self.nell_retrieve_info[item['id']] = item
self.max_nell_concept_length = max([max([len(entity_info['retrieved_concepts']) for entity_info in item['query_entities'] + item['document_entities']])
for qid, item in self.nell_retrieve_info.items() if len(item['query_entities'] + item['document_entities']) > 0])
# return list of concept ids given input subword list
def _lookup_wordnet_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, tolower, tokenizer):
concept_ids = []
for index in range(len(sub_tokens)):
original_token = tokens[sub_to_ori_index[index]]
# if tokens are in upper case, we must lower it for retrieving
retrieve_token = tokenizer.basic_tokenizer._run_strip_accents(original_token.lower()) if tolower else original_token
if retrieve_token in self.synsets_info:
concept_ids.append([self.wn_concept2id[synset_name] for synset_name in self.synsets_info[retrieve_token]])
else:
concept_ids.append([])
return concept_ids
def _lookup_nell_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, nell_info):
original_concept_ids = [[] for _ in range(len(tokens))]
for entity_info in nell_info:
for pos in range(entity_info['token_start'], entity_info['token_end'] + 1):
original_concept_ids[pos] += [self.nell_concept2id[category_name] for category_name in entity_info['retrieved_concepts']]
for pos in range(len(original_concept_ids)):
original_concept_ids[pos] = list(set(original_concept_ids[pos]))
concept_ids = [original_concept_ids[sub_to_ori_index[index]] for index in range(len(sub_tokens))]
return concept_ids
def __call__(self,
examples,
tokenizer,
max_seq_length,
doc_stride,
max_query_length,
is_training):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
for (example_index, example) in enumerate(examples):
tokenization_info = self.all_tokenization_info[example.qas_id]
query_tokens = tokenizer.tokenize(example.question_text)
# check online subword tokenization result is the same as offline result
assert query_tokens == tokenization_info['query_subtokens']
if self.concept_settings['use_wordnet']:
query_wn_concepts = self._lookup_wordnet_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'],
tokenization_info['query_tokens'],
tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
if self.concept_settings['use_nell']:
query_nell_concepts = self._lookup_nell_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'],
tokenization_info['query_tokens'], self.nell_retrieve_info[example.qas_id]['query_entities'])
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
query_wn_concepts = query_wn_concepts[0:max_query_length]
query_nell_concepts = query_nell_concepts[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
assert all_doc_tokens == tokenization_info['document_subtokens']
if self.concept_settings['use_wordnet']:
doc_wn_concepts = self._lookup_wordnet_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'],
tokenization_info['document_tokens'],
tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
if self.concept_settings['use_nell']:
doc_nell_concepts = self._lookup_nell_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'],
tokenization_info['document_tokens'], self.nell_retrieve_info[example.qas_id]['document_entities'])
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position +
1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
wn_concept_ids = []
nell_concept_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
wn_concept_ids.append([])
nell_concept_ids.append([])
for token, query_wn_concept, query_nell_concept in zip(query_tokens, query_wn_concepts, query_nell_concepts):
tokens.append(token)
segment_ids.append(0)
wn_concept_ids.append(query_wn_concept)
nell_concept_ids.append(query_nell_concept)
tokens.append("[SEP]")
segment_ids.append(0)
wn_concept_ids.append([])
nell_concept_ids.append([])
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
wn_concept_ids.append(doc_wn_concepts[split_token_index])
nell_concept_ids.append(doc_nell_concepts[split_token_index])
tokens.append("[SEP]")
segment_ids.append(1)
wn_concept_ids.append([])
nell_concept_ids.append([])
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
#while len(input_ids) < max_seq_length:
# input_ids.append(0)
# input_mask.append(0)
# segment_ids.append(0)
#assert len(input_ids) == max_seq_length
#assert len(input_mask) == max_seq_length
#assert len(segment_ids) == max_seq_length
for concept_ids, max_concept_length in zip((wn_concept_ids, nell_concept_ids), (self.max_wn_concept_length, self.max_nell_concept_length)):
for cindex in range(len(concept_ids)):
concept_ids[cindex] = concept_ids[cindex] + [0] * (max_concept_length - len(concept_ids[cindex]))
concept_ids[cindex] = concept_ids[cindex][:max_concept_length]
assert all([len(id_list) == max_concept_length for id_list in concept_ids])
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
# out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
continue
# out_of_span = True
# if out_of_span:
# start_position = 0
# end_position = 0
# else:
# doc_offset = len(query_tokens) + 2
# start_position = tok_start_position - doc_start + doc_offset
# end_position = tok_end_position - doc_start + doc_offset
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
if example_index < 3:
logger.info("*** Example ***")
logger.info("unique_id: %s" % (unique_id))
logger.info("example_index: %s" % (example_index))
logger.info("doc_span_index: %s" % (doc_span_index))
logger.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
logger.info("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map)
]))
logger.info("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context)
]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" %
" ".join([str(x) for x in segment_ids]))
logger.info("wordnet_concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(wn_concept_ids)]))
logger.info("nell_concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(nell_concept_ids)]))
if is_training and example.is_impossible:
logger.info("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(tokens[start_position:(end_position +
1)])
logger.info("start_position: %d" % (start_position))
logger.info("end_position: %d" % (end_position))
logger.info("answer: %s" %
(tokenization.printable_text(answer_text)))
feature = InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
wn_concept_ids=wn_concept_ids,
nell_concept_ids=nell_concept_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible)
unique_id += 1
yield feature
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The ReCoRD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in ReCoRD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
class DataProcessor(object):
def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
doc_stride, max_query_length):
self._tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self._max_seq_length = max_seq_length
self._doc_stride = doc_stride
self._max_query_length = max_query_length
self._in_tokens = in_tokens
self.vocab = self._tokenizer.vocab
self.vocab_size = len(self.vocab)
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.current_train_example = -1
self.num_train_examples = -1
self.current_train_epoch = -1
self.train_examples = None
self.predict_examples = None
self.num_examples = {'train': -1, 'predict': -1}
self.train_wn_max_concept_length = None
self.predict_wn_max_concept_length = None
self.train_nell_max_concept_length = None
self.predict_nell_max_concept_length = None
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_train_example, self.current_train_epoch
def get_examples(self,
data_path,
is_training,
version_2_with_negative=False):
examples = read_record_examples(
input_file=data_path,
is_training=is_training,
version_2_with_negative=version_2_with_negative)
return examples
def get_num_examples(self, phase):
if phase not in ['train', 'predict']:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
return self.num_examples[phase]
def get_features(self, examples, is_training, **concept_settings):
convert_examples_to_features = Examples_To_Features_Converter(**concept_settings)
features = convert_examples_to_features(
examples=examples,
tokenizer=self._tokenizer,
max_seq_length=self._max_seq_length,
doc_stride=self._doc_stride,
max_query_length=self._max_query_length,
is_training=is_training)
return features
def data_generator(self,
data_path,
batch_size,
phase='train',
shuffle=False,
dev_count=1,
version_2_with_negative=False,
epoch=1,
**concept_settings):
if phase == 'train':
self.train_examples = self.get_examples(
data_path,
is_training=True,
version_2_with_negative=version_2_with_negative)
examples = self.train_examples
self.num_examples['train'] = len(self.train_examples)
elif phase == 'predict':
self.predict_examples = self.get_examples(
data_path,
is_training=False,
version_2_with_negative=version_2_with_negative)
examples = self.predict_examples
self.num_examples['predict'] = len(self.predict_examples)
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
def batch_reader(features, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for (index, feature) in enumerate(features):
if phase == 'train':
self.current_train_example = index + 1
seq_len = len(feature.input_ids)
labels = [feature.unique_id
] if feature.start_position is None else [
feature.start_position, feature.end_position
]
example = [
# feature.input_ids, feature.segment_ids, range(seq_len), feature.wn_concept_ids, feature.nell_concept_ids
feature.input_ids, feature.segment_ids, range(384), feature.wn_concept_ids, feature.nell_concept_ids
] + labels
max_len = max(max_len, seq_len)
#max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(example)
total_token_num += seq_len
else:
yield batch, total_token_num
batch, total_token_num, max_len = [example
], seq_len, seq_len
if len(batch) > 0:
yield batch, total_token_num
if phase == 'train':
self.train_wn_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_wn_concept_length
self.train_nell_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_nell_concept_length
else:
self.predict_wn_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_wn_concept_length
self.predict_nell_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_nell_concept_length
def wrapper():
for epoch_index in range(epoch):
if shuffle:
random.shuffle(examples)
if phase == 'train':
self.current_train_epoch = epoch_index
features = self.get_features(examples, is_training=True, **concept_settings)
max_wn_concept_length = self.train_wn_max_concept_length
max_nell_concept_length = self.train_nell_max_concept_length
else:
features = self.get_features(examples, is_training=False, **concept_settings)
max_wn_concept_length = self.predict_wn_max_concept_length
max_nell_concept_length = self.predict_nell_max_concept_length
all_dev_batches = []
for batch_data, total_token_num in batch_reader(
features, batch_size, self._in_tokens):
batch_data = prepare_batch_data(
batch_data,
total_token_num,
voc_size=-1,
pad_id=self.pad_id,
cls_id=self.cls_id,
sep_id=self.sep_id,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False,
max_wn_concept_length=max_wn_concept_length,
max_nell_concept_length=max_nell_concept_length)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data)
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
yield batch
all_dev_batches = []
return wrapper
def write_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
version_2_with_negative, null_score_diff_threshold,
verbose, predict_file, evaluation_result_file):
"""Write final predictions to the json file and log-odds of null if needed."""
logger.info("Writing predictions to: %s" % (output_prediction_file))
logger.info("Writing nbest to: %s" % (output_nbest_file))
logger.info("Writing evaluation result to: %s" % (evaluation_result_file))
# load ground truth file for evaluation and post-edit
with open(predict_file, "r", encoding='utf-8') as reader:
predict_json = json.load(reader)["data"]
all_candidates = {}
for passage in predict_json:
passage_text = passage['passage']['text']
candidates = []
for entity_info in passage['passage']['entities']:
start_offset = entity_info['start']
end_offset = entity_info['end']
candidates.append(passage_text[start_offset: end_offset + 1])
for qa in passage['qas']:
all_candidates[qa['id']] = candidates
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit",
"end_logit"
])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + result.end_logits[
0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
for start_index in start_indexes:
for end_index in end_indexes:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, do_lower_case,
verbose)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't inlude the empty option in the n-best, inlcude it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(
text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(
text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
# debug
if best_non_null_entry is None:
logger.info("Emmm..., sth wrong")
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
# restrict the finally picked prediction to have overlap with at least one candidate
picked_index = 0
for pred_index in range(len(nbest_json)):
if any([f1_score(nbest_json[pred_index]['text'], candidate) > 0. for candidate in all_candidates[example.qas_id]]):
picked_index = pred_index
break
all_predictions[example.qas_id] = nbest_json[picked_index]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
eval_result, _ = evaluate(predict_json, all_predictions)
with open(evaluation_result_file, "w") as writer:
writer.write(json.dumps(eval_result, indent=4) + "\n")
return eval_result
def get_final_text(pred_text, orig_text, do_lower_case, verbose):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the ReCoRD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose:
logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(
enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
import six
import math
import json
import random
import collections
import os
import pickle
import logging
import tokenization
from batching import prepare_batch_data
from eval.squad_v1_official_evaluate import evaluate
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
class SquadExample(object):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
concept_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
self.concept_ids = concept_ids
def read_squad_examples(input_file, is_training, version_2_with_negative=False):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r") as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer."
)
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset +
answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(
end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.info("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
class Examples_To_Features_Converter(object):
def __init__(self, **concept_settings):
self.concept_settings = concept_settings
# load necessary data files for mapping to related concepts
# 1. mapping from subword-level tokenization to word-level tokenization
tokenization_filepath = self.concept_settings['tokenization_path']
assert os.path.exists(tokenization_filepath)
self.all_tokenization_info = {}
for item in pickle.load(open(tokenization_filepath, 'rb')):
self.all_tokenization_info[item['id']] = item
# 2. mapping from concept name to concept id (currently only support one KB)
self.concept2id = self.concept_settings['concept2id']
# 3. retrieved related wordnet concepts (if use_wordnet)
if concept_settings['use_wordnet']:
assert not self.concept_settings['use_nell']
retrieved_synset_filepath = self.concept_settings['retrieved_synset_path']
assert os.path.exists(retrieved_synset_filepath)
self.synsets_info = pickle.load(open(retrieved_synset_filepath, 'rb')) # token to sysnet names
self.max_concept_length = max([len(synsets) for synsets in self.synsets_info.values()])
# 4. retrieved related nell concepts (if use_nell)
if concept_settings['use_nell']:
assert not self.concept_settings['use_wordnet']
retrieved_nell_concept_filepath = self.concept_settings['retrieved_nell_concept_path']
assert os.path.exists(retrieved_nell_concept_filepath)
self.nell_retrieve_info = {}
for item in pickle.load(open(retrieved_nell_concept_filepath, 'rb')):
self.nell_retrieve_info[item['id']] = item
self.max_concept_length = max([max([len(entity_info['retrieved_concepts']) for entity_info in item['query_entities'] + item['document_entities']])
for qid, item in self.nell_retrieve_info.items() if len(item['query_entities'] + item['document_entities']) > 0])
# return list of concept ids given input subword list
def _lookup_wordnet_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, tolower, tokenizer):
concept_ids = []
for index in range(len(sub_tokens)):
original_token = tokens[sub_to_ori_index[index]]
# if tokens are in upper case, we must lower it for retrieving
retrieve_token = tokenizer.basic_tokenizer._run_strip_accents(original_token.lower()) if tolower else original_token
if retrieve_token in self.synsets_info:
concept_ids.append([self.concept2id[synset_name] for synset_name in self.synsets_info[retrieve_token]])
else:
concept_ids.append([])
return concept_ids
def _lookup_nell_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, nell_info):
original_concept_ids = [[] for _ in range(len(tokens))]
for entity_info in nell_info:
for pos in range(entity_info['token_start'], entity_info['token_end'] + 1):
original_concept_ids[pos] += [self.concept2id[category_name] for category_name in entity_info['retrieved_concepts']]
for pos in range(len(original_concept_ids)):
original_concept_ids[pos] = list(set(original_concept_ids[pos]))
concept_ids = [original_concept_ids[sub_to_ori_index[index]] for index in range(len(sub_tokens))]
return concept_ids
def __call__(self,
examples,
tokenizer,
max_seq_length,
doc_stride,
max_query_length,
is_training):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
for (example_index, example) in enumerate(examples):
tokenization_info = self.all_tokenization_info[example.qas_id]
query_tokens = tokenizer.tokenize(example.question_text)
# check online subword tokenization result is the same as offline result
assert query_tokens == tokenization_info['query_subtokens']
if self.concept_settings['use_wordnet']:
query_concepts = self._lookup_wordnet_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'],
tokenization_info['query_tokens'],
tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
if self.concept_settings['use_nell']:
query_concepts = self._lookup_nell_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'],
tokenization_info['query_tokens'], self.nell_retrieve_info[example.qas_id]['query_entities'])
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
query_concepts = query_concepts[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
assert all_doc_tokens == tokenization_info['document_subtokens']
if self.concept_settings['use_wordnet']:
doc_concepts = self._lookup_wordnet_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'],
tokenization_info['document_tokens'],
tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
if self.concept_settings['use_nell']:
doc_concepts = self._lookup_nell_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'],
tokenization_info['document_tokens'], self.nell_retrieve_info[example.qas_id]['document_entities'])
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position +
1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
concept_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
concept_ids.append([])
for token, query_concept in zip(query_tokens, query_concepts):
tokens.append(token)
segment_ids.append(0)
concept_ids.append(query_concept)
tokens.append("[SEP]")
segment_ids.append(0)
concept_ids.append([])
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
concept_ids.append(doc_concepts[split_token_index])
tokens.append("[SEP]")
segment_ids.append(1)
concept_ids.append([])
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
#while len(input_ids) < max_seq_length:
# input_ids.append(0)
# input_mask.append(0)
# segment_ids.append(0)
#assert len(input_ids) == max_seq_length
#assert len(input_mask) == max_seq_length
#assert len(segment_ids) == max_seq_length
for cindex in range(len(concept_ids)):
concept_ids[cindex] = concept_ids[cindex] + [0] * (self.max_concept_length - len(concept_ids[cindex]))
concept_ids[cindex] = concept_ids[cindex][:self.max_concept_length]
assert all([len(id_list) == self.max_concept_length for id_list in concept_ids])
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
# out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
continue
# out_of_span = True
# if out_of_span:
# start_position = 0
# end_position = 0
# else:
# doc_offset = len(query_tokens) + 2
# start_position = tok_start_position - doc_start + doc_offset
# end_position = tok_end_position - doc_start + doc_offset
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
if example_index < 3:
logger.info("*** Example ***")
logger.info("unique_id: %s" % (unique_id))
logger.info("example_index: %s" % (example_index))
logger.info("doc_span_index: %s" % (doc_span_index))
logger.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
logger.info("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map)
]))
logger.info("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context)
]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" %
" ".join([str(x) for x in segment_ids]))
logger.info("concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(concept_ids)]))
if is_training and example.is_impossible:
logger.info("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(tokens[start_position:(end_position +
1)])
logger.info("start_position: %d" % (start_position))
logger.info("end_position: %d" % (end_position))
logger.info("answer: %s" %
(tokenization.printable_text(answer_text)))
feature = InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
concept_ids=concept_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible)
unique_id += 1
yield feature
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
class DataProcessor(object):
def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
doc_stride, max_query_length):
self._tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self._max_seq_length = max_seq_length
self._doc_stride = doc_stride
self._max_query_length = max_query_length
self._in_tokens = in_tokens
self.vocab = self._tokenizer.vocab
self.vocab_size = len(self.vocab)
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.current_train_example = -1
self.num_train_examples = -1
self.current_train_epoch = -1
self.train_examples = None
self.predict_examples = None
self.num_examples = {'train': -1, 'predict': -1}
self.train_max_concept_length = None
self.predict_max_concept_length = None
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_train_example, self.current_train_epoch
def get_examples(self,
data_path,
is_training,
version_2_with_negative=False):
examples = read_squad_examples(
input_file=data_path,
is_training=is_training,
version_2_with_negative=version_2_with_negative)
return examples
def get_num_examples(self, phase):
if phase not in ['train', 'predict']:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
return self.num_examples[phase]
def get_features(self, examples, is_training, **concept_settings):
convert_examples_to_features = Examples_To_Features_Converter(**concept_settings)
features = convert_examples_to_features(
examples=examples,
tokenizer=self._tokenizer,
max_seq_length=self._max_seq_length,
doc_stride=self._doc_stride,
max_query_length=self._max_query_length,
is_training=is_training)
return features
def data_generator(self,
data_path,
batch_size,
phase='train',
shuffle=False,
dev_count=1,
version_2_with_negative=False,
epoch=1,
**concept_settings):
if phase == 'train':
self.train_examples = self.get_examples(
data_path,
is_training=True,
version_2_with_negative=version_2_with_negative)
examples = self.train_examples
self.num_examples['train'] = len(self.train_examples)
elif phase == 'predict':
self.predict_examples = self.get_examples(
data_path,
is_training=False,
version_2_with_negative=version_2_with_negative)
examples = self.predict_examples
self.num_examples['predict'] = len(self.predict_examples)
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
def batch_reader(features, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for (index, feature) in enumerate(features):
if phase == 'train':
self.current_train_example = index + 1
seq_len = len(feature.input_ids)
labels = [feature.unique_id
] if feature.start_position is None else [
feature.start_position, feature.end_position
]
example = [
# feature.input_ids, feature.segment_ids, range(seq_len), feature.concept_ids
feature.input_ids, feature.segment_ids, range(384), feature.concept_ids
] + labels
max_len = max(max_len, seq_len)
#max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(example)
total_token_num += seq_len
else:
yield batch, total_token_num
batch, total_token_num, max_len = [example
], seq_len, seq_len
if len(batch) > 0:
yield batch, total_token_num
if phase == 'train':
self.train_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_concept_length
else:
self.predict_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_concept_length
def wrapper():
for epoch_index in range(epoch):
if shuffle:
random.shuffle(examples)
if phase == 'train':
self.current_train_epoch = epoch_index
features = self.get_features(examples, is_training=True, **concept_settings)
max_concept_length = self.train_max_concept_length
else:
features = self.get_features(examples, is_training=False, **concept_settings)
max_concept_length = self.predict_max_concept_length
all_dev_batches = []
for batch_data, total_token_num in batch_reader(
features, batch_size, self._in_tokens):
batch_data = prepare_batch_data(
batch_data,
total_token_num,
voc_size=-1,
pad_id=self.pad_id,
cls_id=self.cls_id,
sep_id=self.sep_id,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False,
max_concept_length=max_concept_length)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data)
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
yield batch
all_dev_batches = []
return wrapper
def write_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
version_2_with_negative, null_score_diff_threshold,
verbose, predict_file, evaluation_result_file):
"""Write final predictions to the json file and log-odds of null if needed."""
logger.info("Writing predictions to: %s" % (output_prediction_file))
logger.info("Writing nbest to: %s" % (output_nbest_file))
logger.info("Writing evaluation result to: %s" % (evaluation_result_file))
# load ground truth file for evaluation and post-edit
with open(predict_file, "r", encoding='utf-8') as reader:
predict_json = json.load(reader)["data"]
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit",
"end_logit"
])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + result.end_logits[
0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
for start_index in start_indexes:
for end_index in end_indexes:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, do_lower_case,
verbose)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't inlude the empty option in the n-best, inlcude it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(
text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(
text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
# debug
if best_non_null_entry is None:
logger.info("Emmm..., sth wrong")
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
eval_result = evaluate(predict_json, all_predictions)
with open(evaluation_result_file, "w") as writer:
writer.write(json.dumps(eval_result, indent=4) + "\n")
return eval_result
def get_final_text(pred_text, orig_text, do_lower_case, verbose):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose:
logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(
enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
import six
import math
import json
import random
import collections
import os
import pickle
import logging
import tokenization
from batching_twomemory import prepare_batch_data
from eval.squad_v1_official_evaluate import evaluate
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
class SquadExample(object):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
wn_concept_ids,
nell_concept_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
self.wn_concept_ids = wn_concept_ids
self.nell_concept_ids = nell_concept_ids
def read_squad_examples(input_file, is_training, version_2_with_negative=False):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r") as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer."
)
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset +
answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(
end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.info("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
class Examples_To_Features_Converter(object):
def __init__(self, **concept_settings):
self.concept_settings = concept_settings
# load necessary data files for mapping to related concepts
# 1. mapping from subword-level tokenization to word-level tokenization
tokenization_filepath = self.concept_settings['tokenization_path']
assert os.path.exists(tokenization_filepath)
self.all_tokenization_info = {}
for item in pickle.load(open(tokenization_filepath, 'rb')):
self.all_tokenization_info[item['id']] = item
# 2. mapping from concept name to concept id
self.wn_concept2id = self.concept_settings['wn_concept2id']
self.nell_concept2id = self.concept_settings['nell_concept2id']
# 3. retrieved related wordnet concepts (if use_wordnet)
if concept_settings['use_wordnet']:
retrieved_synset_filepath = self.concept_settings['retrieved_synset_path']
assert os.path.exists(retrieved_synset_filepath)
self.synsets_info = pickle.load(open(retrieved_synset_filepath, 'rb')) # token to sysnet names
self.max_wn_concept_length = max([len(synsets) for synsets in self.synsets_info.values()])
# 4. retrieved related nell concepts (if use_nell)
if concept_settings['use_nell']:
retrieved_nell_concept_filepath = self.concept_settings['retrieved_nell_concept_path']
assert os.path.exists(retrieved_nell_concept_filepath)
self.nell_retrieve_info = {}
for item in pickle.load(open(retrieved_nell_concept_filepath, 'rb')):
self.nell_retrieve_info[item['id']] = item
self.max_nell_concept_length = max([max([len(entity_info['retrieved_concepts']) for entity_info in item['query_entities'] + item['document_entities']])
for qid, item in self.nell_retrieve_info.items() if len(item['query_entities'] + item['document_entities']) > 0])
# return list of concept ids given input subword list
def _lookup_wordnet_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, tolower, tokenizer):
concept_ids = []
for index in range(len(sub_tokens)):
original_token = tokens[sub_to_ori_index[index]]
# if tokens are in upper case, we must lower it for retrieving
retrieve_token = tokenizer.basic_tokenizer._run_strip_accents(original_token.lower()) if tolower else original_token
if retrieve_token in self.synsets_info:
concept_ids.append([self.wn_concept2id[synset_name] for synset_name in self.synsets_info[retrieve_token]])
else:
concept_ids.append([])
return concept_ids
def _lookup_nell_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, nell_info):
original_concept_ids = [[] for _ in range(len(tokens))]
for entity_info in nell_info:
for pos in range(entity_info['token_start'], entity_info['token_end'] + 1):
original_concept_ids[pos] += [self.nell_concept2id[category_name] for category_name in entity_info['retrieved_concepts']]
for pos in range(len(original_concept_ids)):
original_concept_ids[pos] = list(set(original_concept_ids[pos]))
concept_ids = [original_concept_ids[sub_to_ori_index[index]] for index in range(len(sub_tokens))]
return concept_ids
def __call__(self,
examples,
tokenizer,
max_seq_length,
doc_stride,
max_query_length,
is_training):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
for (example_index, example) in enumerate(examples):
tokenization_info = self.all_tokenization_info[example.qas_id]
query_tokens = tokenizer.tokenize(example.question_text)
# check online subword tokenization result is the same as offline result
assert query_tokens == tokenization_info['query_subtokens']
if self.concept_settings['use_wordnet']:
query_wn_concepts = self._lookup_wordnet_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'],
tokenization_info['query_tokens'],
tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
if self.concept_settings['use_nell']:
query_nell_concepts = self._lookup_nell_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'],
tokenization_info['query_tokens'], self.nell_retrieve_info[example.qas_id]['query_entities'])
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
query_wn_concepts = query_wn_concepts[0:max_query_length]
query_nell_concepts = query_nell_concepts[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
assert all_doc_tokens == tokenization_info['document_subtokens']
if self.concept_settings['use_wordnet']:
doc_wn_concepts = self._lookup_wordnet_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'],
tokenization_info['document_tokens'],
tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
if self.concept_settings['use_nell']:
doc_nell_concepts = self._lookup_nell_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'],
tokenization_info['document_tokens'], self.nell_retrieve_info[example.qas_id]['document_entities'])
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position +
1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
wn_concept_ids = []
nell_concept_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
wn_concept_ids.append([])
nell_concept_ids.append([])
for token, query_wn_concept, query_nell_concept in zip(query_tokens, query_wn_concepts, query_nell_concepts):
tokens.append(token)
segment_ids.append(0)
wn_concept_ids.append(query_wn_concept)
nell_concept_ids.append(query_nell_concept)
tokens.append("[SEP]")
segment_ids.append(0)
wn_concept_ids.append([])
nell_concept_ids.append([])
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
wn_concept_ids.append(doc_wn_concepts[split_token_index])
nell_concept_ids.append(doc_nell_concepts[split_token_index])
tokens.append("[SEP]")
segment_ids.append(1)
wn_concept_ids.append([])
nell_concept_ids.append([])
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
#while len(input_ids) < max_seq_length:
# input_ids.append(0)
# input_mask.append(0)
# segment_ids.append(0)
#assert len(input_ids) == max_seq_length
#assert len(input_mask) == max_seq_length
#assert len(segment_ids) == max_seq_length
for concept_ids, max_concept_length in zip((wn_concept_ids, nell_concept_ids), (self.max_wn_concept_length, self.max_nell_concept_length)):
for cindex in range(len(concept_ids)):
concept_ids[cindex] = concept_ids[cindex] + [0] * (max_concept_length - len(concept_ids[cindex]))
concept_ids[cindex] = concept_ids[cindex][:max_concept_length]
assert all([len(id_list) == max_concept_length for id_list in concept_ids])
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
# out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
continue
# out_of_span = True
# if out_of_span:
# start_position = 0
# end_position = 0
# else:
# doc_offset = len(query_tokens) + 2
# start_position = tok_start_position - doc_start + doc_offset
# end_position = tok_end_position - doc_start + doc_offset
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
if example_index < 3:
logger.info("*** Example ***")
logger.info("unique_id: %s" % (unique_id))
logger.info("example_index: %s" % (example_index))
logger.info("doc_span_index: %s" % (doc_span_index))
logger.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
logger.info("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map)
]))
logger.info("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context)
]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" %
" ".join([str(x) for x in segment_ids]))
logger.info("wordnet_concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(wn_concept_ids)]))
logger.info("nell_concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(nell_concept_ids)]))
if is_training and example.is_impossible:
logger.info("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(tokens[start_position:(end_position +
1)])
logger.info("start_position: %d" % (start_position))
logger.info("end_position: %d" % (end_position))
logger.info("answer: %s" %
(tokenization.printable_text(answer_text)))
feature = InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
wn_concept_ids=wn_concept_ids,
nell_concept_ids=nell_concept_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible)
unique_id += 1
yield feature
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
class DataProcessor(object):
def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
doc_stride, max_query_length):
self._tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self._max_seq_length = max_seq_length
self._doc_stride = doc_stride
self._max_query_length = max_query_length
self._in_tokens = in_tokens
self.vocab = self._tokenizer.vocab
self.vocab_size = len(self.vocab)
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.current_train_example = -1
self.num_train_examples = -1
self.current_train_epoch = -1
self.train_examples = None
self.predict_examples = None
self.num_examples = {'train': -1, 'predict': -1}
self.train_wn_max_concept_length = None
self.predict_wn_max_concept_length = None
self.train_nell_max_concept_length = None
self.predict_nell_max_concept_length = None
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_train_example, self.current_train_epoch
def get_examples(self,
data_path,
is_training,
version_2_with_negative=False):
examples = read_squad_examples(
input_file=data_path,
is_training=is_training,
version_2_with_negative=version_2_with_negative)
return examples
def get_num_examples(self, phase):
if phase not in ['train', 'predict']:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
return self.num_examples[phase]
def get_features(self, examples, is_training, **concept_settings):
convert_examples_to_features = Examples_To_Features_Converter(**concept_settings)
features = convert_examples_to_features(
examples=examples,
tokenizer=self._tokenizer,
max_seq_length=self._max_seq_length,
doc_stride=self._doc_stride,
max_query_length=self._max_query_length,
is_training=is_training)
return features
def data_generator(self,
data_path,
batch_size,
phase='train',
shuffle=False,
dev_count=1,
version_2_with_negative=False,
epoch=1,
**concept_settings):
if phase == 'train':
self.train_examples = self.get_examples(
data_path,
is_training=True,
version_2_with_negative=version_2_with_negative)
examples = self.train_examples
self.num_examples['train'] = len(self.train_examples)
elif phase == 'predict':
self.predict_examples = self.get_examples(
data_path,
is_training=False,
version_2_with_negative=version_2_with_negative)
examples = self.predict_examples
self.num_examples['predict'] = len(self.predict_examples)
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
def batch_reader(features, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for (index, feature) in enumerate(features):
if phase == 'train':
self.current_train_example = index + 1
seq_len = len(feature.input_ids)
labels = [feature.unique_id
] if feature.start_position is None else [
feature.start_position, feature.end_position
]
example = [
# feature.input_ids, feature.segment_ids, range(seq_len), feature.wn_concept_ids, feature.nell_concept_ids
feature.input_ids, feature.segment_ids, range(384), feature.wn_concept_ids, feature.nell_concept_ids
] + labels
max_len = max(max_len, seq_len)
#max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(example)
total_token_num += seq_len
else:
yield batch, total_token_num
batch, total_token_num, max_len = [example
], seq_len, seq_len
if len(batch) > 0:
yield batch, total_token_num
if phase == 'train':
self.train_wn_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_wn_concept_length
self.train_nell_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_nell_concept_length
else:
self.predict_wn_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_wn_concept_length
self.predict_nell_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_nell_concept_length
def wrapper():
for epoch_index in range(epoch):
if shuffle:
random.shuffle(examples)
if phase == 'train':
self.current_train_epoch = epoch_index
features = self.get_features(examples, is_training=True, **concept_settings)
max_wn_concept_length = self.train_wn_max_concept_length
max_nell_concept_length = self.train_nell_max_concept_length
else:
features = self.get_features(examples, is_training=False, **concept_settings)
max_wn_concept_length = self.predict_wn_max_concept_length
max_nell_concept_length = self.predict_nell_max_concept_length
all_dev_batches = []
for batch_data, total_token_num in batch_reader(
features, batch_size, self._in_tokens):
batch_data = prepare_batch_data(
batch_data,
total_token_num,
voc_size=-1,
pad_id=self.pad_id,
cls_id=self.cls_id,
sep_id=self.sep_id,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False,
max_wn_concept_length=max_wn_concept_length,
max_nell_concept_length=max_nell_concept_length)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data)
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
yield batch
all_dev_batches = []
return wrapper
def write_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
version_2_with_negative, null_score_diff_threshold,
verbose, predict_file, evaluation_result_file):
"""Write final predictions to the json file and log-odds of null if needed."""
logger.info("Writing predictions to: %s" % (output_prediction_file))
logger.info("Writing nbest to: %s" % (output_nbest_file))
logger.info("Writing evaluation result to: %s" % (evaluation_result_file))
# load ground truth file for evaluation and post-edit
with open(predict_file, "r", encoding='utf-8') as reader:
predict_json = json.load(reader)["data"]
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit",
"end_logit"
])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + result.end_logits[
0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
for start_index in start_indexes:
for end_index in end_indexes:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, do_lower_case,
verbose)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't inlude the empty option in the n-best, inlcude it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(
text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(
text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
# debug
if best_non_null_entry is None:
logger.info("Emmm..., sth wrong")
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
eval_result = evaluate(predict_json, all_predictions)
with open(evaluation_result_file, "w") as writer:
writer.write(json.dumps(eval_result, indent=4) + "\n")
return eval_result
def get_final_text(pred_text, orig_text, do_lower_case, verbose):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose:
logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(
enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on ReCoRD."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import multiprocessing
import os
import time
import logging
import random
import numpy as np
import paddle
import paddle.fluid as fluid
from reader.record import DataProcessor, write_predictions
from model.bert import BertConfig, BertModel
from model.layers import MemoryLayer, TriLinearTwoTimeSelfAttentionLayer
from utils.args import ArgumentGroup, print_arguments
from optimization import optimization
from utils.init import init_pretraining_params, init_checkpoint
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
# yapf: disable
parser = argparse.ArgumentParser()
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("init_pretraining_params", str, None,
"Init pre-training params which preforms fine-tuning from. If the "
"arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("warmup_proportion", float, 0.1,
"Proportion of training steps to perform linear learning rate warmup for.")
train_g.add_arg("save_steps", int, 1000, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 1000, "The steps interval for validation (effective only when do_val is True).")
train_g.add_arg("use_ema", bool, True, "Whether to use ema.")
train_g.add_arg("ema_decay", float, 0.9999, "Decay rate for expoential moving average.")
train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.")
train_g.add_arg("loss_scaling", float, 1.0,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_file", str, None, "ReCoRD json for training. E.g., train.json.")
data_g.add_arg("predict_file", str, None, "ReCoRD json for predictions. E.g. dev.json.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("version_2_with_negative", bool, False,
"If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
data_g.add_arg("max_query_length", int, 64, "Max query length.")
data_g.add_arg("max_answer_length", int, 30, "Max answer length.")
data_g.add_arg("batch_size", int, 12, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("in_tokens", bool, False,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch.")
data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("doc_stride", int, 128,
"When splitting up a long document into chunks, how much stride to take between chunks.")
data_g.add_arg("n_best_size", int, 20,
"The total number of n-best predictions to generate in the nbest_predictions.json output file.")
data_g.add_arg("null_score_diff_threshold", float, 0.0,
"If null_score - best_non_null is greater than the threshold predict null.")
data_g.add_arg("random_seed", int, 42, "Random seed.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.")
run_type_g.add_arg("do_train", bool, False, "Whether to perform training.")
run_type_g.add_arg("do_val", bool, False, "Whether to perform validation during training.")
run_type_g.add_arg("do_predict", bool, False, "Whether to perform prediction.")
run_type_g.add_arg("freeze", bool, False, "freeze bert parameters")
mem_settings_g = ArgumentGroup(parser, "memory", "memory settings.")
mem_settings_g.add_arg('concept_embedding_path', str, None, 'path of pretrained concept file')
mem_settings_g.add_arg('use_wordnet', bool, False, 'whether to use wordnet memory')
mem_settings_g.add_arg('retrieved_synset_path', str, '../retrieve_concepts/retrieve_wordnet/output_record/retrived_synsets.data', 'path of retrieved synsets')
mem_settings_g.add_arg('use_nell', bool, False, 'whether to use nell memory')
mem_settings_g.add_arg('train_retrieved_nell_concept_path', str, '../retrieve_concepts/retrieve_nell/output_record/train.retrieved_nell_concepts.data', 'path of retrieved concepts for trainset')
mem_settings_g.add_arg('dev_retrieved_nell_concept_path', str, '../retrieve_concepts/retrieve_nell/output_record/dev.retrieved_nell_concepts.data', 'path of retrieved concepts for devset')
args = parser.parse_args()
# yapf: enable.
def create_model(pyreader_name, bert_config, max_concept_length, concept_embedding_mat, is_training=False, freeze=False):
if is_training:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, max_concept_length, 1],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, concept_ids, input_mask, start_positions,
end_positions) = fluid.layers.read_file(pyreader)
else:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, max_concept_length, 1],
[-1, args.max_seq_len, 1], [-1, 1]],
dtypes=['int64', 'int64', 'int64', 'int64', 'float32', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
'''1st Layer: BERT Layer'''
bert = BertModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
input_mask=input_mask,
config=bert_config,
use_fp16=args.use_fp16)
enc_out = bert.get_sequence_output()
if freeze:
enc_out.stop_gradient=True
logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient))
'''2nd layer: Memory Layer'''
# get memory embedding
concept_vocab_size = concept_embedding_mat.shape[0]
concept_dim = concept_embedding_mat.shape[1]
memory_embs = fluid.layers.embedding(concept_ids,
size=(concept_vocab_size, concept_dim),
param_attr=fluid.ParamAttr(name="concept_emb_mat",
do_model_average=False,
trainable=False),
dtype='float32')
# get memory length
concept_ids_reduced = fluid.layers.equal(concept_ids,
fluid.layers.fill_constant(shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1]
concept_ids_reduced = fluid.layers.cast(concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1]
concept_ids_reduced = fluid.layers.scale(
fluid.layers.elementwise_sub(
concept_ids_reduced,
fluid.layers.fill_constant([1], "float32", 1)
),
scale=-1
)
mem_length = fluid.layers.reduce_sum(concept_ids_reduced, dim=2) # [batch_size, sent_size, 1]
# select and integrate
memory_layer = MemoryLayer(bert_config, max_concept_length, concept_dim, mem_method='cat')
memory_output = memory_layer.forward(enc_out, memory_embs, mem_length, ignore_no_memory_token=True)
'''3rd layer: Self-Matching Layer'''
# calculate input dim for self-matching layer
if memory_layer.mem_method == 'add':
memory_output_size = bert_config['hidden_size']
elif memory_layer.mem_method == 'cat':
memory_output_size = bert_config['hidden_size'] + concept_dim
else:
raise ValueError("memory_layer.mem_method must be 'add' or 'cat'")
logger.info("memory_output_size: {}".format(memory_output_size))
# do matching
self_att_layer = TriLinearTwoTimeSelfAttentionLayer(
memory_output_size, dropout_rate=0.0,
cat_mul=True, cat_sub=True, cat_twotime=True,
cat_twotime_mul=False, cat_twotime_sub=True) # [bs, sq, concat_hs]
att_output = self_att_layer.forward(memory_output, input_mask) # [bs, sq, concat_hs]
'''4th layer: Output Layer'''
logits = fluid.layers.fc(
input=att_output,
size=2,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_squad_out_w",
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])),
bias_attr=fluid.ParamAttr(
name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
batch_ones = fluid.layers.fill_constant_batch_size_like(
input=start_logits, dtype='int64', shape=[1], value=1)
num_seqs = fluid.layers.reduce_sum(input=batch_ones)
if is_training:
def compute_loss(logits, positions):
loss = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=positions)
loss = fluid.layers.mean(x=loss)
return loss
start_loss = compute_loss(start_logits, start_positions)
end_loss = compute_loss(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2.0
if args.use_fp16 and args.loss_scaling > 1.0:
total_loss = total_loss * args.loss_scaling
return pyreader, total_loss, num_seqs
else:
return pyreader, unique_id, start_logits, end_logits, num_seqs
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def predict(test_exe, test_program, test_pyreader, fetch_list, processor, eval_concept_settings, eval_output_name='eval_result.json'):
if not os.path.exists(args.checkpoints):
os.makedirs(args.checkpoints)
output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
output_evaluation_result_file = os.path.join(args.checkpoints, eval_output_name)
test_pyreader.start()
all_results = []
time_begin = time.time()
while True:
try:
np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
fetch_list=fetch_list, program=test_program)
for idx in range(np_unique_ids.shape[0]):
if len(all_results) % 1000 == 0:
logger.info("Processing example: %d" % len(all_results))
unique_id = int(np_unique_ids[idx])
start_logits = [float(x) for x in np_start_logits[idx].flat]
end_logits = [float(x) for x in np_end_logits[idx].flat]
all_results.append(
RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
except fluid.core.EOFException:
test_pyreader.reset()
break
time_end = time.time()
features = processor.get_features(
processor.predict_examples, is_training=False, **eval_concept_settings)
eval_result = write_predictions(processor.predict_examples, features, all_results,
args.n_best_size, args.max_answer_length,
args.do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
args.version_2_with_negative,
args.null_score_diff_threshold, args.verbose, args.predict_file, output_evaluation_result_file)
return eval_result
def read_concept_embedding(embedding_path):
fin = open(embedding_path, encoding='utf-8')
info = [line.strip() for line in fin]
dim = len(info[0].split(' ')[1:])
n_concept = len(info)
embedding_mat = []
id2concept, concept2id = [], {}
# add padding concept into vocab
id2concept.append('<pad_concept>')
concept2id['<pad_concept>'] = 0
embedding_mat.append([0.0 for _ in range(dim)])
for line in info:
concept_name = line.split(' ')[0]
embedding = [float(value_str) for value_str in line.split(' ')[1:]]
assert len(embedding) == dim and not np.any(np.isnan(embedding))
embedding_mat.append(embedding)
concept2id[concept_name] = len(id2concept)
id2concept.append(concept_name)
embedding_mat = np.array(embedding_mat, dtype=np.float32)
return id2concept, concept2id, embedding_mat
def train(args):
bert_config = BertConfig(args.bert_config_path)
bert_config.print_config()
if not (args.do_train or args.do_predict or args.do_val):
raise ValueError("For args `do_train` and `do_predict`, at "
"least one of them must be True.")
if args.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place)
id2concept, concept2id, concept_embedding_mat = read_concept_embedding(
args.concept_embedding_path)
processor = DataProcessor(
vocab_path=args.vocab_path,
do_lower_case=args.do_lower_case,
max_seq_length=args.max_seq_len,
in_tokens=args.in_tokens,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length)
startup_prog = fluid.Program()
if args.random_seed is not None:
startup_prog.random_seed = args.random_seed
random.seed(args.random_seed)
np.random.seed(args.random_seed)
if args.do_train:
train_concept_settings = {
'tokenization_path': '../retrieve_concepts/tokenization_record/tokens/train.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
'concept2id': concept2id,
'use_wordnet': args.use_wordnet,
'retrieved_synset_path': args.retrieved_synset_path,
'use_nell': args.use_nell,
'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path,
}
train_data_generator = processor.data_generator(
data_path=args.train_file,
batch_size=args.batch_size,
phase='train',
shuffle=True,
dev_count=dev_count,
version_2_with_negative=args.version_2_with_negative,
epoch=args.epoch,
**train_concept_settings)
num_train_examples = processor.get_num_examples(phase='train')
if args.in_tokens:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size // args.max_seq_len) // dev_count
else:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size) // dev_count
warmup_steps = int(max_train_steps * args.warmup_proportion)
logger.info("Device count: %d" % dev_count)
logger.info("Num train examples: %d" % num_train_examples)
logger.info("Max train steps: %d" % max_train_steps)
logger.info("Num warmup steps: %d" % warmup_steps)
train_program = fluid.Program()
# if args.random_seed is not None:
# train_program.random_seed = args.random_seed
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, loss, num_seqs = create_model(
pyreader_name='train_reader',
bert_config=bert_config,
max_concept_length=processor.train_max_concept_length,
concept_embedding_mat=concept_embedding_mat,
is_training=True,
freeze=args.freeze)
scheduled_lr = optimization(
loss=loss,
warmup_steps=warmup_steps,
num_train_steps=max_train_steps,
learning_rate=args.learning_rate,
train_program=train_program,
startup_prog=startup_prog,
weight_decay=args.weight_decay,
scheduler=args.lr_scheduler,
use_fp16=args.use_fp16,
loss_scaling=args.loss_scaling)
if args.use_ema:
ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
ema.update()
fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
if args.verbose:
if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program,
batch_size=args.batch_size // args.max_seq_len)
else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
logger.info("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
if args.do_predict or args.do_val:
eval_concept_settings = {
'tokenization_path': '../retrieve_concepts/tokenization_record/tokens/dev.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
'concept2id': concept2id,
'use_wordnet': args.use_wordnet,
'retrieved_synset_path': args.retrieved_synset_path,
'use_nell': args.use_nell,
'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path,
}
eval_data_generator = processor.data_generator(
data_path=args.predict_file,
batch_size=args.batch_size,
phase='predict',
shuffle=False,
dev_count=1,
epoch=1,
**eval_concept_settings)
test_prog = fluid.Program()
# if args.random_seed is not None:
# test_prog.random_seed = args.random_seed
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
pyreader_name='test_reader',
bert_config=bert_config,
max_concept_length=processor.predict_max_concept_length,
concept_embedding_mat=concept_embedding_mat,
is_training=False)
if args.use_ema and 'ema' not in dir():
ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
start_logits.name, end_logits.name, num_seqs.name])
test_prog = test_prog.clone(for_test=True)
# if args.random_seed is not None:
# test_prog.random_seed = args.random_seed
exe.run(startup_prog)
if args.do_train:
logger.info('load pretrained concept embedding')
fluid.global_scope().find_var('concept_emb_mat').get_tensor().set(concept_embedding_mat, place)
if args.init_checkpoint and args.init_pretraining_params:
logger.info(
"WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
"both are set! Only arg 'init_checkpoint' is made valid.")
if args.init_checkpoint:
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.init_pretraining_params:
init_pretraining_params(
exe,
args.init_pretraining_params,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.do_predict or args.do_val:
if not args.init_checkpoint:
raise ValueError("args 'init_checkpoint' should be set if"
"only doing prediction!")
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
if args.do_train:
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = args.use_fast_executor
exec_strategy.num_threads = dev_count
exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
train_exe = fluid.ParallelExecutor(
use_cuda=args.use_cuda,
loss_name=loss.name,
exec_strategy=exec_strategy,
main_program=train_program)
train_pyreader.decorate_tensor_provider(train_data_generator)
train_pyreader.start()
steps = 0
total_cost, total_num_seqs = [], []
time_begin = time.time()
while steps < max_train_steps:
try:
steps += 1
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
fetch_list = [loss.name, num_seqs.name]
else:
fetch_list = [
loss.name, scheduled_lr.name, num_seqs.name
]
else:
fetch_list = []
outputs = train_exe.run(fetch_list=fetch_list)
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
np_loss, np_num_seqs = outputs
else:
np_loss, np_lr, np_num_seqs = outputs
total_cost.extend(np_loss * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
)
verbose += "learning rate: %f" % (
np_lr[0]
if warmup_steps > 0 else args.learning_rate)
logger.info(verbose)
time_end = time.time()
used_time = time_end - time_begin
current_example, epoch = processor.get_train_progress()
logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"speed: %f steps/s" %
(epoch, current_example, num_train_examples, steps,
np.sum(total_cost) / np.sum(total_num_seqs),
args.skip_steps / used_time))
total_cost, total_num_seqs = [], []
time_begin = time.time()
if steps % args.save_steps == 0 or steps == max_train_steps:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program)
if steps % args.validation_steps == 0 or steps == max_train_steps:
if args.do_val:
test_pyreader.decorate_tensor_provider(
processor.data_generator(
data_path=args.predict_file,
batch_size=args.batch_size,
phase='predict',
shuffle=False,
dev_count=1,
epoch=1,
**eval_concept_settings)
)
val_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps))
logger.info("Validation performance after step {}:\n* Exact_match: {}\n* F1: {}".format(steps, val_performance['exact_match'], val_performance['f1']))
except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps) + "_final")
fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset()
break
if args.do_predict:
test_pyreader.decorate_tensor_provider(eval_data_generator)
if args.use_ema:
with ema.apply(exe):
eval_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings)
else:
eval_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings)
logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format(eval_performance['exact_match'], eval_performance['f1']))
if __name__ == '__main__':
print_arguments(args)
train(args)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on ReCoRD."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import multiprocessing
import os
import time
import logging
import random
import numpy as np
import paddle
import paddle.fluid as fluid
from reader.record_twomemory import DataProcessor, write_predictions
from model.bert import BertConfig, BertModel
from model.layers import MemoryLayer, TriLinearTwoTimeSelfAttentionLayer
from utils.args import ArgumentGroup, print_arguments
from optimization import optimization
from utils.init import init_pretraining_params, init_checkpoint
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
# yapf: disable
parser = argparse.ArgumentParser()
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("init_pretraining_params", str, None,
"Init pre-training params which preforms fine-tuning from. If the "
"arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("warmup_proportion", float, 0.1,
"Proportion of training steps to perform linear learning rate warmup for.")
train_g.add_arg("save_steps", int, 1000, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 1000, "The steps interval for validation (effective only when do_val is True).")
train_g.add_arg("use_ema", bool, True, "Whether to use ema.")
train_g.add_arg("ema_decay", float, 0.9999, "Decay rate for expoential moving average.")
train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.")
train_g.add_arg("loss_scaling", float, 1.0,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_file", str, None, "ReCoRD json for training. E.g., train.json.")
data_g.add_arg("predict_file", str, None, "ReCoRD json for predictions. E.g. dev.json.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("version_2_with_negative", bool, False,
"If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
data_g.add_arg("max_query_length", int, 64, "Max query length.")
data_g.add_arg("max_answer_length", int, 30, "Max answer length.")
data_g.add_arg("batch_size", int, 12, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("in_tokens", bool, False,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch.")
data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("doc_stride", int, 128,
"When splitting up a long document into chunks, how much stride to take between chunks.")
data_g.add_arg("n_best_size", int, 20,
"The total number of n-best predictions to generate in the nbest_predictions.json output file.")
data_g.add_arg("null_score_diff_threshold", float, 0.0,
"If null_score - best_non_null is greater than the threshold predict null.")
data_g.add_arg("random_seed", int, 42, "Random seed.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.")
run_type_g.add_arg("do_train", bool, False, "Whether to perform training.")
run_type_g.add_arg("do_val", bool, False, "Whether to perform validation during training.")
run_type_g.add_arg("do_predict", bool, False, "Whether to perform prediction.")
run_type_g.add_arg("freeze", bool, False, "freeze bert parameters")
mem_settings_g = ArgumentGroup(parser, "memory", "memory settings.")
mem_settings_g.add_arg('wn_concept_embedding_path', str, None, 'path of wordnet pretrained concept file')
mem_settings_g.add_arg('nell_concept_embedding_path', str, None, 'path of nell pretrained concept file')
mem_settings_g.add_arg('use_wordnet', bool, False, 'whether to use wordnet memory')
mem_settings_g.add_arg('retrieved_synset_path', str, '../retrieve_concepts/retrieve_wordnet/output_record/retrived_synsets.data', 'path of retrieved synsets')
mem_settings_g.add_arg('use_nell', bool, False, 'whether to use nell memory')
mem_settings_g.add_arg('train_retrieved_nell_concept_path', str, '../retrieve_concepts/retrieve_nell/output_record/train.retrieved_nell_concepts.data', 'path of retrieved concepts for trainset')
mem_settings_g.add_arg('dev_retrieved_nell_concept_path', str, '../retrieve_concepts/retrieve_nell/output_record/dev.retrieved_nell_concepts.data', 'path of retrieved concepts for devset')
args = parser.parse_args()
# yapf: enable.
def create_model(pyreader_name, bert_config, max_wn_concept_length, max_nell_concept_length, wn_concept_embedding_mat, nell_concept_embedding_mat, is_training=False, freeze=False):
if is_training:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, max_wn_concept_length, 1],
[-1, args.max_seq_len, max_nell_concept_length, 1],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, start_positions,
end_positions) = fluid.layers.read_file(pyreader)
else:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, max_wn_concept_length, 1],
[-1, args.max_seq_len, max_nell_concept_length, 1],
[-1, args.max_seq_len, 1], [-1, 1]],
dtypes=['int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
'''1st Layer: BERT Layer'''
bert = BertModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
input_mask=input_mask,
config=bert_config,
use_fp16=args.use_fp16)
enc_out = bert.get_sequence_output()
if freeze:
enc_out.stop_gradient=True
logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient))
'''2nd layer: Memory Layer'''
# get memory embedding
wn_concept_vocab_size = wn_concept_embedding_mat.shape[0]
wn_concept_dim = wn_concept_embedding_mat.shape[1]
nell_concept_vocab_size = nell_concept_embedding_mat.shape[0]
nell_concept_dim = nell_concept_embedding_mat.shape[1]
wn_memory_embs = fluid.layers.embedding(wn_concept_ids,
size=(wn_concept_vocab_size, wn_concept_dim),
param_attr=fluid.ParamAttr(name="wn_concept_emb_mat",
do_model_average=False,
trainable=False),
dtype='float32')
nell_memory_embs = fluid.layers.embedding(nell_concept_ids,
size=(nell_concept_vocab_size, nell_concept_dim),
param_attr=fluid.ParamAttr(name="nell_concept_emb_mat",
do_model_average=False,
trainable=False),
dtype='float32')
# get memory length
wn_concept_ids_reduced = fluid.layers.equal(wn_concept_ids,
fluid.layers.fill_constant(shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1]
wn_concept_ids_reduced = fluid.layers.cast(wn_concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1]
wn_concept_ids_reduced = fluid.layers.scale(
fluid.layers.elementwise_sub(
wn_concept_ids_reduced,
fluid.layers.fill_constant([1], "float32", 1)
),
scale=-1
)
wn_mem_length = fluid.layers.reduce_sum(wn_concept_ids_reduced, dim=2) # [batch_size, sent_size, 1]
nell_concept_ids_reduced = fluid.layers.equal(nell_concept_ids,
fluid.layers.fill_constant(shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1]
nell_concept_ids_reduced = fluid.layers.cast(nell_concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1]
nell_concept_ids_reduced = fluid.layers.scale(
fluid.layers.elementwise_sub(
nell_concept_ids_reduced,
fluid.layers.fill_constant([1], "float32", 1)
),
scale=-1
)
nell_mem_length = fluid.layers.reduce_sum(nell_concept_ids_reduced, dim=2) # [batch_size, sent_size, 1]
# select and integrate
wn_memory_layer = MemoryLayer(bert_config, max_wn_concept_length, wn_concept_dim, mem_method='raw', prefix='wn')
wn_memory_output = wn_memory_layer.forward(enc_out, wn_memory_embs, wn_mem_length, ignore_no_memory_token=True)
nell_memory_layer = MemoryLayer(bert_config, max_nell_concept_length, nell_concept_dim, mem_method='raw', prefix='nell')
nell_memory_output = nell_memory_layer.forward(enc_out, nell_memory_embs, nell_mem_length, ignore_no_memory_token=True)
memory_output = fluid.layers.concat([enc_out, wn_memory_output, nell_memory_output], axis=2)
'''3rd layer: Self-Matching Layer'''
# calculate input dim for self-matching layer
memory_output_size = bert_config['hidden_size'] + wn_concept_dim + nell_concept_dim
logger.info("memory_output_size: {}".format(memory_output_size))
# do matching
self_att_layer = TriLinearTwoTimeSelfAttentionLayer(
memory_output_size, dropout_rate=0.0,
cat_mul=True, cat_sub=True, cat_twotime=True,
cat_twotime_mul=False, cat_twotime_sub=True) # [bs, sq, concat_hs]
att_output = self_att_layer.forward(memory_output, input_mask) # [bs, sq, concat_hs]
'''4th layer: Output Layer'''
logits = fluid.layers.fc(
input=att_output,
size=2,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_squad_out_w",
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])),
bias_attr=fluid.ParamAttr(
name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
batch_ones = fluid.layers.fill_constant_batch_size_like(
input=start_logits, dtype='int64', shape=[1], value=1)
num_seqs = fluid.layers.reduce_sum(input=batch_ones)
if is_training:
def compute_loss(logits, positions):
loss = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=positions)
loss = fluid.layers.mean(x=loss)
return loss
start_loss = compute_loss(start_logits, start_positions)
end_loss = compute_loss(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2.0
if args.use_fp16 and args.loss_scaling > 1.0:
total_loss = total_loss * args.loss_scaling
return pyreader, total_loss, num_seqs
else:
return pyreader, unique_id, start_logits, end_logits, num_seqs
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def predict(test_exe, test_program, test_pyreader, fetch_list, processor, eval_concept_settings, eval_output_name='eval_result.json'):
if not os.path.exists(args.checkpoints):
os.makedirs(args.checkpoints)
output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
output_evaluation_result_file = os.path.join(args.checkpoints, eval_output_name)
test_pyreader.start()
all_results = []
time_begin = time.time()
while True:
try:
np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
fetch_list=fetch_list, program=test_program)
for idx in range(np_unique_ids.shape[0]):
if len(all_results) % 1000 == 0:
logger.info("Processing example: %d" % len(all_results))
unique_id = int(np_unique_ids[idx])
start_logits = [float(x) for x in np_start_logits[idx].flat]
end_logits = [float(x) for x in np_end_logits[idx].flat]
all_results.append(
RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
except fluid.core.EOFException:
test_pyreader.reset()
break
time_end = time.time()
features = processor.get_features(
processor.predict_examples, is_training=False, **eval_concept_settings)
eval_result = write_predictions(processor.predict_examples, features, all_results,
args.n_best_size, args.max_answer_length,
args.do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
args.version_2_with_negative,
args.null_score_diff_threshold, args.verbose, args.predict_file, output_evaluation_result_file)
return eval_result
def read_concept_embedding(embedding_path):
fin = open(embedding_path, encoding='utf-8')
info = [line.strip() for line in fin]
dim = len(info[0].split(' ')[1:])
n_concept = len(info)
embedding_mat = []
id2concept, concept2id = [], {}
# add padding concept into vocab
id2concept.append('<pad_concept>')
concept2id['<pad_concept>'] = 0
embedding_mat.append([0.0 for _ in range(dim)])
for line in info:
concept_name = line.split(' ')[0]
embedding = [float(value_str) for value_str in line.split(' ')[1:]]
assert len(embedding) == dim and not np.any(np.isnan(embedding))
embedding_mat.append(embedding)
concept2id[concept_name] = len(id2concept)
id2concept.append(concept_name)
embedding_mat = np.array(embedding_mat, dtype=np.float32)
return id2concept, concept2id, embedding_mat
def train(args):
bert_config = BertConfig(args.bert_config_path)
bert_config.print_config()
if not (args.do_train or args.do_predict or args.do_val):
raise ValueError("For args `do_train` and `do_predict`, at "
"least one of them must be True.")
if args.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place)
wn_id2concept, wn_concept2id, wn_concept_embedding_mat = read_concept_embedding(
args.wn_concept_embedding_path)
nell_id2concept, nell_concept2id, nell_concept_embedding_mat = read_concept_embedding(
args.nell_concept_embedding_path)
processor = DataProcessor(
vocab_path=args.vocab_path,
do_lower_case=args.do_lower_case,
max_seq_length=args.max_seq_len,
in_tokens=args.in_tokens,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length)
startup_prog = fluid.Program()
if args.random_seed is not None:
startup_prog.random_seed = args.random_seed
random.seed(args.random_seed)
np.random.seed(args.random_seed)
if args.do_train:
train_concept_settings = {
'tokenization_path': '../retrieve_concepts/tokenization_record/tokens/train.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
'wn_concept2id': wn_concept2id,
'nell_concept2id': nell_concept2id,
'use_wordnet': args.use_wordnet,
'retrieved_synset_path': args.retrieved_synset_path,
'use_nell': args.use_nell,
'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path,
}
train_data_generator = processor.data_generator(
data_path=args.train_file,
batch_size=args.batch_size,
phase='train',
shuffle=True,
dev_count=dev_count,
version_2_with_negative=args.version_2_with_negative,
epoch=args.epoch,
**train_concept_settings)
num_train_examples = processor.get_num_examples(phase='train')
if args.in_tokens:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size // args.max_seq_len) // dev_count
else:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size) // dev_count
warmup_steps = int(max_train_steps * args.warmup_proportion)
logger.info("Device count: %d" % dev_count)
logger.info("Num train examples: %d" % num_train_examples)
logger.info("Max train steps: %d" % max_train_steps)
logger.info("Num warmup steps: %d" % warmup_steps)
train_program = fluid.Program()
# if args.random_seed is not None:
# train_program.random_seed = args.random_seed
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, loss, num_seqs = create_model(
pyreader_name='train_reader',
bert_config=bert_config,
max_wn_concept_length=processor.train_wn_max_concept_length,
max_nell_concept_length=processor.train_nell_max_concept_length,
wn_concept_embedding_mat=wn_concept_embedding_mat,
nell_concept_embedding_mat=nell_concept_embedding_mat,
is_training=True,
freeze=args.freeze)
scheduled_lr = optimization(
loss=loss,
warmup_steps=warmup_steps,
num_train_steps=max_train_steps,
learning_rate=args.learning_rate,
train_program=train_program,
startup_prog=startup_prog,
weight_decay=args.weight_decay,
scheduler=args.lr_scheduler,
use_fp16=args.use_fp16,
loss_scaling=args.loss_scaling)
if args.use_ema:
ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
ema.update()
fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
if args.verbose:
if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program,
batch_size=args.batch_size // args.max_seq_len)
else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
logger.info("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
if args.do_predict or args.do_val:
eval_concept_settings = {
'tokenization_path': '../retrieve_concepts/tokenization_record/tokens/dev.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
'wn_concept2id': wn_concept2id,
'nell_concept2id': nell_concept2id,
'use_wordnet': args.use_wordnet,
'retrieved_synset_path': args.retrieved_synset_path,
'use_nell': args.use_nell,
'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path,
}
eval_data_generator = processor.data_generator(
data_path=args.predict_file,
batch_size=args.batch_size,
phase='predict',
shuffle=False,
dev_count=1,
epoch=1,
**eval_concept_settings)
test_prog = fluid.Program()
# if args.random_seed is not None:
# test_prog.random_seed = args.random_seed
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
pyreader_name='test_reader',
bert_config=bert_config,
max_wn_concept_length=processor.predict_wn_max_concept_length,
max_nell_concept_length=processor.predict_nell_max_concept_length,
wn_concept_embedding_mat=wn_concept_embedding_mat,
nell_concept_embedding_mat=nell_concept_embedding_mat,
is_training=False)
if args.use_ema and 'ema' not in dir():
ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
start_logits.name, end_logits.name, num_seqs.name])
test_prog = test_prog.clone(for_test=True)
# if args.random_seed is not None:
# test_prog.random_seed = args.random_seed
exe.run(startup_prog)
if args.do_train:
logger.info('load pretrained concept embedding')
fluid.global_scope().find_var('wn_concept_emb_mat').get_tensor().set(wn_concept_embedding_mat, place)
fluid.global_scope().find_var('nell_concept_emb_mat').get_tensor().set(nell_concept_embedding_mat, place)
if args.init_checkpoint and args.init_pretraining_params:
logger.info(
"WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
"both are set! Only arg 'init_checkpoint' is made valid.")
if args.init_checkpoint:
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.init_pretraining_params:
init_pretraining_params(
exe,
args.init_pretraining_params,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.do_predict or args.do_val:
if not args.init_checkpoint:
raise ValueError("args 'init_checkpoint' should be set if"
"only doing prediction!")
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
if args.do_train:
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = args.use_fast_executor
exec_strategy.num_threads = dev_count
exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
train_exe = fluid.ParallelExecutor(
use_cuda=args.use_cuda,
loss_name=loss.name,
exec_strategy=exec_strategy,
main_program=train_program)
train_pyreader.decorate_tensor_provider(train_data_generator)
train_pyreader.start()
steps = 0
total_cost, total_num_seqs = [], []
time_begin = time.time()
while steps < max_train_steps:
try:
steps += 1
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
fetch_list = [loss.name, num_seqs.name]
else:
fetch_list = [
loss.name, scheduled_lr.name, num_seqs.name
]
else:
fetch_list = []
outputs = train_exe.run(fetch_list=fetch_list)
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
np_loss, np_num_seqs = outputs
else:
np_loss, np_lr, np_num_seqs = outputs
total_cost.extend(np_loss * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
)
verbose += "learning rate: %f" % (
np_lr[0]
if warmup_steps > 0 else args.learning_rate)
logger.info(verbose)
time_end = time.time()
used_time = time_end - time_begin
current_example, epoch = processor.get_train_progress()
logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"speed: %f steps/s" %
(epoch, current_example, num_train_examples, steps,
np.sum(total_cost) / np.sum(total_num_seqs),
args.skip_steps / used_time))
total_cost, total_num_seqs = [], []
time_begin = time.time()
if steps % args.save_steps == 0 or steps == max_train_steps:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program)
if steps % args.validation_steps == 0 or steps == max_train_steps:
if args.do_val:
test_pyreader.decorate_tensor_provider(
processor.data_generator(
data_path=args.predict_file,
batch_size=args.batch_size,
phase='predict',
shuffle=False,
dev_count=1,
epoch=1,
**eval_concept_settings)
)
val_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps))
logger.info("Validation performance after step {}:\n* Exact_match: {}\n* F1: {}".format(steps, val_performance['exact_match'], val_performance['f1']))
except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps) + "_final")
fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset()
break
if args.do_predict:
test_pyreader.decorate_tensor_provider(eval_data_generator)
if args.use_ema:
with ema.apply(exe):
eval_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings)
else:
eval_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings)
logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format(eval_performance['exact_match'], eval_performance['f1']))
if __name__ == '__main__':
print_arguments(args)
train(args)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on SQuAD."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import multiprocessing
import os
import time
import logging
import random
import numpy as np
import paddle
import paddle.fluid as fluid
from reader.squad import DataProcessor, write_predictions
from model.bert import BertConfig, BertModel
from model.layers import MemoryLayer, TriLinearTwoTimeSelfAttentionLayer
from utils.args import ArgumentGroup, print_arguments
from optimization import optimization
from utils.init import init_pretraining_params, init_checkpoint
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
# yapf: disable
parser = argparse.ArgumentParser()
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("init_pretraining_params", str, None,
"Init pre-training params which preforms fine-tuning from. If the "
"arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("warmup_proportion", float, 0.1,
"Proportion of training steps to perform linear learning rate warmup for.")
train_g.add_arg("save_steps", int, 1000, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 1000, "The steps interval for validation (effective only when do_val is True).")
train_g.add_arg("use_ema", bool, True, "Whether to use ema.")
train_g.add_arg("ema_decay", float, 0.9999, "Decay rate for expoential moving average.")
train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.")
train_g.add_arg("loss_scaling", float, 1.0,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_file", str, None, "SQuAD json for training. E.g., train-v1.1.json.")
data_g.add_arg("predict_file", str, None, "SQuAD json for predictions. E.g. dev-v1.1.json or test-v1.1.json.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("version_2_with_negative", bool, False,
"If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
data_g.add_arg("max_query_length", int, 64, "Max query length.")
data_g.add_arg("max_answer_length", int, 30, "Max answer length.")
data_g.add_arg("batch_size", int, 12, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("in_tokens", bool, False,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch.")
data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("doc_stride", int, 128,
"When splitting up a long document into chunks, how much stride to take between chunks.")
data_g.add_arg("n_best_size", int, 20,
"The total number of n-best predictions to generate in the nbest_predictions.json output file.")
data_g.add_arg("null_score_diff_threshold", float, 0.0,
"If null_score - best_non_null is greater than the threshold predict null.")
data_g.add_arg("random_seed", int, 42, "Random seed.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.")
run_type_g.add_arg("do_train", bool, False, "Whether to perform training.")
run_type_g.add_arg("do_val", bool, False, "Whether to perform validation during training.")
run_type_g.add_arg("do_predict", bool, False, "Whether to perform prediction.")
run_type_g.add_arg("freeze", bool, False, "freeze bert parameters")
mem_settings_g = ArgumentGroup(parser, "memory", "memory settings.")
mem_settings_g.add_arg('concept_embedding_path', str, None, 'path of pretrained concept file')
mem_settings_g.add_arg('use_wordnet', bool, False, 'whether to use wordnet memory')
mem_settings_g.add_arg('retrieved_synset_path', str, '../retrieve_concepts/retrieve_wordnet/output_squad/retrived_synsets.data', 'path of retrieved synsets')
mem_settings_g.add_arg('use_nell', bool, False, 'whether to use nell memory')
mem_settings_g.add_arg('train_retrieved_nell_concept_path', str, '../retrieve_concepts/retrieve_nell/output_squad/train.retrieved_nell_concepts.data', 'path of retrieved concepts for trainset')
mem_settings_g.add_arg('dev_retrieved_nell_concept_path', str, '../retrieve_concepts/retrieve_nell/output_squad/dev.retrieved_nell_concepts.data', 'path of retrieved concepts for devset')
args = parser.parse_args()
# yapf: enable.
def create_model(pyreader_name, bert_config, max_concept_length, concept_embedding_mat, is_training=False, freeze=False):
if is_training:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, max_concept_length, 1],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, concept_ids, input_mask, start_positions,
end_positions) = fluid.layers.read_file(pyreader)
else:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, max_concept_length, 1],
[-1, args.max_seq_len, 1], [-1, 1]],
dtypes=['int64', 'int64', 'int64', 'int64', 'float32', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
'''1st Layer: BERT Layer'''
bert = BertModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
input_mask=input_mask,
config=bert_config,
use_fp16=args.use_fp16)
enc_out = bert.get_sequence_output()
if freeze:
enc_out.stop_gradient=True
logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient))
'''2nd layer: Memory Layer'''
# get memory embedding
concept_vocab_size = concept_embedding_mat.shape[0]
concept_dim = concept_embedding_mat.shape[1]
memory_embs = fluid.layers.embedding(concept_ids,
size=(concept_vocab_size, concept_dim),
param_attr=fluid.ParamAttr(name="concept_emb_mat",
do_model_average=False,
trainable=False),
dtype='float32')
# get memory length
concept_ids_reduced = fluid.layers.equal(concept_ids,
fluid.layers.fill_constant(shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1]
concept_ids_reduced = fluid.layers.cast(concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1]
concept_ids_reduced = fluid.layers.scale(
fluid.layers.elementwise_sub(
concept_ids_reduced,
fluid.layers.fill_constant([1], "float32", 1)
),
scale=-1
)
mem_length = fluid.layers.reduce_sum(concept_ids_reduced, dim=2) # [batch_size, sent_size, 1]
# select and integrate
memory_layer = MemoryLayer(bert_config, max_concept_length, concept_dim, mem_method='cat')
memory_output = memory_layer.forward(enc_out, memory_embs, mem_length, ignore_no_memory_token=True)
'''3rd layer: Self-Matching Layer'''
# calculate input dim for self-matching layer
if memory_layer.mem_method == 'add':
memory_output_size = bert_config['hidden_size']
elif memory_layer.mem_method == 'cat':
memory_output_size = bert_config['hidden_size'] + concept_dim
else:
raise ValueError("memory_layer.mem_method must be 'add' or 'cat'")
logger.info("memory_output_size: {}".format(memory_output_size))
# do matching
self_att_layer = TriLinearTwoTimeSelfAttentionLayer(
memory_output_size, dropout_rate=0.0,
cat_mul=True, cat_sub=True, cat_twotime=True,
cat_twotime_mul=False, cat_twotime_sub=True) # [bs, sq, concat_hs]
att_output = self_att_layer.forward(memory_output, input_mask) # [bs, sq, concat_hs]
'''4th layer: Output Layer'''
logits = fluid.layers.fc(
input=att_output,
size=2,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_squad_out_w",
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])),
bias_attr=fluid.ParamAttr(
name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
batch_ones = fluid.layers.fill_constant_batch_size_like(
input=start_logits, dtype='int64', shape=[1], value=1)
num_seqs = fluid.layers.reduce_sum(input=batch_ones)
if is_training:
def compute_loss(logits, positions):
loss = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=positions)
loss = fluid.layers.mean(x=loss)
return loss
start_loss = compute_loss(start_logits, start_positions)
end_loss = compute_loss(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2.0
if args.use_fp16 and args.loss_scaling > 1.0:
total_loss = total_loss * args.loss_scaling
return pyreader, total_loss, num_seqs
else:
return pyreader, unique_id, start_logits, end_logits, num_seqs
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def predict(test_exe, test_program, test_pyreader, fetch_list, processor, eval_concept_settings, eval_output_name='eval_result.json'):
if not os.path.exists(args.checkpoints):
os.makedirs(args.checkpoints)
output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
output_evaluation_result_file = os.path.join(args.checkpoints, eval_output_name)
test_pyreader.start()
all_results = []
time_begin = time.time()
while True:
try:
np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
fetch_list=fetch_list, program=test_program)
for idx in range(np_unique_ids.shape[0]):
if len(all_results) % 1000 == 0:
logger.info("Processing example: %d" % len(all_results))
unique_id = int(np_unique_ids[idx])
start_logits = [float(x) for x in np_start_logits[idx].flat]
end_logits = [float(x) for x in np_end_logits[idx].flat]
all_results.append(
RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
except fluid.core.EOFException:
test_pyreader.reset()
break
time_end = time.time()
features = processor.get_features(
processor.predict_examples, is_training=False, **eval_concept_settings)
eval_result = write_predictions(processor.predict_examples, features, all_results,
args.n_best_size, args.max_answer_length,
args.do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
args.version_2_with_negative,
args.null_score_diff_threshold, args.verbose, args.predict_file, output_evaluation_result_file)
return eval_result
def read_concept_embedding(embedding_path):
fin = open(embedding_path, encoding='utf-8')
info = [line.strip() for line in fin]
dim = len(info[0].split(' ')[1:])
n_concept = len(info)
embedding_mat = []
id2concept, concept2id = [], {}
# add padding concept into vocab
id2concept.append('<pad_concept>')
concept2id['<pad_concept>'] = 0
embedding_mat.append([0.0 for _ in range(dim)])
for line in info:
concept_name = line.split(' ')[0]
embedding = [float(value_str) for value_str in line.split(' ')[1:]]
assert len(embedding) == dim and not np.any(np.isnan(embedding))
embedding_mat.append(embedding)
concept2id[concept_name] = len(id2concept)
id2concept.append(concept_name)
embedding_mat = np.array(embedding_mat, dtype=np.float32)
return id2concept, concept2id, embedding_mat
def train(args):
bert_config = BertConfig(args.bert_config_path)
bert_config.print_config()
if not (args.do_train or args.do_predict or args.do_val):
raise ValueError("For args `do_train` and `do_predict`, at "
"least one of them must be True.")
if args.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place)
id2concept, concept2id, concept_embedding_mat = read_concept_embedding(
args.concept_embedding_path)
processor = DataProcessor(
vocab_path=args.vocab_path,
do_lower_case=args.do_lower_case,
max_seq_length=args.max_seq_len,
in_tokens=args.in_tokens,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length)
startup_prog = fluid.Program()
if args.random_seed is not None:
startup_prog.random_seed = args.random_seed
random.seed(args.random_seed)
np.random.seed(args.random_seed)
if args.do_train:
train_concept_settings = {
'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/train.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
'concept2id': concept2id,
'use_wordnet': args.use_wordnet,
'retrieved_synset_path': args.retrieved_synset_path,
'use_nell': args.use_nell,
'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path,
}
train_data_generator = processor.data_generator(
data_path=args.train_file,
batch_size=args.batch_size,
phase='train',
shuffle=True,
dev_count=dev_count,
version_2_with_negative=args.version_2_with_negative,
epoch=args.epoch,
**train_concept_settings)
num_train_examples = processor.get_num_examples(phase='train')
if args.in_tokens:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size // args.max_seq_len) // dev_count
else:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size) // dev_count
warmup_steps = int(max_train_steps * args.warmup_proportion)
logger.info("Device count: %d" % dev_count)
logger.info("Num train examples: %d" % num_train_examples)
logger.info("Max train steps: %d" % max_train_steps)
logger.info("Num warmup steps: %d" % warmup_steps)
train_program = fluid.Program()
# if args.random_seed is not None:
# train_program.random_seed = args.random_seed
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, loss, num_seqs = create_model(
pyreader_name='train_reader',
bert_config=bert_config,
max_concept_length=processor.train_max_concept_length,
concept_embedding_mat=concept_embedding_mat,
is_training=True,
freeze=args.freeze)
scheduled_lr = optimization(
loss=loss,
warmup_steps=warmup_steps,
num_train_steps=max_train_steps,
learning_rate=args.learning_rate,
train_program=train_program,
startup_prog=startup_prog,
weight_decay=args.weight_decay,
scheduler=args.lr_scheduler,
use_fp16=args.use_fp16,
loss_scaling=args.loss_scaling)
if args.use_ema:
ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
ema.update()
fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
if args.verbose:
if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program,
batch_size=args.batch_size // args.max_seq_len)
else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
logger.info("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
if args.do_predict or args.do_val:
eval_concept_settings = {
'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/dev.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
'concept2id': concept2id,
'use_wordnet': args.use_wordnet,
'retrieved_synset_path': args.retrieved_synset_path,
'use_nell': args.use_nell,
'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path,
}
eval_data_generator = processor.data_generator(
data_path=args.predict_file,
batch_size=args.batch_size,
phase='predict',
shuffle=False,
dev_count=1,
epoch=1,
**eval_concept_settings)
test_prog = fluid.Program()
# if args.random_seed is not None:
# test_prog.random_seed = args.random_seed
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
pyreader_name='test_reader',
bert_config=bert_config,
max_concept_length=processor.predict_max_concept_length,
concept_embedding_mat=concept_embedding_mat,
is_training=False)
if args.use_ema and 'ema' not in dir():
ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
start_logits.name, end_logits.name, num_seqs.name])
test_prog = test_prog.clone(for_test=True)
# if args.random_seed is not None:
# test_prog.random_seed = args.random_seed
exe.run(startup_prog)
if args.do_train:
logger.info('load pretrained concept embedding')
fluid.global_scope().find_var('concept_emb_mat').get_tensor().set(concept_embedding_mat, place)
if args.init_checkpoint and args.init_pretraining_params:
logger.info(
"WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
"both are set! Only arg 'init_checkpoint' is made valid.")
if args.init_checkpoint:
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.init_pretraining_params:
init_pretraining_params(
exe,
args.init_pretraining_params,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.do_predict or args.do_val:
if not args.init_checkpoint:
raise ValueError("args 'init_checkpoint' should be set if"
"only doing prediction!")
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
if args.do_train:
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = args.use_fast_executor
exec_strategy.num_threads = dev_count
exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
train_exe = fluid.ParallelExecutor(
use_cuda=args.use_cuda,
loss_name=loss.name,
exec_strategy=exec_strategy,
main_program=train_program)
train_pyreader.decorate_tensor_provider(train_data_generator)
train_pyreader.start()
steps = 0
total_cost, total_num_seqs = [], []
time_begin = time.time()
while steps < max_train_steps:
try:
steps += 1
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
fetch_list = [loss.name, num_seqs.name]
else:
fetch_list = [
loss.name, scheduled_lr.name, num_seqs.name
]
else:
fetch_list = []
outputs = train_exe.run(fetch_list=fetch_list)
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
np_loss, np_num_seqs = outputs
else:
np_loss, np_lr, np_num_seqs = outputs
total_cost.extend(np_loss * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
)
verbose += "learning rate: %f" % (
np_lr[0]
if warmup_steps > 0 else args.learning_rate)
logger.info(verbose)
time_end = time.time()
used_time = time_end - time_begin
current_example, epoch = processor.get_train_progress()
logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"speed: %f steps/s" %
(epoch, current_example, num_train_examples, steps,
np.sum(total_cost) / np.sum(total_num_seqs),
args.skip_steps / used_time))
total_cost, total_num_seqs = [], []
time_begin = time.time()
if steps % args.save_steps == 0 or steps == max_train_steps:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program)
if steps % args.validation_steps == 0 or steps == max_train_steps:
if args.do_val:
test_pyreader.decorate_tensor_provider(
processor.data_generator(
data_path=args.predict_file,
batch_size=args.batch_size,
phase='predict',
shuffle=False,
dev_count=1,
epoch=1,
**eval_concept_settings)
)
val_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps))
logger.info("Validation performance after step {}:\n* Exact_match: {}\n* F1: {}".format(steps, val_performance['exact_match'], val_performance['f1']))
except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps) + "_final")
fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset()
break
if args.do_predict:
test_pyreader.decorate_tensor_provider(eval_data_generator)
if args.use_ema:
with ema.apply(exe):
eval_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings)
else:
eval_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings)
logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format(eval_performance['exact_match'], eval_performance['f1']))
if __name__ == '__main__':
print_arguments(args)
train(args)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on SQuAD."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import multiprocessing
import os
import time
import logging
import random
import numpy as np
import paddle
import paddle.fluid as fluid
from reader.squad_twomemory import DataProcessor, write_predictions
from model.bert import BertConfig, BertModel
from model.layers import MemoryLayer, TriLinearTwoTimeSelfAttentionLayer
from utils.args import ArgumentGroup, print_arguments
from optimization import optimization
from utils.init import init_pretraining_params, init_checkpoint
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
# yapf: disable
parser = argparse.ArgumentParser()
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("init_pretraining_params", str, None,
"Init pre-training params which preforms fine-tuning from. If the "
"arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("warmup_proportion", float, 0.1,
"Proportion of training steps to perform linear learning rate warmup for.")
train_g.add_arg("save_steps", int, 1000, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 1000, "The steps interval for validation (effective only when do_val is True).")
train_g.add_arg("use_ema", bool, True, "Whether to use ema.")
train_g.add_arg("ema_decay", float, 0.9999, "Decay rate for expoential moving average.")
train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.")
train_g.add_arg("loss_scaling", float, 1.0,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_file", str, None, "SQuAD json for training. E.g., train-v1.1.json.")
data_g.add_arg("predict_file", str, None, "SQuAD json for predictions. E.g. dev-v1.1.json or test-v1.1.json.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("version_2_with_negative", bool, False,
"If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
data_g.add_arg("max_query_length", int, 64, "Max query length.")
data_g.add_arg("max_answer_length", int, 30, "Max answer length.")
data_g.add_arg("batch_size", int, 12, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("in_tokens", bool, False,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch.")
data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("doc_stride", int, 128,
"When splitting up a long document into chunks, how much stride to take between chunks.")
data_g.add_arg("n_best_size", int, 20,
"The total number of n-best predictions to generate in the nbest_predictions.json output file.")
data_g.add_arg("null_score_diff_threshold", float, 0.0,
"If null_score - best_non_null is greater than the threshold predict null.")
data_g.add_arg("random_seed", int, 42, "Random seed.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.")
run_type_g.add_arg("do_train", bool, False, "Whether to perform training.")
run_type_g.add_arg("do_val", bool, False, "Whether to perform validation during training.")
run_type_g.add_arg("do_predict", bool, False, "Whether to perform prediction.")
run_type_g.add_arg("freeze", bool, False, "freeze bert parameters")
mem_settings_g = ArgumentGroup(parser, "memory", "memory settings.")
mem_settings_g.add_arg('wn_concept_embedding_path', str, None, 'path of wordnet pretrained concept file')
mem_settings_g.add_arg('nell_concept_embedding_path', str, None, 'path of nell pretrained concept file')
mem_settings_g.add_arg('use_wordnet', bool, False, 'whether to use wordnet memory')
mem_settings_g.add_arg('retrieved_synset_path', str, '../retrieve_concepts/retrieve_wordnet/output_squad/retrived_synsets.data', 'path of retrieved synsets')
mem_settings_g.add_arg('use_nell', bool, False, 'whether to use nell memory')
mem_settings_g.add_arg('train_retrieved_nell_concept_path', str, '../retrieve_concepts/retrieve_nell/output_squad/train.retrieved_nell_concepts.data', 'path of retrieved concepts for trainset')
mem_settings_g.add_arg('dev_retrieved_nell_concept_path', str, '../retrieve_concepts/retrieve_nell/output_squad/dev.retrieved_nell_concepts.data', 'path of retrieved concepts for devset')
args = parser.parse_args()
# yapf: enable.
def create_model(pyreader_name, bert_config, max_wn_concept_length, max_nell_concept_length, wn_concept_embedding_mat, nell_concept_embedding_mat, is_training=False, freeze=False):
if is_training:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, max_wn_concept_length, 1],
[-1, args.max_seq_len, max_nell_concept_length, 1],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, start_positions,
end_positions) = fluid.layers.read_file(pyreader)
else:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, max_wn_concept_length, 1],
[-1, args.max_seq_len, max_nell_concept_length, 1],
[-1, args.max_seq_len, 1], [-1, 1]],
dtypes=['int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
'''1st Layer: BERT Layer'''
bert = BertModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
input_mask=input_mask,
config=bert_config,
use_fp16=args.use_fp16)
enc_out = bert.get_sequence_output()
if freeze:
enc_out.stop_gradient=True
logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient))
'''2nd layer: Memory Layer'''
# get memory embedding
wn_concept_vocab_size = wn_concept_embedding_mat.shape[0]
wn_concept_dim = wn_concept_embedding_mat.shape[1]
nell_concept_vocab_size = nell_concept_embedding_mat.shape[0]
nell_concept_dim = nell_concept_embedding_mat.shape[1]
wn_memory_embs = fluid.layers.embedding(wn_concept_ids,
size=(wn_concept_vocab_size, wn_concept_dim),
param_attr=fluid.ParamAttr(name="wn_concept_emb_mat",
do_model_average=False,
trainable=False),
dtype='float32')
nell_memory_embs = fluid.layers.embedding(nell_concept_ids,
size=(nell_concept_vocab_size, nell_concept_dim),
param_attr=fluid.ParamAttr(name="nell_concept_emb_mat",
do_model_average=False,
trainable=False),
dtype='float32')
# get memory length
wn_concept_ids_reduced = fluid.layers.equal(wn_concept_ids,
fluid.layers.fill_constant(shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1]
wn_concept_ids_reduced = fluid.layers.cast(wn_concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1]
wn_concept_ids_reduced = fluid.layers.scale(
fluid.layers.elementwise_sub(
wn_concept_ids_reduced,
fluid.layers.fill_constant([1], "float32", 1)
),
scale=-1
)
wn_mem_length = fluid.layers.reduce_sum(wn_concept_ids_reduced, dim=2) # [batch_size, sent_size, 1]
nell_concept_ids_reduced = fluid.layers.equal(nell_concept_ids,
fluid.layers.fill_constant(shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1]
nell_concept_ids_reduced = fluid.layers.cast(nell_concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1]
nell_concept_ids_reduced = fluid.layers.scale(
fluid.layers.elementwise_sub(
nell_concept_ids_reduced,
fluid.layers.fill_constant([1], "float32", 1)
),
scale=-1
)
nell_mem_length = fluid.layers.reduce_sum(nell_concept_ids_reduced, dim=2) # [batch_size, sent_size, 1]
# select and integrate
wn_memory_layer = MemoryLayer(bert_config, max_wn_concept_length, wn_concept_dim, mem_method='raw', prefix='wn')
wn_memory_output = wn_memory_layer.forward(enc_out, wn_memory_embs, wn_mem_length, ignore_no_memory_token=True)
nell_memory_layer = MemoryLayer(bert_config, max_nell_concept_length, nell_concept_dim, mem_method='raw', prefix='nell')
nell_memory_output = nell_memory_layer.forward(enc_out, nell_memory_embs, nell_mem_length, ignore_no_memory_token=True)
memory_output = fluid.layers.concat([enc_out, wn_memory_output, nell_memory_output], axis=2)
'''3rd layer: Self-Matching Layer'''
# calculate input dim for self-matching layer
memory_output_size = bert_config['hidden_size'] + wn_concept_dim + nell_concept_dim
logger.info("memory_output_size: {}".format(memory_output_size))
# do matching
self_att_layer = TriLinearTwoTimeSelfAttentionLayer(
memory_output_size, dropout_rate=0.0,
cat_mul=True, cat_sub=True, cat_twotime=True,
cat_twotime_mul=False, cat_twotime_sub=True) # [bs, sq, concat_hs]
att_output = self_att_layer.forward(memory_output, input_mask) # [bs, sq, concat_hs]
'''4th layer: Output Layer'''
logits = fluid.layers.fc(
input=att_output,
size=2,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_squad_out_w",
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])),
bias_attr=fluid.ParamAttr(
name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
batch_ones = fluid.layers.fill_constant_batch_size_like(
input=start_logits, dtype='int64', shape=[1], value=1)
num_seqs = fluid.layers.reduce_sum(input=batch_ones)
if is_training:
def compute_loss(logits, positions):
loss = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=positions)
loss = fluid.layers.mean(x=loss)
return loss
start_loss = compute_loss(start_logits, start_positions)
end_loss = compute_loss(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2.0
if args.use_fp16 and args.loss_scaling > 1.0:
total_loss = total_loss * args.loss_scaling
return pyreader, total_loss, num_seqs
else:
return pyreader, unique_id, start_logits, end_logits, num_seqs
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def predict(test_exe, test_program, test_pyreader, fetch_list, processor, eval_concept_settings, eval_output_name='eval_result.json'):
if not os.path.exists(args.checkpoints):
os.makedirs(args.checkpoints)
output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
output_evaluation_result_file = os.path.join(args.checkpoints, eval_output_name)
test_pyreader.start()
all_results = []
time_begin = time.time()
while True:
try:
np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
fetch_list=fetch_list, program=test_program)
for idx in range(np_unique_ids.shape[0]):
if len(all_results) % 1000 == 0:
logger.info("Processing example: %d" % len(all_results))
unique_id = int(np_unique_ids[idx])
start_logits = [float(x) for x in np_start_logits[idx].flat]
end_logits = [float(x) for x in np_end_logits[idx].flat]
all_results.append(
RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
except fluid.core.EOFException:
test_pyreader.reset()
break
time_end = time.time()
features = processor.get_features(
processor.predict_examples, is_training=False, **eval_concept_settings)
eval_result = write_predictions(processor.predict_examples, features, all_results,
args.n_best_size, args.max_answer_length,
args.do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
args.version_2_with_negative,
args.null_score_diff_threshold, args.verbose, args.predict_file, output_evaluation_result_file)
return eval_result
def read_concept_embedding(embedding_path):
fin = open(embedding_path, encoding='utf-8')
info = [line.strip() for line in fin]
dim = len(info[0].split(' ')[1:])
n_concept = len(info)
embedding_mat = []
id2concept, concept2id = [], {}
# add padding concept into vocab
id2concept.append('<pad_concept>')
concept2id['<pad_concept>'] = 0
embedding_mat.append([0.0 for _ in range(dim)])
for line in info:
concept_name = line.split(' ')[0]
embedding = [float(value_str) for value_str in line.split(' ')[1:]]
assert len(embedding) == dim and not np.any(np.isnan(embedding))
embedding_mat.append(embedding)
concept2id[concept_name] = len(id2concept)
id2concept.append(concept_name)
embedding_mat = np.array(embedding_mat, dtype=np.float32)
return id2concept, concept2id, embedding_mat
def train(args):
bert_config = BertConfig(args.bert_config_path)
bert_config.print_config()
if not (args.do_train or args.do_predict or args.do_val):
raise ValueError("For args `do_train` and `do_predict`, at "
"least one of them must be True.")
if args.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place)
wn_id2concept, wn_concept2id, wn_concept_embedding_mat = read_concept_embedding(
args.wn_concept_embedding_path)
nell_id2concept, nell_concept2id, nell_concept_embedding_mat = read_concept_embedding(
args.nell_concept_embedding_path)
processor = DataProcessor(
vocab_path=args.vocab_path,
do_lower_case=args.do_lower_case,
max_seq_length=args.max_seq_len,
in_tokens=args.in_tokens,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length)
startup_prog = fluid.Program()
if args.random_seed is not None:
startup_prog.random_seed = args.random_seed
random.seed(args.random_seed)
np.random.seed(args.random_seed)
if args.do_train:
train_concept_settings = {
'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/train.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
'wn_concept2id': wn_concept2id,
'nell_concept2id': nell_concept2id,
'use_wordnet': args.use_wordnet,
'retrieved_synset_path': args.retrieved_synset_path,
'use_nell': args.use_nell,
'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path,
}
train_data_generator = processor.data_generator(
data_path=args.train_file,
batch_size=args.batch_size,
phase='train',
shuffle=True,
dev_count=dev_count,
version_2_with_negative=args.version_2_with_negative,
epoch=args.epoch,
**train_concept_settings)
num_train_examples = processor.get_num_examples(phase='train')
if args.in_tokens:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size // args.max_seq_len) // dev_count
else:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size) // dev_count
warmup_steps = int(max_train_steps * args.warmup_proportion)
logger.info("Device count: %d" % dev_count)
logger.info("Num train examples: %d" % num_train_examples)
logger.info("Max train steps: %d" % max_train_steps)
logger.info("Num warmup steps: %d" % warmup_steps)
train_program = fluid.Program()
# if args.random_seed is not None:
# train_program.random_seed = args.random_seed
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, loss, num_seqs = create_model(
pyreader_name='train_reader',
bert_config=bert_config,
max_wn_concept_length=processor.train_wn_max_concept_length,
max_nell_concept_length=processor.train_nell_max_concept_length,
wn_concept_embedding_mat=wn_concept_embedding_mat,
nell_concept_embedding_mat=nell_concept_embedding_mat,
is_training=True,
freeze=args.freeze)
scheduled_lr = optimization(
loss=loss,
warmup_steps=warmup_steps,
num_train_steps=max_train_steps,
learning_rate=args.learning_rate,
train_program=train_program,
startup_prog=startup_prog,
weight_decay=args.weight_decay,
scheduler=args.lr_scheduler,
use_fp16=args.use_fp16,
loss_scaling=args.loss_scaling)
if args.use_ema:
ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
ema.update()
fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
if args.verbose:
if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program,
batch_size=args.batch_size // args.max_seq_len)
else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
logger.info("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
if args.do_predict or args.do_val:
eval_concept_settings = {
'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/dev.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
'wn_concept2id': wn_concept2id,
'nell_concept2id': nell_concept2id,
'use_wordnet': args.use_wordnet,
'retrieved_synset_path': args.retrieved_synset_path,
'use_nell': args.use_nell,
'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path,
}
eval_data_generator = processor.data_generator(
data_path=args.predict_file,
batch_size=args.batch_size,
phase='predict',
shuffle=False,
dev_count=1,
epoch=1,
**eval_concept_settings)
test_prog = fluid.Program()
# if args.random_seed is not None:
# test_prog.random_seed = args.random_seed
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
pyreader_name='test_reader',
bert_config=bert_config,
max_wn_concept_length=processor.predict_wn_max_concept_length,
max_nell_concept_length=processor.predict_nell_max_concept_length,
wn_concept_embedding_mat=wn_concept_embedding_mat,
nell_concept_embedding_mat=nell_concept_embedding_mat,
is_training=False)
if args.use_ema and 'ema' not in dir():
ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
start_logits.name, end_logits.name, num_seqs.name])
test_prog = test_prog.clone(for_test=True)
# if args.random_seed is not None:
# test_prog.random_seed = args.random_seed
exe.run(startup_prog)
if args.do_train:
logger.info('load pretrained concept embedding')
fluid.global_scope().find_var('wn_concept_emb_mat').get_tensor().set(wn_concept_embedding_mat, place)
fluid.global_scope().find_var('nell_concept_emb_mat').get_tensor().set(nell_concept_embedding_mat, place)
if args.init_checkpoint and args.init_pretraining_params:
logger.info(
"WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
"both are set! Only arg 'init_checkpoint' is made valid.")
if args.init_checkpoint:
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.init_pretraining_params:
init_pretraining_params(
exe,
args.init_pretraining_params,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.do_predict or args.do_val:
if not args.init_checkpoint:
raise ValueError("args 'init_checkpoint' should be set if"
"only doing prediction!")
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
if args.do_train:
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = args.use_fast_executor
exec_strategy.num_threads = dev_count
exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
train_exe = fluid.ParallelExecutor(
use_cuda=args.use_cuda,
loss_name=loss.name,
exec_strategy=exec_strategy,
main_program=train_program)
train_pyreader.decorate_tensor_provider(train_data_generator)
train_pyreader.start()
steps = 0
total_cost, total_num_seqs = [], []
time_begin = time.time()
while steps < max_train_steps:
try:
steps += 1
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
fetch_list = [loss.name, num_seqs.name]
else:
fetch_list = [
loss.name, scheduled_lr.name, num_seqs.name
]
else:
fetch_list = []
outputs = train_exe.run(fetch_list=fetch_list)
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
np_loss, np_num_seqs = outputs
else:
np_loss, np_lr, np_num_seqs = outputs
total_cost.extend(np_loss * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
)
verbose += "learning rate: %f" % (
np_lr[0]
if warmup_steps > 0 else args.learning_rate)
logger.info(verbose)
time_end = time.time()
used_time = time_end - time_begin
current_example, epoch = processor.get_train_progress()
logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"speed: %f steps/s" %
(epoch, current_example, num_train_examples, steps,
np.sum(total_cost) / np.sum(total_num_seqs),
args.skip_steps / used_time))
total_cost, total_num_seqs = [], []
time_begin = time.time()
if steps % args.save_steps == 0 or steps == max_train_steps:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program)
if steps % args.validation_steps == 0 or steps == max_train_steps:
if args.do_val:
test_pyreader.decorate_tensor_provider(
processor.data_generator(
data_path=args.predict_file,
batch_size=args.batch_size,
phase='predict',
shuffle=False,
dev_count=1,
epoch=1,
**eval_concept_settings)
)
val_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps))
logger.info("Validation performance after step {}:\n* Exact_match: {}\n* F1: {}".format(steps, val_performance['exact_match'], val_performance['f1']))
except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps) + "_final")
fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset()
break
if args.do_predict:
test_pyreader.decorate_tensor_provider(eval_data_generator)
if args.use_ema:
with ema.apply(exe):
eval_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings)
else:
eval_performance = predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor, eval_concept_settings)
logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format(eval_performance['exact_match'], eval_performance['f1']))
if __name__ == '__main__':
print_arguments(args)
train(args)
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import unicodedata
import six
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
fin = open(vocab_file)
for num, line in enumerate(fin):
items = convert_to_unicode(line.strip()).split("\t")
if len(items) > 2:
break
token = items[0]
index = items[1] if len(items) == 2 else num
token = token.strip()
vocab[token] = int(index)
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output
def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)
def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class CharTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in text.lower().split(" "):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Arguments for configuration."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import argparse
import logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
def str2bool(v):
# because argparse does not support to parse "true, False" as python
# boolean directly
return v.lower() in ("true", "t", "1")
class ArgumentGroup(object):
def __init__(self, parser, title, des):
self._group = parser.add_argument_group(title=title, description=des)
def add_arg(self, name, type, default, help, **kwargs):
type = str2bool if type == bool else type
self._group.add_argument(
"--" + name,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def print_arguments(args):
logger.info('----------- Configuration Arguments -----------')
for arg, value in sorted(six.iteritems(vars(args))):
logger.info('%s: %s' % (arg, value))
logger.info('------------------------------------------------')
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
def cast_fp16_to_fp32(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP16,
"out_dtype": fluid.core.VarDesc.VarType.FP32
})
def cast_fp32_to_fp16(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP32,
"out_dtype": fluid.core.VarDesc.VarType.FP16
})
def copy_to_master_param(p, block):
v = block.vars.get(p.name, None)
if v is None:
raise ValueError("no param name %s found!" % p.name)
new_p = fluid.framework.Parameter(
block=block,
shape=v.shape,
dtype=fluid.core.VarDesc.VarType.FP32,
type=v.type,
lod_level=v.lod_level,
stop_gradient=p.stop_gradient,
trainable=p.trainable,
optimize_attr=p.optimize_attr,
regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip,
name=v.name + ".master")
return new_p
def create_master_params_grads(params_grads, main_prog, startup_prog,
loss_scaling):
master_params_grads = []
tmp_role = main_prog._current_role
OpRole = fluid.core.op_proto_and_checker_maker.OpRole
main_prog._current_role = OpRole.Backward
for p, g in params_grads:
# create master parameters
master_param = copy_to_master_param(p, main_prog.global_block())
startup_master_param = startup_prog.global_block()._clone_variable(
master_param)
startup_p = startup_prog.global_block().var(p.name)
cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
# cast fp16 gradients to fp32 before apply gradients
if g.name.find("layer_norm") > -1:
if loss_scaling > 1:
scaled_g = g / float(loss_scaling)
else:
scaled_g = g
master_params_grads.append([p, scaled_g])
continue
master_grad = fluid.layers.cast(g, "float32")
if loss_scaling > 1:
master_grad = master_grad / float(loss_scaling)
master_params_grads.append([master_param, master_grad])
main_prog._current_role = tmp_role
return master_params_grads
def master_param_to_train_param(master_params_grads, params_grads, main_prog):
for idx, m_p_g in enumerate(master_params_grads):
train_p, _ = params_grads[idx]
if train_p.name.find("layer_norm") > -1:
continue
with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import six
import ast
import copy
import logging
import numpy as np
import paddle.fluid as fluid
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
def cast_fp32_to_fp16(exe, main_program):
logger.info("Cast parameters to float16 data format.")
for param in main_program.global_block().all_parameters():
if not param.name.endswith(".master"):
param_t = fluid.global_scope().find_var(param.name).get_tensor()
data = np.array(param_t)
if param.name.find("layer_norm") == -1:
param_t.set(np.float16(data).view(np.uint16), exe.place)
master_param_var = fluid.global_scope().find_var(param.name +
".master")
if master_param_var is not None:
master_param_var.get_tensor().set(data, exe.place)
def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
assert os.path.exists(
init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
def existed_persitables(var):
if not fluid.io.is_persistable(var):
return False
return os.path.exists(os.path.join(init_checkpoint_path, var.name))
fluid.io.load_vars(
exe,
init_checkpoint_path,
main_program=main_program,
predicate=existed_persitables)
logger.info("Load model from {}".format(init_checkpoint_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
def init_pretraining_params(exe,
pretraining_params_path,
main_program,
use_fp16=False):
assert os.path.exists(pretraining_params_path
), "[%s] cann't be found." % pretraining_params_path
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(os.path.join(pretraining_params_path, var.name))
fluid.io.load_vars(
exe,
pretraining_params_path,
main_program=main_program,
predicate=existed_params)
logger.info("Load pretraining parameters from {}.".format(
pretraining_params_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
# KT-NET
## Introduction
KT-NET (Knowledge and Text fusion NET) is a machine reading comprehension (MRC) model which integrates knowledge from knowledge bases (KBs) into pre-trained contextualized representations. The model is proposed in ACL2019 paper [Enhancing Pre-Trained Language Representations with Rich Knowledge for Machine Reading Comprehension](https://www.aclweb.org/anthology/P19-1226). The overall architecture of the model is shown as follows:
<p align="center">
<img src="images/architecture.png" width = "340" height = "300" /> <br />
Overall Architecture of KT-NET
</p>
This repository contains the PaddlePaddle implementation of KT-NET. The trained checkpoints are also provided for reproducing the results in the paper.
## How to Run
### Environment
This project should work fine if the following requirements have been satisfied:
+ python >= 3.7
+ paddlepaddle-gpu (the latest develop version is recommended)
+ NLTK >= 3.3 (with WordNet 3.0)
+ tqdm
+ CoreNLP (3.8.0 version is recommended)
+ pycorenlp
+ CUDA, CuDNN and NCCL (CUDA 9.0, CuDNN v7 and NCCL 2.3.7 are recommended)
All of the experiments in the paper are performed on 4 P40 GPUs.
### Download the MRC datasets
In this work, we empirically evaluate our model on two benchmarks:
#### 1. ReCoRD
[ReCoRD](https://sheng-z.github.io/ReCoRD-explorer/) (Reading Comprehension with Commonsense Reasoning Dataset) is a large-scale MRC dataset requiring commonsense reasoning. The official dataset in JSON format can be downloaded using Google drive (training set: [link](https://drive.google.com/file/d/1PoHmphyH79pETNws8kU2OwuerU7SWLHj/view), valid set: [link](https://drive.google.com/file/d/1WNaxBpXEGgPbymTzyN249P4ub-uU5dkO/view)). *(For convenience, we have provided the MD5 for each downloadable file of this readme in `downloaded_files.md5`. It's recommended to use it to check the completeness of the downloaded file.)* Please place the downloaded files `train.json` and `dev.json` into the `data/ReCoRD/` directory of this repository. We will also use the official evaluation script of ReCoRD, so please run the following command:
```
curl -o record_official_evaluate.py https://sheng-z.github.io/ReCoRD-explorer/evaluation.py
mv record_official_evaluate.py reading_comprehension/src/eval/
```
#### 2. SQuAD v1.1
[SQuAD v1.1](https://rajpurkar.github.io/SQuAD-explorer/) is a well-known extractive MRC dataset that consists of questions created by crowdworkers for Wikipedia articles. Please run the following command to download the official dataset and evaluation script.
```
curl -O https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
curl -O https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
mv train-v1.1.json dev-v1.1.json data/SQuAD/
curl -o squad_v1_official_evaluate.py https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/
mv squad_v1_official_evaluate.py reading_comprehension/src/eval/
```
### Retrieve KB entries
Relevant knowledge should be retrieved and encoded before training the model. In this project, we leveraged two KBs: [WordNet](https://wordnet.princeton.edu/) and [NELL](http://rtw.ml.cmu.edu/rtw/). WordNet records lexical relations between words and NELL stores beliefs about entities. The following procedure describes how we retrieve relevant WordNet synsets and NELL concepts for MRC samples.
#### 1. Named entity recognition (only for SQuAD)
To retrieve NELL concepts about entities, the named entity mentions in MRC samples should be annotated. For ReCoRD, the entity mentions have been provided in the dataset. For SQuAD, named entity recognition (NER) needs to be performed before the retrieval. We use [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/index.html) in this step. After CoreNLP is [downloaded](http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip) and unzipped, run the following command at the CoreNLP directory to start the CoreNLP server:
```
java -mx10g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9753 -timeout 20000
```
Then run the command:
```
cd retrieve_concepts/ner_tagging_squad
python3 tagging.py
```
The tagged dataset will be saved at `retrieve_concepts/ner_tagging_squad/output` directory. We have provided our output files for convenience ([download link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_squad_tagging_output.tar.gz)).
#### 2. Tokenization
Tokenization should be performed for retrieval. We use the same tokenizer with [BERT](https://github.com/google-research/bert).
For ReCoRD, run the following command to tokenize the raw dataset (or directly download our output from [link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_tokenize_result_record.tar.gz)):
```
cd retrieve_concepts/tokenization_record
python3 do_tokenization.py
```
For SQuAD, run the following command to process the NER tagged dataset (or directly download our output from [link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_tokenize_result_squad.tar.gz)):
```
cd retrieve_concepts/tokenization_squad
python3 do_tokenization.py
```
#### 3. Retrieve WordNet
This step retrieves the WordNet (WN18) synsets for each non-stop word in the MRC samples.
For ReCoRD, run the command:
```
cd retrieve_concepts/retrieve_wordnet
python3 retrieve.py --train_token ../tokenization_record/tokens/train.tokenization.uncased.data --eval_token ../tokenization_record/tokens/dev.tokenization.uncased.data --output_dir output_record/ --no_stopwords
```
For SQuAD, run the command:
```
cd retrieve_concepts/retrieve_wordnet
python3 retrieve.py --train_token ../tokenization_squad/tokens/train.tokenization.uncased.data --eval_token ../tokenization_squad/tokens/dev.tokenization.uncased.data --output_dir output_squad/ --no_stopwords
```
The outputs are pickled into binary files. We have also provided our output files for convenience ([download link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_wordnet_concepts.tar.gz)).
#### 4. Retrieve NELL
Using string mapping, this step finds corresponding named entities for each entity mention in the given MRC example and returns their categories as relevant NELL concepts. The latest NELL beliefs should be downloaded first.
```
wget http://rtw.ml.cmu.edu/resources/results/08m/NELL.08m.1115.esv.csv.gz
gzip -d NELL.08m.1115.esv.csv.gz
mv NELL.08m.1115.esv.csv retrieve_concepts/retrieve_nell
```
For ReCoRD, run the command:
```
cd retrieve_concepts/retrieve_nell
python3 retrieve.py --train_token ../tokenization_record/tokens/train.tokenization.uncased.data --eval_token ../tokenization_record/tokens/dev.tokenization.uncased.data --output_dir output_record/
```
For SQuAD, run the command:
```
cd retrieve_concepts/retrieve_nell
python3 retrieve.py --train_token ../tokenization_squad/tokens/train.tokenization.uncased.data --eval_token ../tokenization_squad/tokens/dev.tokenization.uncased.data --output_dir output_squad/
```
The outputs are pickled into binary files. The output files can also be downloaded from [download link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_nell_concepts.tar.gz).
#### 5. Prepare KB embedding
Following the work of [Yang et al., 2015](https://arxiv.org/pdf/1412.6575.pdf), we leverage their KB embedding for WordNet synsets and NELL categories trained by the BILINEAR model.
```
curl -O https://raw.githubusercontent.com/bishanyang/kblstm/master/embeddings/wn_concept2vec.txt
curl -O https://raw.githubusercontent.com/bishanyang/kblstm/master/embeddings/nell_concept2vec.txt
mv wn_concept2vec.txt nell_concept2vec.txt retrieve_concepts/KB_embeddings
```
The 100-dimensional embeddings are stored in the following format:
```
concept:coach -0.123886 0.0477016 0.517474 0.154645 0.32559 ...
```
For other knowledge bases, please refer to the source code for training the BILINEAR model from [Yang's github repo](https://github.com/bishanyang/kblstm/tree/master/code/models).
### Training KT-NET
#### Prepare BERT checkpoint
The text encoder module of KT-NET is initialized with pretrained BERT large-cased parameters, run the command:
```
cd reading_comprehension
wget https://bert-models.bj.bcebos.com/cased_L-24_H-1024_A-16.tar.gz --no-check-certificate
tar xvf cased_L-24_H-1024_A-16.tar.gz
```
#### Directly fine-tuning
We have provided scripts to execute training and inference for KT-NET. To train a model for ReCoRD dataset with both WordNet and NELL concepts employed, just run the command:
```
cd reading_comprehension && sh ./run_record_twomemory.sh
```
The hyper-parameters, such as training epochs, learning rate and batch size, can be adjusted in the script. After training and evaluation, the following files and directories will be created:
+ `output/eval_result.json`: the performance of the trained model on the benchmark
+ `output/predictions.json`: the predicted answers for the development set
+ `output/nbest_predictions.json`: n-best predicted answers for the development set
+ `output/step_XXXX`: the directory of model checkpoint
+ `log/train.log`: the logging file
To run with single KB, replace `run_record_twomemory.sh` with `run_record_wordnet.sh` or `run_record_nell.sh`.
Similarly, for SQuAD, use `run_squad_twomemory.sh`, `run_squad_wordnet.sh` or `run_squad_nell.sh`.
#### Two-staged fine-tuning (Recommended)
In our experiments, we found that employing a "two-staged" training strategy achieves better model performance, which freezes BERT params in the first stage and unfreezes them later. We recommend to adopt this strategy to train KT-NET. To run two-staged fine-tuning, just first execute the `XXX_pretrain.sh` script and then run `XXX_finetune.sh`. E.g., to train a KT-NET on ReCoRD with both KBs, firstly run
```
cd reading_comprehension && sh ./run_record_twomemory_pretrain.sh
```
and then run the command after the first stage has been finished
```
sh ./run_record_twomemory_finetune.sh
```
The finally created `output/` and `log/` directories have the same folder structure with directly fine-tuning.
In the first stage, we trained 10 epochs for ReCoRD and 1 epoch for SQuAD. As for the second stage, we recommend to fine-tune 2-4 epochs for ReCoRD and 2-3 epochs for SQuAD.
#### Reproduce the paper results
We have released the following checkpoints for our trained KT-NET which can reproduce the performance in the paper:
| ReCoRD Model | F1 score | Exact Match | Inference Script |
| :------------- | :---------: | :----------: | :--------- |
| [KT-NET (WordNet)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_record_wordnet.tar.gz) | 72.76 | 70.56 | eval_record_wordnet.sh |
| [KT-NET (NELL)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_record_nell.tar.gz) | 72.52 | 70.54 | eval_record_nell.sh |
| [KT-NET (Both)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_record_both.tar.gz) | 73.62 | 71.60 | eval_record_twomemory.sh |
| SQuAD Model | F1 score | Exact Match | Inference Script |
| :------------- | :---------: | :----------: | :--------- |
| [KT-NET (WordNet)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_squad_wordnet.tar.gz) | 91.70 | 85.16 | eval_squad_wordnet.sh |
| [KT-NET (NELL)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_squad_nell.tar.gz) | 91.70 | 85.02 | eval_squad_nell.sh |
| [KT-NET (Both)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_squad_both.tar.gz) | 91.65 | 84.97 | eval_squad_twomemory.sh |
After downloading and extracting the checkpoint file, please execute the corresponding inference script. E.g.:
```
cd reading_comprehension && sh ./eval_record_twomemory.sh extracted_ckpt_dir_path
```
The following result is expected to be created in the `output/` directory:
```
{
"exact_match": 71.61,
"f1": 73.62396522806482
}
```
## Citation
If you use any source code included in this project in your work, please cite the following paper:
```
@inproceedings{yang-etal-2019-enhancing-pre,
title = {Enhancing Pre-Trained Language Representations with Rich Knowledge for Machine Reading Comprehension},
author = {An Yang, Quan Wang, Jing Liu, Kai Liu, Yajuan Lyu, Hua Wu, Qiaoqiao She and Sujian Li},
booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
year = {2019},
publisher = {Association for Computational Linguistics},
pages = {2346--2357},
}
```
## Copyright and License
Copyright 2019 Baidu.com, Inc. All Rights Reserved
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
\ No newline at end of file
# -*- coding: utf-8 -*-
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# This script perform NER tagging for raw SQuAD datasets
# All the named entites found in question and context are recorded with their offsets in the output file
# CoreNLP is used for NER tagging
import os
import json
import argparse
import logging
import urllib
import sys
from tqdm import tqdm
from pycorenlp import StanfordCoreNLP
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", default='output', type=str,
help="The output directory to store tagging results.")
parser.add_argument("--train_file", default='../../data/SQuAD/train-v1.1.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json")
parser.add_argument("--predict_file", default='../../data/SQuAD/dev-v1.1.json', type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
return parser.parse_args()
# transform corenlp tagging output into entity list
# some questions begins with whitespaces and they are striped by corenlp, thus begin offset should be added.
def parse_output(text, tagging_output, begin_offset=0):
entities = []
select_states = ['ORGANIZATION', 'PERSON', 'MISC', 'LOCATION']
for sent in tagging_output['sentences']:
state = 'O'
start_pos, end_pos = -1, -1
for token in sent['tokens']:
tag = token['ner']
if tag == 'O' and state != 'O':
if state in select_states:
entities.append({'text': text[begin_offset + start_pos: begin_offset + end_pos], 'start': begin_offset + start_pos, 'end': begin_offset + end_pos - 1})
state = 'O'
elif tag != 'O':
if state == tag:
end_pos = token['characterOffsetEnd']
else:
if state in select_states:
entities.append({'text': text[begin_offset + start_pos: begin_offset + end_pos], 'start': begin_offset + start_pos, 'end': begin_offset + end_pos - 1})
state = tag
start_pos = token['characterOffsetBegin']
end_pos = token['characterOffsetEnd']
if state in select_states:
entities.append({'text': text[begin_offset + start_pos: begin_offset + end_pos], 'start': begin_offset + start_pos, 'end': begin_offset + end_pos - 1})
return entities
def tagging(dataset, nlp):
skip_context_cnt, skip_question_cnt = 0, 0
for article in tqdm(dataset['data']):
for paragraph in tqdm(article['paragraphs']):
context = paragraph['context']
context_tagging_output = nlp.annotate(urllib.parse.quote(context), properties={'annotators': 'ner', 'outputFormat': 'json'})
# assert the context length is not changed
if len(context.strip()) == context_tagging_output['sentences'][-1]['tokens'][-1]['characterOffsetEnd']:
context_entities = parse_output(context, context_tagging_output, len(context) - len(context.lstrip()))
else:
context_entities = []
skip_context_cnt += 1
logger.info('Skipped context due to offset mismatch:')
logger.info(context)
paragraph['context_entities'] = context_entities
for qa in tqdm(paragraph['qas']):
question = qa['question']
question_tagging_output = nlp.annotate(urllib.parse.quote(question), properties={'annotators': 'ner', 'outputFormat': 'json'})
if len(question.strip()) == question_tagging_output['sentences'][-1]['tokens'][-1]['characterOffsetEnd']:
question_entities = parse_output(question, question_tagging_output, len(context) - len(context.lstrip()))
else:
question_entities = []
skip_question_cnt += 1
logger.info('Skipped question due to offset mismatch:')
logger.info(question)
qa['question_entities'] = question_entities
logger.info('In total, {} contexts and {} questions are skipped...'.format(skip_context_cnt, skip_question_cnt))
if __name__ == '__main__':
args = parse_args()
# make output directory if not exist
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
# register corenlp server
nlp = StanfordCoreNLP('http://localhost:9753')
# load train and dev datasets
ftrain = open(args.train_file, 'r', encoding='utf-8')
trainset = json.load(ftrain)
fdev = open(args.predict_file, 'r', encoding='utf-8')
devset = json.load(fdev)
for dataset, path, name in zip((trainset, devset), (args.train_file, args.predict_file), ('train', 'dev')):
tagging(dataset, nlp)
output_path = os.path.join(args.output_dir, "{}.tagged.json".format(os.path.basename(path)[:-5]))
json.dump(dataset, open(output_path, 'w', encoding='utf-8'))
logger.info('Finished tagging {} set'.format(name))
concept:coach
concept:musicfestival
concept:book
concept:professor
concept:dateliteral
concept:mountainrange
concept:wine
concept:flooritem
concept:clothing
concept:mlalgorithm
concept:drug
concept:musicgenre
concept:parlourgame
concept:website
concept:eventoutcome
concept:planet
concept:mammal
concept:organization
concept:female
concept:vehicle
concept:event
concept:legume
concept:weatherphenomenon
concept:perceptionevent
concept:emotion
concept:bombingevent
concept:highway
concept:creativework
concept:comedian
concept:gamescore
concept:software
concept:personcanada
concept:musicalbum
concept:beach
concept:geopoliticalorganization
concept:product
concept:street
concept:astronaut
concept:virus
concept:criminal
concept:trail
concept:roadaccidentevent
concept:physicalaction
concept:archaea
concept:personafrica
concept:personasia
concept:medicalprocedure
concept:monument
concept:tool
concept:politician
concept:conference
concept:insect
concept:restaurant
concept:sportsequipment
concept:politicsblog
concept:physicalcharacteristic
concept:bakedgood
concept:sociopolitical
concept:meetingeventtype
concept:blog
concept:mediacompany
concept:bridge
concept:male
concept:researchproject
concept:traditionalgame
concept:recipe
concept:crustacean
concept:militaryeventtype
concept:color
concept:race
concept:religion
concept:furniture
concept:building
concept:geopoliticallocation
concept:personsouthamerica
concept:beverage
concept:nondiseasecondition
concept:school
concept:politicalparty
concept:politicsbill
concept:zoo
concept:artery
concept:recordlabel
concept:cave
concept:visualartmovement
concept:musicartist
concept:olympics
concept:visualizableattribute
concept:sportsteamposition
concept:boardgame
concept:person
concept:actor
concept:perceptionaction
concept:dayofweek
concept:householditem
concept:fungus
concept:bird
concept:fruit
concept:amphibian
concept:victim
concept:musicsong
concept:newspaper
concept:farm
concept:tradeunion
concept:bone
concept:month
concept:personaustralia
concept:movie
concept:convention
concept:nonneginteger
concept:nerve
concept:highschool
concept:time
concept:lake
concept:placeofworship
concept:mlmetric
concept:visualartform
concept:grandprix
concept:agriculturalproduct
concept:bedroomitem
concept:chemical
concept:muscle
concept:sportsgame
concept:physiologicalcondition
concept:radiostation
concept:televisionstation
concept:personus
concept:coffeedrink
concept:airport
concept:invertebrate
concept:bathroomitem
concept:physicsterm
concept:company
concept:meetingeventtitle
concept:earthquakeevent
concept:judge
concept:skiarea
concept:personeurope
concept:politicsissue
concept:nongovorganization
concept:mlconference
concept:politicaloffice
concept:url
concept:visualartist
concept:hotel
concept:caf_
concept:bacteria
concept:kitchenitem
concept:militaryconflict
concept:protestevent
concept:sportsteam
concept:politicianus
concept:mlauthor
concept:retailstore
concept:architect
concept:location
concept:shoppingmall
concept:sportsevent
concept:politicsgroup
concept:buildingmaterial
concept:televisionshow
concept:consumerelectronicitem
concept:petroleumrefiningcompany
concept:room
concept:academicfield
concept:reptile
concept:wallitem
concept:buildingfeature
concept:programminglanguage
concept:mollusk
concept:monarch
concept:bank
concept:creditunion
concept:park
concept:island
concept:governmentorganization
concept:celltype
concept:game
concept:videogamesystem
concept:automobileengine
concept:biotechcompany
concept:nonprofitorganization
concept:geometricshape
concept:museum
concept:port
concept:cardgame
concept:landscapefeatures
concept:televisionnetwork
concept:musicinstrument
concept:ethnicgroup
concept:language
concept:grain
concept:mlarea
concept:director
concept:weapon
concept:cognitiveactions
concept:mlsoftware
concept:species
concept:fish
concept:athlete
concept:ceo
concept:publication
concept:vertebrate
concept:sportsleague
concept:mediatype
concept:filmfestival
concept:university
concept:stadiumoreventvenue
concept:zipcode
concept:writer
concept:continent
concept:oilgasfield
concept:videogame
concept:country
concept:river
concept:personnorthamerica
concept:currency
concept:nut
concept:hallwayitem
concept:professionalorganization
concept:skyscraper
concept:lymphnode
concept:meat
concept:scientist
concept:tableitem
concept:winery
concept:disease
concept:magazine
concept:condiment
concept:economicsector
concept:visualizablescene
concept:mldataset
concept:mountain
concept:braintissue
concept:chef
concept:vegetable
concept:model
concept:protein
concept:city
concept:personbylocation
concept:arachnid
concept:date
concept:scientificterm
concept:officeitem
concept:automobilemodel
concept:musician
concept:election
concept:automobilemaker
concept:sport
concept:food
concept:attraction
concept:candy
concept:profession
concept:county
concept:celebrity
concept:crimeorcharge
concept:vein
concept:aquarium
concept:year
concept:plant
concept:journalist
concept:bodypart
concept:stateorprovince
concept:refineryproduct
concept:jobposition
concept:personmexico
concept:trainstation
concept:productlaunchevent
concept:awardtrophytournament
concept:officebuildingroom
concept:animal
concept:arthropod
concept:hobby
concept:charactertrait
concept:hospital
concept:transportation
concept:cheese
concept:terroristorganization
concept:personalcareitem
concept:geopoliticalentity
\ No newline at end of file
# -*- coding: utf-8 -*-
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# This script retrieve related NELL entities and their concepts for each named-entity in ReCoRD
# 1. transform ReCoRD entity from word sequences into strings (use _ to replace whitespace and eliminate punc)
# 2. preprocess NELL entity name (remove front 'n' for NELL entities when digit is in the beginning and additional _)
# 3. for ReCoRD entities with more than one token, use exact match
# 4. for one-word ReCoRD entities, do wordnet lemmatization before matching (and matching by both raw and morphed forms)
# 5. in a passage, if entity A is a suffix of entity B, use B's categories instead
import pickle
import logging
import string
import argparse
import os
import nltk
from collections import namedtuple
from tqdm import tqdm
from nltk.corpus import wordnet as wn
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
# remove category part of NELL entities, digit prefix 'n' and additional '_'
def preprocess_nell_ent_name(raw_name):
ent_name = raw_name.split(':')[-1]
digits = set(string.digits)
if ent_name.startswith('n') and all([char in digits for char in ent_name.split('_')[0][1:]]):
ent_name = ent_name[1:]
ent_name = "_".join(filter(lambda x:len(x) > 0, ent_name.split('_')))
return ent_name
puncs = set(string.punctuation)
def preprocess_record_ent_name(raw_token_seq):
return "_".join(filter(lambda x:x not in puncs, raw_token_seq))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--train_token', type=str, default='../tokenization_record/tokens/train.tokenization.uncased.data', help='token file of train set')
parser.add_argument('--eval_token', type=str, default='../tokenization_record/tokens/dev.tokenization.uncased.data', help='token file of dev set')
parser.add_argument('--score_threshold', type=float, default=0.9, help='only keep generalizations relations with score >= threshold')
parser.add_argument('--output_dir', type=str, default='output_record/', help='output directory')
args = parser.parse_args()
# make output directory if not exist
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
# load set of concepts with pre-trained embedding
concept_set = set()
with open('nell_concept_list.txt') as fin:
for line in fin:
concept_name = line.strip()
concept_set.add(concept_name)
# read nell csv file and build NELL entity to category dict
logger.info('Begin to read NELL csv...')
fin = open('NELL.08m.1115.esv.csv')
nell_ent_to_cpt = {}
nell_ent_to_fullname = {}
header = True
for line in fin:
if header:
header = False
continue
line = line.strip()
items = line.split('\t')
if items[1] == 'generalizations' and float(items[4]) >= args.score_threshold:
nell_ent_name = preprocess_nell_ent_name(items[0])
category = items[2]
if nell_ent_name not in nell_ent_to_cpt:
nell_ent_to_cpt[nell_ent_name] = set()
nell_ent_to_fullname[nell_ent_name] = set()
nell_ent_to_cpt[nell_ent_name].add(category)
nell_ent_to_fullname[nell_ent_name].add(items[0])
logger.info('Finished reading NELL csv.')
# load record dataset
logger.info('Begin to load tokenization results...')
train_samples = pickle.load(open(args.train_token, 'rb'))
dev_samples = pickle.load(open(args.eval_token, 'rb'))
logger.info('Finished loading tokenization results.')
# build record entity set
record_ent_set = set()
for sample in train_samples + dev_samples:
query_tokens = sample['query_tokens']
document_tokens = sample['document_tokens']
for entity_info in sample['document_entities']:
entity_token_seq = document_tokens[entity_info[1]: entity_info[2] + 1]
record_ent_set.add(preprocess_record_ent_name(entity_token_seq))
for entity_info in sample['query_entities']:
entity_token_seq = query_tokens[entity_info[1]: entity_info[2] + 1]
record_ent_set.add(preprocess_record_ent_name(entity_token_seq))
logger.info('Finished making tokenization results into entity set.')
# do mapping
record_ent_to_cpt = {}
record_ent_to_nell_ent = {}
for record_ent in tqdm(record_ent_set):
cpt, nell_ent = set(), set()
if record_ent in nell_ent_to_cpt:
cpt.update(nell_ent_to_cpt[record_ent])
nell_ent.update(nell_ent_to_fullname[record_ent])
# length is 1, do morphy
if '_' not in record_ent:
for pos_tag in ['n', 'v', 'a', 'r']:
morph = wn.morphy(record_ent, pos_tag)
if morph is not None and morph in nell_ent_to_cpt:
cpt.update(nell_ent_to_cpt[morph])
nell_ent.update(nell_ent_to_fullname[morph])
record_ent_to_cpt[record_ent] = cpt
record_ent_to_nell_ent[record_ent] = nell_ent
logger.info('Finished matching record entities to nell entities.')
# map the record entity in the set back to passage
logger.info('Begin to generate output file...')
_TempRectuple = namedtuple('entity_record', [
'entity_string', 'start', 'end', 'retrieved_concepts', 'retrieved_entities'])
for outfn, samples in zip(('{}.retrieved_nell_concepts.data'.format(prefix) for prefix in ('train', 'dev')), (train_samples, dev_samples)):
all_outputs = []
for sample in tqdm(samples):
doc_entities = []
document_tokens = sample['document_tokens']
for entity_info in sample['document_entities']:
entity_token_seq = document_tokens[entity_info[1]: entity_info[2] + 1]
entity_whitespace_str = " ".join(entity_token_seq)
entity_retrieve_str = preprocess_record_ent_name(
entity_token_seq)
doc_entities.append(_TempRectuple(
entity_whitespace_str, entity_info[1], entity_info[2], record_ent_to_cpt[entity_retrieve_str], record_ent_to_nell_ent[entity_retrieve_str]))
query_entities = []
query_tokens = sample['query_tokens']
for entity_info in sample['query_entities']:
entity_token_seq = query_tokens[entity_info[1]: entity_info[2] + 1]
entity_whitespace_str = " ".join(entity_token_seq)
entity_retrieve_str = preprocess_record_ent_name(
entity_token_seq)
query_entities.append(_TempRectuple(
entity_whitespace_str, entity_info[1], entity_info[2], record_ent_to_cpt[entity_retrieve_str], record_ent_to_nell_ent[entity_retrieve_str]))
# perform suffix replacement rule (eg. use the result of "Donald Trump" to replace "Trump" in the passage)
doc_entities_final, query_entities_final = [], []
for entities, entities_final in zip((doc_entities, query_entities), (doc_entities_final, query_entities_final)):
for trt in entities:
new_nell_cpt_set, new_nell_ent_set = set(), set()
for other_trt in doc_entities + query_entities:
if other_trt.entity_string != trt.entity_string and other_trt.entity_string.endswith(trt.entity_string):
new_nell_cpt_set.update(other_trt.retrieved_concepts)
new_nell_ent_set.update(other_trt.retrieved_entities)
# no need to replace
if len(new_nell_cpt_set) == 0:
new_nell_cpt_set = trt.retrieved_concepts
new_nell_ent_set = trt.retrieved_entities
new_nell_cpt_set = new_nell_cpt_set & concept_set # filter concepts with pretrained embedding
entities_final.append({
'entity_string': trt.entity_string,
'token_start': trt.start,
'token_end': trt.end,
'retrieved_concepts': list(new_nell_cpt_set),
'retrieved_entities': list(new_nell_ent_set),
})
all_outputs.append({
'id': sample['id'],
'document_entities': doc_entities_final,
'query_entities': query_entities_final,
})
pickle.dump(all_outputs, open(os.path.join(args.output_dir, outfn), 'wb'))
logger.info('Output retrieved results have been dumped.')
if __name__ == '__main__':
main()
# -*- coding: utf-8 -*-
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import pickle
import argparse
import os
import nltk
import logging
import string
from tqdm import tqdm
from nltk.corpus import wordnet as wn
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--train_token', type=str, default='../tokenization_record/tokens/train.tokenization.uncased.data', help='token file of train set')
parser.add_argument('--eval_token', type=str, default='../tokenization_record/tokens/dev.tokenization.uncased.data', help='token file of dev set')
parser.add_argument('--output_dir', type=str, default='output_record/', help='output directory')
parser.add_argument('--no_stopwords', action='store_true', help='ignore stopwords')
parser.add_argument('--ignore_length', type=int, default=0, help='ignore words with length <= ignore_length')
args = parser.parse_args()
# initialize mapping from offset id to wn18 synset name
offset_to_wn18name_dict = {}
fin = open('wordnet-mlj12-definitions.txt')
for line in fin:
info = line.strip().split('\t')
offset_str, synset_name = info[0], info[1]
offset_to_wn18name_dict[offset_str] = synset_name
logger.info('Finished loading wn18 definition file.')
# load pickled samples
logger.info('Begin to load tokenization results...')
train_samples = pickle.load(open(args.train_token, 'rb'))
dev_samples = pickle.load(open(args.eval_token, 'rb'))
logger.info('Finished loading tokenization results.')
# build token set
all_token_set = set()
for sample in train_samples + dev_samples:
for token in sample['query_tokens'] + sample['document_tokens']:
all_token_set.add(token)
logger.info('Finished making tokenization results into token set.')
# load stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))
logger.info('Finished loading stopwords list.')
# retrive synsets
logger.info('Begin to retrieve synsets...')
token2synset = dict()
stopword_cnt = 0
punctuation_cnt = 0
for token in tqdm(all_token_set):
if token in set(string.punctuation):
logger.info('{} is punctuation, skipped!'.format(token))
punctuation_cnt += 1
continue
if args.no_stopwords and token in stopwords:
logger.info('{} is stopword, skipped!'.format(token))
stopword_cnt += 1
continue
if args.ignore_length > 0 and len(token) <= args.ignore_length:
logger.info('{} is too short, skipped!'.format(token))
continue
synsets = wn.synsets(token)
wn18synset_names = []
for synset in synsets:
offset_str = str(synset.offset()).zfill(8)
if offset_str in offset_to_wn18name_dict:
wn18synset_names.append(offset_to_wn18name_dict[offset_str])
if len(wn18synset_names) > 0:
token2synset[token] = wn18synset_names
logger.info('Finished retrieving sysnets.')
logger.info('{} / {} tokens retrieved at lease 1 synset. {} stopwords and {} punctuations skipped.'.format(len(token2synset), len(all_token_set), stopword_cnt, punctuation_cnt))
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
with open(os.path.join(args.output_dir, 'retrived_synsets.data'), 'wb') as fout:
pickle.dump(token2synset, fout)
logger.info('Finished dumping retrieved synsets.')
if __name__ == '__main__':
main()
# -*- coding: utf-8 -*-
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# This script performs the same tokenization process as run_record.py, dumping tokenization results
# compared with v1: add query and passage entity span in output
import argparse
import logging
import json
import os
import pickle
from tqdm import tqdm, trange
import tokenization
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class ReCoRDExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self,
qas_id,
question_text,
doc_tokens,
passage_entities,
orig_answer_text=None,
start_position=None,
end_position=None):
self.passage_entities = passage_entities
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
return s
# the tokenization process when reading examples
def read_record_examples(input_file, is_training):
"""Read a ReCoRD json file into a list of ReCoRDExample."""
with open(input_file, "r") as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
paragraph_text = entry["passage"]["text"].replace('\xa0', ' ')
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
# load entities in passage
passage_entities = []
for entity in entry['passage']['entities']:
entity_start_offset = entity['start']
entity_end_offset = entity['end']
if entity_end_offset < entity_start_offset: # some error labeled entities in record dataset
continue
entity_text = paragraph_text[entity_start_offset: entity_end_offset + 1]
passage_entities.append({'orig_text': entity_text,
'start_position': char_to_word_offset[entity_start_offset],
'end_position': char_to_word_offset[entity_end_offset]})
for qa in entry["qas"]:
qas_id = qa["id"]
question_text = qa["query"].replace('\xa0', ' ')
start_position = None
end_position = None
orig_answer_text = None
if is_training:
# if len(qa["answers"]) != 1:
# raise ValueError(
# "For training, each question should have exactly 1 answer.")
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
example = ReCoRDExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
passage_entities=passage_entities,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position)
examples.append(example)
return examples
def _improve_entity_span(doc_tokens, input_start, input_end, tokenizer,
orig_entity_text):
"""Returns token-level tokenized entity spans that better match the annotated entity."""
tok_entity_text = " ".join(tokenizer.basic_tokenizer.tokenize(orig_entity_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_entity_text:
return (new_start, new_end)
return (input_start, input_end)
def _is_real_subspan(start, end, other_start, other_end):
return (start >= other_start and end < other_end) or (start > other_start and end <= other_end)
def match_query_entities(query_tokens, document_entities, document_tokens):
# transform query_tokens list into a whitespace separated string
query_string = " ".join(query_tokens)
offset_to_tid_map = []
tid = 0
for char in query_string:
offset_to_tid_map.append(tid)
if char == ' ':
tid += 1
# transform entity_tokens into whitespace separated strings
entity_strings = set()
for document_entity in document_entities:
entity_tokens = document_tokens[document_entity[1]: document_entity[2] + 1]
entity_strings.add(" ".join(entity_tokens))
# do matching
results = []
for entity_string in entity_strings:
start = 0
while True:
pos = query_string.find(entity_string, start)
if pos == -1:
break
token_start, token_end = offset_to_tid_map[pos], offset_to_tid_map[pos] + entity_string.count(' ')
# assure the match is not partial match (eg. "ville" matches to "danville")
if " ".join(query_tokens[token_start: token_end + 1]) == entity_string:
results.append((token_start, token_end))
start = pos + len(entity_string)
# filter out a result span if it's a subspan of another span
no_subspan_results = []
for result in results:
if not any([_is_real_subspan(result[0], result[1], other_result[0], other_result[1]) for other_result in results]):
no_subspan_results.append((" ".join(query_tokens[result[0]: result[1] + 1]), result[0], result[1]))
assert len(no_subspan_results) == len(set(no_subspan_results))
return no_subspan_results
# the further tokenization process when generating features
def tokenization_on_examples(examples, tokenizer):
tokenization_result = []
for example in tqdm(examples):
# do tokenization on raw question text
query_subtokens = []
query_sub_to_ori_index = [] # mapping from sub-token index to token index
query_tokens = tokenizer.basic_tokenizer.tokenize(example.question_text)
for index, token in enumerate(query_tokens):
for sub_token in tokenizer.wordpiece_tokenizer.tokenize(token):
query_subtokens.append(sub_token)
query_sub_to_ori_index.append(index)
# do tokenization on whitespace tokenized document
document_tokens = []
document_subtokens = []
document_sub_to_ori_index = []
document_up_to_ori_index = [] # map unpunc token index to tokenized token index
for unpunc_tokenized_tokens in example.doc_tokens:
tokens = tokenizer.basic_tokenizer.tokenize(unpunc_tokenized_tokens) # do punctuation tokenization
document_up_to_ori_index.append(len(document_tokens))
for token in tokens:
for sub_token in tokenizer.wordpiece_tokenizer.tokenize(token):
document_subtokens.append(sub_token)
document_sub_to_ori_index.append(len(document_tokens))
document_tokens.append(token)
# generate token-level document entity index
document_entities = []
for entity in example.passage_entities:
entity_start_position = document_up_to_ori_index[entity['start_position']]
entity_end_position = None
if entity['end_position'] < len(example.doc_tokens) - 1:
entity_end_position = document_up_to_ori_index[entity['end_position'] + 1] - 1
else:
entity_end_position = len(document_tokens) - 1
(entity_start_position, entity_end_position) = _improve_entity_span(
document_tokens, entity_start_position, entity_end_position, tokenizer, entity['orig_text'])
document_entities.append((entity['orig_text'], entity_start_position, entity_end_position)) # ('Trump', 10, 10)
# match query to passage entities
query_entities = match_query_entities(query_tokens, document_entities, document_tokens) # [('trump', 10, 10)]
tokenization_result.append({
'id': example.qas_id,
'query_tokens': query_tokens,
'query_subtokens': query_subtokens,
'query_sub_to_ori_index': query_sub_to_ori_index,
'query_entities': query_entities,
'document_tokens': document_tokens,
'document_subtokens': document_subtokens,
'document_entities': document_entities,
'document_sub_to_ori_index': document_sub_to_ori_index,
})
return tokenization_result
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", default='tokens', type=str,
help="The output directory to dump tokenization results.")
parser.add_argument("--train_file", default='../../data/ReCoRD/train.json', type=str, help="ReCoRD json for training. E.g., train-v1.1.json")
parser.add_argument("--predict_file", default='../../data/ReCoRD/dev.json', type=str,
help="ReCoRD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
# parser.add_argument("--do_lower_case", default=False, action='store_true',
# help="Whether to lower case the input text. Should be True for uncased "
# "models and False for cased models.")
# parser.add_argument('--dump_token', action='store_true', help='whether dump the token-level tokenization result')
# parser.add_argument('--dump_subtoken', action='store_true', help='whether dump the subtoken-level tokenization result, with its mapping with token-level result')
args = parser.parse_args()
# make output directory if not exist
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
# We do both cased and uncased tokenization
for do_lower_case in (True, False):
tokenizer = tokenization.FullTokenizer(
vocab_file='vocab.{}.txt'.format('uncased' if do_lower_case else 'cased'), do_lower_case=do_lower_case)
train_examples = read_record_examples(input_file=args.train_file, is_training=True)
train_tokenization_result = tokenization_on_examples(
examples=train_examples,
tokenizer=tokenizer)
with open(os.path.join(args.output_dir, 'train.tokenization.{}.data'.format('uncased' if do_lower_case else 'cased')), 'wb') as fout:
pickle.dump(train_tokenization_result, fout)
logger.info('Finished {} tokenization for train set.'.format('uncased' if do_lower_case else 'cased'))
eval_examples = read_record_examples(input_file=args.predict_file, is_training=False)
eval_tokenization_result = tokenization_on_examples(
examples=eval_examples,
tokenizer=tokenizer)
with open(os.path.join(args.output_dir, 'dev.tokenization.{}.data'.format('uncased' if do_lower_case else 'cased')), 'wb') as fout:
pickle.dump(eval_tokenization_result, fout)
logger.info('Finished {} tokenization for dev set.'.format('uncased' if do_lower_case else 'cased'))
if __name__ == "__main__":
main()
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import unicodedata
import six
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r") as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def convert_tokens_to_ids(vocab, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
ids.append(vocab[token])
return ids
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_tokens_to_ids(self.vocab, tokens)
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
# -*- coding: utf-8 -*-
# ==============================================================================
# Copyright 2019 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# This script performs the same tokenization process as run_squad.py, dumping tokenization results
# compared with v1: add query and passage entity span in output
import argparse
import logging
import json
import os
import pickle
from tqdm import tqdm, trange
import tokenization
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class SQuADExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self,
qas_id,
question_text,
question_entities_strset,
doc_tokens,
passage_entities,
orig_answer_text=None,
start_position=None,
end_position=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.passage_entities = passage_entities
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.question_entities_strset = question_entities_strset
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
return s
# the tokenization process when reading examples
def read_squad_examples(input_file, is_training):
"""Read a SQuAD json file into a list of SQuADExample."""
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
# load entities in passage
passage_entities = []
for entity in paragraph['context_entities']:
entity_start_offset = entity['start']
entity_end_offset = entity['end']
entity_text = entity['text']
assert entity_text == paragraph_text[entity_start_offset: entity_end_offset + 1]
passage_entities.append({'orig_text': entity_text,
'start_position': char_to_word_offset[entity_start_offset],
'end_position': char_to_word_offset[entity_end_offset]})
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
question_entities_strset = set([entity_info["text"] for entity_info in qa["question_entities"]])
start_position = None
end_position = None
orig_answer_text = None
if is_training:
if len(qa["answers"]) != 1:
raise ValueError(
"For training, each question should have exactly 1 answer.")
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
example = SQuADExample(
qas_id=qas_id,
question_text=question_text,
question_entities_strset=question_entities_strset,
doc_tokens=doc_tokens,
passage_entities=passage_entities,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position)
examples.append(example)
return examples
def _improve_entity_span(doc_tokens, input_start, input_end, tokenizer,
orig_entity_text):
"""Returns token-level tokenized entity spans that better match the annotated entity."""
tok_entity_text = " ".join(tokenizer.basic_tokenizer.tokenize(orig_entity_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_entity_text:
return (new_start, new_end)
return (input_start, input_end)
def _is_real_subspan(start, end, other_start, other_end):
return (start >= other_start and end < other_end) or (start > other_start and end <= other_end)
def match_query_entities(query_tokens, entities_tokens):
# transform query_tokens list into a whitespace separated string
query_string = " ".join(query_tokens)
offset_to_tid_map = []
tid = 0
for char in query_string:
offset_to_tid_map.append(tid)
if char == ' ':
tid += 1
# transform entity_tokens into whitespace separated strings
entity_strings = set()
for entity_tokens in entities_tokens:
entity_strings.add(" ".join(entity_tokens))
# do matching
results = []
for entity_string in entity_strings:
start = 0
while True:
pos = query_string.find(entity_string, start)
if pos == -1:
break
token_start, token_end = offset_to_tid_map[pos], offset_to_tid_map[pos] + entity_string.count(' ')
# assure the match is not partial match (eg. "ville" matches to "danville")
if " ".join(query_tokens[token_start: token_end + 1]) == entity_string:
results.append((token_start, token_end))
start = pos + len(entity_string)
# filter out a result span if it's a subspan of another span
no_subspan_results = []
for result in results:
if not any([_is_real_subspan(result[0], result[1], other_result[0], other_result[1]) for other_result in results]):
no_subspan_results.append((" ".join(query_tokens[result[0]: result[1] + 1]), result[0], result[1]))
assert len(no_subspan_results) == len(set(no_subspan_results))
return no_subspan_results
# the further tokenization process when generating features
def tokenization_on_examples(examples, tokenizer):
tokenization_result = []
for example in tqdm(examples):
# do tokenization on raw question text
query_subtokens = []
query_sub_to_ori_index = [] # mapping from sub-token index to token index
query_tokens = tokenizer.basic_tokenizer.tokenize(example.question_text)
for index, token in enumerate(query_tokens):
for sub_token in tokenizer.wordpiece_tokenizer.tokenize(token):
query_subtokens.append(sub_token)
query_sub_to_ori_index.append(index)
# do tokenization on whitespace tokenized document
document_tokens = []
document_subtokens = []
document_sub_to_ori_index = []
document_up_to_ori_index = [] # map unpunc token index to tokenized token index
for unpunc_tokenized_tokens in example.doc_tokens:
tokens = tokenizer.basic_tokenizer.tokenize(unpunc_tokenized_tokens) # do punctuation tokenization
document_up_to_ori_index.append(len(document_tokens))
for token in tokens:
for sub_token in tokenizer.wordpiece_tokenizer.tokenize(token):
document_subtokens.append(sub_token)
document_sub_to_ori_index.append(len(document_tokens))
document_tokens.append(token)
# generate token-level document entity index
document_entities = []
for entity in example.passage_entities:
entity_start_position = document_up_to_ori_index[entity['start_position']]
entity_end_position = None
if entity['end_position'] < len(example.doc_tokens) - 1:
entity_end_position = document_up_to_ori_index[entity['end_position'] + 1] - 1
else:
entity_end_position = len(document_tokens) - 1
(entity_start_position, entity_end_position) = _improve_entity_span(
document_tokens, entity_start_position, entity_end_position, tokenizer, entity['orig_text'])
document_entities.append((entity['orig_text'], entity_start_position, entity_end_position)) # ('Trump', 10, 10)
# match query entities (including tagged and document entities)
entities_tokens = []
for question_entity_str in example.question_entities_strset:
entities_tokens.append(tokenizer.basic_tokenizer.tokenize(question_entity_str))
for document_entity in document_entities:
entities_tokens.append(document_tokens[document_entity[1]: document_entity[2] + 1])
query_entities = match_query_entities(query_tokens, entities_tokens) # [('trump', 10, 10)]
tokenization_result.append({
'id': example.qas_id,
'query_tokens': query_tokens,
'query_subtokens': query_subtokens,
'query_entities': query_entities,
'query_sub_to_ori_index': query_sub_to_ori_index,
'document_tokens': document_tokens,
'document_subtokens': document_subtokens,
'document_entities': document_entities,
'document_sub_to_ori_index': document_sub_to_ori_index,
})
return tokenization_result
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", default='tokens', type=str,
help="The output directory to dump tokenization results.")
parser.add_argument("--train_file", default='../ner_tagging_squad/output/train-v1.1.tagged.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json")
parser.add_argument("--predict_file", default='../ner_tagging_squad/output/dev-v1.1.tagged.json', type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
# parser.add_argument("--do_lower_case", default=False, action='store_true',
# help="Whether to lower case the input text. Should be True for uncased "
# "models and False for cased models.")
# parser.add_argument('--dump_token', action='store_true', help='whether dump the token-level tokenization result')
# parser.add_argument('--dump_subtoken', action='store_true', help='whether dump the subtoken-level tokenization result, with its mapping with token-level result')
args = parser.parse_args()
# make output directory if not exist
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
# We do both cased and uncased tokenization
for do_lower_case in (True, False):
tokenizer = tokenization.FullTokenizer(
vocab_file='vocab.{}.txt'.format('uncased' if do_lower_case else 'cased'), do_lower_case=do_lower_case)
train_examples = read_squad_examples(input_file=args.train_file, is_training=True)
train_tokenization_result = tokenization_on_examples(
examples=train_examples,
tokenizer=tokenizer)
with open(os.path.join(args.output_dir, 'train.tokenization.{}.data'.format('uncased' if do_lower_case else 'cased')), 'wb') as fout:
pickle.dump(train_tokenization_result, fout)
logger.info('Finished {} tokenization for train set.'.format('uncased' if do_lower_case else 'cased'))
eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False)
eval_tokenization_result = tokenization_on_examples(
examples=eval_examples,
tokenizer=tokenizer)
with open(os.path.join(args.output_dir, 'dev.tokenization.{}.data'.format('uncased' if do_lower_case else 'cased')), 'wb') as fout:
pickle.dump(eval_tokenization_result, fout)
logger.info('Finished {} tokenization for dev set.'.format('uncased' if do_lower_case else 'cased'))
if __name__ == "__main__":
main()
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import unicodedata
import six
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r") as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def convert_tokens_to_ids(vocab, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
ids.append(vocab[token])
return ids
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_tokens_to_ids(self.vocab, tokens)
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册