Add the workspace of ACL2019-KTNET into PaddleNLP Research Version (#3244)

* add readme for KTNET * update readme * update readme * update readme * update readme of KTNET * update readme of KTNET * add source files for KTNET * update files for KTNET * update files for KTNET * update draft of readme for KTNET * modified scripts for KTNET * fix typos in readme.md for KTNET * update scripts for KTNET * update scripts for KTNET * update readme for KTNET * edit two-staged training scripts for KTNET * add details in the readme of KTNET * fix typos in the readme of KTNET * added eval scripts for KTNET * rename folders for KTNET * add copyright in the code and add links in readme for KTNET * add the remaining download link for KTNET * add md5sum for KTNET * final version for KTNET

Add the workspace of ACL2019-KTNET into PaddleNLP Research Version (#3244)
* add readme for KTNET * update readme * update readme * update readme * update readme of KTNET * update readme of KTNET * add source files for KTNET * update files for KTNET * update files for KTNET * update draft of readme for KTNET * modified scripts for KTNET * fix typos in readme.md for KTNET * update scripts for KTNET * update scripts for KTNET * update readme for KTNET * edit two-staged training scripts for KTNET * add details in the readme of KTNET * fix typos in the readme of KTNET * added eval scripts for KTNET * rename folders for KTNET * add copyright in the code and add links in readme for KTNET * add the remaining download link for KTNET * add md5sum for KTNET * final version for KTNET
f8658874 · Yang An · Yibing Liu · d6c65111 · f8658874 · f8658874
65 changed file
--- a/PaddleNLP/Research/ACL2019-KTNET/data/ReCoRD/.gitkeep
+++ b/PaddleNLP/Research/ACL2019-KTNET/data/ReCoRD/.gitkeep
--- a/PaddleNLP/Research/ACL2019-KTNET/data/SQuAD/.gitkeep
+++ b/PaddleNLP/Research/ACL2019-KTNET/data/SQuAD/.gitkeep
--- a/PaddleNLP/Research/ACL2019-KTNET/downloaded_files.md5
+++ b/PaddleNLP/Research/ACL2019-KTNET/downloaded_files.md5
+ad550852cf26241b20e8364e40340a99  train.json
+60c70c4a7e8190483f9899a1c9bc4178  dev.json
+df45d93b87ca3c47b54a33e03fabf719  record_official_evaluate.py
+981b29407e0affa3b1b156f72073b945  train-v1.1.json
+3e85deb501d4e538b6bc56f786231552  dev-v1.1.json
+afb04912d18ff20696f7f88eed49bea9  squad_v1_official_evaluate.py
+64010b964ae2ebf00148b3519a4aafc8  KTNET_preprocess_squad_tagging_output.tar.gz
+e9352221127b7620427c18e39bfae7fc  KTNET_preprocess_tokenize_result_record.tar.gz
+e52da2b1d096e889d32df267b82f9c77  KTNET_preprocess_tokenize_result_squad.tar.gz
+89db2f5cfb07f0c44998d7f49098eb90  KTNET_preprocess_wordnet_concepts.tar.gz
+fb62db2fe82d88480ec853f3c6fa237a  NELL.08m.1115.esv.csv.gz
+a68e68f9dcf4524b356163369c7f9f50  KTNET_preprocess_nell_concepts.tar.gz
+d9b62183c6367ffac3ee6f864c9425a5  wn_concept2vec.txt
+1f69c3d092089b0a0652616b72d61bd8  nell_concept2vec.txt
+5405c050e64fee4ffec17ee50f079b64  cased_L-24_H-1024_A-16.tar.gz
+4bd6e911cdad39c543ba8922a70580cd  KTNET_fine-tuned-model_record_both.tar.gz
+43fa464d6aeabe6dc7a15315d4ea8288  KTNET_fine-tuned-model_record_nell.tar.gz
+20aaefead331f64e435a94ac8a7b58aa  KTNET_fine-tuned-model_record_wordnet.tar.gz
+3abdb7be3fc5e3b98633c918acc25af4  KTNET_fine-tuned-model_squad_both.tar.gz
+9232cf27adda9d64265ccb315e1b9c81  KTNET_fine-tuned-model_squad_nell.tar.gz
+a36fdd6d5c88e3e931bb3b28f9aeb4e2  KTNET_fine-tuned-model_squad_wordnet.tar.gz
--- a/PaddleNLP/Research/ACL2019-KTNET/images/architecture.png
+++ b/PaddleNLP/Research/ACL2019-KTNET/images/architecture.png
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_nell.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_nell.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_twomemory.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_twomemory.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_record_twomemory.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_wordnet.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_wordnet.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_nell.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_nell.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_twomemory.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_twomemory.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_squad_twomemory.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_wordnet.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_wordnet.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint record_nell_first_stage_output/step_41970 \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d record_nell_first_stage_log ]; then
+mkdir record_nell_first_stage_log
+else
+rm -r record_nell_first_stage_log/*
+fi
+
+if [ ! -d record_nell_first_stage_output ]; then
+mkdir record_nell_first_stage_output
+else
+rm -r record_nell_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-4 \
+  --epoch 10 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints record_nell_first_stage_output/ 1>$PWD_DIR/record_nell_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint record_both_first_stage_output/step_41970 \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d record_both_first_stage_log ]; then
+mkdir record_both_first_stage_log
+else
+rm -r record_both_first_stage_log/*
+fi
+
+if [ ! -d record_both_first_stage_output ]; then
+mkdir record_both_first_stage_output
+else
+rm -r record_both_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-4 \
+  --epoch 10 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints record_both_first_stage_output/ 1>$PWD_DIR/record_both_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint record_wn_first_stage_output/step_41970 \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d record_wn_first_stage_log ]; then
+mkdir record_wn_first_stage_log
+else
+rm -r record_wn_first_stage_log/*
+fi
+
+if [ ! -d record_wn_first_stage_output ]; then
+mkdir record_wn_first_stage_output
+else
+rm -r record_wn_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-4 \
+  --epoch 10 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints record_wn_first_stage_output/ 1>$PWD_DIR/record_wn_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint sqd_nell_first_stage_output/step_3649 \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d sqd_nell_first_stage_log ]; then
+mkdir sqd_nell_first_stage_log
+else
+rm -r sqd_nell_first_stage_log/*
+fi
+
+if [ ! -d sqd_nell_first_stage_output ]; then
+mkdir sqd_nell_first_stage_output
+else
+rm -r sqd_nell_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-5 \
+  --epoch 1 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints sqd_nell_first_stage_output/ 1>$PWD_DIR/sqd_nell_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint sqd_both_first_stage_output/step_3649 \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d sqd_both_first_stage_log ]; then
+mkdir sqd_both_first_stage_log
+else
+rm -r sqd_both_first_stage_log/*
+fi
+
+if [ ! -d sqd_both_first_stage_output ]; then
+mkdir sqd_both_first_stage_output
+else
+rm -r sqd_both_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-5 \
+  --epoch 1 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints sqd_both_first_stage_output/ 1>$PWD_DIR/sqd_both_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint sqd_wn_first_stage_output/step_3649 \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d sqd_wn_first_stage_log ]; then
+mkdir sqd_wn_first_stage_log
+else
+rm -r sqd_wn_first_stage_log/*
+fi
+
+if [ ! -d sqd_wn_first_stage_output ]; then
+mkdir sqd_wn_first_stage_output
+else
+rm -r sqd_wn_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-5 \
+  --epoch 1 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints sqd_wn_first_stage_output/ 1>$PWD_DIR/sqd_wn_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/batching.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/batching.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False,
+                       max_concept_length=50):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+    batch_concept_ids = [inst[3] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+
+    for i in range(4, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)       
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    concept_ids = pad_batch_data(
+        batch_concept_ids, pad_idx=[],
+        max_concept_length=max_concept_length)  # 用[0,0,..]来pad         
+
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, concept_ids, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, concept_ids, self_input_mask] + labels_list
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   max_concept_length=50):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    # max_len = max(len(inst) for inst in insts)
+    max_len = 384
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    if type(pad_idx) == list:  # padding list, for concept_ids
+        inst_data = np.array(
+            [inst + list([0] * max_concept_length for x in range(max_len - len(inst))) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, max_len, max_concept_length, 1])]
+    else:
+        inst_data = np.array([
+            list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+        ])
+        return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/batching_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/batching_twomemory.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False,
+                       max_wn_concept_length=50,
+                       max_nell_concept_length=50):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+    batch_wn_concept_ids = [inst[3] for inst in insts]
+    batch_nell_concept_ids = [inst[4] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+
+    for i in range(5, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)       
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    wn_concept_ids = pad_batch_data(
+        batch_wn_concept_ids, pad_idx=[],
+        max_concept_length=max_wn_concept_length)  # 用[0,0,..]来pad      
+    nell_concept_ids = pad_batch_data(
+        batch_nell_concept_ids, pad_idx=[],
+        max_concept_length=max_nell_concept_length)  # 用[0,0,..]来pad            
+
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, wn_concept_ids, nell_concept_ids, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, wn_concept_ids, nell_concept_ids, self_input_mask] + labels_list
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   max_concept_length=50):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    # max_len = max(len(inst) for inst in insts)
+    max_len = 384
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    if type(pad_idx) == list:  # padding list, for concept_ids
+        inst_data = np.array(
+            [inst + list([0] * max_concept_length for x in range(max_len - len(inst))) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, max_len, max_concept_length, 1])]
+    else:
+        inst_data = np.array([
+            list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+        ])
+        return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/eval/__init__.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/eval/__init__.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/__init__.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/__init__.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/bert.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/bert.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import json
+import logging
+import numpy as np
+import paddle.fluid as fluid
+from model.transformer_encoder import encoder, pre_process_layer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+class BertConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        return self._config_dict[key]
+
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            logger.info('%s: %s' % (arg, value))
+        logger.info('------------------------------------------------')
+
+
+class BertModel(object):
+    def __init__(self,
+                 src_ids,
+                 position_ids,
+                 sentence_ids,
+                 input_mask,
+                 config,
+                 weight_sharing=True,
+                 use_fp16=False):
+
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        self._sent_types = config['type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self._weight_sharing = weight_sharing
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+
+        # Initialize all weigths by truncated normal initializer, and all biases 
+        # will be initialized by constant zero by default.
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+
+        self._build_model(src_ids, position_ids, sentence_ids, input_mask)
+
+    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+        position_emb_out = fluid.layers.embedding(
+            input=position_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer))
+
+        sent_emb_out = fluid.layers.embedding(
+            sentence_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer))
+
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+
+        emb_out = pre_process_layer(
+            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
+
+        if self._dtype == "float16":
+            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
+
+        # self_attn_mask = fluid.layers.matmul(
+        #     x=input_mask, y=input_mask, transpose_y=True)
+        self_attn_mask = fluid.layers.expand(fluid.layers.transpose(input_mask, [0, 2, 1]), [1, 384, 1])    
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        self._enc_out = encoder(
+            enc_input=emb_out,
+            attn_bias=n_head_self_attn_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            param_initializer=self._param_initializer,
+            name='encoder')
+
+    def get_sequence_output(self):
+        return self._enc_out
+
+    def get_pooled_output(self):
+        """Get the first feature of each sequence for classification"""
+
+        next_sent_feat = fluid.layers.slice(
+            input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0")
+        return next_sent_feat
+
+    def get_pretraining_output(self, mask_label, mask_pos, labels):
+        """Get the loss & accuracy for pretraining"""
+
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+
+        # extract the first token feature in each sentence
+        next_sent_feat = self.get_pooled_output()
+        reshaped_emb_out = fluid.layers.reshape(
+            x=self._enc_out, shape=[-1, self._emb_size])
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=self._emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(
+                name='mask_lm_trans_fc.w_0',
+                initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
+        # transform: layer norm 
+        mask_trans_feat = pre_process_layer(
+            mask_trans_feat, 'n', name='mask_lm_trans')
+
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name="mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+        if self._weight_sharing:
+            fc_out = fluid.layers.matmul(
+                x=mask_trans_feat,
+                y=fluid.default_main_program().global_block().var(
+                    self._word_emb_name),
+                transpose_y=True)
+            fc_out += fluid.layers.create_parameter(
+                shape=[self._voc_size],
+                dtype=self._dtype,
+                attr=mask_lm_out_bias_attr,
+                is_bias=True)
+
+        else:
+            fc_out = fluid.layers.fc(input=mask_trans_feat,
+                                     size=self._voc_size,
+                                     param_attr=fluid.ParamAttr(
+                                         name="mask_lm_out_fc.w_0",
+                                         initializer=self._param_initializer),
+                                     bias_attr=mask_lm_out_bias_attr)
+
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label)
+        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
+
+        next_sent_fc_out = fluid.layers.fc(
+            input=next_sent_feat,
+            size=2,
+            param_attr=fluid.ParamAttr(
+                name="next_sent_fc.w_0", initializer=self._param_initializer),
+            bias_attr="next_sent_fc.b_0")
+
+        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=next_sent_fc_out, label=labels, return_softmax=True)
+
+        next_sent_acc = fluid.layers.accuracy(
+            input=next_sent_softmax, label=labels)
+
+        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
+
+        loss = mean_next_sent_loss + mean_mask_lm_loss
+        return next_sent_acc, mean_mask_lm_loss, loss
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/layers.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/layers.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""bert model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import sys
+import six
+import logging
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.layers import shape
+
+from model.transformer_encoder import encoder, pre_process_layer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+def dynamic_expand(dynamic_tensor, smaller_tensor):
+    """
+    :param dynamic_tensor:
+    :param smaller_tensor:
+    :return:
+    """
+    assert len(dynamic_tensor.shape) > len(smaller_tensor.shape)
+    if type(smaller_tensor.shape) == list:
+        for dim_idx, dim in smaller_tensor.shape:
+            dynamic_tensor_dim_idx = len(dynamic_tensor) - len(smaller_tensor) + dim_idx
+            assert dynamic_tensor.shape[dynamic_tensor_dim_idx] % dim == 0
+    elif type(smaller_tensor.shape) == int:
+        assert dynamic_tensor.shape[-1] % smaller_tensor.shape == 0
+    memory_embs_zero = fluid.layers.scale(dynamic_tensor, scale=0.0)
+    smaller_tensor = fluid.layers.elementwise_add(memory_embs_zero, smaller_tensor)
+    return smaller_tensor
+
+
+def print_tensor(tensor, message, print_runtime=False):
+    logger.info("{}: {}".format(message, tensor.shape))
+    if print_runtime:
+        fluid.layers.Print(tensor, summarize=10, message=message)
+
+
+class MemoryLayer(object):
+    def __init__(self, bert_config, concept_size, mem_emb_size, mem_method='cat', prefix=None):
+        self.initializer_range = bert_config['initializer_range']
+        self.bert_size = bert_config['hidden_size']
+        self.concept_size = concept_size
+        self.mem_emb_size = mem_emb_size
+        assert mem_method in ['add', 'cat', 'raw']
+        self.mem_method = mem_method
+        self.prefix = prefix
+
+    def forward(self, bert_output, memory_embs, mem_length, ignore_no_memory_token=True):
+        """
+        :param bert_output: [batch_size, seq_size, bert_size]
+        :param memory_embs: [batch_size, seq_size, concept_size, mem_emb_size]
+        :param mem_length: [batch_size, sent_size, 1]
+        :return: 
+        """
+
+        bert_size = self.bert_size
+        concept_size = self.concept_size
+        mem_emb_size = self.mem_emb_size
+
+        print_tensor(bert_output, "bert_output")
+        print_tensor(memory_embs, "memory_embs")
+        print_tensor(mem_length, "mem_length")
+
+ 
+        projected_bert = fluid.layers.fc(bert_output, size=mem_emb_size, num_flatten_dims=2,
+            param_attr=fluid.ParamAttr(
+                name='{}_memory_layer_projection.w_0'.format(self.prefix) if self.prefix else 'memory_layer_projection.w_0',
+                initializer=fluid.initializer.NormalInitializer(
+                    loc=0.0, scale=self.initializer_range)),
+            bias_attr=False)  # [batch_size *seq_size, mem_emb_size]
+        logger.info("projected_bert: {}".format(projected_bert.shape))
+
+        expanded_bert = fluid.layers.unsqueeze(projected_bert, axes=[2])   # [batch_size, seq_size, 1, mem_emb_size]
+
+  
+        extended_memory, memory_score = self.add_sentinel(expanded_bert, memory_embs, mem_emb_size)
+        # extended_memory: [batch_size, seq_size, 1+concept_size, mem_emb_size]
+        # memory_score: [batch_size, seq_size, 1+concept_size]
+
+
+        concept_ordinal = self.get_concept_oridinal(concept_size, memory_score)  # [bs,sq,1+cs]
+
+        memory_reverse_mask = fluid.layers.less_than(
+            fluid.layers.expand(mem_length, expand_times=[1, 1, 1 + concept_size])
+            , concept_ordinal)
+        # [batch_size, seq_size, 1+concept_size]
+        memory_reverse_mask = fluid.layers.cast(memory_reverse_mask, dtype="float32")
+        print_tensor(memory_reverse_mask, "memory_reverse_mask")
+
+        memory_reverse_masked_infinity = fluid.layers.scale(memory_reverse_mask, scale=-1e6)
+        # [batch_size, seq_size, 1+concept_size]
+        print_tensor(memory_reverse_masked_infinity, "memory_reverse_masked_infinity")
+
+        memory_score = fluid.layers.elementwise_add(memory_score, memory_reverse_masked_infinity)
+        # [batch_size, seq_size, 1+concept_size]
+        logger.info("memory_score:{}".format(memory_score.shape))
+
+        memory_att = fluid.layers.softmax(memory_score)  # [batch_size, seq_size, 1+concept_size]
+        memory_att = fluid.layers.unsqueeze(memory_att, axes=[2])  # [batch_size, seq_size, 1, 1+concept_size]
+        logger.info("memory_att: {}".format(memory_att.shape))
+        logger.info("extended_memory: {}".format(extended_memory.shape))
+        summ = fluid.layers.matmul(memory_att,extended_memory)  # [batch_size, seq_size,1, mem_emb_size]
+        summ = fluid.layers.squeeze(summ, axes=[2])  # [batch_size, seq_size,mem_emb_size]
+
+        if ignore_no_memory_token:
+            condition = fluid.layers.less_than(
+                dynamic_expand(mem_length, fluid.layers.zeros([1],"float32")),
+                mem_length)  # [bs, sq]
+            # summ_true = fluid.layers.elementwise_mul(
+            #     summ,
+            #     fluid.layers.cast(condition, "float32"))   # [bs, sq, ms]
+            # summ_false = fluid.layers.elementwise_mul(
+            #     summ,
+            #     fluid.layers.scale(fluid.layers.cast(condition, "float32"), -1))  # [bs, sq, ms]
+            # summ = fluid.layers.elementwise_add(summ_true, summ_false)  # [bs, sq, ms]
+            summ = fluid.layers.elementwise_mul(
+                summ,
+                fluid.layers.cast(condition, "float32"))   # [bs, sq, ms]
+
+            print_tensor(summ, "summ")
+
+        if self.mem_method == "add":
+            summ_transform = fluid.layers.fc(summ, size=bert_size, num_flatten_dims=2)  # [batch_size, seq_size, bert_size]
+            output = fluid.layers.sums(input=[summ_transform, bert_output])  # [batch_size, seq_size, bert_size]
+        elif self.mem_method == "cat":
+            logger.info("bert_output: {}".format(bert_output.shape))
+            logger.info("summ: {}".format(summ.shape))
+            output = fluid.layers.concat(input=[bert_output, summ], axis=2)  # [batch_size, seq_size, bert_size + mem_emb_size]
+        elif self.mem_method == "raw":
+            logger.info("bert_output: {}".format(bert_output.shape))
+            logger.info("summ: {}".format(summ.shape))
+            output = summ  # [batch_size, seq_size, mem_emb_size]
+        else:
+            raise ValueError("mem_method not supported")
+        logger.info("output: {}".format(output.shape))
+        return output
+
+    def get_concept_oridinal(self, concept_size, memory_score):
+        """
+
+        :param concept_size:
+        :param memory_score: [batch_size, seq_size, 1+concept_size]
+        :return:
+        """
+        concept_ordinal = fluid.layers.create_tensor(dtype="float32")
+        fluid.layers.assign(np.arange(start=0, stop=(1 + concept_size), step=1, dtype=np.float32),
+                            concept_ordinal)  # [1+cs]
+        print_tensor(concept_ordinal, "concept_ordinal")
+        print_tensor(memory_score, "memory_score")
+
+        concept_ordinal = dynamic_expand(memory_score, concept_ordinal)  # [bs,sq,1+cs]
+
+        logger.info("concept_ordinal: {}".format(concept_ordinal.shape))
+        return concept_ordinal
+
+    def add_sentinel(self, expanded_bert, memory_embs, mem_emb_size):
+        """
+
+        :param expanded_bert: [batch_size, seq_size, 1, mem_emb_size]
+        :param memory_embs: [batch_size, seq_size, concept_size, mem_emb_size]
+        :param mem_emb_size:
+        :return:
+        """
+        sentinel = fluid.layers.create_parameter(
+            name='{}_memory_layer_sentinel'.format(self.prefix) if self.prefix else 'memory_layer_sentinel',
+            dtype="float32",
+            shape=[mem_emb_size],
+            default_initializer=fluid.initializer.ConstantInitializer(0))  # [mem_emb_size]
+        print_tensor(sentinel, "sentinel")
+
+        memory_embs_squeeze = fluid.layers.slice(memory_embs, axes=[2], starts=[0],
+                                                 ends=[1])  # [bs,sq,1,ms]
+        print_tensor(memory_embs_squeeze, "memory_embs_squeeze")
+
+        sentinel = dynamic_expand(memory_embs_squeeze, sentinel)  # [bs,sq,1,ms]
+        print_tensor(sentinel, "sentinel")
+        print_tensor(memory_embs, "memory_embs")
+
+        extended_memory = fluid.layers.concat([sentinel, memory_embs],
+                                              axis=2)  # [batch_size, seq_size, 1+concept_size, mem_emb_size]
+        extended_memory = fluid.layers.transpose(extended_memory, perm=[0, 1, 3, 2])
+        # [batch_size, seq_size, mem_emb_size, 1+concept_size]
+        logger.info("extended_memory: {}".format(extended_memory.shape))
+        memory_score = fluid.layers.matmul(expanded_bert,
+                                           extended_memory)  # [batch_size, seq_size, 1, 1+concept_size]
+        memory_score = fluid.layers.squeeze(memory_score, axes=[2])
+        # [batch_size, seq_size, 1+concept_size]
+        extended_memory = fluid.layers.transpose(extended_memory, perm=[0, 1, 3, 2])
+        # [batch_size, seq_size, 1+concept_size, mem_emb_size]
+        return extended_memory, memory_score
+
+
+class TriLinearTwoTimeSelfAttentionLayer(object):
+    def __init__(self, hidden_size, dropout_rate=0.0,
+    cat_mul=False, cat_sub=False, cat_twotime=False, cat_twotime_mul=False, cat_twotime_sub=False):
+        self.hidden_size = hidden_size
+        self.dropout_rate = dropout_rate
+        self.cat_mul = cat_mul
+        self.cat_sub = cat_sub
+        self.cat_twotime = cat_twotime
+        self.cat_twotime_mul = cat_twotime_mul
+        self.cat_twotime_sub = cat_twotime_sub
+
+    def forward(self, hidden_emb, sequence_mask):
+        """
+        :param hidden_emb: [batch_size, seq_size, hidden_size]
+        :param sequence_mask: [batch_size, seq_size, 1]
+        :return:
+        """
+        assert len(hidden_emb.shape) ==3 and len(sequence_mask.shape) == 3 \
+               and sequence_mask.shape[-1] == 1
+        assert hidden_emb.shape[:2] == sequence_mask.shape[:2]  
+
+        hidden_size = self.hidden_size
+
+        bias = fluid.layers.create_parameter(name='self_matching_layer_bias', shape=[1], dtype="float32",
+                        default_initializer=fluid.initializer.ConstantInitializer(0))
+
+        weight_1 = fluid.layers.create_parameter(name='self_matching_layer_weight1', shape=[hidden_size], dtype="float32",
+                        default_initializer=fluid.initializer.XavierInitializer(uniform=True, fan_in=1, fan_out=hidden_size))  # [HS]
+        bs_1_hs = fluid.layers.slice(hidden_emb, axes=[1], starts=[0], ends=[1]) # [bs, 1, hs]
+        print_tensor(bs_1_hs, "bs_1_hs")
+        bs_hs_1 = fluid.layers.transpose(bs_1_hs, perm=[0, 2, 1])  # [bs, hs, 1]
+        print_tensor(bs_hs_1, "bs_hs_1")
+        print_tensor(weight_1, "weight_1")
+        weight_1 = dynamic_expand(bs_1_hs, weight_1)  # [BS, 1, HS] (a)jk
+        weight_1 = fluid.layers.transpose(weight_1, perm=[0, 2, 1])
+        print_tensor(hidden_emb, "hidden_emb")
+        print_tensor(weight_1, "weight_1")
+        r1 = fluid.layers.matmul(hidden_emb, weight_1)  # [BS, SQ, 1]  aik
+        print_tensor(r1, "r1")
+
+        weight_2 = fluid.layers.create_parameter(name='self_matching_layer_weight2', shape=[hidden_size], dtype="float32",
+                         default_initializer=fluid.initializer.XavierInitializer(uniform=True, fan_in=1, fan_out=hidden_size))  # [HS]
+        weight_2 = dynamic_expand(bs_1_hs, weight_2)  # # [BS, 1, HS] (a)jk
+        hidden_emb_transpose = fluid.layers.transpose(hidden_emb, perm=[0, 2, 1])  # [BS, HS, SQ] aji
+        r2 = fluid.layers.matmul(weight_2, hidden_emb_transpose)  # [BS, 1, SQ]  aki
+        print_tensor(r2, "r2")
+
+        weight_mul = fluid.layers.create_parameter(name='self_matching_layer_weightmul', shape=[hidden_size], dtype="float32",
+                        default_initializer=fluid.initializer.XavierInitializer(uniform=True))  # [HS]
+
+ 
+        weight_mul = dynamic_expand(hidden_emb, weight_mul)
+        rmul_1 = fluid.layers.elementwise_mul(hidden_emb, weight_mul)  # for "hidden * self.weight_mul". [bs, sq(i), hs(j)]
+        print_tensor(rmul_1, "rmul_1")
+        rmul_2 = fluid.layers.matmul(rmul_1, hidden_emb_transpose)  # [bs, sq(i), hs(j)] mul [bs, hs(j), sq(k)] = [bs, sq(i), sq(k)]
+        print_tensor(rmul_2, "rmul_2")
+
+        r1 = fluid.layers.squeeze(r1, axes=[2])  # [BS, SQ]  aik
+        r1 = dynamic_expand(
+            fluid.layers.transpose(rmul_2, [1, 0, 2]),  # [sq, bs, sq]
+            r1)  # [ SQ(from 1), bs, SQ]
+        r1 = fluid.layers.transpose(r1, [1, 2, 0])  # [bs, sq, sq(from 1)]
+
+        r2 = fluid.layers.squeeze(r2, axes=[1])  # [BS, SQ]  aik
+        r2 = dynamic_expand(
+            fluid.layers.transpose(rmul_2, [1, 0, 2]),  # [sq, bs, sq]
+            r2)  # [ SQ(from 1), bs, SQ]
+        r2 = fluid.layers.transpose(r2, [1, 0, 2])  # [bs,sq(from 1),sq]
+
+        bias = dynamic_expand(rmul_2, bias)  # [BS, SQ, SQ]
+        sim_score = fluid.layers.sums(input=[r1, r2, rmul_2, bias])
+        # [bs,sq,1]+[bs,1,sq]+[bs,sq,sq]+[bs,sq,sq]=[BS,SQ,SQ]
+        print_tensor(sim_score, "sim_score")
+
+        sequence_mask = fluid.layers.cast(sequence_mask, dtype="float32")  # [BS,SQ,1]
+        softmax_mask = fluid.layers.elementwise_sub(
+            sequence_mask,
+            fluid.layers.fill_constant([1], "float32", 1))  # [BS,SQ,1]
+        softmax_mask = fluid.layers.scale(softmax_mask, -1)
+        very_negative_number = fluid.layers.fill_constant([1], value=-1e6, dtype="float32")
+        logger.info("softmax_mask: {}".format(softmax_mask.shape))
+        logger.info("very_negative_number: {}".format(very_negative_number.shape))
+
+        softmax_mask = fluid.layers.elementwise_mul(softmax_mask, very_negative_number)  # [BS,SQ,1]
+
+        softmax_mask = fluid.layers.squeeze(softmax_mask, axes=[2])  # [BS,SQ]
+        softmax_mask = dynamic_expand(fluid.layers.transpose(sim_score, perm=[2, 0, 1]), softmax_mask)  # [sq(1),bs,sq]
+        softmax_mask = fluid.layers.transpose(softmax_mask, perm=[1, 0, 2])   # [BS,sq(1),SQ]
+        print_tensor(softmax_mask, "softmax_mask")
+        sim_score = fluid.layers.elementwise_add(sim_score, softmax_mask)  # [bs,sq,sq]+[bs,sq(1),sq]=[BS,SQ,SQ]
+        print_tensor(sim_score, "sim_score")
+
+        attn_prob = fluid.layers.softmax(sim_score)  # [BS,SQ,SQ]
+        weighted_sum = fluid.layers.matmul(attn_prob, hidden_emb)  # [bs,sq,sq]*[bs,sq,hs]=[BS,SQ,HS]
+        if any([self.cat_twotime, self.cat_twotime_mul, self.cat_twotime_sub]):
+            twotime_att_prob = fluid.layers.matmul(attn_prob, attn_prob)  # [bs,sq,sq]*[bs,sq,sq]=[BS,SQ,SQ]
+            twotime_weited_sum = fluid.layers.matmul(twotime_att_prob, hidden_emb)  # [BS,SQ,HS]
+
+        out_tensors = [hidden_emb, weighted_sum]
+        if self.cat_mul:
+            out_tensors.append(fluid.layers.elementwise_mul(hidden_emb, weighted_sum))
+        if self.cat_sub:
+            out_tensors.append(fluid.layers.elementwise_sub(hidden_emb, weighted_sum))
+        if self.cat_twotime:
+            out_tensors.append(twotime_weited_sum)
+        if self.cat_twotime_mul:
+            out_tensors.append(fluid.layers.elementwise_mul(hidden_emb, twotime_weited_sum))
+        if self.cat_twotime_sub:
+            out_tensors.append(fluid.layers.elementwise_sub(hidden_emb, twotime_weited_sum))
+        output = fluid.layers.concat(out_tensors, axis=2)  # [BS,SQ, HS+HS+....]
+        print_tensor(output, "output")
+        return output
+
+
+
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/transformer_encoder.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/transformer_encoder.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer encoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import partial, reduce
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.layer_helper import LayerHelper
+
+def layer_norm(x, begin_norm_axis=1, epsilon=1e-12, param_attr=None, bias_attr=None):
+    """
+    Replace build-in layer_norm op with this function
+    """
+    helper = LayerHelper('layer_norm', **locals())
+    mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
+    shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
+    variance = layers.reduce_mean(layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
+    r_stdev = layers.rsqrt(variance + epsilon)
+    norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
+
+    param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
+    param_dtype = norm_x.dtype
+    scale = helper.create_parameter(
+        attr=param_attr,
+        shape=param_shape,
+        dtype=param_dtype,
+        default_initializer=fluid.initializer.Constant(1.))
+    bias = helper.create_parameter(
+        attr=bias_attr,
+        shape=param_shape,
+        dtype=param_dtype,
+        is_bias=True,
+        default_initializer=fluid.initializer.Constant(0.))
+
+    out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
+    out = layers.elementwise_add(x=out, y=bias, axis=-1)
+
+    return out
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None,
+                         param_initializer=None,
+                         name='multi_head_att'):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_query_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_query_fc.b_0')
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_key_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_key_fc.b_0')
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_value_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_value_fc.b_0')
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return layers.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=True)
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    if cache is not None:  # use cache and concat time steps
+        # Since the inplace reshape in __split_heads changes the shape of k and
+        # v, which is the cache input for next time step, reshape the cache
+        # input from the previous time step first.
+        k = cache["k"] = layers.concat(
+            [layers.reshape(
+                cache["k"], shape=[0, 0, d_model]), k], axis=1)
+        v = cache["v"] = layers.concat(
+            [layers.reshape(
+                cache["v"], shape=[0, 0, d_model]), v], axis=1)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         num_flatten_dims=2,
+                         param_attr=fluid.ParamAttr(
+                             name=name + '_output_fc.w_0',
+                             initializer=param_initializer),
+                         bias_attr=name + '_output_fc.b_0')
+    return proj_out
+
+
+def positionwise_feed_forward(x,
+                              d_inner_hid,
+                              d_hid,
+                              dropout_rate,
+                              hidden_act,
+                              param_initializer=None,
+                              name='ffn'):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act=hidden_act,
+                       param_attr=fluid.ParamAttr(
+                           name=name + '_fc_0.w_0',
+                           initializer=param_initializer),
+                       bias_attr=name + '_fc_0.b_0')
+    if dropout_rate:
+        hidden = layers.dropout(
+            hidden,
+            dropout_prob=dropout_rate,
+            dropout_implementation="upscale_in_train",
+            is_test=False)
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=fluid.ParamAttr(
+                        name=name + '_fc_1.w_0', initializer=param_initializer),
+                    bias_attr=name + '_fc_1.b_0')
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
+                           name=''):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out_dtype = out.dtype
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float32")
+            out = layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_scale',
+                    initializer=fluid.initializer.Constant(1.)),
+                bias_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_bias',
+                    initializer=fluid.initializer.Constant(0.)))
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float16")
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    dropout_implementation="upscale_in_train",
+                    is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  hidden_act,
+                  preprocess_cmd="n",
+                  postprocess_cmd="da",
+                  param_initializer=None,
+                  name=''):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(
+        pre_process_layer(
+            enc_input,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_att'),
+        None,
+        None,
+        attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        param_initializer=param_initializer,
+        name=name + '_multi_head_att')
+    attn_output = post_process_layer(
+        enc_input,
+        attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_att')
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(
+            attn_output,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_ffn'),
+        d_inner_hid,
+        d_model,
+        relu_dropout,
+        hidden_act,
+        param_initializer=param_initializer,
+        name=name + '_ffn')
+    return post_process_layer(
+        attn_output,
+        ffd_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_ffn')
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
+
+    return enc_output
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/optimization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/optimization.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+from utils.fp16 import create_master_params_grads, master_param_to_train_param
+
+
+def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
+    """ Applies linear warmup of learning rate from 0 and decay to 0."""
+    with fluid.default_main_program()._lr_schedule_guard():
+        lr = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="scheduled_learning_rate")
+
+        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
+
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(global_step < num_train_steps * 0.1):
+                warmup_lr = learning_rate * (global_step / (num_train_steps * 0.1))
+                fluid.layers.tensor.assign(warmup_lr, lr)
+            with switch.default():
+                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+                    learning_rate=learning_rate,
+                    decay_steps=num_train_steps,
+                    end_learning_rate=0.0,
+                    power=1.0,
+                    cycle=False)
+                fluid.layers.tensor.assign(decayed_lr, lr)
+
+        return lr
+
+
+def optimization(loss,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 train_program,
+                 startup_prog,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 use_fp16=False,
+                 loss_scaling=1.0):
+    if warmup_steps > 0:
+        if scheduler == 'noam_decay':
+            scheduled_lr = fluid.layers.learning_rate_scheduler\
+             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
+                         warmup_steps)
+        elif scheduler == 'linear_warmup_decay':
+            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
+                                               num_train_steps)
+        else:
+            raise ValueError("Unkown learning rate scheduler, should be "
+                             "'noam_decay' or 'linear_warmup_decay'")
+        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, epsilon=1e-6)
+    else:
+        optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, epsilon=1e-6)
+        scheduled_lr = learning_rate
+
+    clip_norm_thres = 1.0
+    # When using mixed precision training, scale the gradient clip threshold
+    # by loss_scaling
+    if use_fp16 and loss_scaling > 1.0:
+        clip_norm_thres *= loss_scaling
+    fluid.clip.set_gradient_clip(
+        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
+
+    def exclude_from_weight_decay(name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+
+    param_list = dict()
+
+    if use_fp16:
+        param_grads = optimizer.backward(loss)
+        master_param_grads = create_master_params_grads(
+            param_grads, train_program, startup_prog, loss_scaling)
+
+        for param, _ in master_param_grads:
+            param_list[param.name] = param * 1.0
+            param_list[param.name].stop_gradient = True
+
+        optimizer.apply_gradients(master_param_grads)
+
+        if weight_decay > 0:
+            for param, grad in master_param_grads:
+                # if exclude_from_weight_decay(param.name.rstrip(".master")):
+                #     continue
+                if param.name == 'concept_emb_mat' or param.name == 'wn_concept_emb_mat' or param.name == 'nell_concept_emb_mat':
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+
+        master_param_to_train_param(master_param_grads, param_grads,
+                                    train_program)
+
+    else:
+        for param in train_program.global_block().all_parameters():
+            param_list[param.name] = param * 1.0
+            param_list[param.name].stop_gradient = True
+
+        _, param_grads = optimizer.minimize(loss)
+
+        if weight_decay > 0:
+            for param, grad in param_grads:
+                # if exclude_from_weight_decay(param.name):
+                #     continue
+                if param.name == 'concept_emb_mat' or param.name == 'wn_concept_emb_mat' or param.name == 'nell_concept_emb_mat':
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+
+    return scheduled_lr
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/__init__.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/__init__.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/record.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/record.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on ReCoRD."""
+
+import six
+import math
+import json
+import random
+import collections
+import os
+import pickle
+import logging
+import tokenization
+from batching import prepare_batch_data
+
+from eval.record_official_evaluate import evaluate, f1_score
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+class ReCoRDExample(object):
+    """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 concept_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+        self.concept_ids = concept_ids
+
+
+def read_record_examples(input_file, is_training, version_2_with_negative=False):
+    """Read a ReCoRD json file into a list of ReCoRDExample."""
+    with open(input_file, "r") as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        paragraph_text = entry["passage"]["text"].replace('\xa0', ' ')
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+        for c in paragraph_text:
+            if is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        for qa in entry["qas"]:
+            qas_id = qa["id"]
+            question_text = qa["query"].replace('\xa0', ' ')
+            start_position = None
+            end_position = None
+            orig_answer_text = None
+            is_impossible = False
+            if is_training:
+
+                if version_2_with_negative:
+                    is_impossible = qa["is_impossible"]
+                # if (len(qa["answers"]) != 1) and (not is_impossible):
+                #     raise ValueError(
+                #         "For training, each question should have exactly 1 answer."
+                #     )
+                if not is_impossible:
+                    answer = qa["answers"][0]
+                    orig_answer_text = answer["text"]
+                    answer_offset = answer["start"]
+                    answer_length = len(orig_answer_text)
+                    start_position = char_to_word_offset[answer_offset]
+                    end_position = char_to_word_offset[answer_offset +
+                                                        answer_length - 1]
+                    # Only add answers where the text can be exactly recovered from the
+                    # document. If this CAN'T happen it's likely due to weird Unicode
+                    # stuff so we will just skip the example.
+                    #
+                    # Note that this means for training mode, every example is NOT
+                    # guaranteed to be preserved.
+                    actual_text = " ".join(doc_tokens[start_position:(
+                        end_position + 1)])
+                    cleaned_answer_text = " ".join(
+                        tokenization.whitespace_tokenize(orig_answer_text))
+                    if actual_text.find(cleaned_answer_text) == -1:
+                        logger.info("Could not find answer: '%s' vs. '%s'",
+                                actual_text, cleaned_answer_text)
+                        continue
+                else:
+                    start_position = -1
+                    end_position = -1
+                    orig_answer_text = ""
+
+            example = ReCoRDExample(
+                qas_id=qas_id,
+                question_text=question_text,
+                doc_tokens=doc_tokens,
+                orig_answer_text=orig_answer_text,
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=is_impossible)
+            examples.append(example)
+
+    return examples
+
+class Examples_To_Features_Converter(object):
+    def __init__(self, **concept_settings):
+        self.concept_settings = concept_settings
+
+        # load necessary data files for mapping to related concepts
+        # 1. mapping from subword-level tokenization to word-level tokenization
+        tokenization_filepath = self.concept_settings['tokenization_path']
+        assert os.path.exists(tokenization_filepath)
+        self.all_tokenization_info = {}
+        for item in pickle.load(open(tokenization_filepath, 'rb')):
+            self.all_tokenization_info[item['id']] = item
+        
+        # 2. mapping from concept name to concept id (currently only support one KB)
+        self.concept2id = self.concept_settings['concept2id']
+
+        # 3. retrieved related wordnet concepts (if use_wordnet)
+        if concept_settings['use_wordnet']:
+            assert not self.concept_settings['use_nell']
+            retrieved_synset_filepath = self.concept_settings['retrieved_synset_path']
+            assert os.path.exists(retrieved_synset_filepath)
+            self.synsets_info = pickle.load(open(retrieved_synset_filepath, 'rb')) # token to sysnet names
+            self.max_concept_length = max([len(synsets) for synsets in self.synsets_info.values()])
+        
+        # 4. retrieved related nell concepts (if use_nell)
+        if concept_settings['use_nell']:
+            assert not self.concept_settings['use_wordnet']
+            retrieved_nell_concept_filepath = self.concept_settings['retrieved_nell_concept_path']
+            assert os.path.exists(retrieved_nell_concept_filepath)
+            self.nell_retrieve_info = {}
+            for item in pickle.load(open(retrieved_nell_concept_filepath, 'rb')):
+                self.nell_retrieve_info[item['id']] = item
+            self.max_concept_length = max([max([len(entity_info['retrieved_concepts']) for entity_info in item['query_entities'] + item['document_entities']]) 
+                                        for qid, item in self.nell_retrieve_info.items() if len(item['query_entities'] + item['document_entities']) > 0])
+
+    # return list of concept ids given input subword list
+    def _lookup_wordnet_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, tolower, tokenizer):
+        concept_ids = []
+        for index in range(len(sub_tokens)):
+            original_token = tokens[sub_to_ori_index[index]]
+            # if tokens are in upper case, we must lower it for retrieving
+            retrieve_token = tokenizer.basic_tokenizer._run_strip_accents(original_token.lower()) if tolower else original_token
+            if retrieve_token in self.synsets_info:
+                concept_ids.append([self.concept2id[synset_name] for synset_name in self.synsets_info[retrieve_token]])
+            else:
+                concept_ids.append([])
+        return concept_ids
+
+    def _lookup_nell_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, nell_info):
+        original_concept_ids = [[] for _ in range(len(tokens))]
+        for entity_info in nell_info:
+            for pos in range(entity_info['token_start'], entity_info['token_end'] + 1):
+                original_concept_ids[pos] += [self.concept2id[category_name] for category_name in entity_info['retrieved_concepts']]
+        for pos in range(len(original_concept_ids)):
+            original_concept_ids[pos] = list(set(original_concept_ids[pos]))
+        concept_ids = [original_concept_ids[sub_to_ori_index[index]] for index in range(len(sub_tokens))]
+        return concept_ids
+
+    def __call__(self, 
+                examples,
+                tokenizer,
+                max_seq_length,
+                doc_stride,
+                max_query_length,
+                is_training):
+        """Loads a data file into a list of `InputBatch`s."""
+
+        unique_id = 1000000000      
+
+        for (example_index, example) in enumerate(examples):
+            tokenization_info = self.all_tokenization_info[example.qas_id]
+            query_tokens = tokenizer.tokenize(example.question_text)
+            # check online subword tokenization result is the same as offline result
+            assert query_tokens == tokenization_info['query_subtokens']
+            if self.concept_settings['use_wordnet']:
+                query_concepts = self._lookup_wordnet_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'], 
+                                                            tokenization_info['query_tokens'], 
+                                                            tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
+            
+            if self.concept_settings['use_nell']:
+                query_concepts = self._lookup_nell_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'], 
+                                                            tokenization_info['query_tokens'], self.nell_retrieve_info[example.qas_id]['query_entities'])
+
+            if len(query_tokens) > max_query_length:
+                query_tokens = query_tokens[0:max_query_length]
+                query_concepts = query_concepts[0:max_query_length]
+
+            tok_to_orig_index = []
+            orig_to_tok_index = []
+            all_doc_tokens = []
+            for (i, token) in enumerate(example.doc_tokens):
+                orig_to_tok_index.append(len(all_doc_tokens))
+                sub_tokens = tokenizer.tokenize(token)
+                for sub_token in sub_tokens:
+                    tok_to_orig_index.append(i)
+                    all_doc_tokens.append(sub_token)
+            assert all_doc_tokens == tokenization_info['document_subtokens']
+            if self.concept_settings['use_wordnet']:
+                doc_concepts = self._lookup_wordnet_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'], 
+                                                            tokenization_info['document_tokens'],
+                                                            tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
+            
+            if self.concept_settings['use_nell']:
+                doc_concepts = self._lookup_nell_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'], 
+                                                            tokenization_info['document_tokens'], self.nell_retrieve_info[example.qas_id]['document_entities'])
+
+            tok_start_position = None
+            tok_end_position = None
+            if is_training and example.is_impossible:
+                tok_start_position = -1
+                tok_end_position = -1
+            if is_training and not example.is_impossible:
+                tok_start_position = orig_to_tok_index[example.start_position]
+                if example.end_position < len(example.doc_tokens) - 1:
+                    tok_end_position = orig_to_tok_index[example.end_position +
+                                                        1] - 1
+                else:
+                    tok_end_position = len(all_doc_tokens) - 1
+                (tok_start_position, tok_end_position) = _improve_answer_span(
+                    all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+                    example.orig_answer_text)
+
+            # The -3 accounts for [CLS], [SEP] and [SEP]
+            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+            # We can have documents that are longer than the maximum sequence length.
+            # To deal with this we do a sliding window approach, where we take chunks
+            # of the up to our max length with a stride of `doc_stride`.
+            _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+                "DocSpan", ["start", "length"])
+            doc_spans = []
+            start_offset = 0
+            while start_offset < len(all_doc_tokens):
+                length = len(all_doc_tokens) - start_offset
+                if length > max_tokens_for_doc:
+                    length = max_tokens_for_doc
+                doc_spans.append(_DocSpan(start=start_offset, length=length))
+                if start_offset + length == len(all_doc_tokens):
+                    break
+                start_offset += min(length, doc_stride)
+
+            for (doc_span_index, doc_span) in enumerate(doc_spans):
+                tokens = []
+                token_to_orig_map = {}
+                token_is_max_context = {}
+                segment_ids = []
+                concept_ids = []
+
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                concept_ids.append([])
+                for token, query_concept in zip(query_tokens, query_concepts):
+                    tokens.append(token)
+                    segment_ids.append(0)
+                    concept_ids.append(query_concept)
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+                concept_ids.append([])
+
+                for i in range(doc_span.length):
+                    split_token_index = doc_span.start + i
+                    token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                    is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                        split_token_index)
+                    token_is_max_context[len(tokens)] = is_max_context
+                    tokens.append(all_doc_tokens[split_token_index])
+                    segment_ids.append(1)
+                    concept_ids.append(doc_concepts[split_token_index])
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+                concept_ids.append([])
+
+                input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+                # The mask has 1 for real tokens and 0 for padding tokens. Only real
+                # tokens are attended to.
+                input_mask = [1] * len(input_ids)
+
+                # Zero-pad up to the sequence length.
+                #while len(input_ids) < max_seq_length:
+                #  input_ids.append(0)
+                #  input_mask.append(0)
+                #  segment_ids.append(0)
+
+                #assert len(input_ids) == max_seq_length
+                #assert len(input_mask) == max_seq_length
+                #assert len(segment_ids) == max_seq_length
+
+                for cindex in range(len(concept_ids)):
+                    concept_ids[cindex] = concept_ids[cindex] + [0] * (self.max_concept_length - len(concept_ids[cindex]))
+                    concept_ids[cindex] = concept_ids[cindex][:self.max_concept_length]
+                assert all([len(id_list) == self.max_concept_length for id_list in concept_ids])            
+
+                start_position = None
+                end_position = None
+                if is_training and not example.is_impossible:
+                    # For training, if our document chunk does not contain an annotation
+                    # we throw it out, since there is nothing to predict.
+                    doc_start = doc_span.start
+                    doc_end = doc_span.start + doc_span.length - 1
+                    # out_of_span = False
+                    if not (tok_start_position >= doc_start and
+                            tok_end_position <= doc_end):
+                        continue
+                        # out_of_span = True
+                    # if out_of_span:
+                    #     start_position = 0
+                    #     end_position = 0
+                    # else:
+                    #     doc_offset = len(query_tokens) + 2
+                    #     start_position = tok_start_position - doc_start + doc_offset
+                    #     end_position = tok_end_position - doc_start + doc_offset
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset                    
+
+                if is_training and example.is_impossible:
+                    start_position = 0
+                    end_position = 0
+
+                if example_index < 3:
+                    logger.info("*** Example ***")
+                    logger.info("unique_id: %s" % (unique_id))
+                    logger.info("example_index: %s" % (example_index))
+                    logger.info("doc_span_index: %s" % (doc_span_index))
+                    logger.info("tokens: %s" % " ".join(
+                        [tokenization.printable_text(x) for x in tokens]))
+                    logger.info("token_to_orig_map: %s" % " ".join([
+                        "%d:%d" % (x, y)
+                        for (x, y) in six.iteritems(token_to_orig_map)
+                    ]))
+                    logger.info("token_is_max_context: %s" % " ".join([
+                        "%d:%s" % (x, y)
+                        for (x, y) in six.iteritems(token_is_max_context)
+                    ]))
+                    logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                    logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                    logger.info("segment_ids: %s" %
+                        " ".join([str(x) for x in segment_ids]))
+                    logger.info("concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(concept_ids)]))                    
+                    if is_training and example.is_impossible:
+                        logger.info("impossible example")
+                    if is_training and not example.is_impossible:
+                        answer_text = " ".join(tokens[start_position:(end_position +
+                                                                    1)])
+                        logger.info("start_position: %d" % (start_position))
+                        logger.info("end_position: %d" % (end_position))
+                        logger.info("answer: %s" %
+                            (tokenization.printable_text(answer_text)))
+
+                feature = InputFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    concept_ids=concept_ids,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=example.is_impossible)
+
+                unique_id += 1
+
+                yield feature
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The ReCoRD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in ReCoRD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+class DataProcessor(object):
+    def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
+                 doc_stride, max_query_length):
+        self._tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self._max_seq_length = max_seq_length
+        self._doc_stride = doc_stride
+        self._max_query_length = max_query_length
+        self._in_tokens = in_tokens
+
+        self.vocab = self._tokenizer.vocab
+        self.vocab_size = len(self.vocab)
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+
+        self.current_train_example = -1
+        self.num_train_examples = -1
+        self.current_train_epoch = -1
+
+        self.train_examples = None
+        self.predict_examples = None
+        self.num_examples = {'train': -1, 'predict': -1}
+
+        self.train_max_concept_length = None
+        self.predict_max_concept_length = None
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_train_example, self.current_train_epoch
+
+    def get_examples(self,
+                     data_path,
+                     is_training,
+                     version_2_with_negative=False):
+        examples = read_record_examples(
+            input_file=data_path,
+            is_training=is_training,
+            version_2_with_negative=version_2_with_negative)
+        return examples
+
+    def get_num_examples(self, phase):
+        if phase not in ['train', 'predict']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+        return self.num_examples[phase]
+
+    def get_features(self, examples, is_training, **concept_settings):
+        convert_examples_to_features = Examples_To_Features_Converter(**concept_settings)
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=self._tokenizer,
+            max_seq_length=self._max_seq_length,
+            doc_stride=self._doc_stride,
+            max_query_length=self._max_query_length,
+            is_training=is_training)
+        return features
+
+    def data_generator(self,
+                       data_path,
+                       batch_size,
+                       phase='train',
+                       shuffle=False,
+                       dev_count=1,
+                       version_2_with_negative=False,
+                       epoch=1,
+                       **concept_settings):
+        if phase == 'train':
+            self.train_examples = self.get_examples(
+                data_path,
+                is_training=True,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.train_examples
+            self.num_examples['train'] = len(self.train_examples)
+        elif phase == 'predict':
+            self.predict_examples = self.get_examples(
+                data_path,
+                is_training=False,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.predict_examples
+            self.num_examples['predict'] = len(self.predict_examples)
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+
+        def batch_reader(features, batch_size, in_tokens):
+            batch, total_token_num, max_len = [], 0, 0
+            for (index, feature) in enumerate(features):
+                if phase == 'train':
+                    self.current_train_example = index + 1
+                seq_len = len(feature.input_ids)
+                labels = [feature.unique_id
+                          ] if feature.start_position is None else [
+                              feature.start_position, feature.end_position
+                          ]
+                example = [
+                    # feature.input_ids, feature.segment_ids, range(seq_len), feature.concept_ids
+                    feature.input_ids, feature.segment_ids, range(384), feature.concept_ids
+                ] + labels
+                max_len = max(max_len, seq_len)
+
+                #max_len = max(max_len, len(token_ids))
+                if in_tokens:
+                    to_append = (len(batch) + 1) * max_len <= batch_size
+                else:
+                    to_append = len(batch) < batch_size
+
+                if to_append:
+                    batch.append(example)
+                    total_token_num += seq_len
+                else:
+                    yield batch, total_token_num
+                    batch, total_token_num, max_len = [example
+                                                       ], seq_len, seq_len
+            if len(batch) > 0:
+                yield batch, total_token_num
+
+        if phase == 'train':
+            self.train_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_concept_length
+        else:
+            self.predict_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_concept_length
+
+        def wrapper():
+            for epoch_index in range(epoch):
+                if shuffle:
+                    random.shuffle(examples)
+                if phase == 'train':
+                    self.current_train_epoch = epoch_index
+                    features = self.get_features(examples, is_training=True, **concept_settings)
+                    max_concept_length = self.train_max_concept_length
+                else:
+                    features = self.get_features(examples, is_training=False, **concept_settings)
+                    max_concept_length = self.predict_max_concept_length
+
+                all_dev_batches = []
+                for batch_data, total_token_num in batch_reader(
+                        features, batch_size, self._in_tokens):
+                    batch_data = prepare_batch_data(
+                        batch_data,
+                        total_token_num,
+                        voc_size=-1,
+                        pad_id=self.pad_id,
+                        cls_id=self.cls_id,
+                        sep_id=self.sep_id,
+                        mask_id=-1,
+                        return_input_mask=True,
+                        return_max_len=False,
+                        return_num_token=False,
+                        max_concept_length=max_concept_length)
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+
+        return wrapper
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      version_2_with_negative, null_score_diff_threshold,
+                      verbose, predict_file, evaluation_result_file):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    logger.info("Writing predictions to: %s" % (output_prediction_file))
+    logger.info("Writing nbest to: %s" % (output_nbest_file))
+    logger.info("Writing evaluation result to: %s" % (evaluation_result_file))
+
+    # load ground truth file for evaluation and post-edit
+    with open(predict_file, "r", encoding='utf-8') as reader:
+        predict_json = json.load(reader)["data"]
+        all_candidates = {}
+        for passage in predict_json:
+            passage_text = passage['passage']['text']
+            candidates = []
+            for entity_info in passage['passage']['entities']:
+                start_offset = entity_info['start']
+                end_offset = entity_info['end']
+                candidates.append(passage_text[start_offset: end_offset + 1])
+            for qa in passage['qas']:
+                all_candidates[qa['id']] = candidates
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[
+                    0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
+                                                              )]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
+                                                                 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case,
+                                            verbose)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+
+        # if we didn't inlude the empty option in the n-best, inlcude it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(
+                    text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+        # debug
+        if best_non_null_entry is None:
+            logger.info("Emmm..., sth wrong")
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            # restrict the finally picked prediction to have overlap with at least one candidate
+            picked_index = 0
+            for pred_index in range(len(nbest_json)):
+                if any([f1_score(nbest_json[pred_index]['text'], candidate) > 0. for candidate in all_candidates[example.qas_id]]):
+                    picked_index = pred_index
+                    break
+            all_predictions[example.qas_id] = nbest_json[picked_index]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    eval_result, _ = evaluate(predict_json, all_predictions)
+    
+    with open(evaluation_result_file, "w") as writer:
+        writer.write(json.dumps(eval_result, indent=4) + "\n")
+    
+    return eval_result
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the ReCoRD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose:
+            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                  orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(
+        enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/record_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/record_twomemory.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on ReCoRD."""
+
+import six
+import math
+import json
+import random
+import collections
+import os
+import pickle
+import logging
+import tokenization
+from batching_twomemory import prepare_batch_data
+
+from eval.record_official_evaluate import evaluate, f1_score
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+class ReCoRDExample(object):
+    """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 wn_concept_ids,
+                 nell_concept_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+        self.wn_concept_ids = wn_concept_ids
+        self.nell_concept_ids = nell_concept_ids
+
+
+def read_record_examples(input_file, is_training, version_2_with_negative=False):
+    """Read a ReCoRD json file into a list of ReCoRDExample."""
+    with open(input_file, "r") as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        paragraph_text = entry["passage"]["text"].replace('\xa0', ' ')
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+        for c in paragraph_text:
+            if is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        for qa in entry["qas"]:
+            qas_id = qa["id"]
+            question_text = qa["query"].replace('\xa0', ' ')
+            start_position = None
+            end_position = None
+            orig_answer_text = None
+            is_impossible = False
+            if is_training:
+
+                if version_2_with_negative:
+                    is_impossible = qa["is_impossible"]
+                # if (len(qa["answers"]) != 1) and (not is_impossible):
+                #     raise ValueError(
+                #         "For training, each question should have exactly 1 answer."
+                #     )
+                if not is_impossible:
+                    answer = qa["answers"][0]
+                    orig_answer_text = answer["text"]
+                    answer_offset = answer["start"]
+                    answer_length = len(orig_answer_text)
+                    start_position = char_to_word_offset[answer_offset]
+                    end_position = char_to_word_offset[answer_offset +
+                                                        answer_length - 1]
+                    # Only add answers where the text can be exactly recovered from the
+                    # document. If this CAN'T happen it's likely due to weird Unicode
+                    # stuff so we will just skip the example.
+                    #
+                    # Note that this means for training mode, every example is NOT
+                    # guaranteed to be preserved.
+                    actual_text = " ".join(doc_tokens[start_position:(
+                        end_position + 1)])
+                    cleaned_answer_text = " ".join(
+                        tokenization.whitespace_tokenize(orig_answer_text))
+                    if actual_text.find(cleaned_answer_text) == -1:
+                        logger.info("Could not find answer: '%s' vs. '%s'",
+                                actual_text, cleaned_answer_text)
+                        continue
+                else:
+                    start_position = -1
+                    end_position = -1
+                    orig_answer_text = ""
+
+            example = ReCoRDExample(
+                qas_id=qas_id,
+                question_text=question_text,
+                doc_tokens=doc_tokens,
+                orig_answer_text=orig_answer_text,
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=is_impossible)
+            examples.append(example)
+
+    return examples
+
+class Examples_To_Features_Converter(object):
+    def __init__(self, **concept_settings):
+        self.concept_settings = concept_settings
+
+        # load necessary data files for mapping to related concepts
+        # 1. mapping from subword-level tokenization to word-level tokenization
+        tokenization_filepath = self.concept_settings['tokenization_path']
+        assert os.path.exists(tokenization_filepath)
+        self.all_tokenization_info = {}
+        for item in pickle.load(open(tokenization_filepath, 'rb')):
+            self.all_tokenization_info[item['id']] = item
+        
+        # 2. mapping from concept name to concept id
+        self.wn_concept2id = self.concept_settings['wn_concept2id']
+        self.nell_concept2id = self.concept_settings['nell_concept2id']
+
+        # 3. retrieved related wordnet concepts (if use_wordnet)
+        if concept_settings['use_wordnet']:
+            retrieved_synset_filepath = self.concept_settings['retrieved_synset_path']
+            assert os.path.exists(retrieved_synset_filepath)
+            self.synsets_info = pickle.load(open(retrieved_synset_filepath, 'rb')) # token to sysnet names
+            self.max_wn_concept_length = max([len(synsets) for synsets in self.synsets_info.values()])
+        
+        # 4. retrieved related nell concepts (if use_nell)
+        if concept_settings['use_nell']:
+            retrieved_nell_concept_filepath = self.concept_settings['retrieved_nell_concept_path']
+            assert os.path.exists(retrieved_nell_concept_filepath)
+            self.nell_retrieve_info = {}
+            for item in pickle.load(open(retrieved_nell_concept_filepath, 'rb')):
+                self.nell_retrieve_info[item['id']] = item
+            self.max_nell_concept_length = max([max([len(entity_info['retrieved_concepts']) for entity_info in item['query_entities'] + item['document_entities']]) 
+                                        for qid, item in self.nell_retrieve_info.items() if len(item['query_entities'] + item['document_entities']) > 0])
+
+    # return list of concept ids given input subword list
+    def _lookup_wordnet_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, tolower, tokenizer):
+        concept_ids = []
+        for index in range(len(sub_tokens)):
+            original_token = tokens[sub_to_ori_index[index]]
+            # if tokens are in upper case, we must lower it for retrieving
+            retrieve_token = tokenizer.basic_tokenizer._run_strip_accents(original_token.lower()) if tolower else original_token
+            if retrieve_token in self.synsets_info:
+                concept_ids.append([self.wn_concept2id[synset_name] for synset_name in self.synsets_info[retrieve_token]])
+            else:
+                concept_ids.append([])
+        return concept_ids
+
+    def _lookup_nell_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, nell_info):
+        original_concept_ids = [[] for _ in range(len(tokens))]
+        for entity_info in nell_info:
+            for pos in range(entity_info['token_start'], entity_info['token_end'] + 1):
+                original_concept_ids[pos] += [self.nell_concept2id[category_name] for category_name in entity_info['retrieved_concepts']]
+        for pos in range(len(original_concept_ids)):
+            original_concept_ids[pos] = list(set(original_concept_ids[pos]))
+        concept_ids = [original_concept_ids[sub_to_ori_index[index]] for index in range(len(sub_tokens))]
+        return concept_ids
+
+    def __call__(self, 
+                examples,
+                tokenizer,
+                max_seq_length,
+                doc_stride,
+                max_query_length,
+                is_training):
+        """Loads a data file into a list of `InputBatch`s."""
+
+        unique_id = 1000000000      
+
+        for (example_index, example) in enumerate(examples):
+            tokenization_info = self.all_tokenization_info[example.qas_id]
+            query_tokens = tokenizer.tokenize(example.question_text)
+            # check online subword tokenization result is the same as offline result
+            assert query_tokens == tokenization_info['query_subtokens']
+            if self.concept_settings['use_wordnet']:
+                query_wn_concepts = self._lookup_wordnet_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'], 
+                                                            tokenization_info['query_tokens'], 
+                                                            tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
+            
+            if self.concept_settings['use_nell']:
+                query_nell_concepts = self._lookup_nell_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'], 
+                                                            tokenization_info['query_tokens'], self.nell_retrieve_info[example.qas_id]['query_entities'])
+
+            if len(query_tokens) > max_query_length:
+                query_tokens = query_tokens[0:max_query_length]
+                query_wn_concepts = query_wn_concepts[0:max_query_length]
+                query_nell_concepts = query_nell_concepts[0:max_query_length]
+
+            tok_to_orig_index = []
+            orig_to_tok_index = []
+            all_doc_tokens = []
+            for (i, token) in enumerate(example.doc_tokens):
+                orig_to_tok_index.append(len(all_doc_tokens))
+                sub_tokens = tokenizer.tokenize(token)
+                for sub_token in sub_tokens:
+                    tok_to_orig_index.append(i)
+                    all_doc_tokens.append(sub_token)
+            assert all_doc_tokens == tokenization_info['document_subtokens']
+            if self.concept_settings['use_wordnet']:
+                doc_wn_concepts = self._lookup_wordnet_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'], 
+                                                            tokenization_info['document_tokens'],
+                                                            tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
+            
+            if self.concept_settings['use_nell']:
+                doc_nell_concepts = self._lookup_nell_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'], 
+                                                            tokenization_info['document_tokens'], self.nell_retrieve_info[example.qas_id]['document_entities'])
+
+            tok_start_position = None
+            tok_end_position = None
+            if is_training and example.is_impossible:
+                tok_start_position = -1
+                tok_end_position = -1
+            if is_training and not example.is_impossible:
+                tok_start_position = orig_to_tok_index[example.start_position]
+                if example.end_position < len(example.doc_tokens) - 1:
+                    tok_end_position = orig_to_tok_index[example.end_position +
+                                                        1] - 1
+                else:
+                    tok_end_position = len(all_doc_tokens) - 1
+                (tok_start_position, tok_end_position) = _improve_answer_span(
+                    all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+                    example.orig_answer_text)
+
+            # The -3 accounts for [CLS], [SEP] and [SEP]
+            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+            # We can have documents that are longer than the maximum sequence length.
+            # To deal with this we do a sliding window approach, where we take chunks
+            # of the up to our max length with a stride of `doc_stride`.
+            _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+                "DocSpan", ["start", "length"])
+            doc_spans = []
+            start_offset = 0
+            while start_offset < len(all_doc_tokens):
+                length = len(all_doc_tokens) - start_offset
+                if length > max_tokens_for_doc:
+                    length = max_tokens_for_doc
+                doc_spans.append(_DocSpan(start=start_offset, length=length))
+                if start_offset + length == len(all_doc_tokens):
+                    break
+                start_offset += min(length, doc_stride)
+
+            for (doc_span_index, doc_span) in enumerate(doc_spans):
+                tokens = []
+                token_to_orig_map = {}
+                token_is_max_context = {}
+                segment_ids = []
+                wn_concept_ids = []
+                nell_concept_ids = []
+
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                wn_concept_ids.append([])
+                nell_concept_ids.append([])
+                for token, query_wn_concept, query_nell_concept in zip(query_tokens, query_wn_concepts, query_nell_concepts):
+                    tokens.append(token)
+                    segment_ids.append(0)
+                    wn_concept_ids.append(query_wn_concept)
+                    nell_concept_ids.append(query_nell_concept)
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+                wn_concept_ids.append([])
+                nell_concept_ids.append([])
+
+
+                for i in range(doc_span.length):
+                    split_token_index = doc_span.start + i
+                    token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                    is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                        split_token_index)
+                    token_is_max_context[len(tokens)] = is_max_context
+                    tokens.append(all_doc_tokens[split_token_index])
+                    segment_ids.append(1)
+                    wn_concept_ids.append(doc_wn_concepts[split_token_index])
+                    nell_concept_ids.append(doc_nell_concepts[split_token_index])
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+                wn_concept_ids.append([])
+                nell_concept_ids.append([])
+
+                input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+                # The mask has 1 for real tokens and 0 for padding tokens. Only real
+                # tokens are attended to.
+                input_mask = [1] * len(input_ids)
+
+                # Zero-pad up to the sequence length.
+                #while len(input_ids) < max_seq_length:
+                #  input_ids.append(0)
+                #  input_mask.append(0)
+                #  segment_ids.append(0)
+
+                #assert len(input_ids) == max_seq_length
+                #assert len(input_mask) == max_seq_length
+                #assert len(segment_ids) == max_seq_length         
+
+                for concept_ids, max_concept_length in zip((wn_concept_ids, nell_concept_ids), (self.max_wn_concept_length, self.max_nell_concept_length)):
+                    for cindex in range(len(concept_ids)):
+                        concept_ids[cindex] = concept_ids[cindex] + [0] * (max_concept_length - len(concept_ids[cindex]))
+                        concept_ids[cindex] = concept_ids[cindex][:max_concept_length]
+                    assert all([len(id_list) == max_concept_length for id_list in concept_ids])
+
+                start_position = None
+                end_position = None
+                if is_training and not example.is_impossible:
+                    # For training, if our document chunk does not contain an annotation
+                    # we throw it out, since there is nothing to predict.
+                    doc_start = doc_span.start
+                    doc_end = doc_span.start + doc_span.length - 1
+                    # out_of_span = False
+                    if not (tok_start_position >= doc_start and
+                            tok_end_position <= doc_end):
+                        continue
+                        # out_of_span = True
+                    # if out_of_span:
+                    #     start_position = 0
+                    #     end_position = 0
+                    # else:
+                    #     doc_offset = len(query_tokens) + 2
+                    #     start_position = tok_start_position - doc_start + doc_offset
+                    #     end_position = tok_end_position - doc_start + doc_offset
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset    
+
+                if is_training and example.is_impossible:
+                    start_position = 0
+                    end_position = 0
+
+                if example_index < 3:
+                    logger.info("*** Example ***")
+                    logger.info("unique_id: %s" % (unique_id))
+                    logger.info("example_index: %s" % (example_index))
+                    logger.info("doc_span_index: %s" % (doc_span_index))
+                    logger.info("tokens: %s" % " ".join(
+                        [tokenization.printable_text(x) for x in tokens]))
+                    logger.info("token_to_orig_map: %s" % " ".join([
+                        "%d:%d" % (x, y)
+                        for (x, y) in six.iteritems(token_to_orig_map)
+                    ]))
+                    logger.info("token_is_max_context: %s" % " ".join([
+                        "%d:%s" % (x, y)
+                        for (x, y) in six.iteritems(token_is_max_context)
+                    ]))
+                    logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                    logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                    logger.info("segment_ids: %s" %
+                        " ".join([str(x) for x in segment_ids]))
+                    logger.info("wordnet_concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(wn_concept_ids)]))
+                    logger.info("nell_concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(nell_concept_ids)]))                   
+                    if is_training and example.is_impossible:
+                        logger.info("impossible example")
+                    if is_training and not example.is_impossible:
+                        answer_text = " ".join(tokens[start_position:(end_position +
+                                                                    1)])
+                        logger.info("start_position: %d" % (start_position))
+                        logger.info("end_position: %d" % (end_position))
+                        logger.info("answer: %s" %
+                            (tokenization.printable_text(answer_text)))
+
+                feature = InputFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    wn_concept_ids=wn_concept_ids,
+                    nell_concept_ids=nell_concept_ids,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=example.is_impossible)
+
+                unique_id += 1
+
+                yield feature
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The ReCoRD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in ReCoRD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+class DataProcessor(object):
+    def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
+                 doc_stride, max_query_length):
+        self._tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self._max_seq_length = max_seq_length
+        self._doc_stride = doc_stride
+        self._max_query_length = max_query_length
+        self._in_tokens = in_tokens
+
+        self.vocab = self._tokenizer.vocab
+        self.vocab_size = len(self.vocab)
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+
+        self.current_train_example = -1
+        self.num_train_examples = -1
+        self.current_train_epoch = -1
+
+        self.train_examples = None
+        self.predict_examples = None
+        self.num_examples = {'train': -1, 'predict': -1}
+
+        self.train_wn_max_concept_length = None
+        self.predict_wn_max_concept_length = None
+        self.train_nell_max_concept_length = None
+        self.predict_nell_max_concept_length = None
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_train_example, self.current_train_epoch
+
+    def get_examples(self,
+                     data_path,
+                     is_training,
+                     version_2_with_negative=False):
+        examples = read_record_examples(
+            input_file=data_path,
+            is_training=is_training,
+            version_2_with_negative=version_2_with_negative)
+        return examples
+
+    def get_num_examples(self, phase):
+        if phase not in ['train', 'predict']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+        return self.num_examples[phase]
+
+    def get_features(self, examples, is_training, **concept_settings):
+        convert_examples_to_features = Examples_To_Features_Converter(**concept_settings)
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=self._tokenizer,
+            max_seq_length=self._max_seq_length,
+            doc_stride=self._doc_stride,
+            max_query_length=self._max_query_length,
+            is_training=is_training)
+        return features
+
+    def data_generator(self,
+                       data_path,
+                       batch_size,
+                       phase='train',
+                       shuffle=False,
+                       dev_count=1,
+                       version_2_with_negative=False,
+                       epoch=1,
+                       **concept_settings):
+        if phase == 'train':
+            self.train_examples = self.get_examples(
+                data_path,
+                is_training=True,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.train_examples
+            self.num_examples['train'] = len(self.train_examples)
+        elif phase == 'predict':
+            self.predict_examples = self.get_examples(
+                data_path,
+                is_training=False,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.predict_examples
+            self.num_examples['predict'] = len(self.predict_examples)
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+
+        def batch_reader(features, batch_size, in_tokens):
+            batch, total_token_num, max_len = [], 0, 0
+            for (index, feature) in enumerate(features):
+                if phase == 'train':
+                    self.current_train_example = index + 1
+                seq_len = len(feature.input_ids)
+                labels = [feature.unique_id
+                          ] if feature.start_position is None else [
+                              feature.start_position, feature.end_position
+                          ]
+                example = [
+                    # feature.input_ids, feature.segment_ids, range(seq_len), feature.wn_concept_ids, feature.nell_concept_ids
+                    feature.input_ids, feature.segment_ids, range(384), feature.wn_concept_ids, feature.nell_concept_ids
+                ] + labels
+                max_len = max(max_len, seq_len)
+
+                #max_len = max(max_len, len(token_ids))
+                if in_tokens:
+                    to_append = (len(batch) + 1) * max_len <= batch_size
+                else:
+                    to_append = len(batch) < batch_size
+
+                if to_append:
+                    batch.append(example)
+                    total_token_num += seq_len
+                else:
+                    yield batch, total_token_num
+                    batch, total_token_num, max_len = [example
+                                                       ], seq_len, seq_len
+            if len(batch) > 0:
+                yield batch, total_token_num
+
+        if phase == 'train':
+            self.train_wn_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_wn_concept_length
+            self.train_nell_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_nell_concept_length
+        else:
+            self.predict_wn_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_wn_concept_length
+            self.predict_nell_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_nell_concept_length
+
+        def wrapper():
+            for epoch_index in range(epoch):
+                if shuffle:
+                    random.shuffle(examples)
+                if phase == 'train':
+                    self.current_train_epoch = epoch_index
+                    features = self.get_features(examples, is_training=True, **concept_settings)
+                    max_wn_concept_length = self.train_wn_max_concept_length
+                    max_nell_concept_length = self.train_nell_max_concept_length
+                else:
+                    features = self.get_features(examples, is_training=False, **concept_settings)
+                    max_wn_concept_length = self.predict_wn_max_concept_length
+                    max_nell_concept_length = self.predict_nell_max_concept_length
+
+                all_dev_batches = []
+                for batch_data, total_token_num in batch_reader(
+                        features, batch_size, self._in_tokens):
+                    batch_data = prepare_batch_data(
+                        batch_data,
+                        total_token_num,
+                        voc_size=-1,
+                        pad_id=self.pad_id,
+                        cls_id=self.cls_id,
+                        sep_id=self.sep_id,
+                        mask_id=-1,
+                        return_input_mask=True,
+                        return_max_len=False,
+                        return_num_token=False,
+                        max_wn_concept_length=max_wn_concept_length,
+                        max_nell_concept_length=max_nell_concept_length)
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+
+        return wrapper
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      version_2_with_negative, null_score_diff_threshold,
+                      verbose, predict_file, evaluation_result_file):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    logger.info("Writing predictions to: %s" % (output_prediction_file))
+    logger.info("Writing nbest to: %s" % (output_nbest_file))
+    logger.info("Writing evaluation result to: %s" % (evaluation_result_file))
+
+    # load ground truth file for evaluation and post-edit
+    with open(predict_file, "r", encoding='utf-8') as reader:
+        predict_json = json.load(reader)["data"]
+        all_candidates = {}
+        for passage in predict_json:
+            passage_text = passage['passage']['text']
+            candidates = []
+            for entity_info in passage['passage']['entities']:
+                start_offset = entity_info['start']
+                end_offset = entity_info['end']
+                candidates.append(passage_text[start_offset: end_offset + 1])
+            for qa in passage['qas']:
+                all_candidates[qa['id']] = candidates
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[
+                    0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
+                                                              )]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
+                                                                 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case,
+                                            verbose)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+
+        # if we didn't inlude the empty option in the n-best, inlcude it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(
+                    text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+        # debug
+        if best_non_null_entry is None:
+            logger.info("Emmm..., sth wrong")
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            # restrict the finally picked prediction to have overlap with at least one candidate
+            picked_index = 0
+            for pred_index in range(len(nbest_json)):
+                if any([f1_score(nbest_json[pred_index]['text'], candidate) > 0. for candidate in all_candidates[example.qas_id]]):
+                    picked_index = pred_index
+                    break
+            all_predictions[example.qas_id] = nbest_json[picked_index]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    eval_result, _ = evaluate(predict_json, all_predictions)
+    
+    with open(evaluation_result_file, "w") as writer:
+        writer.write(json.dumps(eval_result, indent=4) + "\n")
+    
+    return eval_result
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the ReCoRD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose:
+            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                  orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(
+        enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/squad.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/squad.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
+
+import six
+import math
+import json
+import random
+import collections
+import os
+import pickle
+import logging
+import tokenization
+from batching import prepare_batch_data
+
+from eval.squad_v1_official_evaluate import evaluate
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+class SquadExample(object):
+    """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 concept_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+        self.concept_ids = concept_ids
+
+
+def read_squad_examples(input_file, is_training, version_2_with_negative=False):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file, "r") as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer."
+                        )
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset +
+                                                           answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(doc_tokens[start_position:(
+                            end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            tokenization.whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            logger.info("Could not find answer: '%s' vs. '%s'",
+                                  actual_text, cleaned_answer_text)
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible)
+                examples.append(example)
+
+    return examples
+
+class Examples_To_Features_Converter(object):
+    def __init__(self, **concept_settings):
+        self.concept_settings = concept_settings
+
+        # load necessary data files for mapping to related concepts
+        # 1. mapping from subword-level tokenization to word-level tokenization
+        tokenization_filepath = self.concept_settings['tokenization_path']
+        assert os.path.exists(tokenization_filepath)
+        self.all_tokenization_info = {}
+        for item in pickle.load(open(tokenization_filepath, 'rb')):
+            self.all_tokenization_info[item['id']] = item
+        
+        # 2. mapping from concept name to concept id (currently only support one KB)
+        self.concept2id = self.concept_settings['concept2id']
+
+        # 3. retrieved related wordnet concepts (if use_wordnet)
+        if concept_settings['use_wordnet']:
+            assert not self.concept_settings['use_nell']
+            retrieved_synset_filepath = self.concept_settings['retrieved_synset_path']
+            assert os.path.exists(retrieved_synset_filepath)
+            self.synsets_info = pickle.load(open(retrieved_synset_filepath, 'rb')) # token to sysnet names
+            self.max_concept_length = max([len(synsets) for synsets in self.synsets_info.values()])
+        
+        # 4. retrieved related nell concepts (if use_nell)
+        if concept_settings['use_nell']:
+            assert not self.concept_settings['use_wordnet']
+            retrieved_nell_concept_filepath = self.concept_settings['retrieved_nell_concept_path']
+            assert os.path.exists(retrieved_nell_concept_filepath)
+            self.nell_retrieve_info = {}
+            for item in pickle.load(open(retrieved_nell_concept_filepath, 'rb')):
+                self.nell_retrieve_info[item['id']] = item
+            self.max_concept_length = max([max([len(entity_info['retrieved_concepts']) for entity_info in item['query_entities'] + item['document_entities']]) 
+                                        for qid, item in self.nell_retrieve_info.items() if len(item['query_entities'] + item['document_entities']) > 0])
+
+    # return list of concept ids given input subword list
+    def _lookup_wordnet_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, tolower, tokenizer):
+        concept_ids = []
+        for index in range(len(sub_tokens)):
+            original_token = tokens[sub_to_ori_index[index]]
+            # if tokens are in upper case, we must lower it for retrieving
+            retrieve_token = tokenizer.basic_tokenizer._run_strip_accents(original_token.lower()) if tolower else original_token
+            if retrieve_token in self.synsets_info:
+                concept_ids.append([self.concept2id[synset_name] for synset_name in self.synsets_info[retrieve_token]])
+            else:
+                concept_ids.append([])
+        return concept_ids
+
+    def _lookup_nell_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, nell_info):
+        original_concept_ids = [[] for _ in range(len(tokens))]
+        for entity_info in nell_info:
+            for pos in range(entity_info['token_start'], entity_info['token_end'] + 1):
+                original_concept_ids[pos] += [self.concept2id[category_name] for category_name in entity_info['retrieved_concepts']]
+        for pos in range(len(original_concept_ids)):
+            original_concept_ids[pos] = list(set(original_concept_ids[pos]))
+        concept_ids = [original_concept_ids[sub_to_ori_index[index]] for index in range(len(sub_tokens))]
+        return concept_ids 
+
+    def __call__(self, 
+                examples,
+                tokenizer,
+                max_seq_length,
+                doc_stride,
+                max_query_length,
+                is_training):
+        """Loads a data file into a list of `InputBatch`s."""
+
+        unique_id = 1000000000      
+
+        for (example_index, example) in enumerate(examples):
+            tokenization_info = self.all_tokenization_info[example.qas_id]
+            query_tokens = tokenizer.tokenize(example.question_text)
+            # check online subword tokenization result is the same as offline result
+            assert query_tokens == tokenization_info['query_subtokens']
+            if self.concept_settings['use_wordnet']:
+                query_concepts = self._lookup_wordnet_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'], 
+                                                            tokenization_info['query_tokens'], 
+                                                            tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
+            
+            if self.concept_settings['use_nell']:
+                query_concepts = self._lookup_nell_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'], 
+                                                            tokenization_info['query_tokens'], self.nell_retrieve_info[example.qas_id]['query_entities'])
+
+            if len(query_tokens) > max_query_length:
+                query_tokens = query_tokens[0:max_query_length]
+                query_concepts = query_concepts[0:max_query_length]
+
+            tok_to_orig_index = []
+            orig_to_tok_index = []
+            all_doc_tokens = []
+            for (i, token) in enumerate(example.doc_tokens):
+                orig_to_tok_index.append(len(all_doc_tokens))
+                sub_tokens = tokenizer.tokenize(token)
+                for sub_token in sub_tokens:
+                    tok_to_orig_index.append(i)
+                    all_doc_tokens.append(sub_token)
+            assert all_doc_tokens == tokenization_info['document_subtokens']
+            if self.concept_settings['use_wordnet']:
+                doc_concepts = self._lookup_wordnet_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'], 
+                                                            tokenization_info['document_tokens'],
+                                                            tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
+            
+            if self.concept_settings['use_nell']:
+                doc_concepts = self._lookup_nell_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'], 
+                                                            tokenization_info['document_tokens'], self.nell_retrieve_info[example.qas_id]['document_entities'])
+
+            tok_start_position = None
+            tok_end_position = None
+            if is_training and example.is_impossible:
+                tok_start_position = -1
+                tok_end_position = -1
+            if is_training and not example.is_impossible:
+                tok_start_position = orig_to_tok_index[example.start_position]
+                if example.end_position < len(example.doc_tokens) - 1:
+                    tok_end_position = orig_to_tok_index[example.end_position +
+                                                        1] - 1
+                else:
+                    tok_end_position = len(all_doc_tokens) - 1
+                (tok_start_position, tok_end_position) = _improve_answer_span(
+                    all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+                    example.orig_answer_text)
+
+            # The -3 accounts for [CLS], [SEP] and [SEP]
+            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+            # We can have documents that are longer than the maximum sequence length.
+            # To deal with this we do a sliding window approach, where we take chunks
+            # of the up to our max length with a stride of `doc_stride`.
+            _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+                "DocSpan", ["start", "length"])
+            doc_spans = []
+            start_offset = 0
+            while start_offset < len(all_doc_tokens):
+                length = len(all_doc_tokens) - start_offset
+                if length > max_tokens_for_doc:
+                    length = max_tokens_for_doc
+                doc_spans.append(_DocSpan(start=start_offset, length=length))
+                if start_offset + length == len(all_doc_tokens):
+                    break
+                start_offset += min(length, doc_stride)
+
+            for (doc_span_index, doc_span) in enumerate(doc_spans):
+                tokens = []
+                token_to_orig_map = {}
+                token_is_max_context = {}
+                segment_ids = []
+                concept_ids = []
+
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                concept_ids.append([])
+                for token, query_concept in zip(query_tokens, query_concepts):
+                    tokens.append(token)
+                    segment_ids.append(0)
+                    concept_ids.append(query_concept)
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+                concept_ids.append([])
+
+                for i in range(doc_span.length):
+                    split_token_index = doc_span.start + i
+                    token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                    is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                        split_token_index)
+                    token_is_max_context[len(tokens)] = is_max_context
+                    tokens.append(all_doc_tokens[split_token_index])
+                    segment_ids.append(1)
+                    concept_ids.append(doc_concepts[split_token_index])
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+                concept_ids.append([])
+
+                input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+                # The mask has 1 for real tokens and 0 for padding tokens. Only real
+                # tokens are attended to.
+                input_mask = [1] * len(input_ids)
+
+                # Zero-pad up to the sequence length.
+                #while len(input_ids) < max_seq_length:
+                #  input_ids.append(0)
+                #  input_mask.append(0)
+                #  segment_ids.append(0)
+
+                #assert len(input_ids) == max_seq_length
+                #assert len(input_mask) == max_seq_length
+                #assert len(segment_ids) == max_seq_length
+
+                for cindex in range(len(concept_ids)):
+                    concept_ids[cindex] = concept_ids[cindex] + [0] * (self.max_concept_length - len(concept_ids[cindex]))
+                    concept_ids[cindex] = concept_ids[cindex][:self.max_concept_length]
+                assert all([len(id_list) == self.max_concept_length for id_list in concept_ids])            
+
+                start_position = None
+                end_position = None
+                if is_training and not example.is_impossible:
+                    # For training, if our document chunk does not contain an annotation
+                    # we throw it out, since there is nothing to predict.
+                    doc_start = doc_span.start
+                    doc_end = doc_span.start + doc_span.length - 1
+                    # out_of_span = False
+                    if not (tok_start_position >= doc_start and
+                            tok_end_position <= doc_end):
+                        continue
+                        # out_of_span = True
+                    # if out_of_span:
+                    #     start_position = 0
+                    #     end_position = 0
+                    # else:
+                    #     doc_offset = len(query_tokens) + 2
+                    #     start_position = tok_start_position - doc_start + doc_offset
+                    #     end_position = tok_end_position - doc_start + doc_offset
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset    
+
+                if is_training and example.is_impossible:
+                    start_position = 0
+                    end_position = 0
+
+                if example_index < 3:
+                    logger.info("*** Example ***")
+                    logger.info("unique_id: %s" % (unique_id))
+                    logger.info("example_index: %s" % (example_index))
+                    logger.info("doc_span_index: %s" % (doc_span_index))
+                    logger.info("tokens: %s" % " ".join(
+                        [tokenization.printable_text(x) for x in tokens]))
+                    logger.info("token_to_orig_map: %s" % " ".join([
+                        "%d:%d" % (x, y)
+                        for (x, y) in six.iteritems(token_to_orig_map)
+                    ]))
+                    logger.info("token_is_max_context: %s" % " ".join([
+                        "%d:%s" % (x, y)
+                        for (x, y) in six.iteritems(token_is_max_context)
+                    ]))
+                    logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                    logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                    logger.info("segment_ids: %s" %
+                        " ".join([str(x) for x in segment_ids]))
+                    logger.info("concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(concept_ids)]))                    
+                    if is_training and example.is_impossible:
+                        logger.info("impossible example")
+                    if is_training and not example.is_impossible:
+                        answer_text = " ".join(tokens[start_position:(end_position +
+                                                                    1)])
+                        logger.info("start_position: %d" % (start_position))
+                        logger.info("end_position: %d" % (end_position))
+                        logger.info("answer: %s" %
+                            (tokenization.printable_text(answer_text)))
+
+                feature = InputFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    concept_ids=concept_ids,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=example.is_impossible)
+
+                unique_id += 1
+
+                yield feature
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+class DataProcessor(object):
+    def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
+                 doc_stride, max_query_length):
+        self._tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self._max_seq_length = max_seq_length
+        self._doc_stride = doc_stride
+        self._max_query_length = max_query_length
+        self._in_tokens = in_tokens
+
+        self.vocab = self._tokenizer.vocab
+        self.vocab_size = len(self.vocab)
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+
+        self.current_train_example = -1
+        self.num_train_examples = -1
+        self.current_train_epoch = -1
+
+        self.train_examples = None
+        self.predict_examples = None
+        self.num_examples = {'train': -1, 'predict': -1}
+
+        self.train_max_concept_length = None
+        self.predict_max_concept_length = None
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_train_example, self.current_train_epoch
+
+    def get_examples(self,
+                     data_path,
+                     is_training,
+                     version_2_with_negative=False):
+        examples = read_squad_examples(
+            input_file=data_path,
+            is_training=is_training,
+            version_2_with_negative=version_2_with_negative)
+        return examples
+
+    def get_num_examples(self, phase):
+        if phase not in ['train', 'predict']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+        return self.num_examples[phase]
+
+    def get_features(self, examples, is_training, **concept_settings):
+        convert_examples_to_features = Examples_To_Features_Converter(**concept_settings)
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=self._tokenizer,
+            max_seq_length=self._max_seq_length,
+            doc_stride=self._doc_stride,
+            max_query_length=self._max_query_length,
+            is_training=is_training)
+        return features
+
+    def data_generator(self,
+                       data_path,
+                       batch_size,
+                       phase='train',
+                       shuffle=False,
+                       dev_count=1,
+                       version_2_with_negative=False,
+                       epoch=1,
+                       **concept_settings):
+        if phase == 'train':
+            self.train_examples = self.get_examples(
+                data_path,
+                is_training=True,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.train_examples
+            self.num_examples['train'] = len(self.train_examples)
+        elif phase == 'predict':
+            self.predict_examples = self.get_examples(
+                data_path,
+                is_training=False,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.predict_examples
+            self.num_examples['predict'] = len(self.predict_examples)
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+
+        def batch_reader(features, batch_size, in_tokens):
+            batch, total_token_num, max_len = [], 0, 0
+            for (index, feature) in enumerate(features):
+                if phase == 'train':
+                    self.current_train_example = index + 1
+                seq_len = len(feature.input_ids)
+                labels = [feature.unique_id
+                          ] if feature.start_position is None else [
+                              feature.start_position, feature.end_position
+                          ]
+                example = [
+                    # feature.input_ids, feature.segment_ids, range(seq_len), feature.concept_ids
+                    feature.input_ids, feature.segment_ids, range(384), feature.concept_ids
+                ] + labels
+                max_len = max(max_len, seq_len)
+
+                #max_len = max(max_len, len(token_ids))
+                if in_tokens:
+                    to_append = (len(batch) + 1) * max_len <= batch_size
+                else:
+                    to_append = len(batch) < batch_size
+
+                if to_append:
+                    batch.append(example)
+                    total_token_num += seq_len
+                else:
+                    yield batch, total_token_num
+                    batch, total_token_num, max_len = [example
+                                                       ], seq_len, seq_len
+            if len(batch) > 0:
+                yield batch, total_token_num
+
+        if phase == 'train':
+            self.train_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_concept_length
+        else:
+            self.predict_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_concept_length
+
+        def wrapper():
+            for epoch_index in range(epoch):
+                if shuffle:
+                    random.shuffle(examples)
+                if phase == 'train':
+                    self.current_train_epoch = epoch_index
+                    features = self.get_features(examples, is_training=True, **concept_settings)
+                    max_concept_length = self.train_max_concept_length
+                else:
+                    features = self.get_features(examples, is_training=False, **concept_settings)
+                    max_concept_length = self.predict_max_concept_length
+
+                all_dev_batches = []
+                for batch_data, total_token_num in batch_reader(
+                        features, batch_size, self._in_tokens):
+                    batch_data = prepare_batch_data(
+                        batch_data,
+                        total_token_num,
+                        voc_size=-1,
+                        pad_id=self.pad_id,
+                        cls_id=self.cls_id,
+                        sep_id=self.sep_id,
+                        mask_id=-1,
+                        return_input_mask=True,
+                        return_max_len=False,
+                        return_num_token=False,
+                        max_concept_length=max_concept_length)
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+
+        return wrapper
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      version_2_with_negative, null_score_diff_threshold,
+                      verbose, predict_file, evaluation_result_file):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    logger.info("Writing predictions to: %s" % (output_prediction_file))
+    logger.info("Writing nbest to: %s" % (output_nbest_file))
+    logger.info("Writing evaluation result to: %s" % (evaluation_result_file))
+
+    # load ground truth file for evaluation and post-edit
+    with open(predict_file, "r", encoding='utf-8') as reader:
+        predict_json = json.load(reader)["data"]
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[
+                    0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
+                                                              )]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
+                                                                 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case,
+                                            verbose)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+
+        # if we didn't inlude the empty option in the n-best, inlcude it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(
+                    text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+        # debug
+        if best_non_null_entry is None:
+            logger.info("Emmm..., sth wrong")
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    eval_result = evaluate(predict_json, all_predictions)
+    
+    with open(evaluation_result_file, "w") as writer:
+        writer.write(json.dumps(eval_result, indent=4) + "\n")
+    
+    return eval_result
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose:
+            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                  orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(
+        enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/squad_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/squad_twomemory.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
+
+import six
+import math
+import json
+import random
+import collections
+import os
+import pickle
+import logging
+import tokenization
+from batching_twomemory import prepare_batch_data
+
+from eval.squad_v1_official_evaluate import evaluate
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+class SquadExample(object):
+    """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 wn_concept_ids,
+                 nell_concept_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+        self.wn_concept_ids = wn_concept_ids
+        self.nell_concept_ids = nell_concept_ids
+
+
+def read_squad_examples(input_file, is_training, version_2_with_negative=False):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file, "r") as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer."
+                        )
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset +
+                                                           answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(doc_tokens[start_position:(
+                            end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            tokenization.whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            logger.info("Could not find answer: '%s' vs. '%s'",
+                                  actual_text, cleaned_answer_text)
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible)
+                examples.append(example)
+
+    return examples
+
+class Examples_To_Features_Converter(object):
+    def __init__(self, **concept_settings):
+        self.concept_settings = concept_settings
+
+        # load necessary data files for mapping to related concepts
+        # 1. mapping from subword-level tokenization to word-level tokenization
+        tokenization_filepath = self.concept_settings['tokenization_path']
+        assert os.path.exists(tokenization_filepath)
+        self.all_tokenization_info = {}
+        for item in pickle.load(open(tokenization_filepath, 'rb')):
+            self.all_tokenization_info[item['id']] = item
+        
+        # 2. mapping from concept name to concept id
+        self.wn_concept2id = self.concept_settings['wn_concept2id']
+        self.nell_concept2id = self.concept_settings['nell_concept2id']
+
+        # 3. retrieved related wordnet concepts (if use_wordnet)
+        if concept_settings['use_wordnet']:
+            retrieved_synset_filepath = self.concept_settings['retrieved_synset_path']
+            assert os.path.exists(retrieved_synset_filepath)
+            self.synsets_info = pickle.load(open(retrieved_synset_filepath, 'rb')) # token to sysnet names
+            self.max_wn_concept_length = max([len(synsets) for synsets in self.synsets_info.values()])
+        
+        # 4. retrieved related nell concepts (if use_nell)
+        if concept_settings['use_nell']:
+            retrieved_nell_concept_filepath = self.concept_settings['retrieved_nell_concept_path']
+            assert os.path.exists(retrieved_nell_concept_filepath)
+            self.nell_retrieve_info = {}
+            for item in pickle.load(open(retrieved_nell_concept_filepath, 'rb')):
+                self.nell_retrieve_info[item['id']] = item
+            self.max_nell_concept_length = max([max([len(entity_info['retrieved_concepts']) for entity_info in item['query_entities'] + item['document_entities']]) 
+                                        for qid, item in self.nell_retrieve_info.items() if len(item['query_entities'] + item['document_entities']) > 0])
+
+    # return list of concept ids given input subword list
+    def _lookup_wordnet_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, tolower, tokenizer):
+        concept_ids = []
+        for index in range(len(sub_tokens)):
+            original_token = tokens[sub_to_ori_index[index]]
+            # if tokens are in upper case, we must lower it for retrieving
+            retrieve_token = tokenizer.basic_tokenizer._run_strip_accents(original_token.lower()) if tolower else original_token
+            if retrieve_token in self.synsets_info:
+                concept_ids.append([self.wn_concept2id[synset_name] for synset_name in self.synsets_info[retrieve_token]])
+            else:
+                concept_ids.append([])
+        return concept_ids
+
+    def _lookup_nell_concept_ids(self, sub_tokens, sub_to_ori_index, tokens, nell_info):
+        original_concept_ids = [[] for _ in range(len(tokens))]
+        for entity_info in nell_info:
+            for pos in range(entity_info['token_start'], entity_info['token_end'] + 1):
+                original_concept_ids[pos] += [self.nell_concept2id[category_name] for category_name in entity_info['retrieved_concepts']]
+        for pos in range(len(original_concept_ids)):
+            original_concept_ids[pos] = list(set(original_concept_ids[pos]))
+        concept_ids = [original_concept_ids[sub_to_ori_index[index]] for index in range(len(sub_tokens))]
+        return concept_ids
+
+    def __call__(self, 
+                examples,
+                tokenizer,
+                max_seq_length,
+                doc_stride,
+                max_query_length,
+                is_training):
+        """Loads a data file into a list of `InputBatch`s."""
+
+        unique_id = 1000000000      
+
+        for (example_index, example) in enumerate(examples):
+            tokenization_info = self.all_tokenization_info[example.qas_id]
+            query_tokens = tokenizer.tokenize(example.question_text)
+            # check online subword tokenization result is the same as offline result
+            assert query_tokens == tokenization_info['query_subtokens']
+            if self.concept_settings['use_wordnet']:
+                query_wn_concepts = self._lookup_wordnet_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'], 
+                                                            tokenization_info['query_tokens'], 
+                                                            tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
+            
+            if self.concept_settings['use_nell']:
+                query_nell_concepts = self._lookup_nell_concept_ids(query_tokens, tokenization_info['query_sub_to_ori_index'], 
+                                                            tokenization_info['query_tokens'], self.nell_retrieve_info[example.qas_id]['query_entities'])
+
+            if len(query_tokens) > max_query_length:
+                query_tokens = query_tokens[0:max_query_length]
+                query_wn_concepts = query_wn_concepts[0:max_query_length]
+                query_nell_concepts = query_nell_concepts[0:max_query_length]
+
+            tok_to_orig_index = []
+            orig_to_tok_index = []
+            all_doc_tokens = []
+            for (i, token) in enumerate(example.doc_tokens):
+                orig_to_tok_index.append(len(all_doc_tokens))
+                sub_tokens = tokenizer.tokenize(token)
+                for sub_token in sub_tokens:
+                    tok_to_orig_index.append(i)
+                    all_doc_tokens.append(sub_token)
+            assert all_doc_tokens == tokenization_info['document_subtokens']
+            if self.concept_settings['use_wordnet']:
+                doc_wn_concepts = self._lookup_wordnet_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'], 
+                                                            tokenization_info['document_tokens'],
+                                                            tolower=tokenizer.basic_tokenizer.do_lower_case == False, tokenizer=tokenizer) # if tolower is True, tokenizer must be given
+            
+            if self.concept_settings['use_nell']:
+                doc_nell_concepts = self._lookup_nell_concept_ids(all_doc_tokens, tokenization_info['document_sub_to_ori_index'], 
+                                                            tokenization_info['document_tokens'], self.nell_retrieve_info[example.qas_id]['document_entities'])
+
+            tok_start_position = None
+            tok_end_position = None
+            if is_training and example.is_impossible:
+                tok_start_position = -1
+                tok_end_position = -1
+            if is_training and not example.is_impossible:
+                tok_start_position = orig_to_tok_index[example.start_position]
+                if example.end_position < len(example.doc_tokens) - 1:
+                    tok_end_position = orig_to_tok_index[example.end_position +
+                                                        1] - 1
+                else:
+                    tok_end_position = len(all_doc_tokens) - 1
+                (tok_start_position, tok_end_position) = _improve_answer_span(
+                    all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+                    example.orig_answer_text)
+
+            # The -3 accounts for [CLS], [SEP] and [SEP]
+            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+            # We can have documents that are longer than the maximum sequence length.
+            # To deal with this we do a sliding window approach, where we take chunks
+            # of the up to our max length with a stride of `doc_stride`.
+            _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+                "DocSpan", ["start", "length"])
+            doc_spans = []
+            start_offset = 0
+            while start_offset < len(all_doc_tokens):
+                length = len(all_doc_tokens) - start_offset
+                if length > max_tokens_for_doc:
+                    length = max_tokens_for_doc
+                doc_spans.append(_DocSpan(start=start_offset, length=length))
+                if start_offset + length == len(all_doc_tokens):
+                    break
+                start_offset += min(length, doc_stride)
+
+            for (doc_span_index, doc_span) in enumerate(doc_spans):
+                tokens = []
+                token_to_orig_map = {}
+                token_is_max_context = {}
+                segment_ids = []
+                wn_concept_ids = []
+                nell_concept_ids = []
+
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                wn_concept_ids.append([])
+                nell_concept_ids.append([])
+                for token, query_wn_concept, query_nell_concept in zip(query_tokens, query_wn_concepts, query_nell_concepts):
+                    tokens.append(token)
+                    segment_ids.append(0)
+                    wn_concept_ids.append(query_wn_concept)
+                    nell_concept_ids.append(query_nell_concept)
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+                wn_concept_ids.append([])
+                nell_concept_ids.append([])
+
+
+                for i in range(doc_span.length):
+                    split_token_index = doc_span.start + i
+                    token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                    is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                        split_token_index)
+                    token_is_max_context[len(tokens)] = is_max_context
+                    tokens.append(all_doc_tokens[split_token_index])
+                    segment_ids.append(1)
+                    wn_concept_ids.append(doc_wn_concepts[split_token_index])
+                    nell_concept_ids.append(doc_nell_concepts[split_token_index])
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+                wn_concept_ids.append([])
+                nell_concept_ids.append([])
+
+                input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+                # The mask has 1 for real tokens and 0 for padding tokens. Only real
+                # tokens are attended to.
+                input_mask = [1] * len(input_ids)
+
+                # Zero-pad up to the sequence length.
+                #while len(input_ids) < max_seq_length:
+                #  input_ids.append(0)
+                #  input_mask.append(0)
+                #  segment_ids.append(0)
+
+                #assert len(input_ids) == max_seq_length
+                #assert len(input_mask) == max_seq_length
+                #assert len(segment_ids) == max_seq_length         
+
+                for concept_ids, max_concept_length in zip((wn_concept_ids, nell_concept_ids), (self.max_wn_concept_length, self.max_nell_concept_length)):
+                    for cindex in range(len(concept_ids)):
+                        concept_ids[cindex] = concept_ids[cindex] + [0] * (max_concept_length - len(concept_ids[cindex]))
+                        concept_ids[cindex] = concept_ids[cindex][:max_concept_length]
+                    assert all([len(id_list) == max_concept_length for id_list in concept_ids])
+
+                start_position = None
+                end_position = None
+                if is_training and not example.is_impossible:
+                    # For training, if our document chunk does not contain an annotation
+                    # we throw it out, since there is nothing to predict.
+                    doc_start = doc_span.start
+                    doc_end = doc_span.start + doc_span.length - 1
+                    # out_of_span = False
+                    if not (tok_start_position >= doc_start and
+                            tok_end_position <= doc_end):
+                        continue
+                        # out_of_span = True
+                    # if out_of_span:
+                    #     start_position = 0
+                    #     end_position = 0
+                    # else:
+                    #     doc_offset = len(query_tokens) + 2
+                    #     start_position = tok_start_position - doc_start + doc_offset
+                    #     end_position = tok_end_position - doc_start + doc_offset
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset    
+
+                if is_training and example.is_impossible:
+                    start_position = 0
+                    end_position = 0
+
+                if example_index < 3:
+                    logger.info("*** Example ***")
+                    logger.info("unique_id: %s" % (unique_id))
+                    logger.info("example_index: %s" % (example_index))
+                    logger.info("doc_span_index: %s" % (doc_span_index))
+                    logger.info("tokens: %s" % " ".join(
+                        [tokenization.printable_text(x) for x in tokens]))
+                    logger.info("token_to_orig_map: %s" % " ".join([
+                        "%d:%d" % (x, y)
+                        for (x, y) in six.iteritems(token_to_orig_map)
+                    ]))
+                    logger.info("token_is_max_context: %s" % " ".join([
+                        "%d:%s" % (x, y)
+                        for (x, y) in six.iteritems(token_is_max_context)
+                    ]))
+                    logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                    logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                    logger.info("segment_ids: %s" %
+                        " ".join([str(x) for x in segment_ids]))
+                    logger.info("wordnet_concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(wn_concept_ids)]))
+                    logger.info("nell_concept_ids: %s" % " ".join(["{}:{}".format(tidx, list(filter(lambda index:index != 0, x))) for tidx, x in enumerate(nell_concept_ids)]))                   
+                    if is_training and example.is_impossible:
+                        logger.info("impossible example")
+                    if is_training and not example.is_impossible:
+                        answer_text = " ".join(tokens[start_position:(end_position +
+                                                                    1)])
+                        logger.info("start_position: %d" % (start_position))
+                        logger.info("end_position: %d" % (end_position))
+                        logger.info("answer: %s" %
+                            (tokenization.printable_text(answer_text)))
+
+                feature = InputFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    wn_concept_ids=wn_concept_ids,
+                    nell_concept_ids=nell_concept_ids,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=example.is_impossible)
+
+                unique_id += 1
+
+                yield feature
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+class DataProcessor(object):
+    def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
+                 doc_stride, max_query_length):
+        self._tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self._max_seq_length = max_seq_length
+        self._doc_stride = doc_stride
+        self._max_query_length = max_query_length
+        self._in_tokens = in_tokens
+
+        self.vocab = self._tokenizer.vocab
+        self.vocab_size = len(self.vocab)
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+
+        self.current_train_example = -1
+        self.num_train_examples = -1
+        self.current_train_epoch = -1
+
+        self.train_examples = None
+        self.predict_examples = None
+        self.num_examples = {'train': -1, 'predict': -1}
+
+        self.train_wn_max_concept_length = None
+        self.predict_wn_max_concept_length = None
+        self.train_nell_max_concept_length = None
+        self.predict_nell_max_concept_length = None
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_train_example, self.current_train_epoch
+
+    def get_examples(self,
+                     data_path,
+                     is_training,
+                     version_2_with_negative=False):
+        examples = read_squad_examples(
+            input_file=data_path,
+            is_training=is_training,
+            version_2_with_negative=version_2_with_negative)
+        return examples
+
+    def get_num_examples(self, phase):
+        if phase not in ['train', 'predict']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+        return self.num_examples[phase]
+
+    def get_features(self, examples, is_training, **concept_settings):
+        convert_examples_to_features = Examples_To_Features_Converter(**concept_settings)
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=self._tokenizer,
+            max_seq_length=self._max_seq_length,
+            doc_stride=self._doc_stride,
+            max_query_length=self._max_query_length,
+            is_training=is_training)
+        return features
+
+    def data_generator(self,
+                       data_path,
+                       batch_size,
+                       phase='train',
+                       shuffle=False,
+                       dev_count=1,
+                       version_2_with_negative=False,
+                       epoch=1,
+                       **concept_settings):
+        if phase == 'train':
+            self.train_examples = self.get_examples(
+                data_path,
+                is_training=True,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.train_examples
+            self.num_examples['train'] = len(self.train_examples)
+        elif phase == 'predict':
+            self.predict_examples = self.get_examples(
+                data_path,
+                is_training=False,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.predict_examples
+            self.num_examples['predict'] = len(self.predict_examples)
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+
+        def batch_reader(features, batch_size, in_tokens):
+            batch, total_token_num, max_len = [], 0, 0
+            for (index, feature) in enumerate(features):
+                if phase == 'train':
+                    self.current_train_example = index + 1
+                seq_len = len(feature.input_ids)
+                labels = [feature.unique_id
+                          ] if feature.start_position is None else [
+                              feature.start_position, feature.end_position
+                          ]
+                example = [
+                    # feature.input_ids, feature.segment_ids, range(seq_len), feature.wn_concept_ids, feature.nell_concept_ids
+                    feature.input_ids, feature.segment_ids, range(384), feature.wn_concept_ids, feature.nell_concept_ids
+                ] + labels
+                max_len = max(max_len, seq_len)
+
+                #max_len = max(max_len, len(token_ids))
+                if in_tokens:
+                    to_append = (len(batch) + 1) * max_len <= batch_size
+                else:
+                    to_append = len(batch) < batch_size
+
+                if to_append:
+                    batch.append(example)
+                    total_token_num += seq_len
+                else:
+                    yield batch, total_token_num
+                    batch, total_token_num, max_len = [example
+                                                       ], seq_len, seq_len
+            if len(batch) > 0:
+                yield batch, total_token_num
+
+        if phase == 'train':
+            self.train_wn_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_wn_concept_length
+            self.train_nell_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_nell_concept_length
+        else:
+            self.predict_wn_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_wn_concept_length
+            self.predict_nell_max_concept_length = Examples_To_Features_Converter(**concept_settings).max_nell_concept_length
+
+        def wrapper():
+            for epoch_index in range(epoch):
+                if shuffle:
+                    random.shuffle(examples)
+                if phase == 'train':
+                    self.current_train_epoch = epoch_index
+                    features = self.get_features(examples, is_training=True, **concept_settings)
+                    max_wn_concept_length = self.train_wn_max_concept_length
+                    max_nell_concept_length = self.train_nell_max_concept_length
+                else:
+                    features = self.get_features(examples, is_training=False, **concept_settings)
+                    max_wn_concept_length = self.predict_wn_max_concept_length
+                    max_nell_concept_length = self.predict_nell_max_concept_length
+
+                all_dev_batches = []
+                for batch_data, total_token_num in batch_reader(
+                        features, batch_size, self._in_tokens):
+                    batch_data = prepare_batch_data(
+                        batch_data,
+                        total_token_num,
+                        voc_size=-1,
+                        pad_id=self.pad_id,
+                        cls_id=self.cls_id,
+                        sep_id=self.sep_id,
+                        mask_id=-1,
+                        return_input_mask=True,
+                        return_max_len=False,
+                        return_num_token=False,
+                        max_wn_concept_length=max_wn_concept_length,
+                        max_nell_concept_length=max_nell_concept_length)
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+
+        return wrapper
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      version_2_with_negative, null_score_diff_threshold,
+                      verbose, predict_file, evaluation_result_file):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    logger.info("Writing predictions to: %s" % (output_prediction_file))
+    logger.info("Writing nbest to: %s" % (output_nbest_file))
+    logger.info("Writing evaluation result to: %s" % (evaluation_result_file))
+
+    # load ground truth file for evaluation and post-edit
+    with open(predict_file, "r", encoding='utf-8') as reader:
+        predict_json = json.load(reader)["data"]
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[
+                    0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
+                                                              )]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
+                                                                 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case,
+                                            verbose)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+
+        # if we didn't inlude the empty option in the n-best, inlcude it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(
+                    text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+        # debug
+        if best_non_null_entry is None:
+            logger.info("Emmm..., sth wrong")
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    eval_result = evaluate(predict_json, all_predictions)
+    
+    with open(evaluation_result_file, "w") as writer:
+        writer.write(json.dumps(eval_result, indent=4) + "\n")
+    
+    return eval_result
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose:
+            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                  orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(
+        enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_record.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_record.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning on ReCoRD."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import multiprocessing
+import os
+import time
+import logging
+import random
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+from reader.record import DataProcessor, write_predictions
+from model.bert import BertConfig, BertModel
+from model.layers import MemoryLayer, TriLinearTwoTimeSelfAttentionLayer
+from utils.args import ArgumentGroup, print_arguments
+from optimization import optimization
+from utils.init import init_pretraining_params, init_checkpoint
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("bert_config_path",         str,  None,           "Path to the json file for bert model config.")
+model_g.add_arg("init_checkpoint",          str,  None,           "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params",  str,  None,
+                "Init pre-training params which preforms fine-tuning from. If the "
+                 "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("checkpoints",              str,  "checkpoints",  "Path to save checkpoints.")
+
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch",             int,    3,      "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate",     float,  5e-5,   "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
+                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay",      float,  0.01,   "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion", float,  0.1,
+                "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps",        int,    1000,   "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps",  int,    1000,   "The steps interval for validation (effective only when do_val is True).")
+train_g.add_arg("use_ema",           bool,   True, "Whether to use ema.")
+train_g.add_arg("ema_decay",         float,  0.9999, "Decay rate for expoential moving average.")
+train_g.add_arg("use_fp16",          bool,   False,  "Whether to use fp16 mixed precision training.")
+train_g.add_arg("loss_scaling",      float,  1.0,
+                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+
+log_g = ArgumentGroup(parser, "logging", "logging related.")
+log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
+log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
+
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("train_file",                str,   None,  "ReCoRD json for training. E.g., train.json.")
+data_g.add_arg("predict_file",              str,   None,  "ReCoRD json for predictions. E.g. dev.json.")
+data_g.add_arg("vocab_path",                str,   None,  "Vocabulary path.")
+data_g.add_arg("version_2_with_negative",   bool,  False,
+               "If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
+data_g.add_arg("max_seq_len",               int,   512,   "Number of words of the longest seqence.")
+data_g.add_arg("max_query_length",          int,   64,    "Max query length.")
+data_g.add_arg("max_answer_length",         int,   30,    "Max answer length.")
+data_g.add_arg("batch_size",                int,   12,    "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("in_tokens",                 bool,  False,
+               "If set, the batch size will be the maximum number of tokens in one batch. "
+               "Otherwise, it will be the maximum number of examples in one batch.")
+data_g.add_arg("do_lower_case",             bool,  True,
+               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+data_g.add_arg("doc_stride",                int,   128,
+               "When splitting up a long document into chunks, how much stride to take between chunks.")
+data_g.add_arg("n_best_size",               int,   20,
+               "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+data_g.add_arg("null_score_diff_threshold", float, 0.0,
+               "If null_score - best_non_null is greater than the threshold predict null.")
+data_g.add_arg("random_seed",               int,   42,      "Random seed.")
+
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")
+run_type_g.add_arg("use_fast_executor",            bool,   False, "If set, use fast parallel executor (in experiment).")
+run_type_g.add_arg("num_iteration_per_drop_scope", int,    1,     "Ihe iteration intervals to clean up temporary variables.")
+run_type_g.add_arg("do_train",                     bool,   False,  "Whether to perform training.")
+run_type_g.add_arg("do_val",                       bool,   False,  "Whether to perform validation during training.")
+run_type_g.add_arg("do_predict",                   bool,   False,  "Whether to perform prediction.")
+run_type_g.add_arg("freeze",                       bool,  False,   "freeze bert parameters")
+
+mem_settings_g = ArgumentGroup(parser, "memory", "memory settings.")
+mem_settings_g.add_arg('concept_embedding_path',  str,    None,   'path of pretrained concept file')
+mem_settings_g.add_arg('use_wordnet',             bool,   False,  'whether to use wordnet memory')
+mem_settings_g.add_arg('retrieved_synset_path',   str,    '../retrieve_concepts/retrieve_wordnet/output_record/retrived_synsets.data',   'path of retrieved synsets')
+mem_settings_g.add_arg('use_nell',                bool,   False,  'whether to use nell memory')
+mem_settings_g.add_arg('train_retrieved_nell_concept_path',   str,    '../retrieve_concepts/retrieve_nell/output_record/train.retrieved_nell_concepts.data', 'path of retrieved concepts for trainset')
+mem_settings_g.add_arg('dev_retrieved_nell_concept_path',     str,    '../retrieve_concepts/retrieve_nell/output_record/dev.retrieved_nell_concepts.data',   'path of retrieved concepts for devset')
+
+args = parser.parse_args()
+# yapf: enable.
+
+def create_model(pyreader_name, bert_config, max_concept_length, concept_embedding_mat, is_training=False, freeze=False):
+    if is_training:
+        pyreader = fluid.layers.py_reader(
+            capacity=50,
+            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, max_concept_length, 1],
+                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
+            dtypes=[
+                'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
+            lod_levels=[0, 0, 0, 0, 0, 0, 0],
+            name=pyreader_name,
+            use_double_buffer=True)
+        (src_ids, pos_ids, sent_ids, concept_ids, input_mask, start_positions,
+         end_positions) = fluid.layers.read_file(pyreader)
+    else:
+        pyreader = fluid.layers.py_reader(
+            capacity=50,
+            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, max_concept_length, 1],
+                    [-1, args.max_seq_len, 1], [-1, 1]],
+            dtypes=['int64', 'int64', 'int64', 'int64', 'float32', 'int64'],
+            lod_levels=[0, 0, 0, 0, 0, 0],
+            name=pyreader_name,
+            use_double_buffer=True)
+        (src_ids, pos_ids, sent_ids, concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
+
+    '''1st Layer: BERT Layer'''
+    bert = BertModel(
+        src_ids=src_ids,
+        position_ids=pos_ids,
+        sentence_ids=sent_ids,
+        input_mask=input_mask,
+        config=bert_config,
+        use_fp16=args.use_fp16)
+
+    enc_out = bert.get_sequence_output()
+    if freeze:
+        enc_out.stop_gradient=True
+    logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient))
+
+    '''2nd layer: Memory Layer'''
+    # get memory embedding
+    concept_vocab_size = concept_embedding_mat.shape[0]
+    concept_dim = concept_embedding_mat.shape[1]
+    memory_embs = fluid.layers.embedding(concept_ids,
+                                         size=(concept_vocab_size, concept_dim),
+                                         param_attr=fluid.ParamAttr(name="concept_emb_mat",
+                                                                    do_model_average=False,
+                                                                    trainable=False),
+                                         dtype='float32')
+    
+    # get memory length
+    concept_ids_reduced = fluid.layers.equal(concept_ids,
+        fluid.layers.fill_constant(shape=[1], value=0, dtype="int64"))  # [batch_size, sent_size, concept_size, 1]
+    concept_ids_reduced = fluid.layers.cast(concept_ids_reduced, dtype="float32")  # [batch_size, sent_size, concept_size, 1]
+    concept_ids_reduced = fluid.layers.scale(
+        fluid.layers.elementwise_sub(
+            concept_ids_reduced,
+            fluid.layers.fill_constant([1], "float32", 1)
+        ),
+        scale=-1
+    )
+    mem_length = fluid.layers.reduce_sum(concept_ids_reduced, dim=2)  # [batch_size, sent_size, 1]    
+
+    # select and integrate
+    memory_layer = MemoryLayer(bert_config, max_concept_length, concept_dim, mem_method='cat')
+    memory_output = memory_layer.forward(enc_out, memory_embs, mem_length, ignore_no_memory_token=True)    
+
+    '''3rd layer: Self-Matching Layer'''
+    # calculate input dim for self-matching layer
+    if memory_layer.mem_method == 'add':
+        memory_output_size = bert_config['hidden_size']
+    elif memory_layer.mem_method == 'cat':
+        memory_output_size = bert_config['hidden_size'] + concept_dim
+    else:
+        raise ValueError("memory_layer.mem_method must be 'add' or 'cat'")        
+    logger.info("memory_output_size: {}".format(memory_output_size))
+
+    # do matching
+    self_att_layer = TriLinearTwoTimeSelfAttentionLayer(
+        memory_output_size, dropout_rate=0.0, 
+        cat_mul=True, cat_sub=True, cat_twotime=True,
+        cat_twotime_mul=False, cat_twotime_sub=True)  # [bs, sq, concat_hs]
+    att_output = self_att_layer.forward(memory_output, input_mask)  # [bs, sq, concat_hs]
+
+    '''4th layer: Output Layer'''
+    logits = fluid.layers.fc(
+        input=att_output,
+        size=2,
+        num_flatten_dims=2,
+        param_attr=fluid.ParamAttr(
+            name="cls_squad_out_w",
+            initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])),
+        bias_attr=fluid.ParamAttr(
+            name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
+
+    logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
+    start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
+
+    batch_ones = fluid.layers.fill_constant_batch_size_like(
+        input=start_logits, dtype='int64', shape=[1], value=1)
+    num_seqs = fluid.layers.reduce_sum(input=batch_ones)
+
+    if is_training:
+
+        def compute_loss(logits, positions):
+            loss = fluid.layers.softmax_with_cross_entropy(
+                logits=logits, label=positions)
+            loss = fluid.layers.mean(x=loss)
+            return loss
+
+        start_loss = compute_loss(start_logits, start_positions)
+        end_loss = compute_loss(end_logits, end_positions)
+        total_loss = (start_loss + end_loss) / 2.0
+        if args.use_fp16 and args.loss_scaling > 1.0:
+            total_loss = total_loss * args.loss_scaling
+
+        return pyreader, total_loss, num_seqs
+    else:
+        return pyreader, unique_id, start_logits, end_logits, num_seqs
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+
+def predict(test_exe, test_program, test_pyreader, fetch_list, processor, eval_concept_settings, eval_output_name='eval_result.json'):
+    if not os.path.exists(args.checkpoints):
+        os.makedirs(args.checkpoints)
+    output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
+    output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
+    output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
+    output_evaluation_result_file = os.path.join(args.checkpoints, eval_output_name)
+
+    test_pyreader.start()
+    all_results = []
+    time_begin = time.time()
+    while True:
+        try:
+            np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
+                fetch_list=fetch_list, program=test_program)
+            for idx in range(np_unique_ids.shape[0]):
+                if len(all_results) % 1000 == 0:
+                    logger.info("Processing example: %d" % len(all_results))
+                unique_id = int(np_unique_ids[idx])
+                start_logits = [float(x) for x in np_start_logits[idx].flat]
+                end_logits = [float(x) for x in np_end_logits[idx].flat]
+                all_results.append(
+                    RawResult(
+                        unique_id=unique_id,
+                        start_logits=start_logits,
+                        end_logits=end_logits))
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+    time_end = time.time()
+
+    features = processor.get_features(
+        processor.predict_examples, is_training=False, **eval_concept_settings)
+    eval_result = write_predictions(processor.predict_examples, features, all_results,
+                      args.n_best_size, args.max_answer_length,
+                      args.do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      args.version_2_with_negative,
+                      args.null_score_diff_threshold, args.verbose, args.predict_file, output_evaluation_result_file)
+    return eval_result
+
+def read_concept_embedding(embedding_path):
+    fin = open(embedding_path, encoding='utf-8')
+    info = [line.strip() for line in fin]
+    dim = len(info[0].split(' ')[1:])
+    n_concept = len(info)
+    embedding_mat = []
+    id2concept, concept2id = [], {}
+    # add padding concept into vocab
+    id2concept.append('<pad_concept>')
+    concept2id['<pad_concept>'] = 0
+    embedding_mat.append([0.0 for _ in range(dim)])
+    for line in info:
+        concept_name = line.split(' ')[0]
+        embedding = [float(value_str) for value_str in line.split(' ')[1:]] 
+        assert len(embedding) == dim and not np.any(np.isnan(embedding))
+        embedding_mat.append(embedding)
+        concept2id[concept_name] = len(id2concept)
+        id2concept.append(concept_name)
+    embedding_mat = np.array(embedding_mat, dtype=np.float32)
+    return id2concept, concept2id, embedding_mat
+
+def train(args):
+    bert_config = BertConfig(args.bert_config_path)
+    bert_config.print_config()
+
+    if not (args.do_train or args.do_predict or args.do_val):
+        raise ValueError("For args `do_train` and `do_predict`, at "
+                         "least one of them must be True.")
+
+    if args.use_cuda:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    exe = fluid.Executor(place)
+
+    id2concept, concept2id, concept_embedding_mat = read_concept_embedding(
+        args.concept_embedding_path)
+
+    processor = DataProcessor(
+        vocab_path=args.vocab_path,
+        do_lower_case=args.do_lower_case,
+        max_seq_length=args.max_seq_len,
+        in_tokens=args.in_tokens,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length)
+
+    startup_prog = fluid.Program()
+    if args.random_seed is not None:
+        startup_prog.random_seed = args.random_seed
+        random.seed(args.random_seed)
+        np.random.seed(args.random_seed)
+
+    if args.do_train:
+        train_concept_settings = {
+            'tokenization_path': '../retrieve_concepts/tokenization_record/tokens/train.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
+            'concept2id': concept2id,
+            'use_wordnet': args.use_wordnet,
+            'retrieved_synset_path': args.retrieved_synset_path,
+            'use_nell': args.use_nell,
+            'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path,          
+        }        
+        train_data_generator = processor.data_generator(
+            data_path=args.train_file,
+            batch_size=args.batch_size,
+            phase='train',
+            shuffle=True,
+            dev_count=dev_count,
+            version_2_with_negative=args.version_2_with_negative,
+            epoch=args.epoch,
+            **train_concept_settings)
+
+        num_train_examples = processor.get_num_examples(phase='train')
+        if args.in_tokens:
+            max_train_steps = args.epoch * num_train_examples // (
+                args.batch_size // args.max_seq_len) // dev_count
+        else:
+            max_train_steps = args.epoch * num_train_examples // (
+                args.batch_size) // dev_count
+        warmup_steps = int(max_train_steps * args.warmup_proportion)
+        logger.info("Device count: %d" % dev_count)
+        logger.info("Num train examples: %d" % num_train_examples)
+        logger.info("Max train steps: %d" % max_train_steps)
+        logger.info("Num warmup steps: %d" % warmup_steps)
+
+        train_program = fluid.Program()
+        # if args.random_seed is not None:
+        #     train_program.random_seed = args.random_seed
+        with fluid.program_guard(train_program, startup_prog):
+            with fluid.unique_name.guard():
+                train_pyreader, loss, num_seqs = create_model(
+                    pyreader_name='train_reader',
+                    bert_config=bert_config,
+                    max_concept_length=processor.train_max_concept_length,
+                    concept_embedding_mat=concept_embedding_mat,
+                    is_training=True,
+                    freeze=args.freeze)
+
+                scheduled_lr = optimization(
+                    loss=loss,
+                    warmup_steps=warmup_steps,
+                    num_train_steps=max_train_steps,
+                    learning_rate=args.learning_rate,
+                    train_program=train_program,
+                    startup_prog=startup_prog,
+                    weight_decay=args.weight_decay,
+                    scheduler=args.lr_scheduler,
+                    use_fp16=args.use_fp16,
+                    loss_scaling=args.loss_scaling)
+                
+                if args.use_ema:
+                    ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
+                    ema.update()
+
+                fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
+
+        if args.verbose:
+            if args.in_tokens:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program,
+                    batch_size=args.batch_size // args.max_seq_len)
+            else:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program, batch_size=args.batch_size)
+            logger.info("Theoretical memory usage in training:  %.3f - %.3f %s" %
+                  (lower_mem, upper_mem, unit))
+
+    if args.do_predict or args.do_val:
+        eval_concept_settings = {
+            'tokenization_path': '../retrieve_concepts/tokenization_record/tokens/dev.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
+            'concept2id': concept2id,
+            'use_wordnet': args.use_wordnet,
+            'retrieved_synset_path': args.retrieved_synset_path,
+            'use_nell': args.use_nell,
+            'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path,         
+        }          
+        eval_data_generator = processor.data_generator(
+                data_path=args.predict_file,
+                batch_size=args.batch_size,
+                phase='predict',
+                shuffle=False,
+                dev_count=1,
+                epoch=1,
+                **eval_concept_settings)
+
+        test_prog = fluid.Program()
+        # if args.random_seed is not None:
+        #     test_prog.random_seed = args.random_seed
+        with fluid.program_guard(test_prog, startup_prog):
+            with fluid.unique_name.guard():
+                test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
+                    pyreader_name='test_reader',
+                    bert_config=bert_config,
+                    max_concept_length=processor.predict_max_concept_length,
+                    concept_embedding_mat=concept_embedding_mat,                    
+                    is_training=False)
+                
+                if args.use_ema and 'ema' not in dir():
+                    ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
+
+                fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
+                    start_logits.name, end_logits.name, num_seqs.name])
+
+        test_prog = test_prog.clone(for_test=True)
+        # if args.random_seed is not None:
+        #     test_prog.random_seed = args.random_seed
+
+    exe.run(startup_prog)
+
+    if args.do_train:
+        logger.info('load pretrained concept embedding')
+        fluid.global_scope().find_var('concept_emb_mat').get_tensor().set(concept_embedding_mat, place)
+
+        if args.init_checkpoint and args.init_pretraining_params:
+            logger.info(
+                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
+                "both are set! Only arg 'init_checkpoint' is made valid.")
+        if args.init_checkpoint:
+            init_checkpoint(
+                exe,
+                args.init_checkpoint,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+        elif args.init_pretraining_params:
+            init_pretraining_params(
+                exe,
+                args.init_pretraining_params,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+    elif args.do_predict or args.do_val:
+        if not args.init_checkpoint:
+            raise ValueError("args 'init_checkpoint' should be set if"
+                             "only doing prediction!")
+        init_checkpoint(
+            exe,
+            args.init_checkpoint,
+            main_program=startup_prog,
+            use_fp16=args.use_fp16)
+
+    if args.do_train:
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.use_experimental_executor = args.use_fast_executor
+        exec_strategy.num_threads = dev_count
+        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
+
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=args.use_cuda,
+            loss_name=loss.name,
+            exec_strategy=exec_strategy,
+            main_program=train_program)
+
+        train_pyreader.decorate_tensor_provider(train_data_generator)
+
+        train_pyreader.start()
+        steps = 0
+        total_cost, total_num_seqs = [], []
+        time_begin = time.time()
+        while steps < max_train_steps:
+            try:
+                steps += 1
+                if steps % args.skip_steps == 0:
+                    if warmup_steps <= 0:
+                        fetch_list = [loss.name, num_seqs.name]
+                    else:
+                        fetch_list = [
+                            loss.name, scheduled_lr.name, num_seqs.name
+                        ]
+                else:
+                    fetch_list = []
+
+                outputs = train_exe.run(fetch_list=fetch_list)
+
+                if steps % args.skip_steps == 0:
+                    if warmup_steps <= 0:
+                        np_loss, np_num_seqs = outputs
+                    else:
+                        np_loss, np_lr, np_num_seqs = outputs
+                    total_cost.extend(np_loss * np_num_seqs)
+                    total_num_seqs.extend(np_num_seqs)
+
+                    if args.verbose:
+                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
+                        )
+                        verbose += "learning rate: %f" % (
+                            np_lr[0]
+                            if warmup_steps > 0 else args.learning_rate)
+                        logger.info(verbose)
+
+                    time_end = time.time()
+                    used_time = time_end - time_begin
+                    current_example, epoch = processor.get_train_progress()
+
+                    logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
+                          "speed: %f steps/s" %
+                          (epoch, current_example, num_train_examples, steps,
+                           np.sum(total_cost) / np.sum(total_num_seqs),
+                           args.skip_steps / used_time))
+                    total_cost, total_num_seqs = [], []
+                    time_begin = time.time()
+
+                if steps % args.save_steps == 0 or steps == max_train_steps:
+                    save_path = os.path.join(args.checkpoints,
+                                             "step_" + str(steps))
+                    fluid.io.save_persistables(exe, save_path, train_program)
+                
+                if steps % args.validation_steps == 0 or steps == max_train_steps:
+                    if args.do_val:
+                        test_pyreader.decorate_tensor_provider(
+                            processor.data_generator(
+                                            data_path=args.predict_file,
+                                            batch_size=args.batch_size,
+                                            phase='predict',
+                                            shuffle=False,
+                                            dev_count=1,
+                                            epoch=1,
+                                            **eval_concept_settings)
+                        )
+                        val_performance = predict(exe, test_prog, test_pyreader, [
+                            unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+                        ], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps))
+                        logger.info("Validation performance after step {}:\n* Exact_match: {}\n* F1: {}".format(steps, val_performance['exact_match'], val_performance['f1']))
+                                        
+            except fluid.core.EOFException:
+                save_path = os.path.join(args.checkpoints,
+                                         "step_" + str(steps) + "_final")
+                fluid.io.save_persistables(exe, save_path, train_program)
+                train_pyreader.reset()
+                break
+
+    if args.do_predict:
+        test_pyreader.decorate_tensor_provider(eval_data_generator)
+
+        if args.use_ema:
+            with ema.apply(exe):
+                eval_performance = predict(exe, test_prog, test_pyreader, [
+                    unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+                ], processor, eval_concept_settings)
+        else:
+            eval_performance = predict(exe, test_prog, test_pyreader, [
+                unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+            ], processor, eval_concept_settings)
+
+        logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format(eval_performance['exact_match'], eval_performance['f1']))
+
+
+if __name__ == '__main__':
+    print_arguments(args)
+    train(args)
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_record_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_record_twomemory.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning on ReCoRD."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import multiprocessing
+import os
+import time
+import logging
+import random
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+from reader.record_twomemory import DataProcessor, write_predictions
+from model.bert import BertConfig, BertModel
+from model.layers import MemoryLayer, TriLinearTwoTimeSelfAttentionLayer
+from utils.args import ArgumentGroup, print_arguments
+from optimization import optimization
+from utils.init import init_pretraining_params, init_checkpoint
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("bert_config_path",         str,  None,           "Path to the json file for bert model config.")
+model_g.add_arg("init_checkpoint",          str,  None,           "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params",  str,  None,
+                "Init pre-training params which preforms fine-tuning from. If the "
+                 "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("checkpoints",              str,  "checkpoints",  "Path to save checkpoints.")
+
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch",             int,    3,      "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate",     float,  5e-5,   "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
+                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay",      float,  0.01,   "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion", float,  0.1,
+                "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps",        int,    1000,   "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps",  int,    1000,   "The steps interval for validation (effective only when do_val is True).")
+train_g.add_arg("use_ema",           bool,   True, "Whether to use ema.")
+train_g.add_arg("ema_decay",         float,  0.9999, "Decay rate for expoential moving average.")
+train_g.add_arg("use_fp16",          bool,   False,  "Whether to use fp16 mixed precision training.")
+train_g.add_arg("loss_scaling",      float,  1.0,
+                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+
+log_g = ArgumentGroup(parser, "logging", "logging related.")
+log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
+log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
+
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("train_file",                str,   None,  "ReCoRD json for training. E.g., train.json.")
+data_g.add_arg("predict_file",              str,   None,  "ReCoRD json for predictions. E.g. dev.json.")
+data_g.add_arg("vocab_path",                str,   None,  "Vocabulary path.")
+data_g.add_arg("version_2_with_negative",   bool,  False,
+               "If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
+data_g.add_arg("max_seq_len",               int,   512,   "Number of words of the longest seqence.")
+data_g.add_arg("max_query_length",          int,   64,    "Max query length.")
+data_g.add_arg("max_answer_length",         int,   30,    "Max answer length.")
+data_g.add_arg("batch_size",                int,   12,    "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("in_tokens",                 bool,  False,
+               "If set, the batch size will be the maximum number of tokens in one batch. "
+               "Otherwise, it will be the maximum number of examples in one batch.")
+data_g.add_arg("do_lower_case",             bool,  True,
+               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+data_g.add_arg("doc_stride",                int,   128,
+               "When splitting up a long document into chunks, how much stride to take between chunks.")
+data_g.add_arg("n_best_size",               int,   20,
+               "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+data_g.add_arg("null_score_diff_threshold", float, 0.0,
+               "If null_score - best_non_null is greater than the threshold predict null.")
+data_g.add_arg("random_seed",               int,   42,      "Random seed.")
+
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")
+run_type_g.add_arg("use_fast_executor",            bool,   False, "If set, use fast parallel executor (in experiment).")
+run_type_g.add_arg("num_iteration_per_drop_scope", int,    1,     "Ihe iteration intervals to clean up temporary variables.")
+run_type_g.add_arg("do_train",                     bool,   False,  "Whether to perform training.")
+run_type_g.add_arg("do_val",                       bool,   False,  "Whether to perform validation during training.")
+run_type_g.add_arg("do_predict",                   bool,   False,  "Whether to perform prediction.")
+run_type_g.add_arg("freeze",                       bool,  False,   "freeze bert parameters")
+
+mem_settings_g = ArgumentGroup(parser, "memory", "memory settings.")
+mem_settings_g.add_arg('wn_concept_embedding_path',  str,    None,   'path of wordnet pretrained concept file')
+mem_settings_g.add_arg('nell_concept_embedding_path',  str,    None,   'path of nell pretrained concept file')
+mem_settings_g.add_arg('use_wordnet',             bool,   False,  'whether to use wordnet memory')
+mem_settings_g.add_arg('retrieved_synset_path',   str,    '../retrieve_concepts/retrieve_wordnet/output_record/retrived_synsets.data',   'path of retrieved synsets')
+mem_settings_g.add_arg('use_nell',                bool,   False,  'whether to use nell memory')
+mem_settings_g.add_arg('train_retrieved_nell_concept_path',   str,    '../retrieve_concepts/retrieve_nell/output_record/train.retrieved_nell_concepts.data', 'path of retrieved concepts for trainset')
+mem_settings_g.add_arg('dev_retrieved_nell_concept_path',     str,    '../retrieve_concepts/retrieve_nell/output_record/dev.retrieved_nell_concepts.data',   'path of retrieved concepts for devset')
+
+args = parser.parse_args()
+# yapf: enable.
+
+def create_model(pyreader_name, bert_config, max_wn_concept_length, max_nell_concept_length, wn_concept_embedding_mat, nell_concept_embedding_mat, is_training=False, freeze=False):
+    if is_training:
+        pyreader = fluid.layers.py_reader(
+            capacity=50,
+            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, max_wn_concept_length, 1],
+                    [-1, args.max_seq_len, max_nell_concept_length, 1],
+                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
+            dtypes=[
+                'int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
+            lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
+            name=pyreader_name,
+            use_double_buffer=True)
+        (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, start_positions,
+         end_positions) = fluid.layers.read_file(pyreader)
+    else:
+        pyreader = fluid.layers.py_reader(
+            capacity=50,
+            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, max_wn_concept_length, 1],
+                    [-1, args.max_seq_len, max_nell_concept_length, 1],
+                    [-1, args.max_seq_len, 1], [-1, 1]],
+            dtypes=['int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64'],
+            lod_levels=[0, 0, 0, 0, 0, 0, 0],
+            name=pyreader_name,
+            use_double_buffer=True)
+        (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
+
+    '''1st Layer: BERT Layer'''
+    bert = BertModel(
+        src_ids=src_ids,
+        position_ids=pos_ids,
+        sentence_ids=sent_ids,
+        input_mask=input_mask,
+        config=bert_config,
+        use_fp16=args.use_fp16)
+
+    enc_out = bert.get_sequence_output()
+    if freeze:
+        enc_out.stop_gradient=True
+    logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient))
+
+    '''2nd layer: Memory Layer'''
+    # get memory embedding
+    wn_concept_vocab_size = wn_concept_embedding_mat.shape[0]
+    wn_concept_dim = wn_concept_embedding_mat.shape[1]
+    nell_concept_vocab_size = nell_concept_embedding_mat.shape[0]
+    nell_concept_dim = nell_concept_embedding_mat.shape[1]    
+    wn_memory_embs = fluid.layers.embedding(wn_concept_ids,
+                                         size=(wn_concept_vocab_size, wn_concept_dim),
+                                         param_attr=fluid.ParamAttr(name="wn_concept_emb_mat",
+                                                                    do_model_average=False,
+                                                                    trainable=False),
+                                         dtype='float32')
+    nell_memory_embs = fluid.layers.embedding(nell_concept_ids,
+                                         size=(nell_concept_vocab_size, nell_concept_dim),
+                                         param_attr=fluid.ParamAttr(name="nell_concept_emb_mat",
+                                                                    do_model_average=False,
+                                                                    trainable=False),
+                                         dtype='float32')    
+    
+    # get memory length
+    wn_concept_ids_reduced = fluid.layers.equal(wn_concept_ids,
+        fluid.layers.fill_constant(shape=[1], value=0, dtype="int64"))  # [batch_size, sent_size, concept_size, 1]
+    wn_concept_ids_reduced = fluid.layers.cast(wn_concept_ids_reduced, dtype="float32")  # [batch_size, sent_size, concept_size, 1]
+    wn_concept_ids_reduced = fluid.layers.scale(
+        fluid.layers.elementwise_sub(
+            wn_concept_ids_reduced,
+            fluid.layers.fill_constant([1], "float32", 1)
+        ),
+        scale=-1
+    )
+    wn_mem_length = fluid.layers.reduce_sum(wn_concept_ids_reduced, dim=2)  # [batch_size, sent_size, 1]    
+
+    nell_concept_ids_reduced = fluid.layers.equal(nell_concept_ids,
+        fluid.layers.fill_constant(shape=[1], value=0, dtype="int64"))  # [batch_size, sent_size, concept_size, 1]
+    nell_concept_ids_reduced = fluid.layers.cast(nell_concept_ids_reduced, dtype="float32")  # [batch_size, sent_size, concept_size, 1]
+    nell_concept_ids_reduced = fluid.layers.scale(
+        fluid.layers.elementwise_sub(
+            nell_concept_ids_reduced,
+            fluid.layers.fill_constant([1], "float32", 1)
+        ),
+        scale=-1
+    )
+    nell_mem_length = fluid.layers.reduce_sum(nell_concept_ids_reduced, dim=2)  # [batch_size, sent_size, 1]      
+
+    # select and integrate
+    wn_memory_layer = MemoryLayer(bert_config, max_wn_concept_length, wn_concept_dim, mem_method='raw', prefix='wn')
+    wn_memory_output = wn_memory_layer.forward(enc_out, wn_memory_embs, wn_mem_length, ignore_no_memory_token=True)
+
+    nell_memory_layer = MemoryLayer(bert_config, max_nell_concept_length, nell_concept_dim, mem_method='raw', prefix='nell')
+    nell_memory_output = nell_memory_layer.forward(enc_out, nell_memory_embs, nell_mem_length, ignore_no_memory_token=True)
+
+    memory_output = fluid.layers.concat([enc_out, wn_memory_output, nell_memory_output], axis=2)
+
+    '''3rd layer: Self-Matching Layer'''
+    # calculate input dim for self-matching layer
+    memory_output_size = bert_config['hidden_size'] + wn_concept_dim + nell_concept_dim    
+    logger.info("memory_output_size: {}".format(memory_output_size))
+
+    # do matching
+    self_att_layer = TriLinearTwoTimeSelfAttentionLayer(
+        memory_output_size, dropout_rate=0.0, 
+        cat_mul=True, cat_sub=True, cat_twotime=True,
+        cat_twotime_mul=False, cat_twotime_sub=True)  # [bs, sq, concat_hs]
+    att_output = self_att_layer.forward(memory_output, input_mask)  # [bs, sq, concat_hs]
+
+    '''4th layer: Output Layer'''
+    logits = fluid.layers.fc(
+        input=att_output,
+        size=2,
+        num_flatten_dims=2,
+        param_attr=fluid.ParamAttr(
+            name="cls_squad_out_w",
+            initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])),
+        bias_attr=fluid.ParamAttr(
+            name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
+
+    logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
+    start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
+
+    batch_ones = fluid.layers.fill_constant_batch_size_like(
+        input=start_logits, dtype='int64', shape=[1], value=1)
+    num_seqs = fluid.layers.reduce_sum(input=batch_ones)
+
+    if is_training:
+
+        def compute_loss(logits, positions):
+            loss = fluid.layers.softmax_with_cross_entropy(
+                logits=logits, label=positions)
+            loss = fluid.layers.mean(x=loss)
+            return loss
+
+        start_loss = compute_loss(start_logits, start_positions)
+        end_loss = compute_loss(end_logits, end_positions)
+        total_loss = (start_loss + end_loss) / 2.0
+        if args.use_fp16 and args.loss_scaling > 1.0:
+            total_loss = total_loss * args.loss_scaling
+
+        return pyreader, total_loss, num_seqs
+    else:
+        return pyreader, unique_id, start_logits, end_logits, num_seqs
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+
+def predict(test_exe, test_program, test_pyreader, fetch_list, processor, eval_concept_settings, eval_output_name='eval_result.json'):
+    if not os.path.exists(args.checkpoints):
+        os.makedirs(args.checkpoints)
+    output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
+    output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
+    output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
+    output_evaluation_result_file = os.path.join(args.checkpoints, eval_output_name)
+
+    test_pyreader.start()
+    all_results = []
+    time_begin = time.time()
+    while True:
+        try:
+            np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
+                fetch_list=fetch_list, program=test_program)
+            for idx in range(np_unique_ids.shape[0]):
+                if len(all_results) % 1000 == 0:
+                    logger.info("Processing example: %d" % len(all_results))
+                unique_id = int(np_unique_ids[idx])
+                start_logits = [float(x) for x in np_start_logits[idx].flat]
+                end_logits = [float(x) for x in np_end_logits[idx].flat]
+                all_results.append(
+                    RawResult(
+                        unique_id=unique_id,
+                        start_logits=start_logits,
+                        end_logits=end_logits))
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+    time_end = time.time()
+
+    features = processor.get_features(
+        processor.predict_examples, is_training=False, **eval_concept_settings)
+    eval_result = write_predictions(processor.predict_examples, features, all_results,
+                      args.n_best_size, args.max_answer_length,
+                      args.do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      args.version_2_with_negative,
+                      args.null_score_diff_threshold, args.verbose, args.predict_file, output_evaluation_result_file)
+    return eval_result
+
+def read_concept_embedding(embedding_path):
+    fin = open(embedding_path, encoding='utf-8')
+    info = [line.strip() for line in fin]
+    dim = len(info[0].split(' ')[1:])
+    n_concept = len(info)
+    embedding_mat = []
+    id2concept, concept2id = [], {}
+    # add padding concept into vocab
+    id2concept.append('<pad_concept>')
+    concept2id['<pad_concept>'] = 0
+    embedding_mat.append([0.0 for _ in range(dim)])
+    for line in info:
+        concept_name = line.split(' ')[0]
+        embedding = [float(value_str) for value_str in line.split(' ')[1:]] 
+        assert len(embedding) == dim and not np.any(np.isnan(embedding))
+        embedding_mat.append(embedding)
+        concept2id[concept_name] = len(id2concept)
+        id2concept.append(concept_name)
+    embedding_mat = np.array(embedding_mat, dtype=np.float32)
+    return id2concept, concept2id, embedding_mat
+
+def train(args):
+    bert_config = BertConfig(args.bert_config_path)
+    bert_config.print_config()
+
+    if not (args.do_train or args.do_predict or args.do_val):
+        raise ValueError("For args `do_train` and `do_predict`, at "
+                         "least one of them must be True.")
+
+    if args.use_cuda:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    exe = fluid.Executor(place)
+
+    wn_id2concept, wn_concept2id, wn_concept_embedding_mat = read_concept_embedding(
+        args.wn_concept_embedding_path)
+    nell_id2concept, nell_concept2id, nell_concept_embedding_mat = read_concept_embedding(
+        args.nell_concept_embedding_path)        
+
+    processor = DataProcessor(
+        vocab_path=args.vocab_path,
+        do_lower_case=args.do_lower_case,
+        max_seq_length=args.max_seq_len,
+        in_tokens=args.in_tokens,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length)
+
+    startup_prog = fluid.Program()
+    if args.random_seed is not None:
+        startup_prog.random_seed = args.random_seed
+        random.seed(args.random_seed)
+        np.random.seed(args.random_seed)
+
+    if args.do_train:
+        train_concept_settings = {
+            'tokenization_path': '../retrieve_concepts/tokenization_record/tokens/train.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
+            'wn_concept2id': wn_concept2id,
+            'nell_concept2id': nell_concept2id,
+            'use_wordnet': args.use_wordnet,
+            'retrieved_synset_path': args.retrieved_synset_path,
+            'use_nell': args.use_nell,
+            'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path,      
+        }        
+        train_data_generator = processor.data_generator(
+            data_path=args.train_file,
+            batch_size=args.batch_size,
+            phase='train',
+            shuffle=True,
+            dev_count=dev_count,
+            version_2_with_negative=args.version_2_with_negative,
+            epoch=args.epoch,
+            **train_concept_settings)
+
+        num_train_examples = processor.get_num_examples(phase='train')
+        if args.in_tokens:
+            max_train_steps = args.epoch * num_train_examples // (
+                args.batch_size // args.max_seq_len) // dev_count
+        else:
+            max_train_steps = args.epoch * num_train_examples // (
+                args.batch_size) // dev_count
+        warmup_steps = int(max_train_steps * args.warmup_proportion)
+        logger.info("Device count: %d" % dev_count)
+        logger.info("Num train examples: %d" % num_train_examples)
+        logger.info("Max train steps: %d" % max_train_steps)
+        logger.info("Num warmup steps: %d" % warmup_steps)
+
+        train_program = fluid.Program()
+        # if args.random_seed is not None:
+        #     train_program.random_seed = args.random_seed
+        with fluid.program_guard(train_program, startup_prog):
+            with fluid.unique_name.guard():
+                train_pyreader, loss, num_seqs = create_model(
+                    pyreader_name='train_reader',
+                    bert_config=bert_config,
+                    max_wn_concept_length=processor.train_wn_max_concept_length,
+                    max_nell_concept_length=processor.train_nell_max_concept_length,
+                    wn_concept_embedding_mat=wn_concept_embedding_mat,
+                    nell_concept_embedding_mat=nell_concept_embedding_mat,
+                    is_training=True,
+                    freeze=args.freeze)
+
+                scheduled_lr = optimization(
+                    loss=loss,
+                    warmup_steps=warmup_steps,
+                    num_train_steps=max_train_steps,
+                    learning_rate=args.learning_rate,
+                    train_program=train_program,
+                    startup_prog=startup_prog,
+                    weight_decay=args.weight_decay,
+                    scheduler=args.lr_scheduler,
+                    use_fp16=args.use_fp16,
+                    loss_scaling=args.loss_scaling)
+                
+                if args.use_ema:
+                    ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
+                    ema.update()
+
+                fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
+
+        if args.verbose:
+            if args.in_tokens:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program,
+                    batch_size=args.batch_size // args.max_seq_len)
+            else:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program, batch_size=args.batch_size)
+            logger.info("Theoretical memory usage in training:  %.3f - %.3f %s" %
+                  (lower_mem, upper_mem, unit))
+
+    if args.do_predict or args.do_val:
+        eval_concept_settings = {
+            'tokenization_path': '../retrieve_concepts/tokenization_record/tokens/dev.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
+            'wn_concept2id': wn_concept2id,
+            'nell_concept2id': nell_concept2id,
+            'use_wordnet': args.use_wordnet,
+            'retrieved_synset_path': args.retrieved_synset_path,
+            'use_nell': args.use_nell,
+            'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path,           
+        }          
+        eval_data_generator = processor.data_generator(
+                data_path=args.predict_file,
+                batch_size=args.batch_size,
+                phase='predict',
+                shuffle=False,
+                dev_count=1,
+                epoch=1,
+                **eval_concept_settings)
+
+        test_prog = fluid.Program()
+        # if args.random_seed is not None:
+        #     test_prog.random_seed = args.random_seed
+        with fluid.program_guard(test_prog, startup_prog):
+            with fluid.unique_name.guard():
+                test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
+                    pyreader_name='test_reader',
+                    bert_config=bert_config,
+                    max_wn_concept_length=processor.predict_wn_max_concept_length,
+                    max_nell_concept_length=processor.predict_nell_max_concept_length,
+                    wn_concept_embedding_mat=wn_concept_embedding_mat,
+                    nell_concept_embedding_mat=nell_concept_embedding_mat,                  
+                    is_training=False)
+                
+                if args.use_ema and 'ema' not in dir():
+                    ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
+
+                fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
+                    start_logits.name, end_logits.name, num_seqs.name])
+
+        test_prog = test_prog.clone(for_test=True)
+        # if args.random_seed is not None:
+        #     test_prog.random_seed = args.random_seed
+
+    exe.run(startup_prog)
+
+    if args.do_train:
+        logger.info('load pretrained concept embedding')
+        fluid.global_scope().find_var('wn_concept_emb_mat').get_tensor().set(wn_concept_embedding_mat, place)
+        fluid.global_scope().find_var('nell_concept_emb_mat').get_tensor().set(nell_concept_embedding_mat, place)
+
+        if args.init_checkpoint and args.init_pretraining_params:
+            logger.info(
+                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
+                "both are set! Only arg 'init_checkpoint' is made valid.")
+        if args.init_checkpoint:
+            init_checkpoint(
+                exe,
+                args.init_checkpoint,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+        elif args.init_pretraining_params:
+            init_pretraining_params(
+                exe,
+                args.init_pretraining_params,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+    elif args.do_predict or args.do_val:
+        if not args.init_checkpoint:
+            raise ValueError("args 'init_checkpoint' should be set if"
+                             "only doing prediction!")
+        init_checkpoint(
+            exe,
+            args.init_checkpoint,
+            main_program=startup_prog,
+            use_fp16=args.use_fp16)
+
+    if args.do_train:
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.use_experimental_executor = args.use_fast_executor
+        exec_strategy.num_threads = dev_count
+        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
+
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=args.use_cuda,
+            loss_name=loss.name,
+            exec_strategy=exec_strategy,
+            main_program=train_program)
+
+        train_pyreader.decorate_tensor_provider(train_data_generator)
+
+        train_pyreader.start()
+        steps = 0
+        total_cost, total_num_seqs = [], []
+        time_begin = time.time()
+        while steps < max_train_steps:
+            try:
+                steps += 1
+                if steps % args.skip_steps == 0:
+                    if warmup_steps <= 0:
+                        fetch_list = [loss.name, num_seqs.name]
+                    else:
+                        fetch_list = [
+                            loss.name, scheduled_lr.name, num_seqs.name
+                        ]
+                else:
+                    fetch_list = []
+
+                outputs = train_exe.run(fetch_list=fetch_list)
+
+                if steps % args.skip_steps == 0:
+                    if warmup_steps <= 0:
+                        np_loss, np_num_seqs = outputs
+                    else:
+                        np_loss, np_lr, np_num_seqs = outputs
+                    total_cost.extend(np_loss * np_num_seqs)
+                    total_num_seqs.extend(np_num_seqs)
+
+                    if args.verbose:
+                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
+                        )
+                        verbose += "learning rate: %f" % (
+                            np_lr[0]
+                            if warmup_steps > 0 else args.learning_rate)
+                        logger.info(verbose)
+
+                    time_end = time.time()
+                    used_time = time_end - time_begin
+                    current_example, epoch = processor.get_train_progress()
+
+                    logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
+                          "speed: %f steps/s" %
+                          (epoch, current_example, num_train_examples, steps,
+                           np.sum(total_cost) / np.sum(total_num_seqs),
+                           args.skip_steps / used_time))
+                    total_cost, total_num_seqs = [], []
+                    time_begin = time.time()
+
+                if steps % args.save_steps == 0 or steps == max_train_steps:
+                    save_path = os.path.join(args.checkpoints,
+                                             "step_" + str(steps))
+                    fluid.io.save_persistables(exe, save_path, train_program)
+                
+                if steps % args.validation_steps == 0 or steps == max_train_steps:
+                    if args.do_val:
+                        test_pyreader.decorate_tensor_provider(
+                            processor.data_generator(
+                                            data_path=args.predict_file,
+                                            batch_size=args.batch_size,
+                                            phase='predict',
+                                            shuffle=False,
+                                            dev_count=1,
+                                            epoch=1,
+                                            **eval_concept_settings)
+                        )
+                        val_performance = predict(exe, test_prog, test_pyreader, [
+                            unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+                        ], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps))
+                        logger.info("Validation performance after step {}:\n* Exact_match: {}\n* F1: {}".format(steps, val_performance['exact_match'], val_performance['f1']))
+                                        
+            except fluid.core.EOFException:
+                save_path = os.path.join(args.checkpoints,
+                                         "step_" + str(steps) + "_final")
+                fluid.io.save_persistables(exe, save_path, train_program)
+                train_pyreader.reset()
+                break
+
+    if args.do_predict:
+        test_pyreader.decorate_tensor_provider(eval_data_generator)
+
+        if args.use_ema:
+            with ema.apply(exe):
+                eval_performance = predict(exe, test_prog, test_pyreader, [
+                    unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+                ], processor, eval_concept_settings)
+        else:
+            eval_performance = predict(exe, test_prog, test_pyreader, [
+                unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+            ], processor, eval_concept_settings)                        
+
+        logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format(eval_performance['exact_match'], eval_performance['f1']))
+
+
+if __name__ == '__main__':
+    print_arguments(args)
+    train(args)
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_squad.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_squad.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning on SQuAD."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import multiprocessing
+import os
+import time
+import logging
+import random
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+from reader.squad import DataProcessor, write_predictions
+from model.bert import BertConfig, BertModel
+from model.layers import MemoryLayer, TriLinearTwoTimeSelfAttentionLayer
+from utils.args import ArgumentGroup, print_arguments
+from optimization import optimization
+from utils.init import init_pretraining_params, init_checkpoint
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("bert_config_path",         str,  None,           "Path to the json file for bert model config.")
+model_g.add_arg("init_checkpoint",          str,  None,           "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params",  str,  None,
+                "Init pre-training params which preforms fine-tuning from. If the "
+                 "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("checkpoints",              str,  "checkpoints",  "Path to save checkpoints.")
+
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch",             int,    3,      "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate",     float,  5e-5,   "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
+                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay",      float,  0.01,   "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion", float,  0.1,
+                "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps",        int,    1000,   "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps",  int,    1000,   "The steps interval for validation (effective only when do_val is True).")
+train_g.add_arg("use_ema",           bool,   True, "Whether to use ema.")
+train_g.add_arg("ema_decay",         float,  0.9999, "Decay rate for expoential moving average.")
+train_g.add_arg("use_fp16",          bool,   False,  "Whether to use fp16 mixed precision training.")
+train_g.add_arg("loss_scaling",      float,  1.0,
+                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+
+log_g = ArgumentGroup(parser, "logging", "logging related.")
+log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
+log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
+
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("train_file",                str,   None,  "SQuAD json for training. E.g., train-v1.1.json.")
+data_g.add_arg("predict_file",              str,   None,  "SQuAD json for predictions. E.g. dev-v1.1.json or test-v1.1.json.")
+data_g.add_arg("vocab_path",                str,   None,  "Vocabulary path.")
+data_g.add_arg("version_2_with_negative",   bool,  False,
+               "If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
+data_g.add_arg("max_seq_len",               int,   512,   "Number of words of the longest seqence.")
+data_g.add_arg("max_query_length",          int,   64,    "Max query length.")
+data_g.add_arg("max_answer_length",         int,   30,    "Max answer length.")
+data_g.add_arg("batch_size",                int,   12,    "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("in_tokens",                 bool,  False,
+               "If set, the batch size will be the maximum number of tokens in one batch. "
+               "Otherwise, it will be the maximum number of examples in one batch.")
+data_g.add_arg("do_lower_case",             bool,  True,
+               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+data_g.add_arg("doc_stride",                int,   128,
+               "When splitting up a long document into chunks, how much stride to take between chunks.")
+data_g.add_arg("n_best_size",               int,   20,
+               "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+data_g.add_arg("null_score_diff_threshold", float, 0.0,
+               "If null_score - best_non_null is greater than the threshold predict null.")
+data_g.add_arg("random_seed",               int,   42,      "Random seed.")
+
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")
+run_type_g.add_arg("use_fast_executor",            bool,   False, "If set, use fast parallel executor (in experiment).")
+run_type_g.add_arg("num_iteration_per_drop_scope", int,    1,     "Ihe iteration intervals to clean up temporary variables.")
+run_type_g.add_arg("do_train",                     bool,   False,  "Whether to perform training.")
+run_type_g.add_arg("do_val",                       bool,   False,  "Whether to perform validation during training.")
+run_type_g.add_arg("do_predict",                   bool,   False,  "Whether to perform prediction.")
+run_type_g.add_arg("freeze",                       bool,  False,   "freeze bert parameters")
+
+mem_settings_g = ArgumentGroup(parser, "memory", "memory settings.")
+mem_settings_g.add_arg('concept_embedding_path',  str,    None,   'path of pretrained concept file')
+mem_settings_g.add_arg('use_wordnet',             bool,   False,  'whether to use wordnet memory')
+mem_settings_g.add_arg('retrieved_synset_path',   str,    '../retrieve_concepts/retrieve_wordnet/output_squad/retrived_synsets.data',   'path of retrieved synsets')
+mem_settings_g.add_arg('use_nell',                bool,   False,  'whether to use nell memory')
+mem_settings_g.add_arg('train_retrieved_nell_concept_path',   str,    '../retrieve_concepts/retrieve_nell/output_squad/train.retrieved_nell_concepts.data', 'path of retrieved concepts for trainset')
+mem_settings_g.add_arg('dev_retrieved_nell_concept_path',     str,    '../retrieve_concepts/retrieve_nell/output_squad/dev.retrieved_nell_concepts.data',   'path of retrieved concepts for devset')
+
+args = parser.parse_args()
+# yapf: enable.
+
+def create_model(pyreader_name, bert_config, max_concept_length, concept_embedding_mat, is_training=False, freeze=False):
+    if is_training:
+        pyreader = fluid.layers.py_reader(
+            capacity=50,
+            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, max_concept_length, 1],
+                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
+            dtypes=[
+                'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
+            lod_levels=[0, 0, 0, 0, 0, 0, 0],
+            name=pyreader_name,
+            use_double_buffer=True)
+        (src_ids, pos_ids, sent_ids, concept_ids, input_mask, start_positions,
+         end_positions) = fluid.layers.read_file(pyreader)
+    else:
+        pyreader = fluid.layers.py_reader(
+            capacity=50,
+            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, max_concept_length, 1],
+                    [-1, args.max_seq_len, 1], [-1, 1]],
+            dtypes=['int64', 'int64', 'int64', 'int64', 'float32', 'int64'],
+            lod_levels=[0, 0, 0, 0, 0, 0],
+            name=pyreader_name,
+            use_double_buffer=True)
+        (src_ids, pos_ids, sent_ids, concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
+
+    '''1st Layer: BERT Layer'''
+    bert = BertModel(
+        src_ids=src_ids,
+        position_ids=pos_ids,
+        sentence_ids=sent_ids,
+        input_mask=input_mask,
+        config=bert_config,
+        use_fp16=args.use_fp16)
+
+    enc_out = bert.get_sequence_output()
+    if freeze:
+        enc_out.stop_gradient=True
+    logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient))
+
+    '''2nd layer: Memory Layer'''
+    # get memory embedding
+    concept_vocab_size = concept_embedding_mat.shape[0]
+    concept_dim = concept_embedding_mat.shape[1]
+    memory_embs = fluid.layers.embedding(concept_ids,
+                                         size=(concept_vocab_size, concept_dim),
+                                         param_attr=fluid.ParamAttr(name="concept_emb_mat",
+                                                                    do_model_average=False,
+                                                                    trainable=False),
+                                         dtype='float32')
+    
+    # get memory length
+    concept_ids_reduced = fluid.layers.equal(concept_ids,
+        fluid.layers.fill_constant(shape=[1], value=0, dtype="int64"))  # [batch_size, sent_size, concept_size, 1]
+    concept_ids_reduced = fluid.layers.cast(concept_ids_reduced, dtype="float32")  # [batch_size, sent_size, concept_size, 1]
+    concept_ids_reduced = fluid.layers.scale(
+        fluid.layers.elementwise_sub(
+            concept_ids_reduced,
+            fluid.layers.fill_constant([1], "float32", 1)
+        ),
+        scale=-1
+    )
+    mem_length = fluid.layers.reduce_sum(concept_ids_reduced, dim=2)  # [batch_size, sent_size, 1]    
+
+    # select and integrate
+    memory_layer = MemoryLayer(bert_config, max_concept_length, concept_dim, mem_method='cat')
+    memory_output = memory_layer.forward(enc_out, memory_embs, mem_length, ignore_no_memory_token=True)    
+
+    '''3rd layer: Self-Matching Layer'''
+    # calculate input dim for self-matching layer
+    if memory_layer.mem_method == 'add':
+        memory_output_size = bert_config['hidden_size']
+    elif memory_layer.mem_method == 'cat':
+        memory_output_size = bert_config['hidden_size'] + concept_dim
+    else:
+        raise ValueError("memory_layer.mem_method must be 'add' or 'cat'")        
+    logger.info("memory_output_size: {}".format(memory_output_size))
+
+    # do matching
+    self_att_layer = TriLinearTwoTimeSelfAttentionLayer(
+        memory_output_size, dropout_rate=0.0, 
+        cat_mul=True, cat_sub=True, cat_twotime=True,
+        cat_twotime_mul=False, cat_twotime_sub=True)  # [bs, sq, concat_hs]
+    att_output = self_att_layer.forward(memory_output, input_mask)  # [bs, sq, concat_hs]
+
+    '''4th layer: Output Layer'''
+    logits = fluid.layers.fc(
+        input=att_output,
+        size=2,
+        num_flatten_dims=2,
+        param_attr=fluid.ParamAttr(
+            name="cls_squad_out_w",
+            initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])),
+        bias_attr=fluid.ParamAttr(
+            name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
+
+    logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
+    start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
+
+    batch_ones = fluid.layers.fill_constant_batch_size_like(
+        input=start_logits, dtype='int64', shape=[1], value=1)
+    num_seqs = fluid.layers.reduce_sum(input=batch_ones)
+
+    if is_training:
+
+        def compute_loss(logits, positions):
+            loss = fluid.layers.softmax_with_cross_entropy(
+                logits=logits, label=positions)
+            loss = fluid.layers.mean(x=loss)
+            return loss
+
+        start_loss = compute_loss(start_logits, start_positions)
+        end_loss = compute_loss(end_logits, end_positions)
+        total_loss = (start_loss + end_loss) / 2.0
+        if args.use_fp16 and args.loss_scaling > 1.0:
+            total_loss = total_loss * args.loss_scaling
+
+        return pyreader, total_loss, num_seqs
+    else:
+        return pyreader, unique_id, start_logits, end_logits, num_seqs
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+
+def predict(test_exe, test_program, test_pyreader, fetch_list, processor, eval_concept_settings, eval_output_name='eval_result.json'):
+    if not os.path.exists(args.checkpoints):
+        os.makedirs(args.checkpoints)
+    output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
+    output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
+    output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
+    output_evaluation_result_file = os.path.join(args.checkpoints, eval_output_name)
+
+    test_pyreader.start()
+    all_results = []
+    time_begin = time.time()
+    while True:
+        try:
+            np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
+                fetch_list=fetch_list, program=test_program)
+            for idx in range(np_unique_ids.shape[0]):
+                if len(all_results) % 1000 == 0:
+                    logger.info("Processing example: %d" % len(all_results))
+                unique_id = int(np_unique_ids[idx])
+                start_logits = [float(x) for x in np_start_logits[idx].flat]
+                end_logits = [float(x) for x in np_end_logits[idx].flat]
+                all_results.append(
+                    RawResult(
+                        unique_id=unique_id,
+                        start_logits=start_logits,
+                        end_logits=end_logits))
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+    time_end = time.time()
+
+    features = processor.get_features(
+        processor.predict_examples, is_training=False, **eval_concept_settings)
+    eval_result = write_predictions(processor.predict_examples, features, all_results,
+                      args.n_best_size, args.max_answer_length,
+                      args.do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      args.version_2_with_negative,
+                      args.null_score_diff_threshold, args.verbose, args.predict_file, output_evaluation_result_file)
+    return eval_result
+
+def read_concept_embedding(embedding_path):
+    fin = open(embedding_path, encoding='utf-8')
+    info = [line.strip() for line in fin]
+    dim = len(info[0].split(' ')[1:])
+    n_concept = len(info)
+    embedding_mat = []
+    id2concept, concept2id = [], {}
+    # add padding concept into vocab
+    id2concept.append('<pad_concept>')
+    concept2id['<pad_concept>'] = 0
+    embedding_mat.append([0.0 for _ in range(dim)])
+    for line in info:
+        concept_name = line.split(' ')[0]
+        embedding = [float(value_str) for value_str in line.split(' ')[1:]] 
+        assert len(embedding) == dim and not np.any(np.isnan(embedding))
+        embedding_mat.append(embedding)
+        concept2id[concept_name] = len(id2concept)
+        id2concept.append(concept_name)
+    embedding_mat = np.array(embedding_mat, dtype=np.float32)
+    return id2concept, concept2id, embedding_mat
+
+def train(args):
+    bert_config = BertConfig(args.bert_config_path)
+    bert_config.print_config()
+
+    if not (args.do_train or args.do_predict or args.do_val):
+        raise ValueError("For args `do_train` and `do_predict`, at "
+                         "least one of them must be True.")
+
+    if args.use_cuda:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    exe = fluid.Executor(place)
+
+    id2concept, concept2id, concept_embedding_mat = read_concept_embedding(
+        args.concept_embedding_path)
+
+    processor = DataProcessor(
+        vocab_path=args.vocab_path,
+        do_lower_case=args.do_lower_case,
+        max_seq_length=args.max_seq_len,
+        in_tokens=args.in_tokens,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length)
+
+    startup_prog = fluid.Program()
+    if args.random_seed is not None:
+        startup_prog.random_seed = args.random_seed
+        random.seed(args.random_seed)
+        np.random.seed(args.random_seed)
+
+    if args.do_train:
+        train_concept_settings = {
+            'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/train.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
+            'concept2id': concept2id,
+            'use_wordnet': args.use_wordnet,
+            'retrieved_synset_path': args.retrieved_synset_path,
+            'use_nell': args.use_nell,
+            'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path,          
+        }        
+        train_data_generator = processor.data_generator(
+            data_path=args.train_file,
+            batch_size=args.batch_size,
+            phase='train',
+            shuffle=True,
+            dev_count=dev_count,
+            version_2_with_negative=args.version_2_with_negative,
+            epoch=args.epoch,
+            **train_concept_settings)
+
+        num_train_examples = processor.get_num_examples(phase='train')
+        if args.in_tokens:
+            max_train_steps = args.epoch * num_train_examples // (
+                args.batch_size // args.max_seq_len) // dev_count
+        else:
+            max_train_steps = args.epoch * num_train_examples // (
+                args.batch_size) // dev_count
+        warmup_steps = int(max_train_steps * args.warmup_proportion)
+        logger.info("Device count: %d" % dev_count)
+        logger.info("Num train examples: %d" % num_train_examples)
+        logger.info("Max train steps: %d" % max_train_steps)
+        logger.info("Num warmup steps: %d" % warmup_steps)
+
+        train_program = fluid.Program()
+        # if args.random_seed is not None:
+        #     train_program.random_seed = args.random_seed
+        with fluid.program_guard(train_program, startup_prog):
+            with fluid.unique_name.guard():
+                train_pyreader, loss, num_seqs = create_model(
+                    pyreader_name='train_reader',
+                    bert_config=bert_config,
+                    max_concept_length=processor.train_max_concept_length,
+                    concept_embedding_mat=concept_embedding_mat,
+                    is_training=True,
+                    freeze=args.freeze)
+
+                scheduled_lr = optimization(
+                    loss=loss,
+                    warmup_steps=warmup_steps,
+                    num_train_steps=max_train_steps,
+                    learning_rate=args.learning_rate,
+                    train_program=train_program,
+                    startup_prog=startup_prog,
+                    weight_decay=args.weight_decay,
+                    scheduler=args.lr_scheduler,
+                    use_fp16=args.use_fp16,
+                    loss_scaling=args.loss_scaling)
+                
+                if args.use_ema:
+                    ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
+                    ema.update()
+
+                fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
+
+        if args.verbose:
+            if args.in_tokens:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program,
+                    batch_size=args.batch_size // args.max_seq_len)
+            else:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program, batch_size=args.batch_size)
+            logger.info("Theoretical memory usage in training:  %.3f - %.3f %s" %
+                  (lower_mem, upper_mem, unit))
+
+    if args.do_predict or args.do_val:
+        eval_concept_settings = {
+            'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/dev.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
+            'concept2id': concept2id,
+            'use_wordnet': args.use_wordnet,
+            'retrieved_synset_path': args.retrieved_synset_path,
+            'use_nell': args.use_nell,
+            'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path,           
+        }          
+        eval_data_generator = processor.data_generator(
+                data_path=args.predict_file,
+                batch_size=args.batch_size,
+                phase='predict',
+                shuffle=False,
+                dev_count=1,
+                epoch=1,
+                **eval_concept_settings)
+
+        test_prog = fluid.Program()
+        # if args.random_seed is not None:
+        #     test_prog.random_seed = args.random_seed
+        with fluid.program_guard(test_prog, startup_prog):
+            with fluid.unique_name.guard():
+                test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
+                    pyreader_name='test_reader',
+                    bert_config=bert_config,
+                    max_concept_length=processor.predict_max_concept_length,
+                    concept_embedding_mat=concept_embedding_mat,                    
+                    is_training=False)
+
+                if args.use_ema and 'ema' not in dir():
+                    ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
+
+                fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
+                    start_logits.name, end_logits.name, num_seqs.name])
+
+        test_prog = test_prog.clone(for_test=True)
+        # if args.random_seed is not None:
+        #     test_prog.random_seed = args.random_seed
+
+    exe.run(startup_prog)
+
+    if args.do_train:
+        logger.info('load pretrained concept embedding')
+        fluid.global_scope().find_var('concept_emb_mat').get_tensor().set(concept_embedding_mat, place)
+
+        if args.init_checkpoint and args.init_pretraining_params:
+            logger.info(
+                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
+                "both are set! Only arg 'init_checkpoint' is made valid.")
+        if args.init_checkpoint:
+            init_checkpoint(
+                exe,
+                args.init_checkpoint,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+        elif args.init_pretraining_params:
+            init_pretraining_params(
+                exe,
+                args.init_pretraining_params,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+    elif args.do_predict or args.do_val:
+        if not args.init_checkpoint:
+            raise ValueError("args 'init_checkpoint' should be set if"
+                             "only doing prediction!")
+        init_checkpoint(
+            exe,
+            args.init_checkpoint,
+            main_program=startup_prog,
+            use_fp16=args.use_fp16)
+
+    if args.do_train:
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.use_experimental_executor = args.use_fast_executor
+        exec_strategy.num_threads = dev_count
+        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
+
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=args.use_cuda,
+            loss_name=loss.name,
+            exec_strategy=exec_strategy,
+            main_program=train_program)
+
+        train_pyreader.decorate_tensor_provider(train_data_generator)
+
+        train_pyreader.start()
+        steps = 0
+        total_cost, total_num_seqs = [], []
+        time_begin = time.time()
+        while steps < max_train_steps:
+            try:
+                steps += 1
+                if steps % args.skip_steps == 0:
+                    if warmup_steps <= 0:
+                        fetch_list = [loss.name, num_seqs.name]
+                    else:
+                        fetch_list = [
+                            loss.name, scheduled_lr.name, num_seqs.name
+                        ]
+                else:
+                    fetch_list = []
+
+                outputs = train_exe.run(fetch_list=fetch_list)
+
+                if steps % args.skip_steps == 0:
+                    if warmup_steps <= 0:
+                        np_loss, np_num_seqs = outputs
+                    else:
+                        np_loss, np_lr, np_num_seqs = outputs
+                    total_cost.extend(np_loss * np_num_seqs)
+                    total_num_seqs.extend(np_num_seqs)
+
+                    if args.verbose:
+                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
+                        )
+                        verbose += "learning rate: %f" % (
+                            np_lr[0]
+                            if warmup_steps > 0 else args.learning_rate)
+                        logger.info(verbose)
+
+                    time_end = time.time()
+                    used_time = time_end - time_begin
+                    current_example, epoch = processor.get_train_progress()
+
+                    logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
+                          "speed: %f steps/s" %
+                          (epoch, current_example, num_train_examples, steps,
+                           np.sum(total_cost) / np.sum(total_num_seqs),
+                           args.skip_steps / used_time))
+                    total_cost, total_num_seqs = [], []
+                    time_begin = time.time()
+
+                if steps % args.save_steps == 0 or steps == max_train_steps:
+                    save_path = os.path.join(args.checkpoints,
+                                             "step_" + str(steps))
+                    fluid.io.save_persistables(exe, save_path, train_program)
+                
+                if steps % args.validation_steps == 0 or steps == max_train_steps:
+                    if args.do_val:
+                        test_pyreader.decorate_tensor_provider(
+                            processor.data_generator(
+                                            data_path=args.predict_file,
+                                            batch_size=args.batch_size,
+                                            phase='predict',
+                                            shuffle=False,
+                                            dev_count=1,
+                                            epoch=1,
+                                            **eval_concept_settings)
+                        )
+                        val_performance = predict(exe, test_prog, test_pyreader, [
+                            unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+                        ], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps))
+                        logger.info("Validation performance after step {}:\n* Exact_match: {}\n* F1: {}".format(steps, val_performance['exact_match'], val_performance['f1']))
+                                        
+            except fluid.core.EOFException:
+                save_path = os.path.join(args.checkpoints,
+                                         "step_" + str(steps) + "_final")
+                fluid.io.save_persistables(exe, save_path, train_program)
+                train_pyreader.reset()
+                break
+
+    if args.do_predict:
+        test_pyreader.decorate_tensor_provider(eval_data_generator)
+
+        if args.use_ema:
+            with ema.apply(exe):
+                eval_performance = predict(exe, test_prog, test_pyreader, [
+                    unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+                ], processor, eval_concept_settings)
+        else:
+            eval_performance = predict(exe, test_prog, test_pyreader, [
+                unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+            ], processor, eval_concept_settings)            
+
+        logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format(eval_performance['exact_match'], eval_performance['f1']))
+
+
+if __name__ == '__main__':
+    print_arguments(args)
+    train(args)
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_squad_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_squad_twomemory.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning on SQuAD."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import multiprocessing
+import os
+import time
+import logging
+import random
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+from reader.squad_twomemory import DataProcessor, write_predictions
+from model.bert import BertConfig, BertModel
+from model.layers import MemoryLayer, TriLinearTwoTimeSelfAttentionLayer
+from utils.args import ArgumentGroup, print_arguments
+from optimization import optimization
+from utils.init import init_pretraining_params, init_checkpoint
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("bert_config_path",         str,  None,           "Path to the json file for bert model config.")
+model_g.add_arg("init_checkpoint",          str,  None,           "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params",  str,  None,
+                "Init pre-training params which preforms fine-tuning from. If the "
+                 "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("checkpoints",              str,  "checkpoints",  "Path to save checkpoints.")
+
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch",             int,    3,      "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate",     float,  5e-5,   "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
+                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay",      float,  0.01,   "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion", float,  0.1,
+                "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps",        int,    1000,   "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps",  int,    1000,   "The steps interval for validation (effective only when do_val is True).")
+train_g.add_arg("use_ema",           bool,   True, "Whether to use ema.")
+train_g.add_arg("ema_decay",         float,  0.9999, "Decay rate for expoential moving average.")
+train_g.add_arg("use_fp16",          bool,   False,  "Whether to use fp16 mixed precision training.")
+train_g.add_arg("loss_scaling",      float,  1.0,
+                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+
+log_g = ArgumentGroup(parser, "logging", "logging related.")
+log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
+log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
+
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("train_file",                str,   None,  "SQuAD json for training. E.g., train-v1.1.json.")
+data_g.add_arg("predict_file",              str,   None,  "SQuAD json for predictions. E.g. dev-v1.1.json or test-v1.1.json.")
+data_g.add_arg("vocab_path",                str,   None,  "Vocabulary path.")
+data_g.add_arg("version_2_with_negative",   bool,  False,
+               "If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
+data_g.add_arg("max_seq_len",               int,   512,   "Number of words of the longest seqence.")
+data_g.add_arg("max_query_length",          int,   64,    "Max query length.")
+data_g.add_arg("max_answer_length",         int,   30,    "Max answer length.")
+data_g.add_arg("batch_size",                int,   12,    "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("in_tokens",                 bool,  False,
+               "If set, the batch size will be the maximum number of tokens in one batch. "
+               "Otherwise, it will be the maximum number of examples in one batch.")
+data_g.add_arg("do_lower_case",             bool,  True,
+               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+data_g.add_arg("doc_stride",                int,   128,
+               "When splitting up a long document into chunks, how much stride to take between chunks.")
+data_g.add_arg("n_best_size",               int,   20,
+               "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+data_g.add_arg("null_score_diff_threshold", float, 0.0,
+               "If null_score - best_non_null is greater than the threshold predict null.")
+data_g.add_arg("random_seed",               int,   42,      "Random seed.")
+
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")
+run_type_g.add_arg("use_fast_executor",            bool,   False, "If set, use fast parallel executor (in experiment).")
+run_type_g.add_arg("num_iteration_per_drop_scope", int,    1,     "Ihe iteration intervals to clean up temporary variables.")
+run_type_g.add_arg("do_train",                     bool,   False,  "Whether to perform training.")
+run_type_g.add_arg("do_val",                       bool,   False,  "Whether to perform validation during training.")
+run_type_g.add_arg("do_predict",                   bool,   False,  "Whether to perform prediction.")
+run_type_g.add_arg("freeze",                       bool,  False,   "freeze bert parameters")
+
+mem_settings_g = ArgumentGroup(parser, "memory", "memory settings.")
+mem_settings_g.add_arg('wn_concept_embedding_path',  str,    None,   'path of wordnet pretrained concept file')
+mem_settings_g.add_arg('nell_concept_embedding_path',  str,    None,   'path of nell pretrained concept file')
+mem_settings_g.add_arg('use_wordnet',             bool,   False,  'whether to use wordnet memory')
+mem_settings_g.add_arg('retrieved_synset_path',   str,    '../retrieve_concepts/retrieve_wordnet/output_squad/retrived_synsets.data',   'path of retrieved synsets')
+mem_settings_g.add_arg('use_nell',                bool,   False,  'whether to use nell memory')
+mem_settings_g.add_arg('train_retrieved_nell_concept_path',   str,    '../retrieve_concepts/retrieve_nell/output_squad/train.retrieved_nell_concepts.data', 'path of retrieved concepts for trainset')
+mem_settings_g.add_arg('dev_retrieved_nell_concept_path',     str,    '../retrieve_concepts/retrieve_nell/output_squad/dev.retrieved_nell_concepts.data',   'path of retrieved concepts for devset')
+
+args = parser.parse_args()
+# yapf: enable.
+
+def create_model(pyreader_name, bert_config, max_wn_concept_length, max_nell_concept_length, wn_concept_embedding_mat, nell_concept_embedding_mat, is_training=False, freeze=False):
+    if is_training:
+        pyreader = fluid.layers.py_reader(
+            capacity=50,
+            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, max_wn_concept_length, 1],
+                    [-1, args.max_seq_len, max_nell_concept_length, 1],
+                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
+            dtypes=[
+                'int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
+            lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
+            name=pyreader_name,
+            use_double_buffer=True)
+        (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, start_positions,
+         end_positions) = fluid.layers.read_file(pyreader)
+    else:
+        pyreader = fluid.layers.py_reader(
+            capacity=50,
+            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, max_wn_concept_length, 1],
+                    [-1, args.max_seq_len, max_nell_concept_length, 1],
+                    [-1, args.max_seq_len, 1], [-1, 1]],
+            dtypes=['int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64'],
+            lod_levels=[0, 0, 0, 0, 0, 0, 0],
+            name=pyreader_name,
+            use_double_buffer=True)
+        (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
+
+    '''1st Layer: BERT Layer'''
+    bert = BertModel(
+        src_ids=src_ids,
+        position_ids=pos_ids,
+        sentence_ids=sent_ids,
+        input_mask=input_mask,
+        config=bert_config,
+        use_fp16=args.use_fp16)
+
+    enc_out = bert.get_sequence_output()
+    if freeze:
+        enc_out.stop_gradient=True
+    logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient))
+
+    '''2nd layer: Memory Layer'''
+    # get memory embedding
+    wn_concept_vocab_size = wn_concept_embedding_mat.shape[0]
+    wn_concept_dim = wn_concept_embedding_mat.shape[1]
+    nell_concept_vocab_size = nell_concept_embedding_mat.shape[0]
+    nell_concept_dim = nell_concept_embedding_mat.shape[1]    
+    wn_memory_embs = fluid.layers.embedding(wn_concept_ids,
+                                         size=(wn_concept_vocab_size, wn_concept_dim),
+                                         param_attr=fluid.ParamAttr(name="wn_concept_emb_mat",
+                                                                    do_model_average=False,
+                                                                    trainable=False),
+                                         dtype='float32')
+    nell_memory_embs = fluid.layers.embedding(nell_concept_ids,
+                                         size=(nell_concept_vocab_size, nell_concept_dim),
+                                         param_attr=fluid.ParamAttr(name="nell_concept_emb_mat",
+                                                                    do_model_average=False,
+                                                                    trainable=False),
+                                         dtype='float32')  
+    
+    # get memory length
+    wn_concept_ids_reduced = fluid.layers.equal(wn_concept_ids,
+        fluid.layers.fill_constant(shape=[1], value=0, dtype="int64"))  # [batch_size, sent_size, concept_size, 1]
+    wn_concept_ids_reduced = fluid.layers.cast(wn_concept_ids_reduced, dtype="float32")  # [batch_size, sent_size, concept_size, 1]
+    wn_concept_ids_reduced = fluid.layers.scale(
+        fluid.layers.elementwise_sub(
+            wn_concept_ids_reduced,
+            fluid.layers.fill_constant([1], "float32", 1)
+        ),
+        scale=-1
+    )
+    wn_mem_length = fluid.layers.reduce_sum(wn_concept_ids_reduced, dim=2)  # [batch_size, sent_size, 1]    
+
+    nell_concept_ids_reduced = fluid.layers.equal(nell_concept_ids,
+        fluid.layers.fill_constant(shape=[1], value=0, dtype="int64"))  # [batch_size, sent_size, concept_size, 1]
+    nell_concept_ids_reduced = fluid.layers.cast(nell_concept_ids_reduced, dtype="float32")  # [batch_size, sent_size, concept_size, 1]
+    nell_concept_ids_reduced = fluid.layers.scale(
+        fluid.layers.elementwise_sub(
+            nell_concept_ids_reduced,
+            fluid.layers.fill_constant([1], "float32", 1)
+        ),
+        scale=-1
+    )
+    nell_mem_length = fluid.layers.reduce_sum(nell_concept_ids_reduced, dim=2)  # [batch_size, sent_size, 1]   
+
+    # select and integrate
+    wn_memory_layer = MemoryLayer(bert_config, max_wn_concept_length, wn_concept_dim, mem_method='raw', prefix='wn')
+    wn_memory_output = wn_memory_layer.forward(enc_out, wn_memory_embs, wn_mem_length, ignore_no_memory_token=True)
+
+    nell_memory_layer = MemoryLayer(bert_config, max_nell_concept_length, nell_concept_dim, mem_method='raw', prefix='nell')
+    nell_memory_output = nell_memory_layer.forward(enc_out, nell_memory_embs, nell_mem_length, ignore_no_memory_token=True)
+
+    memory_output = fluid.layers.concat([enc_out, wn_memory_output, nell_memory_output], axis=2) 
+
+    '''3rd layer: Self-Matching Layer'''
+    # calculate input dim for self-matching layer
+    memory_output_size = bert_config['hidden_size'] + wn_concept_dim + nell_concept_dim        
+    logger.info("memory_output_size: {}".format(memory_output_size))
+
+    # do matching
+    self_att_layer = TriLinearTwoTimeSelfAttentionLayer(
+        memory_output_size, dropout_rate=0.0, 
+        cat_mul=True, cat_sub=True, cat_twotime=True,
+        cat_twotime_mul=False, cat_twotime_sub=True)  # [bs, sq, concat_hs]
+    att_output = self_att_layer.forward(memory_output, input_mask)  # [bs, sq, concat_hs]
+
+    '''4th layer: Output Layer'''
+    logits = fluid.layers.fc(
+        input=att_output,
+        size=2,
+        num_flatten_dims=2,
+        param_attr=fluid.ParamAttr(
+            name="cls_squad_out_w",
+            initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])),
+        bias_attr=fluid.ParamAttr(
+            name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
+
+    logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
+    start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
+
+    batch_ones = fluid.layers.fill_constant_batch_size_like(
+        input=start_logits, dtype='int64', shape=[1], value=1)
+    num_seqs = fluid.layers.reduce_sum(input=batch_ones)
+
+    if is_training:
+
+        def compute_loss(logits, positions):
+            loss = fluid.layers.softmax_with_cross_entropy(
+                logits=logits, label=positions)
+            loss = fluid.layers.mean(x=loss)
+            return loss
+
+        start_loss = compute_loss(start_logits, start_positions)
+        end_loss = compute_loss(end_logits, end_positions)
+        total_loss = (start_loss + end_loss) / 2.0
+        if args.use_fp16 and args.loss_scaling > 1.0:
+            total_loss = total_loss * args.loss_scaling
+
+        return pyreader, total_loss, num_seqs
+    else:
+        return pyreader, unique_id, start_logits, end_logits, num_seqs
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+
+def predict(test_exe, test_program, test_pyreader, fetch_list, processor, eval_concept_settings, eval_output_name='eval_result.json'):
+    if not os.path.exists(args.checkpoints):
+        os.makedirs(args.checkpoints)
+    output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
+    output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
+    output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
+    output_evaluation_result_file = os.path.join(args.checkpoints, eval_output_name)
+
+    test_pyreader.start()
+    all_results = []
+    time_begin = time.time()
+    while True:
+        try:
+            np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
+                fetch_list=fetch_list, program=test_program)
+            for idx in range(np_unique_ids.shape[0]):
+                if len(all_results) % 1000 == 0:
+                    logger.info("Processing example: %d" % len(all_results))
+                unique_id = int(np_unique_ids[idx])
+                start_logits = [float(x) for x in np_start_logits[idx].flat]
+                end_logits = [float(x) for x in np_end_logits[idx].flat]
+                all_results.append(
+                    RawResult(
+                        unique_id=unique_id,
+                        start_logits=start_logits,
+                        end_logits=end_logits))
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+    time_end = time.time()
+
+    features = processor.get_features(
+        processor.predict_examples, is_training=False, **eval_concept_settings)
+    eval_result = write_predictions(processor.predict_examples, features, all_results,
+                      args.n_best_size, args.max_answer_length,
+                      args.do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      args.version_2_with_negative,
+                      args.null_score_diff_threshold, args.verbose, args.predict_file, output_evaluation_result_file)
+    return eval_result
+
+def read_concept_embedding(embedding_path):
+    fin = open(embedding_path, encoding='utf-8')
+    info = [line.strip() for line in fin]
+    dim = len(info[0].split(' ')[1:])
+    n_concept = len(info)
+    embedding_mat = []
+    id2concept, concept2id = [], {}
+    # add padding concept into vocab
+    id2concept.append('<pad_concept>')
+    concept2id['<pad_concept>'] = 0
+    embedding_mat.append([0.0 for _ in range(dim)])
+    for line in info:
+        concept_name = line.split(' ')[0]
+        embedding = [float(value_str) for value_str in line.split(' ')[1:]] 
+        assert len(embedding) == dim and not np.any(np.isnan(embedding))
+        embedding_mat.append(embedding)
+        concept2id[concept_name] = len(id2concept)
+        id2concept.append(concept_name)
+    embedding_mat = np.array(embedding_mat, dtype=np.float32)
+    return id2concept, concept2id, embedding_mat
+
+def train(args):
+    bert_config = BertConfig(args.bert_config_path)
+    bert_config.print_config()
+
+    if not (args.do_train or args.do_predict or args.do_val):
+        raise ValueError("For args `do_train` and `do_predict`, at "
+                         "least one of them must be True.")
+
+    if args.use_cuda:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    exe = fluid.Executor(place)
+
+    wn_id2concept, wn_concept2id, wn_concept_embedding_mat = read_concept_embedding(
+        args.wn_concept_embedding_path)
+    nell_id2concept, nell_concept2id, nell_concept_embedding_mat = read_concept_embedding(
+        args.nell_concept_embedding_path)      
+
+    processor = DataProcessor(
+        vocab_path=args.vocab_path,
+        do_lower_case=args.do_lower_case,
+        max_seq_length=args.max_seq_len,
+        in_tokens=args.in_tokens,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length)
+
+    startup_prog = fluid.Program()
+    if args.random_seed is not None:
+        startup_prog.random_seed = args.random_seed
+        random.seed(args.random_seed)
+        np.random.seed(args.random_seed)
+
+    if args.do_train:
+        train_concept_settings = {
+            'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/train.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
+            'wn_concept2id': wn_concept2id,
+            'nell_concept2id': nell_concept2id,
+            'use_wordnet': args.use_wordnet,
+            'retrieved_synset_path': args.retrieved_synset_path,
+            'use_nell': args.use_nell,
+            'retrieved_nell_concept_path': args.train_retrieved_nell_concept_path,           
+        }        
+        train_data_generator = processor.data_generator(
+            data_path=args.train_file,
+            batch_size=args.batch_size,
+            phase='train',
+            shuffle=True,
+            dev_count=dev_count,
+            version_2_with_negative=args.version_2_with_negative,
+            epoch=args.epoch,
+            **train_concept_settings)
+
+        num_train_examples = processor.get_num_examples(phase='train')
+        if args.in_tokens:
+            max_train_steps = args.epoch * num_train_examples // (
+                args.batch_size // args.max_seq_len) // dev_count
+        else:
+            max_train_steps = args.epoch * num_train_examples // (
+                args.batch_size) // dev_count
+        warmup_steps = int(max_train_steps * args.warmup_proportion)
+        logger.info("Device count: %d" % dev_count)
+        logger.info("Num train examples: %d" % num_train_examples)
+        logger.info("Max train steps: %d" % max_train_steps)
+        logger.info("Num warmup steps: %d" % warmup_steps)
+
+        train_program = fluid.Program()
+        # if args.random_seed is not None:
+        #     train_program.random_seed = args.random_seed
+        with fluid.program_guard(train_program, startup_prog):
+            with fluid.unique_name.guard():
+                train_pyreader, loss, num_seqs = create_model(
+                    pyreader_name='train_reader',
+                    bert_config=bert_config,
+                    max_wn_concept_length=processor.train_wn_max_concept_length,
+                    max_nell_concept_length=processor.train_nell_max_concept_length,
+                    wn_concept_embedding_mat=wn_concept_embedding_mat,
+                    nell_concept_embedding_mat=nell_concept_embedding_mat,
+                    is_training=True,
+                    freeze=args.freeze)
+
+                scheduled_lr = optimization(
+                    loss=loss,
+                    warmup_steps=warmup_steps,
+                    num_train_steps=max_train_steps,
+                    learning_rate=args.learning_rate,
+                    train_program=train_program,
+                    startup_prog=startup_prog,
+                    weight_decay=args.weight_decay,
+                    scheduler=args.lr_scheduler,
+                    use_fp16=args.use_fp16,
+                    loss_scaling=args.loss_scaling)
+                
+                if args.use_ema:
+                    ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
+                    ema.update()
+
+                fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
+
+        if args.verbose:
+            if args.in_tokens:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program,
+                    batch_size=args.batch_size // args.max_seq_len)
+            else:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program, batch_size=args.batch_size)
+            logger.info("Theoretical memory usage in training:  %.3f - %.3f %s" %
+                  (lower_mem, upper_mem, unit))
+
+    if args.do_predict or args.do_val:
+        eval_concept_settings = {
+            'tokenization_path': '../retrieve_concepts/tokenization_squad/tokens/dev.tokenization.{}.data'.format('uncased' if args.do_lower_case else 'cased'),
+            'wn_concept2id': wn_concept2id,
+            'nell_concept2id': nell_concept2id,
+            'use_wordnet': args.use_wordnet,
+            'retrieved_synset_path': args.retrieved_synset_path,
+            'use_nell': args.use_nell,
+            'retrieved_nell_concept_path': args.dev_retrieved_nell_concept_path,          
+        }          
+        eval_data_generator = processor.data_generator(
+                data_path=args.predict_file,
+                batch_size=args.batch_size,
+                phase='predict',
+                shuffle=False,
+                dev_count=1,
+                epoch=1,
+                **eval_concept_settings)
+
+        test_prog = fluid.Program()
+        # if args.random_seed is not None:
+        #     test_prog.random_seed = args.random_seed
+        with fluid.program_guard(test_prog, startup_prog):
+            with fluid.unique_name.guard():
+                test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
+                    pyreader_name='test_reader',
+                    bert_config=bert_config,
+                    max_wn_concept_length=processor.predict_wn_max_concept_length,
+                    max_nell_concept_length=processor.predict_nell_max_concept_length,
+                    wn_concept_embedding_mat=wn_concept_embedding_mat,
+                    nell_concept_embedding_mat=nell_concept_embedding_mat,                  
+                    is_training=False)
+                
+                if args.use_ema and 'ema' not in dir():
+                    ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
+
+                fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
+                    start_logits.name, end_logits.name, num_seqs.name])
+
+        test_prog = test_prog.clone(for_test=True)
+        # if args.random_seed is not None:
+        #     test_prog.random_seed = args.random_seed
+
+    exe.run(startup_prog)
+
+    if args.do_train:
+        logger.info('load pretrained concept embedding')
+        fluid.global_scope().find_var('wn_concept_emb_mat').get_tensor().set(wn_concept_embedding_mat, place)
+        fluid.global_scope().find_var('nell_concept_emb_mat').get_tensor().set(nell_concept_embedding_mat, place)
+
+        if args.init_checkpoint and args.init_pretraining_params:
+            logger.info(
+                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
+                "both are set! Only arg 'init_checkpoint' is made valid.")
+        if args.init_checkpoint:
+            init_checkpoint(
+                exe,
+                args.init_checkpoint,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+        elif args.init_pretraining_params:
+            init_pretraining_params(
+                exe,
+                args.init_pretraining_params,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+    elif args.do_predict or args.do_val:
+        if not args.init_checkpoint:
+            raise ValueError("args 'init_checkpoint' should be set if"
+                             "only doing prediction!")
+        init_checkpoint(
+            exe,
+            args.init_checkpoint,
+            main_program=startup_prog,
+            use_fp16=args.use_fp16)
+
+    if args.do_train:
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.use_experimental_executor = args.use_fast_executor
+        exec_strategy.num_threads = dev_count
+        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
+
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=args.use_cuda,
+            loss_name=loss.name,
+            exec_strategy=exec_strategy,
+            main_program=train_program)
+
+        train_pyreader.decorate_tensor_provider(train_data_generator)
+
+        train_pyreader.start()
+        steps = 0
+        total_cost, total_num_seqs = [], []
+        time_begin = time.time()
+        while steps < max_train_steps:
+            try:
+                steps += 1
+                if steps % args.skip_steps == 0:
+                    if warmup_steps <= 0:
+                        fetch_list = [loss.name, num_seqs.name]
+                    else:
+                        fetch_list = [
+                            loss.name, scheduled_lr.name, num_seqs.name
+                        ]
+                else:
+                    fetch_list = []
+
+                outputs = train_exe.run(fetch_list=fetch_list)
+
+                if steps % args.skip_steps == 0:
+                    if warmup_steps <= 0:
+                        np_loss, np_num_seqs = outputs
+                    else:
+                        np_loss, np_lr, np_num_seqs = outputs
+                    total_cost.extend(np_loss * np_num_seqs)
+                    total_num_seqs.extend(np_num_seqs)
+
+                    if args.verbose:
+                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
+                        )
+                        verbose += "learning rate: %f" % (
+                            np_lr[0]
+                            if warmup_steps > 0 else args.learning_rate)
+                        logger.info(verbose)
+
+                    time_end = time.time()
+                    used_time = time_end - time_begin
+                    current_example, epoch = processor.get_train_progress()
+
+                    logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
+                          "speed: %f steps/s" %
+                          (epoch, current_example, num_train_examples, steps,
+                           np.sum(total_cost) / np.sum(total_num_seqs),
+                           args.skip_steps / used_time))
+                    total_cost, total_num_seqs = [], []
+                    time_begin = time.time()
+
+                if steps % args.save_steps == 0 or steps == max_train_steps:
+                    save_path = os.path.join(args.checkpoints,
+                                             "step_" + str(steps))
+                    fluid.io.save_persistables(exe, save_path, train_program)
+                
+                if steps % args.validation_steps == 0 or steps == max_train_steps:
+                    if args.do_val:
+                        test_pyreader.decorate_tensor_provider(
+                            processor.data_generator(
+                                            data_path=args.predict_file,
+                                            batch_size=args.batch_size,
+                                            phase='predict',
+                                            shuffle=False,
+                                            dev_count=1,
+                                            epoch=1,
+                                            **eval_concept_settings)
+                        )
+                        val_performance = predict(exe, test_prog, test_pyreader, [
+                            unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+                        ], processor, eval_concept_settings, 'validate_result_step_{}.json'.format(steps))
+                        logger.info("Validation performance after step {}:\n* Exact_match: {}\n* F1: {}".format(steps, val_performance['exact_match'], val_performance['f1']))
+                                        
+            except fluid.core.EOFException:
+                save_path = os.path.join(args.checkpoints,
+                                         "step_" + str(steps) + "_final")
+                fluid.io.save_persistables(exe, save_path, train_program)
+                train_pyreader.reset()
+                break
+
+    if args.do_predict:
+        test_pyreader.decorate_tensor_provider(eval_data_generator)
+
+        if args.use_ema:
+            with ema.apply(exe):
+                eval_performance = predict(exe, test_prog, test_pyreader, [
+                    unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+                ], processor, eval_concept_settings)
+        else:
+            eval_performance = predict(exe, test_prog, test_pyreader, [
+                unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
+            ], processor, eval_concept_settings)            
+
+        logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format(eval_performance['exact_match'], eval_performance['f1']))
+
+
+if __name__ == '__main__':
+    print_arguments(args)
+    train(args)
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = open(vocab_file)
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/__init__.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/__init__.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/args.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Arguments for configuration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import argparse
+import logging
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+def str2bool(v):
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+
+
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+
+    def add_arg(self, name, type, default, help, **kwargs):
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            "--" + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+
+
+def print_arguments(args):
+    logger.info('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(six.iteritems(vars(args))):
+        logger.info('%s: %s' % (arg, value))
+    logger.info('------------------------------------------------')
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/fp16.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/fp16.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+
+
+def cast_fp16_to_fp32(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP16,
+            "out_dtype": fluid.core.VarDesc.VarType.FP32
+        })
+
+
+def cast_fp32_to_fp16(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP32,
+            "out_dtype": fluid.core.VarDesc.VarType.FP16
+        })
+
+
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+
+
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    tmp_role = main_prog._current_role
+    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
+    main_prog._current_role = OpRole.Backward
+    for p, g in params_grads:
+        # create master parameters
+        master_param = copy_to_master_param(p, main_prog.global_block())
+        startup_master_param = startup_prog.global_block()._clone_variable(
+            master_param)
+        startup_p = startup_prog.global_block().var(p.name)
+        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
+        # cast fp16 gradients to fp32 before apply gradients
+        if g.name.find("layer_norm") > -1:
+            if loss_scaling > 1:
+                scaled_g = g / float(loss_scaling)
+            else:
+                scaled_g = g
+            master_params_grads.append([p, scaled_g])
+            continue
+        master_grad = fluid.layers.cast(g, "float32")
+        if loss_scaling > 1:
+            master_grad = master_grad / float(loss_scaling)
+        master_params_grads.append([master_param, master_grad])
+    main_prog._current_role = tmp_role
+    return master_params_grads
+
+
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/init.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/init.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import ast
+import copy
+import logging
+
+import numpy as np
+import paddle.fluid as fluid
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+def cast_fp32_to_fp16(exe, main_program):
+    logger.info("Cast parameters to float16 data format.")
+    for param in main_program.global_block().all_parameters():
+        if not param.name.endswith(".master"):
+            param_t = fluid.global_scope().find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            if param.name.find("layer_norm") == -1:
+                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            master_param_var = fluid.global_scope().find_var(param.name +
+                                                             ".master")
+            if master_param_var is not None:
+                master_param_var.get_tensor().set(data, exe.place)
+
+
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+
+    def existed_persitables(var):
+        if not fluid.io.is_persistable(var):
+            return False
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    logger.info("Load model from {}".format(init_checkpoint_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+
+
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+
+    def existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=existed_params)
+    logger.info("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
--- a/PaddleNLP/Research/ACL2019-KTNET/readme.md
+++ b/PaddleNLP/Research/ACL2019-KTNET/readme.md
+# KT-NET
+
+## Introduction
+
+KT-NET (Knowledge and Text fusion NET) is a machine reading comprehension (MRC) model which integrates knowledge from knowledge bases (KBs) into pre-trained contextualized representations. The model is proposed in ACL2019 paper [Enhancing Pre-Trained Language Representations with Rich Knowledge for Machine Reading Comprehension](https://www.aclweb.org/anthology/P19-1226). The overall architecture of the model is shown as follows:
+
+<p align="center">
+<img src="images/architecture.png" width = "340" height = "300" /> <br />
+Overall Architecture of KT-NET
+</p>
+
+This repository contains the PaddlePaddle implementation of KT-NET. The trained checkpoints are also provided for reproducing the results in the paper.
+
+## How to Run
+
+### Environment
+
+This project should work fine if the following requirements have been satisfied:
+ python >= 3.7
+ paddlepaddle-gpu (the latest develop version is recommended)
+ NLTK >= 3.3 (with WordNet 3.0)
+ tqdm
+ CoreNLP (3.8.0 version is recommended)
+ pycorenlp
+ CUDA, CuDNN and NCCL (CUDA 9.0, CuDNN v7 and NCCL 2.3.7 are recommended)
+
+All of the experiments in the paper are performed on 4 P40 GPUs.
+
+### Download the MRC datasets
+
+In this work, we empirically evaluate our model on two benchmarks: 
+
+#### 1. ReCoRD
+
+[ReCoRD](https://sheng-z.github.io/ReCoRD-explorer/) (Reading Comprehension with Commonsense Reasoning Dataset) is a large-scale MRC dataset requiring commonsense reasoning. The official dataset in JSON format can be downloaded using Google drive (training set: [link](https://drive.google.com/file/d/1PoHmphyH79pETNws8kU2OwuerU7SWLHj/view), valid set: [link](https://drive.google.com/file/d/1WNaxBpXEGgPbymTzyN249P4ub-uU5dkO/view)). *(For convenience, we have provided the MD5 for each downloadable file of this readme in `downloaded_files.md5`. It's recommended to use it to check the completeness of the downloaded file.)* Please place the downloaded files `train.json` and `dev.json` into the `data/ReCoRD/` directory of this repository. We will also use the official evaluation script of ReCoRD, so please run the following command:
+```
+curl -o record_official_evaluate.py https://sheng-z.github.io/ReCoRD-explorer/evaluation.py
+mv record_official_evaluate.py reading_comprehension/src/eval/
+```
+
+#### 2. SQuAD v1.1
+
+[SQuAD v1.1](https://rajpurkar.github.io/SQuAD-explorer/) is a well-known extractive MRC dataset that consists of questions created by crowdworkers for Wikipedia articles. Please run the following command to download the official dataset and evaluation script.
+```
+curl -O https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
+curl -O https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
+mv train-v1.1.json dev-v1.1.json data/SQuAD/
+curl -o squad_v1_official_evaluate.py https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/
+mv squad_v1_official_evaluate.py reading_comprehension/src/eval/
+```
+
+### Retrieve KB entries
+
+Relevant knowledge should be retrieved and encoded before training the model. In this project, we leveraged two KBs: [WordNet](https://wordnet.princeton.edu/) and [NELL](http://rtw.ml.cmu.edu/rtw/). WordNet records lexical relations between words and NELL stores beliefs about entities. The following procedure describes how we retrieve relevant WordNet synsets and NELL concepts for MRC samples.
+
+#### 1. Named entity recognition (only for SQuAD)
+
+To retrieve NELL concepts about entities, the named entity mentions in MRC samples should be annotated. For ReCoRD, the entity mentions have been provided in the dataset. For SQuAD, named entity recognition (NER) needs to be performed before the retrieval. We use [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/index.html) in this step. After CoreNLP is [downloaded](http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip) and unzipped, run the following command at the CoreNLP directory to start the CoreNLP server:
+```
+java -mx10g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9753 -timeout 20000
+```
+Then run the command:
+```
+cd retrieve_concepts/ner_tagging_squad
+python3 tagging.py
+```
+The tagged dataset will be saved at `retrieve_concepts/ner_tagging_squad/output` directory. We have provided our output files for convenience ([download link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_squad_tagging_output.tar.gz)).
+
+#### 2. Tokenization
+
+Tokenization should be performed for retrieval. We use the same tokenizer with [BERT](https://github.com/google-research/bert). 
+For ReCoRD, run the following command to tokenize the raw dataset (or directly download our output from [link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_tokenize_result_record.tar.gz)):
+```
+cd retrieve_concepts/tokenization_record
+python3 do_tokenization.py
+```
+For SQuAD, run the following command to process the NER tagged dataset (or directly download our output from [link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_tokenize_result_squad.tar.gz)):
+```
+cd retrieve_concepts/tokenization_squad
+python3 do_tokenization.py
+```
+
+#### 3. Retrieve WordNet
+
+This step retrieves the WordNet (WN18) synsets for each non-stop word in the MRC samples. 
+For ReCoRD, run the command:
+```
+cd retrieve_concepts/retrieve_wordnet
+python3 retrieve.py --train_token ../tokenization_record/tokens/train.tokenization.uncased.data --eval_token ../tokenization_record/tokens/dev.tokenization.uncased.data --output_dir output_record/ --no_stopwords
+```
+For SQuAD, run the command:
+```
+cd retrieve_concepts/retrieve_wordnet
+python3 retrieve.py --train_token ../tokenization_squad/tokens/train.tokenization.uncased.data --eval_token ../tokenization_squad/tokens/dev.tokenization.uncased.data --output_dir output_squad/ --no_stopwords
+```
+The outputs are pickled into binary files. We have also provided our output files for convenience ([download link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_wordnet_concepts.tar.gz)).
+
+#### 4. Retrieve NELL
+
+Using string mapping, this step finds corresponding named entities for each entity mention in the given MRC example and returns their categories as relevant NELL concepts. The latest NELL beliefs should be downloaded first.
+```
+wget http://rtw.ml.cmu.edu/resources/results/08m/NELL.08m.1115.esv.csv.gz
+gzip -d NELL.08m.1115.esv.csv.gz
+mv NELL.08m.1115.esv.csv retrieve_concepts/retrieve_nell
+```
+For ReCoRD, run the command:
+```
+cd retrieve_concepts/retrieve_nell
+python3 retrieve.py --train_token ../tokenization_record/tokens/train.tokenization.uncased.data --eval_token ../tokenization_record/tokens/dev.tokenization.uncased.data --output_dir output_record/
+```
+For SQuAD, run the command:
+```
+cd retrieve_concepts/retrieve_nell
+python3 retrieve.py --train_token ../tokenization_squad/tokens/train.tokenization.uncased.data --eval_token ../tokenization_squad/tokens/dev.tokenization.uncased.data --output_dir output_squad/
+```
+The outputs are pickled into binary files. The output files can also be downloaded from [download link](https://baidu-nlp.bj.bcebos.com/KTNET_preprocess_nell_concepts.tar.gz).
+
+#### 5. Prepare KB embedding
+
+Following the work of [Yang et al., 2015](https://arxiv.org/pdf/1412.6575.pdf), we leverage their KB embedding for WordNet synsets and NELL categories trained by the BILINEAR model. 
+```
+curl -O https://raw.githubusercontent.com/bishanyang/kblstm/master/embeddings/wn_concept2vec.txt
+curl -O https://raw.githubusercontent.com/bishanyang/kblstm/master/embeddings/nell_concept2vec.txt
+mv wn_concept2vec.txt nell_concept2vec.txt retrieve_concepts/KB_embeddings
+```
+The 100-dimensional embeddings are stored in the following format:
+```
+concept:coach -0.123886 0.0477016 0.517474 0.154645 0.32559 ...
+```
+For other knowledge bases, please refer to the source code for training the BILINEAR model from [Yang's github repo](https://github.com/bishanyang/kblstm/tree/master/code/models).
+
+### Training KT-NET
+
+#### Prepare BERT checkpoint
+
+The text encoder module of KT-NET is initialized with pretrained BERT large-cased parameters, run the command:
+```
+cd reading_comprehension
+wget https://bert-models.bj.bcebos.com/cased_L-24_H-1024_A-16.tar.gz --no-check-certificate
+tar xvf cased_L-24_H-1024_A-16.tar.gz
+```
+
+#### Directly fine-tuning
+
+We have provided scripts to execute training and inference for KT-NET. To train a model for ReCoRD dataset with both WordNet and NELL concepts employed, just run the command:
+```
+cd reading_comprehension && sh ./run_record_twomemory.sh
+```
+The hyper-parameters, such as training epochs, learning rate and batch size, can be adjusted in the script. After training and evaluation, the following files and directories will be created:
+ `output/eval_result.json`: the performance of the trained model on the benchmark
+ `output/predictions.json`: the predicted answers for the development set
+ `output/nbest_predictions.json`: n-best predicted answers for the development set
+ `output/step_XXXX`: the directory of model checkpoint
+ `log/train.log`: the logging file
+
+To run with single KB, replace `run_record_twomemory.sh` with `run_record_wordnet.sh` or `run_record_nell.sh`. 
+
+Similarly, for SQuAD, use `run_squad_twomemory.sh`, `run_squad_wordnet.sh` or `run_squad_nell.sh`.
+
+#### Two-staged fine-tuning (Recommended)
+
+In our experiments, we found that employing a "two-staged" training strategy achieves better model performance, which freezes BERT params in the first stage and unfreezes them later. We recommend to adopt this strategy to train KT-NET. To run two-staged fine-tuning, just first execute the `XXX_pretrain.sh` script and then run `XXX_finetune.sh`. E.g., to train a KT-NET on ReCoRD with both KBs, firstly run
+```
+cd reading_comprehension && sh ./run_record_twomemory_pretrain.sh
+```
+and then run the command after the first stage has been finished
+```
+sh ./run_record_twomemory_finetune.sh
+```
+
+The finally created `output/` and `log/` directories have the same folder structure with directly fine-tuning.
+
+In the first stage, we trained 10 epochs for ReCoRD and 1 epoch for SQuAD. As for the second stage, we recommend to fine-tune 2-4 epochs for ReCoRD and 2-3 epochs for SQuAD.
+
+#### Reproduce the paper results
+
+We have released the following checkpoints for our trained KT-NET which can reproduce the performance in the paper:
+
+|  ReCoRD Model  | F1 score | Exact Match | Inference Script |
+| :------------- | :---------: | :----------: | :--------- |
+| [KT-NET (WordNet)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_record_wordnet.tar.gz) | 72.76 | 70.56 | eval_record_wordnet.sh | 
+| [KT-NET (NELL)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_record_nell.tar.gz) | 72.52 | 70.54 | eval_record_nell.sh | 
+| [KT-NET (Both)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_record_both.tar.gz) | 73.62 | 71.60 | eval_record_twomemory.sh | 
+
+|   SQuAD Model  | F1 score | Exact Match | Inference Script |
+| :------------- | :---------: | :----------: | :--------- |
+| [KT-NET (WordNet)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_squad_wordnet.tar.gz) | 91.70 | 85.16 | eval_squad_wordnet.sh | 
+| [KT-NET (NELL)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_squad_nell.tar.gz) | 91.70 | 85.02 | eval_squad_nell.sh | 
+| [KT-NET (Both)](https://baidu-nlp.bj.bcebos.com/KTNET_fine-tuned-model_squad_both.tar.gz) | 91.65 | 84.97 | eval_squad_twomemory.sh | 
+
+After downloading and extracting the checkpoint file, please execute the corresponding inference script. E.g.:
+```
+cd reading_comprehension && sh ./eval_record_twomemory.sh extracted_ckpt_dir_path
+```
+The following result is expected to be created in the `output/` directory:
+```
+{
+    "exact_match": 71.61,
+    "f1": 73.62396522806482
+}
+```
+
+## Citation
+
+If you use any source code included in this project in your work, please cite the following paper:
+```
+@inproceedings{yang-etal-2019-enhancing-pre,
+    title = {Enhancing Pre-Trained Language Representations with Rich Knowledge for Machine Reading Comprehension},
+    author = {An Yang, Quan Wang, Jing Liu, Kai Liu, Yajuan Lyu, Hua Wu, Qiaoqiao She and Sujian Li},
+    booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
+    year = {2019},
+    publisher = {Association for Computational Linguistics},
+    pages = {2346--2357},
+}
+```
+
+## Copyright and License
+
+Copyright 2019 Baidu.com, Inc. All Rights Reserved
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/KB_embeddings/.gitkeep
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/KB_embeddings/.gitkeep
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/ner_tagging_squad/tagging.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/ner_tagging_squad/tagging.py
+# -*- coding: utf-8 -*-
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This script perform NER tagging for raw SQuAD datasets
+# All the named entites found in question and context are recorded with their offsets in the output file
+# CoreNLP is used for NER tagging
+
+import os
+import json
+import argparse
+import logging
+import urllib
+import sys
+from tqdm import tqdm
+from pycorenlp import StanfordCoreNLP
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_dir", default='output', type=str, 
+                        help="The output directory to store tagging results.")
+    parser.add_argument("--train_file", default='../../data/SQuAD/train-v1.1.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default='../../data/SQuAD/dev-v1.1.json', type=str,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    return parser.parse_args()
+
+# transform corenlp tagging output into entity list
+# some questions begins with whitespaces and they are striped by corenlp, thus begin offset should be added.
+def parse_output(text, tagging_output, begin_offset=0): 
+    entities = []
+    select_states = ['ORGANIZATION', 'PERSON', 'MISC', 'LOCATION']
+    for sent in tagging_output['sentences']:
+        state = 'O'
+        start_pos, end_pos = -1, -1
+        for token in sent['tokens']:
+            tag = token['ner']  
+            if tag == 'O' and state != 'O':
+                if state in select_states:
+                    entities.append({'text': text[begin_offset + start_pos: begin_offset + end_pos], 'start': begin_offset + start_pos, 'end': begin_offset + end_pos - 1})
+                state = 'O'
+            elif tag != 'O':
+                if state == tag:
+                    end_pos = token['characterOffsetEnd']
+                else:
+                    if state in select_states:
+                        entities.append({'text': text[begin_offset + start_pos: begin_offset + end_pos], 'start': begin_offset + start_pos, 'end': begin_offset + end_pos - 1})
+                    state = tag
+                    start_pos = token['characterOffsetBegin']
+                    end_pos = token['characterOffsetEnd']
+        if state in select_states:
+            entities.append({'text': text[begin_offset + start_pos: begin_offset + end_pos], 'start': begin_offset + start_pos, 'end': begin_offset + end_pos - 1})
+    return entities
+                
+def tagging(dataset, nlp):
+    skip_context_cnt, skip_question_cnt = 0, 0
+    for article in tqdm(dataset['data']):
+        for paragraph in tqdm(article['paragraphs']):
+            context = paragraph['context']
+            context_tagging_output = nlp.annotate(urllib.parse.quote(context), properties={'annotators': 'ner', 'outputFormat': 'json'})
+            # assert the context length is not changed
+            if len(context.strip()) == context_tagging_output['sentences'][-1]['tokens'][-1]['characterOffsetEnd']:
+                context_entities = parse_output(context, context_tagging_output, len(context) - len(context.lstrip()))
+            else:
+                context_entities = []
+                skip_context_cnt += 1
+                logger.info('Skipped context due to offset mismatch:')
+                logger.info(context)
+            paragraph['context_entities'] = context_entities
+            for qa in tqdm(paragraph['qas']):
+                question = qa['question']
+                question_tagging_output = nlp.annotate(urllib.parse.quote(question), properties={'annotators': 'ner', 'outputFormat': 'json'})
+                if len(question.strip()) == question_tagging_output['sentences'][-1]['tokens'][-1]['characterOffsetEnd']:
+                    question_entities = parse_output(question, question_tagging_output, len(context) - len(context.lstrip()))
+                else:
+                    question_entities = []
+                    skip_question_cnt += 1
+                    logger.info('Skipped question due to offset mismatch:')
+                    logger.info(question)                    
+                qa['question_entities'] = question_entities
+    logger.info('In total, {} contexts and {} questions are skipped...'.format(skip_context_cnt, skip_question_cnt))
+
+if __name__ == '__main__':
+    args = parse_args()
+    
+    # make output directory if not exist
+    if not os.path.exists(args.output_dir):
+        os.mkdir(args.output_dir)
+    
+    # register corenlp server
+    nlp = StanfordCoreNLP('http://localhost:9753')
+
+    # load train and dev datasets
+    ftrain = open(args.train_file, 'r', encoding='utf-8')
+    trainset = json.load(ftrain)
+    fdev = open(args.predict_file, 'r', encoding='utf-8')
+    devset = json.load(fdev)
+    
+    for dataset, path, name in zip((trainset, devset), (args.train_file, args.predict_file), ('train', 'dev')):
+        tagging(dataset, nlp)
+        output_path = os.path.join(args.output_dir, "{}.tagged.json".format(os.path.basename(path)[:-5]))
+        json.dump(dataset, open(output_path, 'w', encoding='utf-8'))
+        logger.info('Finished tagging {} set'.format(name))
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_nell/nell_concept_list.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_nell/nell_concept_list.txt
+concept:coach
+concept:musicfestival
+concept:book
+concept:professor
+concept:dateliteral
+concept:mountainrange
+concept:wine
+concept:flooritem
+concept:clothing
+concept:mlalgorithm
+concept:drug
+concept:musicgenre
+concept:parlourgame
+concept:website
+concept:eventoutcome
+concept:planet
+concept:mammal
+concept:organization
+concept:female
+concept:vehicle
+concept:event
+concept:legume
+concept:weatherphenomenon
+concept:perceptionevent
+concept:emotion
+concept:bombingevent
+concept:highway
+concept:creativework
+concept:comedian
+concept:gamescore
+concept:software
+concept:personcanada
+concept:musicalbum
+concept:beach
+concept:geopoliticalorganization
+concept:product
+concept:street
+concept:astronaut
+concept:virus
+concept:criminal
+concept:trail
+concept:roadaccidentevent
+concept:physicalaction
+concept:archaea
+concept:personafrica
+concept:personasia
+concept:medicalprocedure
+concept:monument
+concept:tool
+concept:politician
+concept:conference
+concept:insect
+concept:restaurant
+concept:sportsequipment
+concept:politicsblog
+concept:physicalcharacteristic
+concept:bakedgood
+concept:sociopolitical
+concept:meetingeventtype
+concept:blog
+concept:mediacompany
+concept:bridge
+concept:male
+concept:researchproject
+concept:traditionalgame
+concept:recipe
+concept:crustacean
+concept:militaryeventtype
+concept:color
+concept:race
+concept:religion
+concept:furniture
+concept:building
+concept:geopoliticallocation
+concept:personsouthamerica
+concept:beverage
+concept:nondiseasecondition
+concept:school
+concept:politicalparty
+concept:politicsbill
+concept:zoo
+concept:artery
+concept:recordlabel
+concept:cave
+concept:visualartmovement
+concept:musicartist
+concept:olympics
+concept:visualizableattribute
+concept:sportsteamposition
+concept:boardgame
+concept:person
+concept:actor
+concept:perceptionaction
+concept:dayofweek
+concept:householditem
+concept:fungus
+concept:bird
+concept:fruit
+concept:amphibian
+concept:victim
+concept:musicsong
+concept:newspaper
+concept:farm
+concept:tradeunion
+concept:bone
+concept:month
+concept:personaustralia
+concept:movie
+concept:convention
+concept:nonneginteger
+concept:nerve
+concept:highschool
+concept:time
+concept:lake
+concept:placeofworship
+concept:mlmetric
+concept:visualartform
+concept:grandprix
+concept:agriculturalproduct
+concept:bedroomitem
+concept:chemical
+concept:muscle
+concept:sportsgame
+concept:physiologicalcondition
+concept:radiostation
+concept:televisionstation
+concept:personus
+concept:coffeedrink
+concept:airport
+concept:invertebrate
+concept:bathroomitem
+concept:physicsterm
+concept:company
+concept:meetingeventtitle
+concept:earthquakeevent
+concept:judge
+concept:skiarea
+concept:personeurope
+concept:politicsissue
+concept:nongovorganization
+concept:mlconference
+concept:politicaloffice
+concept:url
+concept:visualartist
+concept:hotel
+concept:caf_
+concept:bacteria
+concept:kitchenitem
+concept:militaryconflict
+concept:protestevent
+concept:sportsteam
+concept:politicianus
+concept:mlauthor
+concept:retailstore
+concept:architect
+concept:location
+concept:shoppingmall
+concept:sportsevent
+concept:politicsgroup
+concept:buildingmaterial
+concept:televisionshow
+concept:consumerelectronicitem
+concept:petroleumrefiningcompany
+concept:room
+concept:academicfield
+concept:reptile
+concept:wallitem
+concept:buildingfeature
+concept:programminglanguage
+concept:mollusk
+concept:monarch
+concept:bank
+concept:creditunion
+concept:park
+concept:island
+concept:governmentorganization
+concept:celltype
+concept:game
+concept:videogamesystem
+concept:automobileengine
+concept:biotechcompany
+concept:nonprofitorganization
+concept:geometricshape
+concept:museum
+concept:port
+concept:cardgame
+concept:landscapefeatures
+concept:televisionnetwork
+concept:musicinstrument
+concept:ethnicgroup
+concept:language
+concept:grain
+concept:mlarea
+concept:director
+concept:weapon
+concept:cognitiveactions
+concept:mlsoftware
+concept:species
+concept:fish
+concept:athlete
+concept:ceo
+concept:publication
+concept:vertebrate
+concept:sportsleague
+concept:mediatype
+concept:filmfestival
+concept:university
+concept:stadiumoreventvenue
+concept:zipcode
+concept:writer
+concept:continent
+concept:oilgasfield
+concept:videogame
+concept:country
+concept:river
+concept:personnorthamerica
+concept:currency
+concept:nut
+concept:hallwayitem
+concept:professionalorganization
+concept:skyscraper
+concept:lymphnode
+concept:meat
+concept:scientist
+concept:tableitem
+concept:winery
+concept:disease
+concept:magazine
+concept:condiment
+concept:economicsector
+concept:visualizablescene
+concept:mldataset
+concept:mountain
+concept:braintissue
+concept:chef
+concept:vegetable
+concept:model
+concept:protein
+concept:city
+concept:personbylocation
+concept:arachnid
+concept:date
+concept:scientificterm
+concept:officeitem
+concept:automobilemodel
+concept:musician
+concept:election
+concept:automobilemaker
+concept:sport
+concept:food
+concept:attraction
+concept:candy
+concept:profession
+concept:county
+concept:celebrity
+concept:crimeorcharge
+concept:vein
+concept:aquarium
+concept:year
+concept:plant
+concept:journalist
+concept:bodypart
+concept:stateorprovince
+concept:refineryproduct
+concept:jobposition
+concept:personmexico
+concept:trainstation
+concept:productlaunchevent
+concept:awardtrophytournament
+concept:officebuildingroom
+concept:animal
+concept:arthropod
+concept:hobby
+concept:charactertrait
+concept:hospital
+concept:transportation
+concept:cheese
+concept:terroristorganization
+concept:personalcareitem
+concept:geopoliticalentity
\ No newline at end of file
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_nell/retrieve.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_nell/retrieve.py
+# -*- coding: utf-8 -*-
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This script retrieve related NELL entities and their concepts for each named-entity in ReCoRD
+# 1. transform ReCoRD entity from word sequences into strings (use _ to replace whitespace and eliminate punc)
+# 2. preprocess NELL entity name (remove front 'n' for NELL entities when digit is in the beginning and additional _)
+# 3. for ReCoRD entities with more than one token, use exact match
+# 4. for one-word ReCoRD entities, do wordnet lemmatization before matching (and matching by both raw and morphed forms)
+# 5. in a passage, if entity A is a suffix of entity B, use B's categories instead
+
+import pickle
+import logging
+import string
+import argparse
+import os
+import nltk
+from collections import namedtuple
+from tqdm import tqdm
+from nltk.corpus import wordnet as wn
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+# remove category part of NELL entities, digit prefix 'n' and additional '_'
+def preprocess_nell_ent_name(raw_name):
+    ent_name = raw_name.split(':')[-1]
+    digits = set(string.digits)
+    if ent_name.startswith('n') and all([char in digits for char in ent_name.split('_')[0][1:]]):
+        ent_name = ent_name[1:]
+    ent_name = "_".join(filter(lambda x:len(x) > 0, ent_name.split('_')))
+    return ent_name
+
+puncs = set(string.punctuation)
+def preprocess_record_ent_name(raw_token_seq):
+    return "_".join(filter(lambda x:x not in puncs, raw_token_seq))
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train_token', type=str, default='../tokenization_record/tokens/train.tokenization.uncased.data', help='token file of train set')
+    parser.add_argument('--eval_token', type=str, default='../tokenization_record/tokens/dev.tokenization.uncased.data', help='token file of dev set')
+    parser.add_argument('--score_threshold', type=float, default=0.9, help='only keep generalizations relations with score >= threshold')    
+    parser.add_argument('--output_dir', type=str, default='output_record/', help='output directory')
+    args = parser.parse_args()
+
+    # make output directory if not exist
+    if not os.path.exists(args.output_dir):
+        os.mkdir(args.output_dir)
+
+    # load set of concepts with pre-trained embedding
+    concept_set = set()
+    with open('nell_concept_list.txt') as fin:
+        for line in fin:
+            concept_name = line.strip()
+            concept_set.add(concept_name)
+
+    # read nell csv file and build NELL entity to category dict
+    logger.info('Begin to read NELL csv...')
+    fin = open('NELL.08m.1115.esv.csv')
+    nell_ent_to_cpt = {}
+    nell_ent_to_fullname = {}
+
+    header = True
+    for line in fin:
+        if header:
+            header = False
+            continue
+        line = line.strip()
+        items = line.split('\t')
+        if items[1] == 'generalizations' and float(items[4]) >= args.score_threshold:
+            nell_ent_name = preprocess_nell_ent_name(items[0])
+            category = items[2]
+            if nell_ent_name not in nell_ent_to_cpt:
+                nell_ent_to_cpt[nell_ent_name] = set()
+                nell_ent_to_fullname[nell_ent_name] = set()
+            nell_ent_to_cpt[nell_ent_name].add(category)
+            nell_ent_to_fullname[nell_ent_name].add(items[0])
+    logger.info('Finished reading NELL csv.')
+
+    # load record dataset
+    logger.info('Begin to load tokenization results...')
+    train_samples = pickle.load(open(args.train_token, 'rb'))
+    dev_samples = pickle.load(open(args.eval_token, 'rb'))
+    logger.info('Finished loading tokenization results.')
+
+    # build record entity set
+    record_ent_set = set()
+    for sample in train_samples + dev_samples:
+        query_tokens = sample['query_tokens']
+        document_tokens = sample['document_tokens']
+        for entity_info in sample['document_entities']:
+            entity_token_seq = document_tokens[entity_info[1]: entity_info[2] + 1]
+            record_ent_set.add(preprocess_record_ent_name(entity_token_seq))
+        for entity_info in sample['query_entities']:
+            entity_token_seq = query_tokens[entity_info[1]: entity_info[2] + 1]
+            record_ent_set.add(preprocess_record_ent_name(entity_token_seq))   
+    logger.info('Finished making tokenization results into entity set.')        
+
+    # do mapping
+    record_ent_to_cpt = {}
+    record_ent_to_nell_ent = {}
+    for record_ent in tqdm(record_ent_set):
+        cpt, nell_ent = set(), set()
+        if record_ent in nell_ent_to_cpt:
+            cpt.update(nell_ent_to_cpt[record_ent])
+            nell_ent.update(nell_ent_to_fullname[record_ent])
+        # length is 1, do morphy
+        if '_' not in record_ent:
+            for pos_tag in ['n', 'v', 'a', 'r']:
+                morph = wn.morphy(record_ent, pos_tag)
+                if morph is not None and morph in nell_ent_to_cpt:
+                    cpt.update(nell_ent_to_cpt[morph])
+                    nell_ent.update(nell_ent_to_fullname[morph])
+        record_ent_to_cpt[record_ent] = cpt
+        record_ent_to_nell_ent[record_ent] = nell_ent
+    logger.info('Finished matching record entities to nell entities.')
+    
+    # map the record entity in the set back to passage
+    logger.info('Begin to generate output file...')
+    _TempRectuple = namedtuple('entity_record', [
+                               'entity_string', 'start', 'end', 'retrieved_concepts', 'retrieved_entities'])
+    for outfn, samples in zip(('{}.retrieved_nell_concepts.data'.format(prefix) for prefix in ('train', 'dev')), (train_samples, dev_samples)):
+        all_outputs = []
+        for sample in tqdm(samples):
+            doc_entities = []
+            document_tokens = sample['document_tokens']
+            for entity_info in sample['document_entities']:
+                entity_token_seq = document_tokens[entity_info[1]: entity_info[2] + 1]
+                entity_whitespace_str = " ".join(entity_token_seq)
+                entity_retrieve_str = preprocess_record_ent_name(
+                    entity_token_seq)
+                doc_entities.append(_TempRectuple(
+                    entity_whitespace_str, entity_info[1], entity_info[2], record_ent_to_cpt[entity_retrieve_str], record_ent_to_nell_ent[entity_retrieve_str]))
+            query_entities = []
+            query_tokens = sample['query_tokens']
+            for entity_info in sample['query_entities']:
+                entity_token_seq = query_tokens[entity_info[1]: entity_info[2] + 1]
+                entity_whitespace_str = " ".join(entity_token_seq)
+                entity_retrieve_str = preprocess_record_ent_name(
+                    entity_token_seq)
+                query_entities.append(_TempRectuple(
+                    entity_whitespace_str, entity_info[1], entity_info[2], record_ent_to_cpt[entity_retrieve_str], record_ent_to_nell_ent[entity_retrieve_str]))                
+            
+            # perform suffix replacement rule (eg. use the result of "Donald Trump" to replace "Trump" in the passage)
+            doc_entities_final, query_entities_final = [], []
+            for entities, entities_final in zip((doc_entities, query_entities), (doc_entities_final, query_entities_final)):
+                for trt in entities:
+                    new_nell_cpt_set, new_nell_ent_set = set(), set()
+                    for other_trt in doc_entities + query_entities:
+                        if other_trt.entity_string != trt.entity_string and other_trt.entity_string.endswith(trt.entity_string):
+                            new_nell_cpt_set.update(other_trt.retrieved_concepts)
+                            new_nell_ent_set.update(other_trt.retrieved_entities)
+                    # no need to replace
+                    if len(new_nell_cpt_set) == 0:
+                        new_nell_cpt_set = trt.retrieved_concepts
+                        new_nell_ent_set = trt.retrieved_entities
+                    new_nell_cpt_set = new_nell_cpt_set & concept_set # filter concepts with pretrained embedding
+                    entities_final.append({
+                        'entity_string': trt.entity_string,
+                        'token_start': trt.start,
+                        'token_end': trt.end,
+                        'retrieved_concepts': list(new_nell_cpt_set),
+                        'retrieved_entities': list(new_nell_ent_set),
+                    })
+            
+            all_outputs.append({
+                'id': sample['id'],
+                'document_entities': doc_entities_final,
+                'query_entities': query_entities_final,
+            })
+        pickle.dump(all_outputs, open(os.path.join(args.output_dir, outfn), 'wb'))
+    logger.info('Output retrieved results have been dumped.')
+        
+if __name__ == '__main__':
+    main()
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/retrieve.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/retrieve.py
+# -*- coding: utf-8 -*-
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import pickle
+import argparse
+import os
+import nltk
+import logging
+import string
+from tqdm import tqdm
+from nltk.corpus import wordnet as wn
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train_token', type=str, default='../tokenization_record/tokens/train.tokenization.uncased.data', help='token file of train set')
+    parser.add_argument('--eval_token', type=str, default='../tokenization_record/tokens/dev.tokenization.uncased.data', help='token file of dev set')
+    parser.add_argument('--output_dir', type=str, default='output_record/', help='output directory')
+    parser.add_argument('--no_stopwords', action='store_true', help='ignore stopwords')
+    parser.add_argument('--ignore_length', type=int, default=0, help='ignore words with length <= ignore_length')
+    args = parser.parse_args()
+
+    # initialize mapping from offset id to wn18 synset name
+    offset_to_wn18name_dict = {} 
+    fin = open('wordnet-mlj12-definitions.txt')
+    for line in fin:
+        info = line.strip().split('\t')
+        offset_str, synset_name = info[0], info[1]
+        offset_to_wn18name_dict[offset_str] = synset_name
+    logger.info('Finished loading wn18 definition file.')
+        
+
+    # load pickled samples
+    logger.info('Begin to load tokenization results...')
+    train_samples = pickle.load(open(args.train_token, 'rb'))
+    dev_samples = pickle.load(open(args.eval_token, 'rb'))
+    logger.info('Finished loading tokenization results.')
+    
+    # build token set
+    all_token_set = set()
+    for sample in train_samples + dev_samples:
+        for token in sample['query_tokens'] + sample['document_tokens']:
+            all_token_set.add(token)
+    logger.info('Finished making tokenization results into token set.')
+
+    # load stopwords
+    stopwords = set(nltk.corpus.stopwords.words('english'))
+    logger.info('Finished loading stopwords list.')
+
+    # retrive synsets
+    logger.info('Begin to retrieve synsets...')
+    token2synset = dict()
+    stopword_cnt = 0
+    punctuation_cnt = 0
+    for token in tqdm(all_token_set):
+        if token in set(string.punctuation):
+            logger.info('{} is punctuation, skipped!'.format(token))
+            punctuation_cnt += 1
+            continue        
+        if args.no_stopwords and token in stopwords:
+            logger.info('{} is stopword, skipped!'.format(token))
+            stopword_cnt += 1
+            continue
+        if args.ignore_length > 0 and len(token) <= args.ignore_length:
+            logger.info('{} is too short, skipped!'.format(token))
+            continue
+        synsets = wn.synsets(token)
+        wn18synset_names = []
+        for synset in synsets:
+            offset_str = str(synset.offset()).zfill(8)
+            if offset_str in offset_to_wn18name_dict:
+                wn18synset_names.append(offset_to_wn18name_dict[offset_str])
+        if len(wn18synset_names) > 0:
+            token2synset[token] = wn18synset_names
+    logger.info('Finished retrieving sysnets.')
+    logger.info('{} / {} tokens retrieved at lease 1 synset. {} stopwords and {} punctuations skipped.'.format(len(token2synset), len(all_token_set), stopword_cnt, punctuation_cnt))
+    
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    with open(os.path.join(args.output_dir, 'retrived_synsets.data'), 'wb') as fout:
+        pickle.dump(token2synset, fout)    
+    logger.info('Finished dumping retrieved synsets.')
+
+if __name__ == '__main__':
+    main()
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/wordnet-mlj12-definitions.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/wordnet-mlj12-definitions.txt
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/do_tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/do_tokenization.py
+# -*- coding: utf-8 -*-
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This script performs the same tokenization process as run_record.py, dumping tokenization results
+# compared with v1: add query and passage entity span in output
+
+import argparse
+import logging
+import json
+import os
+import pickle
+from tqdm import tqdm, trange
+
+import tokenization
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+class ReCoRDExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 passage_entities,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None):
+        self.passage_entities = passage_entities
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        return s
+
+# the tokenization process when reading examples
+def read_record_examples(input_file, is_training):
+    """Read a ReCoRD json file into a list of ReCoRDExample."""
+    with open(input_file, "r") as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        paragraph_text = entry["passage"]["text"].replace('\xa0', ' ')
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+        for c in paragraph_text:
+            if is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        # load entities in passage
+        passage_entities = []
+        for entity in entry['passage']['entities']:
+            entity_start_offset = entity['start']
+            entity_end_offset = entity['end']
+            if entity_end_offset < entity_start_offset: # some error labeled entities in record dataset
+                continue
+            entity_text = paragraph_text[entity_start_offset: entity_end_offset + 1]
+            passage_entities.append({'orig_text': entity_text, 
+                                     'start_position': char_to_word_offset[entity_start_offset], 
+                                     'end_position': char_to_word_offset[entity_end_offset]})            
+
+        for qa in entry["qas"]:
+            qas_id = qa["id"]
+            question_text = qa["query"].replace('\xa0', ' ')
+            start_position = None
+            end_position = None
+            orig_answer_text = None
+            if is_training:
+                # if len(qa["answers"]) != 1:
+                #     raise ValueError(
+                #         "For training, each question should have exactly 1 answer.")
+                answer = qa["answers"][0]
+                orig_answer_text = answer["text"]
+                answer_offset = answer["start"]
+                answer_length = len(orig_answer_text)
+                start_position = char_to_word_offset[answer_offset]
+                end_position = char_to_word_offset[answer_offset + answer_length - 1]
+                # Only add answers where the text can be exactly recovered from the
+                # document. If this CAN'T happen it's likely due to weird Unicode
+                # stuff so we will just skip the example.
+                #
+                # Note that this means for training mode, every example is NOT
+                # guaranteed to be preserved.
+                actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+                cleaned_answer_text = " ".join(
+                    tokenization.whitespace_tokenize(orig_answer_text))
+                if actual_text.find(cleaned_answer_text) == -1:
+                    logger.warning("Could not find answer: '%s' vs. '%s'",
+                                        actual_text, cleaned_answer_text)
+                    continue
+
+            example = ReCoRDExample(
+                qas_id=qas_id,
+                question_text=question_text,
+                doc_tokens=doc_tokens,
+                passage_entities=passage_entities,
+                orig_answer_text=orig_answer_text,
+                start_position=start_position,
+                end_position=end_position)
+            examples.append(example)
+    return examples
+
+def _improve_entity_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_entity_text):
+    """Returns token-level tokenized entity spans that better match the annotated entity."""
+    tok_entity_text = " ".join(tokenizer.basic_tokenizer.tokenize(orig_entity_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_entity_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+def _is_real_subspan(start, end, other_start, other_end):
+    return (start >= other_start and end < other_end) or (start > other_start and end <= other_end)
+
+def match_query_entities(query_tokens, document_entities, document_tokens):
+    # transform query_tokens list into a whitespace separated string
+    query_string = " ".join(query_tokens)
+    offset_to_tid_map = []
+    tid = 0
+    for char in query_string:
+        offset_to_tid_map.append(tid)
+        if char == ' ':
+            tid += 1
+
+    # transform entity_tokens into whitespace separated strings
+    entity_strings = set()
+    for document_entity in document_entities:
+        entity_tokens = document_tokens[document_entity[1]: document_entity[2] + 1]
+        entity_strings.add(" ".join(entity_tokens))
+    
+    # do matching
+    results = []
+    for entity_string in entity_strings:
+        start = 0
+        while True:
+            pos = query_string.find(entity_string, start)
+            if pos == -1:
+                break
+            token_start, token_end = offset_to_tid_map[pos], offset_to_tid_map[pos] + entity_string.count(' ')
+            # assure the match is not partial match (eg. "ville" matches to "danville")
+            if " ".join(query_tokens[token_start: token_end + 1]) == entity_string:
+                results.append((token_start, token_end))
+            start = pos + len(entity_string)
+    
+    # filter out a result span if it's a subspan of another span
+    no_subspan_results = []
+    for result in results:
+        if not any([_is_real_subspan(result[0], result[1], other_result[0], other_result[1]) for other_result in results]):
+            no_subspan_results.append((" ".join(query_tokens[result[0]: result[1] + 1]), result[0], result[1]))
+    assert len(no_subspan_results) == len(set(no_subspan_results))
+
+    return no_subspan_results
+    
+
+# the further tokenization process when generating features
+def tokenization_on_examples(examples, tokenizer):
+
+    tokenization_result = []
+    for example in tqdm(examples):
+        # do tokenization on raw question text
+        query_subtokens = []
+        query_sub_to_ori_index = [] # mapping from sub-token index to token index
+        query_tokens = tokenizer.basic_tokenizer.tokenize(example.question_text)
+        for index, token in enumerate(query_tokens):
+            for sub_token in tokenizer.wordpiece_tokenizer.tokenize(token):
+                query_subtokens.append(sub_token)
+                query_sub_to_ori_index.append(index)
+        
+        # do tokenization on whitespace tokenized document
+        document_tokens = []
+        document_subtokens = []
+        document_sub_to_ori_index = []
+        document_up_to_ori_index = [] # map unpunc token index to tokenized token index
+        for unpunc_tokenized_tokens in example.doc_tokens:
+            tokens = tokenizer.basic_tokenizer.tokenize(unpunc_tokenized_tokens) # do punctuation tokenization
+            document_up_to_ori_index.append(len(document_tokens))
+            for token in tokens:
+                for sub_token in tokenizer.wordpiece_tokenizer.tokenize(token):
+                    document_subtokens.append(sub_token)
+                    document_sub_to_ori_index.append(len(document_tokens))
+                document_tokens.append(token)
+        
+        # generate token-level document entity index
+        document_entities = []
+        for entity in example.passage_entities:
+            entity_start_position = document_up_to_ori_index[entity['start_position']]
+            entity_end_position = None
+            if entity['end_position'] < len(example.doc_tokens) - 1:
+                entity_end_position = document_up_to_ori_index[entity['end_position'] + 1] - 1
+            else:
+                entity_end_position = len(document_tokens) - 1
+            (entity_start_position, entity_end_position) = _improve_entity_span(
+                document_tokens, entity_start_position, entity_end_position, tokenizer, entity['orig_text'])
+            document_entities.append((entity['orig_text'], entity_start_position, entity_end_position)) # ('Trump', 10, 10)
+        
+        # match query to passage entities
+        query_entities = match_query_entities(query_tokens, document_entities, document_tokens) # [('trump', 10, 10)]
+        
+        tokenization_result.append({
+            'id': example.qas_id,
+            'query_tokens': query_tokens,
+            'query_subtokens': query_subtokens,
+            'query_sub_to_ori_index': query_sub_to_ori_index,
+            'query_entities': query_entities,
+            'document_tokens': document_tokens,
+            'document_subtokens': document_subtokens,
+            'document_entities': document_entities,
+            'document_sub_to_ori_index': document_sub_to_ori_index,
+        })
+    
+    return tokenization_result
+    
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--output_dir", default='tokens', type=str, 
+                        help="The output directory to dump tokenization results.")
+    parser.add_argument("--train_file", default='../../data/ReCoRD/train.json', type=str, help="ReCoRD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default='../../data/ReCoRD/dev.json', type=str,
+                        help="ReCoRD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    # parser.add_argument("--do_lower_case", default=False, action='store_true',
+    #                     help="Whether to lower case the input text. Should be True for uncased "
+    #                          "models and False for cased models.")
+    # parser.add_argument('--dump_token', action='store_true', help='whether dump the token-level tokenization result')
+    # parser.add_argument('--dump_subtoken', action='store_true', help='whether dump the subtoken-level tokenization result, with its mapping with token-level result')
+    args = parser.parse_args()
+
+    # make output directory if not exist
+    if not os.path.exists(args.output_dir):
+        os.mkdir(args.output_dir)
+
+    # We do both cased and uncased tokenization
+    for do_lower_case in (True, False):
+        tokenizer = tokenization.FullTokenizer(
+            vocab_file='vocab.{}.txt'.format('uncased' if do_lower_case else 'cased'), do_lower_case=do_lower_case)
+
+        train_examples = read_record_examples(input_file=args.train_file, is_training=True)
+        train_tokenization_result = tokenization_on_examples(
+            examples=train_examples,
+            tokenizer=tokenizer)
+        with open(os.path.join(args.output_dir, 'train.tokenization.{}.data'.format('uncased' if do_lower_case else 'cased')), 'wb') as fout:
+            pickle.dump(train_tokenization_result, fout)
+
+        logger.info('Finished {} tokenization for train set.'.format('uncased' if do_lower_case else 'cased'))
+
+        eval_examples = read_record_examples(input_file=args.predict_file, is_training=False)
+        eval_tokenization_result = tokenization_on_examples(
+            examples=eval_examples,
+            tokenizer=tokenizer) 
+        with open(os.path.join(args.output_dir, 'dev.tokenization.{}.data'.format('uncased' if do_lower_case else 'cased')), 'wb') as fout:
+            pickle.dump(eval_tokenization_result, fout)
+        
+        logger.info('Finished {} tokenization for dev set.'.format('uncased' if do_lower_case else 'cased'))
+
+if __name__ == "__main__":
+    main()
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    """Converts a sequence of tokens into ids using the vocab."""
+    ids = []
+    for token in tokens:
+        ids.append(vocab[token])
+    return ids
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_tokens_to_ids(self.vocab, tokens)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+    
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+    
+        return False
+    
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/vocab.cased.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/vocab.cased.txt
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/vocab.uncased.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/vocab.uncased.txt
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/do_tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/do_tokenization.py
+# -*- coding: utf-8 -*-
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This script performs the same tokenization process as run_squad.py, dumping tokenization results
+# compared with v1: add query and passage entity span in output
+
+import argparse
+import logging
+import json
+import os
+import pickle
+from tqdm import tqdm, trange
+
+import tokenization
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+class SQuADExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 question_entities_strset,
+                 doc_tokens,
+                 passage_entities,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.passage_entities = passage_entities
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.question_entities_strset = question_entities_strset
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        return s
+
+# the tokenization process when reading examples
+def read_squad_examples(input_file, is_training):
+    """Read a SQuAD json file into a list of SQuADExample."""
+    with open(input_file, "r", encoding='utf-8') as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)           
+
+            # load entities in passage
+            passage_entities = []
+            for entity in paragraph['context_entities']:
+                entity_start_offset = entity['start']
+                entity_end_offset = entity['end']
+                entity_text = entity['text']
+                assert entity_text == paragraph_text[entity_start_offset: entity_end_offset + 1]
+                passage_entities.append({'orig_text': entity_text, 
+                                        'start_position': char_to_word_offset[entity_start_offset], 
+                                        'end_position': char_to_word_offset[entity_end_offset]})   
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                question_entities_strset = set([entity_info["text"] for entity_info in qa["question_entities"]])
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                if is_training:
+                    if len(qa["answers"]) != 1:
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer.")
+                    answer = qa["answers"][0]
+                    orig_answer_text = answer["text"]
+                    answer_offset = answer["answer_start"]
+                    answer_length = len(orig_answer_text)
+                    start_position = char_to_word_offset[answer_offset]
+                    end_position = char_to_word_offset[answer_offset + answer_length - 1]
+                    # Only add answers where the text can be exactly recovered from the
+                    # document. If this CAN'T happen it's likely due to weird Unicode
+                    # stuff so we will just skip the example.
+                    #
+                    # Note that this means for training mode, every example is NOT
+                    # guaranteed to be preserved.
+                    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+                    cleaned_answer_text = " ".join(
+                        tokenization.whitespace_tokenize(orig_answer_text))
+                    if actual_text.find(cleaned_answer_text) == -1:
+                        logger.warning("Could not find answer: '%s' vs. '%s'",
+                                            actual_text, cleaned_answer_text)
+                        continue
+
+                example = SQuADExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    question_entities_strset=question_entities_strset,
+                    doc_tokens=doc_tokens,
+                    passage_entities=passage_entities,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position)
+                examples.append(example)
+    return examples
+
+def _improve_entity_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_entity_text):
+    """Returns token-level tokenized entity spans that better match the annotated entity."""
+    tok_entity_text = " ".join(tokenizer.basic_tokenizer.tokenize(orig_entity_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_entity_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+def _is_real_subspan(start, end, other_start, other_end):
+    return (start >= other_start and end < other_end) or (start > other_start and end <= other_end)
+
+def match_query_entities(query_tokens, entities_tokens):
+    # transform query_tokens list into a whitespace separated string
+    query_string = " ".join(query_tokens)
+    offset_to_tid_map = []
+    tid = 0
+    for char in query_string:
+        offset_to_tid_map.append(tid)
+        if char == ' ':
+            tid += 1
+
+    # transform entity_tokens into whitespace separated strings
+    entity_strings = set()
+    for entity_tokens in entities_tokens:
+        entity_strings.add(" ".join(entity_tokens))
+    
+    # do matching
+    results = []
+    for entity_string in entity_strings:
+        start = 0
+        while True:
+            pos = query_string.find(entity_string, start)
+            if pos == -1:
+                break
+            token_start, token_end = offset_to_tid_map[pos], offset_to_tid_map[pos] + entity_string.count(' ')
+            # assure the match is not partial match (eg. "ville" matches to "danville")
+            if " ".join(query_tokens[token_start: token_end + 1]) == entity_string:
+                results.append((token_start, token_end))
+            start = pos + len(entity_string)
+    
+    # filter out a result span if it's a subspan of another span
+    no_subspan_results = []
+    for result in results:
+        if not any([_is_real_subspan(result[0], result[1], other_result[0], other_result[1]) for other_result in results]):
+            no_subspan_results.append((" ".join(query_tokens[result[0]: result[1] + 1]), result[0], result[1]))
+    assert len(no_subspan_results) == len(set(no_subspan_results))
+
+    return no_subspan_results
+
+# the further tokenization process when generating features
+def tokenization_on_examples(examples, tokenizer):
+
+    tokenization_result = []
+    for example in tqdm(examples):
+        # do tokenization on raw question text
+        query_subtokens = []
+        query_sub_to_ori_index = [] # mapping from sub-token index to token index
+        query_tokens = tokenizer.basic_tokenizer.tokenize(example.question_text)
+        for index, token in enumerate(query_tokens):
+            for sub_token in tokenizer.wordpiece_tokenizer.tokenize(token):
+                query_subtokens.append(sub_token)
+                query_sub_to_ori_index.append(index)
+        
+        # do tokenization on whitespace tokenized document
+        document_tokens = []
+        document_subtokens = []
+        document_sub_to_ori_index = []
+        document_up_to_ori_index = [] # map unpunc token index to tokenized token index
+        for unpunc_tokenized_tokens in example.doc_tokens:
+            tokens = tokenizer.basic_tokenizer.tokenize(unpunc_tokenized_tokens) # do punctuation tokenization
+            document_up_to_ori_index.append(len(document_tokens))
+            for token in tokens:
+                for sub_token in tokenizer.wordpiece_tokenizer.tokenize(token):
+                    document_subtokens.append(sub_token)
+                    document_sub_to_ori_index.append(len(document_tokens))
+                document_tokens.append(token)
+
+        # generate token-level document entity index
+        document_entities = []
+        for entity in example.passage_entities:
+            entity_start_position = document_up_to_ori_index[entity['start_position']]
+            entity_end_position = None
+            if entity['end_position'] < len(example.doc_tokens) - 1:
+                entity_end_position = document_up_to_ori_index[entity['end_position'] + 1] - 1
+            else:
+                entity_end_position = len(document_tokens) - 1
+            (entity_start_position, entity_end_position) = _improve_entity_span(
+                document_tokens, entity_start_position, entity_end_position, tokenizer, entity['orig_text'])
+            document_entities.append((entity['orig_text'], entity_start_position, entity_end_position)) # ('Trump', 10, 10)
+
+        # match query entities (including tagged and document entities)
+        entities_tokens = []
+        for question_entity_str in example.question_entities_strset:
+            entities_tokens.append(tokenizer.basic_tokenizer.tokenize(question_entity_str))
+        for document_entity in document_entities:
+            entities_tokens.append(document_tokens[document_entity[1]: document_entity[2] + 1])
+        query_entities = match_query_entities(query_tokens, entities_tokens) # [('trump', 10, 10)]
+
+        tokenization_result.append({
+            'id': example.qas_id,
+            'query_tokens': query_tokens,
+            'query_subtokens': query_subtokens,
+            'query_entities': query_entities,
+            'query_sub_to_ori_index': query_sub_to_ori_index,
+            'document_tokens': document_tokens,
+            'document_subtokens': document_subtokens,
+            'document_entities': document_entities,
+            'document_sub_to_ori_index': document_sub_to_ori_index,
+        })
+    
+    return tokenization_result
+    
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--output_dir", default='tokens', type=str, 
+                        help="The output directory to dump tokenization results.")
+    parser.add_argument("--train_file", default='../ner_tagging_squad/output/train-v1.1.tagged.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default='../ner_tagging_squad/output/dev-v1.1.tagged.json', type=str,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    # parser.add_argument("--do_lower_case", default=False, action='store_true',
+    #                     help="Whether to lower case the input text. Should be True for uncased "
+    #                          "models and False for cased models.")
+    # parser.add_argument('--dump_token', action='store_true', help='whether dump the token-level tokenization result')
+    # parser.add_argument('--dump_subtoken', action='store_true', help='whether dump the subtoken-level tokenization result, with its mapping with token-level result')
+    args = parser.parse_args()
+
+    # make output directory if not exist
+    if not os.path.exists(args.output_dir):
+        os.mkdir(args.output_dir)
+
+    # We do both cased and uncased tokenization
+    for do_lower_case in (True, False):
+        tokenizer = tokenization.FullTokenizer(
+            vocab_file='vocab.{}.txt'.format('uncased' if do_lower_case else 'cased'), do_lower_case=do_lower_case)
+
+        train_examples = read_squad_examples(input_file=args.train_file, is_training=True)
+        train_tokenization_result = tokenization_on_examples(
+            examples=train_examples,
+            tokenizer=tokenizer)
+        with open(os.path.join(args.output_dir, 'train.tokenization.{}.data'.format('uncased' if do_lower_case else 'cased')), 'wb') as fout:
+            pickle.dump(train_tokenization_result, fout)
+
+        logger.info('Finished {} tokenization for train set.'.format('uncased' if do_lower_case else 'cased'))
+
+        eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False)
+        eval_tokenization_result = tokenization_on_examples(
+            examples=eval_examples,
+            tokenizer=tokenizer) 
+        with open(os.path.join(args.output_dir, 'dev.tokenization.{}.data'.format('uncased' if do_lower_case else 'cased')), 'wb') as fout:
+            pickle.dump(eval_tokenization_result, fout)
+        
+        logger.info('Finished {} tokenization for dev set.'.format('uncased' if do_lower_case else 'cased'))
+
+if __name__ == "__main__":
+    main()
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    """Converts a sequence of tokens into ids using the vocab."""
+    ids = []
+    for token in tokens:
+        ids.append(vocab[token])
+    return ids
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_tokens_to_ids(self.vocab, tokens)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+    
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+    
+        return False
+    
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/vocab.cased.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/vocab.cased.txt
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/vocab.uncased.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/vocab.uncased.txt