Add the workspace of ACL2019-KTNET into PaddleNLP Research Version (#3244)

* add readme for KTNET * update readme * update readme * update readme * update readme of KTNET * update readme of KTNET * add source files for KTNET * update files for KTNET * update files for KTNET * update draft of readme for KTNET * modified scripts for KTNET * fix typos in readme.md for KTNET * update scripts for KTNET * update scripts for KTNET * update readme for KTNET * edit two-staged training scripts for KTNET * add details in the readme of KTNET * fix typos in the readme of KTNET * added eval scripts for KTNET * rename folders for KTNET * add copyright in the code and add links in readme for KTNET * add the remaining download link for KTNET * add md5sum for KTNET * final version for KTNET

Add the workspace of ACL2019-KTNET into PaddleNLP Research Version (#3244)
* add readme for KTNET * update readme * update readme * update readme * update readme of KTNET * update readme of KTNET * add source files for KTNET * update files for KTNET * update files for KTNET * update draft of readme for KTNET * modified scripts for KTNET * fix typos in readme.md for KTNET * update scripts for KTNET * update scripts for KTNET * update readme for KTNET * edit two-staged training scripts for KTNET * add details in the readme of KTNET * fix typos in the readme of KTNET * added eval scripts for KTNET * rename folders for KTNET * add copyright in the code and add links in readme for KTNET * add the remaining download link for KTNET * add md5sum for KTNET * final version for KTNET
f8658874 · Yang An · Yibing Liu · d6c65111 · f8658874 · f8658874
65 changed file
--- a/PaddleNLP/Research/ACL2019-KTNET/data/ReCoRD/.gitkeep
+++ b/PaddleNLP/Research/ACL2019-KTNET/data/ReCoRD/.gitkeep
--- a/PaddleNLP/Research/ACL2019-KTNET/data/SQuAD/.gitkeep
+++ b/PaddleNLP/Research/ACL2019-KTNET/data/SQuAD/.gitkeep
--- a/PaddleNLP/Research/ACL2019-KTNET/downloaded_files.md5
+++ b/PaddleNLP/Research/ACL2019-KTNET/downloaded_files.md5
+ad550852cf26241b20e8364e40340a99  train.json
+60c70c4a7e8190483f9899a1c9bc4178  dev.json
+df45d93b87ca3c47b54a33e03fabf719  record_official_evaluate.py
+981b29407e0affa3b1b156f72073b945  train-v1.1.json
+3e85deb501d4e538b6bc56f786231552  dev-v1.1.json
+afb04912d18ff20696f7f88eed49bea9  squad_v1_official_evaluate.py
+64010b964ae2ebf00148b3519a4aafc8  KTNET_preprocess_squad_tagging_output.tar.gz
+e9352221127b7620427c18e39bfae7fc  KTNET_preprocess_tokenize_result_record.tar.gz
+e52da2b1d096e889d32df267b82f9c77  KTNET_preprocess_tokenize_result_squad.tar.gz
+89db2f5cfb07f0c44998d7f49098eb90  KTNET_preprocess_wordnet_concepts.tar.gz
+fb62db2fe82d88480ec853f3c6fa237a  NELL.08m.1115.esv.csv.gz
+a68e68f9dcf4524b356163369c7f9f50  KTNET_preprocess_nell_concepts.tar.gz
+d9b62183c6367ffac3ee6f864c9425a5  wn_concept2vec.txt
+1f69c3d092089b0a0652616b72d61bd8  nell_concept2vec.txt
+5405c050e64fee4ffec17ee50f079b64  cased_L-24_H-1024_A-16.tar.gz
+4bd6e911cdad39c543ba8922a70580cd  KTNET_fine-tuned-model_record_both.tar.gz
+43fa464d6aeabe6dc7a15315d4ea8288  KTNET_fine-tuned-model_record_nell.tar.gz
+20aaefead331f64e435a94ac8a7b58aa  KTNET_fine-tuned-model_record_wordnet.tar.gz
+3abdb7be3fc5e3b98633c918acc25af4  KTNET_fine-tuned-model_squad_both.tar.gz
+9232cf27adda9d64265ccb315e1b9c81  KTNET_fine-tuned-model_squad_nell.tar.gz
+a36fdd6d5c88e3e931bb3b28f9aeb4e2  KTNET_fine-tuned-model_squad_wordnet.tar.gz
--- a/PaddleNLP/Research/ACL2019-KTNET/images/architecture.png
+++ b/PaddleNLP/Research/ACL2019-KTNET/images/architecture.png
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_nell.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_nell.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_twomemory.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_twomemory.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_record_twomemory.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_wordnet.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_record_wordnet.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_nell.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_nell.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_twomemory.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_twomemory.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_squad_twomemory.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_wordnet.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/eval_squad_wordnet.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+CKPT_DIR=$1
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train false \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --init_checkpoint $CKPT_DIR \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint record_nell_first_stage_output/step_41970 \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_nell_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d record_nell_first_stage_log ]; then
+mkdir record_nell_first_stage_log
+else
+rm -r record_nell_first_stage_log/*
+fi
+
+if [ ! -d record_nell_first_stage_output ]; then
+mkdir record_nell_first_stage_output
+else
+rm -r record_nell_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-4 \
+  --epoch 10 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints record_nell_first_stage_output/ 1>$PWD_DIR/record_nell_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint record_both_first_stage_output/step_41970 \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_twomemory_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d record_both_first_stage_log ]; then
+mkdir record_both_first_stage_log
+else
+rm -r record_both_first_stage_log/*
+fi
+
+if [ ! -d record_both_first_stage_output ]; then
+mkdir record_both_first_stage_output
+else
+rm -r record_both_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_record_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-4 \
+  --epoch 10 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints record_both_first_stage_output/ 1>$PWD_DIR/record_both_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint record_wn_first_stage_output/step_41970 \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 4 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_record_wordnet_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d record_wn_first_stage_log ]; then
+mkdir record_wn_first_stage_log
+else
+rm -r record_wn_first_stage_log/*
+fi
+
+if [ ! -d record_wn_first_stage_output ]; then
+mkdir record_wn_first_stage_output
+else
+rm -r record_wn_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_record.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/ReCoRD/train.json \
+  --predict_file $DATA/ReCoRD/dev.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-4 \
+  --epoch 10 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints record_wn_first_stage_output/ 1>$PWD_DIR/record_wn_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint sqd_nell_first_stage_output/step_3649 \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_nell_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d sqd_nell_first_stage_log ]; then
+mkdir sqd_nell_first_stage_log
+else
+rm -r sqd_nell_first_stage_log/*
+fi
+
+if [ ! -d sqd_nell_first_stage_output ]; then
+mkdir sqd_nell_first_stage_output
+else
+rm -r sqd_nell_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-5 \
+  --epoch 1 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints sqd_nell_first_stage_output/ 1>$PWD_DIR/sqd_nell_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint sqd_both_first_stage_output/step_3649 \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_twomemory_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d sqd_both_first_stage_log ]; then
+mkdir sqd_both_first_stage_log
+else
+rm -r sqd_both_first_stage_log/*
+fi
+
+if [ ! -d sqd_both_first_stage_output ]; then
+mkdir sqd_both_first_stage_output
+else
+rm -r sqd_both_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+WN_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+NELL_CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/nell_concept2vec.txt
+
+python3 src/run_squad_twomemory.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-5 \
+  --epoch 1 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --wn_concept_embedding_path $WN_CPT_EMBEDDING_PATH \
+  --nell_concept_embedding_path $NELL_CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --use_nell true \
+  --random_seed 45 \
+  --checkpoints sqd_both_first_stage_output/ 1>$PWD_DIR/sqd_both_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_finetune.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_finetune.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d log ]; then
+mkdir log
+else
+rm -r log/*
+fi
+
+if [ ! -d output ]; then
+mkdir output
+else
+rm -r output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --do_lower_case false \
+  --init_checkpoint sqd_wn_first_stage_output/step_3649 \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze false \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.1 \
+  --learning_rate 3e-5 \
+  --epoch 3 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints output/ 1>$PWD_DIR/log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_pretrain.sh
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/run_squad_wordnet_pretrain.sh
+#!/bin/bash
+# ==============================================================================
+# Copyright 2019 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+export LANG=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export LC_CTYPE=en_US.UTF-8
+
+if [ ! -d sqd_wn_first_stage_log ]; then
+mkdir sqd_wn_first_stage_log
+else
+rm -r sqd_wn_first_stage_log/*
+fi
+
+if [ ! -d sqd_wn_first_stage_output ]; then
+mkdir sqd_wn_first_stage_output
+else
+rm -r sqd_wn_first_stage_output/*
+fi
+
+export FLAGS_cudnn_deterministic=true
+export FLAGS_cpu_deterministic=true
+
+PWD_DIR=`pwd`
+DATA=../data/
+BERT_DIR=cased_L-24_H-1024_A-16
+CPT_EMBEDDING_PATH=../retrieve_concepts/KB_embeddings/wn_concept2vec.txt
+
+python3 src/run_squad.py \
+  --batch_size 6 \
+  --do_train true \
+  --do_predict true \
+  --use_ema false \
+  --do_lower_case false \
+  --init_pretraining_params $BERT_DIR/params \
+  --train_file $DATA/SQuAD/train-v1.1.json \
+  --predict_file $DATA/SQuAD/dev-v1.1.json \
+  --vocab_path $BERT_DIR/vocab.txt \
+  --bert_config_path $BERT_DIR/bert_config.json \
+  --freeze true \
+  --save_steps 4000 \
+  --weight_decay 0.01 \
+  --warmup_proportion 0.0 \
+  --learning_rate 3e-5 \
+  --epoch 1 \
+  --max_seq_len 384 \
+  --doc_stride 128 \
+  --concept_embedding_path $CPT_EMBEDDING_PATH \
+  --use_wordnet true \
+  --random_seed 45 \
+  --checkpoints sqd_wn_first_stage_output/ 1>$PWD_DIR/sqd_wn_first_stage_log/train.log 2>&1
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/batching.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/batching.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False,
+                       max_concept_length=50):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+    batch_concept_ids = [inst[3] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+
+    for i in range(4, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)       
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    concept_ids = pad_batch_data(
+        batch_concept_ids, pad_idx=[],
+        max_concept_length=max_concept_length)  # 用[0,0,..]来pad         
+
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, concept_ids, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, concept_ids, self_input_mask] + labels_list
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   max_concept_length=50):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    # max_len = max(len(inst) for inst in insts)
+    max_len = 384
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    if type(pad_idx) == list:  # padding list, for concept_ids
+        inst_data = np.array(
+            [inst + list([0] * max_concept_length for x in range(max_len - len(inst))) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, max_len, max_concept_length, 1])]
+    else:
+        inst_data = np.array([
+            list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+        ])
+        return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/batching_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/batching_twomemory.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False,
+                       max_wn_concept_length=50,
+                       max_nell_concept_length=50):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+    batch_wn_concept_ids = [inst[3] for inst in insts]
+    batch_nell_concept_ids = [inst[4] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+
+    for i in range(5, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)       
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    wn_concept_ids = pad_batch_data(
+        batch_wn_concept_ids, pad_idx=[],
+        max_concept_length=max_wn_concept_length)  # 用[0,0,..]来pad      
+    nell_concept_ids = pad_batch_data(
+        batch_nell_concept_ids, pad_idx=[],
+        max_concept_length=max_nell_concept_length)  # 用[0,0,..]来pad            
+
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, wn_concept_ids, nell_concept_ids, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, wn_concept_ids, nell_concept_ids, self_input_mask] + labels_list
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   max_concept_length=50):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    # max_len = max(len(inst) for inst in insts)
+    max_len = 384
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    if type(pad_idx) == list:  # padding list, for concept_ids
+        inst_data = np.array(
+            [inst + list([0] * max_concept_length for x in range(max_len - len(inst))) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, max_len, max_concept_length, 1])]
+    else:
+        inst_data = np.array([
+            list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+        ])
+        return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/eval/__init__.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/eval/__init__.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/__init__.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/__init__.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/bert.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/bert.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import json
+import logging
+import numpy as np
+import paddle.fluid as fluid
+from model.transformer_encoder import encoder, pre_process_layer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+class BertConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        return self._config_dict[key]
+
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            logger.info('%s: %s' % (arg, value))
+        logger.info('------------------------------------------------')
+
+
+class BertModel(object):
+    def __init__(self,
+                 src_ids,
+                 position_ids,
+                 sentence_ids,
+                 input_mask,
+                 config,
+                 weight_sharing=True,
+                 use_fp16=False):
+
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        self._sent_types = config['type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self._weight_sharing = weight_sharing
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+
+        # Initialize all weigths by truncated normal initializer, and all biases 
+        # will be initialized by constant zero by default.
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+
+        self._build_model(src_ids, position_ids, sentence_ids, input_mask)
+
+    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+        position_emb_out = fluid.layers.embedding(
+            input=position_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer))
+
+        sent_emb_out = fluid.layers.embedding(
+            sentence_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer))
+
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+
+        emb_out = pre_process_layer(
+            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
+
+        if self._dtype == "float16":
+            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
+
+        # self_attn_mask = fluid.layers.matmul(
+        #     x=input_mask, y=input_mask, transpose_y=True)
+        self_attn_mask = fluid.layers.expand(fluid.layers.transpose(input_mask, [0, 2, 1]), [1, 384, 1])    
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        self._enc_out = encoder(
+            enc_input=emb_out,
+            attn_bias=n_head_self_attn_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            param_initializer=self._param_initializer,
+            name='encoder')
+
+    def get_sequence_output(self):
+        return self._enc_out
+
+    def get_pooled_output(self):
+        """Get the first feature of each sequence for classification"""
+
+        next_sent_feat = fluid.layers.slice(
+            input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0")
+        return next_sent_feat
+
+    def get_pretraining_output(self, mask_label, mask_pos, labels):
+        """Get the loss & accuracy for pretraining"""
+
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+
+        # extract the first token feature in each sentence
+        next_sent_feat = self.get_pooled_output()
+        reshaped_emb_out = fluid.layers.reshape(
+            x=self._enc_out, shape=[-1, self._emb_size])
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=self._emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(
+                name='mask_lm_trans_fc.w_0',
+                initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
+        # transform: layer norm 
+        mask_trans_feat = pre_process_layer(
+            mask_trans_feat, 'n', name='mask_lm_trans')
+
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name="mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+        if self._weight_sharing:
+            fc_out = fluid.layers.matmul(
+                x=mask_trans_feat,
+                y=fluid.default_main_program().global_block().var(
+                    self._word_emb_name),
+                transpose_y=True)
+            fc_out += fluid.layers.create_parameter(
+                shape=[self._voc_size],
+                dtype=self._dtype,
+                attr=mask_lm_out_bias_attr,
+                is_bias=True)
+
+        else:
+            fc_out = fluid.layers.fc(input=mask_trans_feat,
+                                     size=self._voc_size,
+                                     param_attr=fluid.ParamAttr(
+                                         name="mask_lm_out_fc.w_0",
+                                         initializer=self._param_initializer),
+                                     bias_attr=mask_lm_out_bias_attr)
+
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label)
+        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
+
+        next_sent_fc_out = fluid.layers.fc(
+            input=next_sent_feat,
+            size=2,
+            param_attr=fluid.ParamAttr(
+                name="next_sent_fc.w_0", initializer=self._param_initializer),
+            bias_attr="next_sent_fc.b_0")
+
+        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=next_sent_fc_out, label=labels, return_softmax=True)
+
+        next_sent_acc = fluid.layers.accuracy(
+            input=next_sent_softmax, label=labels)
+
+        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
+
+        loss = mean_next_sent_loss + mean_mask_lm_loss
+        return next_sent_acc, mean_mask_lm_loss, loss
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/layers.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/layers.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""bert model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import sys
+import six
+import logging
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.layers import shape
+
+from model.transformer_encoder import encoder, pre_process_layer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+def dynamic_expand(dynamic_tensor, smaller_tensor):
+    """
+    :param dynamic_tensor:
+    :param smaller_tensor:
+    :return:
+    """
+    assert len(dynamic_tensor.shape) > len(smaller_tensor.shape)
+    if type(smaller_tensor.shape) == list:
+        for dim_idx, dim in smaller_tensor.shape:
+            dynamic_tensor_dim_idx = len(dynamic_tensor) - len(smaller_tensor) + dim_idx
+            assert dynamic_tensor.shape[dynamic_tensor_dim_idx] % dim == 0
+    elif type(smaller_tensor.shape) == int:
+        assert dynamic_tensor.shape[-1] % smaller_tensor.shape == 0
+    memory_embs_zero = fluid.layers.scale(dynamic_tensor, scale=0.0)
+    smaller_tensor = fluid.layers.elementwise_add(memory_embs_zero, smaller_tensor)
+    return smaller_tensor
+
+
+def print_tensor(tensor, message, print_runtime=False):
+    logger.info("{}: {}".format(message, tensor.shape))
+    if print_runtime:
+        fluid.layers.Print(tensor, summarize=10, message=message)
+
+
+class MemoryLayer(object):
+    def __init__(self, bert_config, concept_size, mem_emb_size, mem_method='cat', prefix=None):
+        self.initializer_range = bert_config['initializer_range']
+        self.bert_size = bert_config['hidden_size']
+        self.concept_size = concept_size
+        self.mem_emb_size = mem_emb_size
+        assert mem_method in ['add', 'cat', 'raw']
+        self.mem_method = mem_method
+        self.prefix = prefix
+
+    def forward(self, bert_output, memory_embs, mem_length, ignore_no_memory_token=True):
+        """
+        :param bert_output: [batch_size, seq_size, bert_size]
+        :param memory_embs: [batch_size, seq_size, concept_size, mem_emb_size]
+        :param mem_length: [batch_size, sent_size, 1]
+        :return: 
+        """
+
+        bert_size = self.bert_size
+        concept_size = self.concept_size
+        mem_emb_size = self.mem_emb_size
+
+        print_tensor(bert_output, "bert_output")
+        print_tensor(memory_embs, "memory_embs")
+        print_tensor(mem_length, "mem_length")
+
+ 
+        projected_bert = fluid.layers.fc(bert_output, size=mem_emb_size, num_flatten_dims=2,
+            param_attr=fluid.ParamAttr(
+                name='{}_memory_layer_projection.w_0'.format(self.prefix) if self.prefix else 'memory_layer_projection.w_0',
+                initializer=fluid.initializer.NormalInitializer(
+                    loc=0.0, scale=self.initializer_range)),
+            bias_attr=False)  # [batch_size *seq_size, mem_emb_size]
+        logger.info("projected_bert: {}".format(projected_bert.shape))
+
+        expanded_bert = fluid.layers.unsqueeze(projected_bert, axes=[2])   # [batch_size, seq_size, 1, mem_emb_size]
+
+  
+        extended_memory, memory_score = self.add_sentinel(expanded_bert, memory_embs, mem_emb_size)
+        # extended_memory: [batch_size, seq_size, 1+concept_size, mem_emb_size]
+        # memory_score: [batch_size, seq_size, 1+concept_size]
+
+
+        concept_ordinal = self.get_concept_oridinal(concept_size, memory_score)  # [bs,sq,1+cs]
+
+        memory_reverse_mask = fluid.layers.less_than(
+            fluid.layers.expand(mem_length, expand_times=[1, 1, 1 + concept_size])
+            , concept_ordinal)
+        # [batch_size, seq_size, 1+concept_size]
+        memory_reverse_mask = fluid.layers.cast(memory_reverse_mask, dtype="float32")
+        print_tensor(memory_reverse_mask, "memory_reverse_mask")
+
+        memory_reverse_masked_infinity = fluid.layers.scale(memory_reverse_mask, scale=-1e6)
+        # [batch_size, seq_size, 1+concept_size]
+        print_tensor(memory_reverse_masked_infinity, "memory_reverse_masked_infinity")
+
+        memory_score = fluid.layers.elementwise_add(memory_score, memory_reverse_masked_infinity)
+        # [batch_size, seq_size, 1+concept_size]
+        logger.info("memory_score:{}".format(memory_score.shape))
+
+        memory_att = fluid.layers.softmax(memory_score)  # [batch_size, seq_size, 1+concept_size]
+        memory_att = fluid.layers.unsqueeze(memory_att, axes=[2])  # [batch_size, seq_size, 1, 1+concept_size]
+        logger.info("memory_att: {}".format(memory_att.shape))
+        logger.info("extended_memory: {}".format(extended_memory.shape))
+        summ = fluid.layers.matmul(memory_att,extended_memory)  # [batch_size, seq_size,1, mem_emb_size]
+        summ = fluid.layers.squeeze(summ, axes=[2])  # [batch_size, seq_size,mem_emb_size]
+
+        if ignore_no_memory_token:
+            condition = fluid.layers.less_than(
+                dynamic_expand(mem_length, fluid.layers.zeros([1],"float32")),
+                mem_length)  # [bs, sq]
+            # summ_true = fluid.layers.elementwise_mul(
+            #     summ,
+            #     fluid.layers.cast(condition, "float32"))   # [bs, sq, ms]
+            # summ_false = fluid.layers.elementwise_mul(
+            #     summ,
+            #     fluid.layers.scale(fluid.layers.cast(condition, "float32"), -1))  # [bs, sq, ms]
+            # summ = fluid.layers.elementwise_add(summ_true, summ_false)  # [bs, sq, ms]
+            summ = fluid.layers.elementwise_mul(
+                summ,
+                fluid.layers.cast(condition, "float32"))   # [bs, sq, ms]
+
+            print_tensor(summ, "summ")
+
+        if self.mem_method == "add":
+            summ_transform = fluid.layers.fc(summ, size=bert_size, num_flatten_dims=2)  # [batch_size, seq_size, bert_size]
+            output = fluid.layers.sums(input=[summ_transform, bert_output])  # [batch_size, seq_size, bert_size]
+        elif self.mem_method == "cat":
+            logger.info("bert_output: {}".format(bert_output.shape))
+            logger.info("summ: {}".format(summ.shape))
+            output = fluid.layers.concat(input=[bert_output, summ], axis=2)  # [batch_size, seq_size, bert_size + mem_emb_size]
+        elif self.mem_method == "raw":
+            logger.info("bert_output: {}".format(bert_output.shape))
+            logger.info("summ: {}".format(summ.shape))
+            output = summ  # [batch_size, seq_size, mem_emb_size]
+        else:
+            raise ValueError("mem_method not supported")
+        logger.info("output: {}".format(output.shape))
+        return output
+
+    def get_concept_oridinal(self, concept_size, memory_score):
+        """
+
+        :param concept_size:
+        :param memory_score: [batch_size, seq_size, 1+concept_size]
+        :return:
+        """
+        concept_ordinal = fluid.layers.create_tensor(dtype="float32")
+        fluid.layers.assign(np.arange(start=0, stop=(1 + concept_size), step=1, dtype=np.float32),
+                            concept_ordinal)  # [1+cs]
+        print_tensor(concept_ordinal, "concept_ordinal")
+        print_tensor(memory_score, "memory_score")
+
+        concept_ordinal = dynamic_expand(memory_score, concept_ordinal)  # [bs,sq,1+cs]
+
+        logger.info("concept_ordinal: {}".format(concept_ordinal.shape))
+        return concept_ordinal
+
+    def add_sentinel(self, expanded_bert, memory_embs, mem_emb_size):
+        """
+
+        :param expanded_bert: [batch_size, seq_size, 1, mem_emb_size]
+        :param memory_embs: [batch_size, seq_size, concept_size, mem_emb_size]
+        :param mem_emb_size:
+        :return:
+        """
+        sentinel = fluid.layers.create_parameter(
+            name='{}_memory_layer_sentinel'.format(self.prefix) if self.prefix else 'memory_layer_sentinel',
+            dtype="float32",
+            shape=[mem_emb_size],
+            default_initializer=fluid.initializer.ConstantInitializer(0))  # [mem_emb_size]
+        print_tensor(sentinel, "sentinel")
+
+        memory_embs_squeeze = fluid.layers.slice(memory_embs, axes=[2], starts=[0],
+                                                 ends=[1])  # [bs,sq,1,ms]
+        print_tensor(memory_embs_squeeze, "memory_embs_squeeze")
+
+        sentinel = dynamic_expand(memory_embs_squeeze, sentinel)  # [bs,sq,1,ms]
+        print_tensor(sentinel, "sentinel")
+        print_tensor(memory_embs, "memory_embs")
+
+        extended_memory = fluid.layers.concat([sentinel, memory_embs],
+                                              axis=2)  # [batch_size, seq_size, 1+concept_size, mem_emb_size]
+        extended_memory = fluid.layers.transpose(extended_memory, perm=[0, 1, 3, 2])
+        # [batch_size, seq_size, mem_emb_size, 1+concept_size]
+        logger.info("extended_memory: {}".format(extended_memory.shape))
+        memory_score = fluid.layers.matmul(expanded_bert,
+                                           extended_memory)  # [batch_size, seq_size, 1, 1+concept_size]
+        memory_score = fluid.layers.squeeze(memory_score, axes=[2])
+        # [batch_size, seq_size, 1+concept_size]
+        extended_memory = fluid.layers.transpose(extended_memory, perm=[0, 1, 3, 2])
+        # [batch_size, seq_size, 1+concept_size, mem_emb_size]
+        return extended_memory, memory_score
+
+
+class TriLinearTwoTimeSelfAttentionLayer(object):
+    def __init__(self, hidden_size, dropout_rate=0.0,
+    cat_mul=False, cat_sub=False, cat_twotime=False, cat_twotime_mul=False, cat_twotime_sub=False):
+        self.hidden_size = hidden_size
+        self.dropout_rate = dropout_rate
+        self.cat_mul = cat_mul
+        self.cat_sub = cat_sub
+        self.cat_twotime = cat_twotime
+        self.cat_twotime_mul = cat_twotime_mul
+        self.cat_twotime_sub = cat_twotime_sub
+
+    def forward(self, hidden_emb, sequence_mask):
+        """
+        :param hidden_emb: [batch_size, seq_size, hidden_size]
+        :param sequence_mask: [batch_size, seq_size, 1]
+        :return:
+        """
+        assert len(hidden_emb.shape) ==3 and len(sequence_mask.shape) == 3 \
+               and sequence_mask.shape[-1] == 1
+        assert hidden_emb.shape[:2] == sequence_mask.shape[:2]  
+
+        hidden_size = self.hidden_size
+
+        bias = fluid.layers.create_parameter(name='self_matching_layer_bias', shape=[1], dtype="float32",
+                        default_initializer=fluid.initializer.ConstantInitializer(0))
+
+        weight_1 = fluid.layers.create_parameter(name='self_matching_layer_weight1', shape=[hidden_size], dtype="float32",
+                        default_initializer=fluid.initializer.XavierInitializer(uniform=True, fan_in=1, fan_out=hidden_size))  # [HS]
+        bs_1_hs = fluid.layers.slice(hidden_emb, axes=[1], starts=[0], ends=[1]) # [bs, 1, hs]
+        print_tensor(bs_1_hs, "bs_1_hs")
+        bs_hs_1 = fluid.layers.transpose(bs_1_hs, perm=[0, 2, 1])  # [bs, hs, 1]
+        print_tensor(bs_hs_1, "bs_hs_1")
+        print_tensor(weight_1, "weight_1")
+        weight_1 = dynamic_expand(bs_1_hs, weight_1)  # [BS, 1, HS] (a)jk
+        weight_1 = fluid.layers.transpose(weight_1, perm=[0, 2, 1])
+        print_tensor(hidden_emb, "hidden_emb")
+        print_tensor(weight_1, "weight_1")
+        r1 = fluid.layers.matmul(hidden_emb, weight_1)  # [BS, SQ, 1]  aik
+        print_tensor(r1, "r1")
+
+        weight_2 = fluid.layers.create_parameter(name='self_matching_layer_weight2', shape=[hidden_size], dtype="float32",
+                         default_initializer=fluid.initializer.XavierInitializer(uniform=True, fan_in=1, fan_out=hidden_size))  # [HS]
+        weight_2 = dynamic_expand(bs_1_hs, weight_2)  # # [BS, 1, HS] (a)jk
+        hidden_emb_transpose = fluid.layers.transpose(hidden_emb, perm=[0, 2, 1])  # [BS, HS, SQ] aji
+        r2 = fluid.layers.matmul(weight_2, hidden_emb_transpose)  # [BS, 1, SQ]  aki
+        print_tensor(r2, "r2")
+
+        weight_mul = fluid.layers.create_parameter(name='self_matching_layer_weightmul', shape=[hidden_size], dtype="float32",
+                        default_initializer=fluid.initializer.XavierInitializer(uniform=True))  # [HS]
+
+ 
+        weight_mul = dynamic_expand(hidden_emb, weight_mul)
+        rmul_1 = fluid.layers.elementwise_mul(hidden_emb, weight_mul)  # for "hidden * self.weight_mul". [bs, sq(i), hs(j)]
+        print_tensor(rmul_1, "rmul_1")
+        rmul_2 = fluid.layers.matmul(rmul_1, hidden_emb_transpose)  # [bs, sq(i), hs(j)] mul [bs, hs(j), sq(k)] = [bs, sq(i), sq(k)]
+        print_tensor(rmul_2, "rmul_2")
+
+        r1 = fluid.layers.squeeze(r1, axes=[2])  # [BS, SQ]  aik
+        r1 = dynamic_expand(
+            fluid.layers.transpose(rmul_2, [1, 0, 2]),  # [sq, bs, sq]
+            r1)  # [ SQ(from 1), bs, SQ]
+        r1 = fluid.layers.transpose(r1, [1, 2, 0])  # [bs, sq, sq(from 1)]
+
+        r2 = fluid.layers.squeeze(r2, axes=[1])  # [BS, SQ]  aik
+        r2 = dynamic_expand(
+            fluid.layers.transpose(rmul_2, [1, 0, 2]),  # [sq, bs, sq]
+            r2)  # [ SQ(from 1), bs, SQ]
+        r2 = fluid.layers.transpose(r2, [1, 0, 2])  # [bs,sq(from 1),sq]
+
+        bias = dynamic_expand(rmul_2, bias)  # [BS, SQ, SQ]
+        sim_score = fluid.layers.sums(input=[r1, r2, rmul_2, bias])
+        # [bs,sq,1]+[bs,1,sq]+[bs,sq,sq]+[bs,sq,sq]=[BS,SQ,SQ]
+        print_tensor(sim_score, "sim_score")
+
+        sequence_mask = fluid.layers.cast(sequence_mask, dtype="float32")  # [BS,SQ,1]
+        softmax_mask = fluid.layers.elementwise_sub(
+            sequence_mask,
+            fluid.layers.fill_constant([1], "float32", 1))  # [BS,SQ,1]
+        softmax_mask = fluid.layers.scale(softmax_mask, -1)
+        very_negative_number = fluid.layers.fill_constant([1], value=-1e6, dtype="float32")
+        logger.info("softmax_mask: {}".format(softmax_mask.shape))
+        logger.info("very_negative_number: {}".format(very_negative_number.shape))
+
+        softmax_mask = fluid.layers.elementwise_mul(softmax_mask, very_negative_number)  # [BS,SQ,1]
+
+        softmax_mask = fluid.layers.squeeze(softmax_mask, axes=[2])  # [BS,SQ]
+        softmax_mask = dynamic_expand(fluid.layers.transpose(sim_score, perm=[2, 0, 1]), softmax_mask)  # [sq(1),bs,sq]
+        softmax_mask = fluid.layers.transpose(softmax_mask, perm=[1, 0, 2])   # [BS,sq(1),SQ]
+        print_tensor(softmax_mask, "softmax_mask")
+        sim_score = fluid.layers.elementwise_add(sim_score, softmax_mask)  # [bs,sq,sq]+[bs,sq(1),sq]=[BS,SQ,SQ]
+        print_tensor(sim_score, "sim_score")
+
+        attn_prob = fluid.layers.softmax(sim_score)  # [BS,SQ,SQ]
+        weighted_sum = fluid.layers.matmul(attn_prob, hidden_emb)  # [bs,sq,sq]*[bs,sq,hs]=[BS,SQ,HS]
+        if any([self.cat_twotime, self.cat_twotime_mul, self.cat_twotime_sub]):
+            twotime_att_prob = fluid.layers.matmul(attn_prob, attn_prob)  # [bs,sq,sq]*[bs,sq,sq]=[BS,SQ,SQ]
+            twotime_weited_sum = fluid.layers.matmul(twotime_att_prob, hidden_emb)  # [BS,SQ,HS]
+
+        out_tensors = [hidden_emb, weighted_sum]
+        if self.cat_mul:
+            out_tensors.append(fluid.layers.elementwise_mul(hidden_emb, weighted_sum))
+        if self.cat_sub:
+            out_tensors.append(fluid.layers.elementwise_sub(hidden_emb, weighted_sum))
+        if self.cat_twotime:
+            out_tensors.append(twotime_weited_sum)
+        if self.cat_twotime_mul:
+            out_tensors.append(fluid.layers.elementwise_mul(hidden_emb, twotime_weited_sum))
+        if self.cat_twotime_sub:
+            out_tensors.append(fluid.layers.elementwise_sub(hidden_emb, twotime_weited_sum))
+        output = fluid.layers.concat(out_tensors, axis=2)  # [BS,SQ, HS+HS+....]
+        print_tensor(output, "output")
+        return output
+
+
+
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/transformer_encoder.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/model/transformer_encoder.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer encoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import partial, reduce
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.layer_helper import LayerHelper
+
+def layer_norm(x, begin_norm_axis=1, epsilon=1e-12, param_attr=None, bias_attr=None):
+    """
+    Replace build-in layer_norm op with this function
+    """
+    helper = LayerHelper('layer_norm', **locals())
+    mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
+    shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
+    variance = layers.reduce_mean(layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
+    r_stdev = layers.rsqrt(variance + epsilon)
+    norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
+
+    param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
+    param_dtype = norm_x.dtype
+    scale = helper.create_parameter(
+        attr=param_attr,
+        shape=param_shape,
+        dtype=param_dtype,
+        default_initializer=fluid.initializer.Constant(1.))
+    bias = helper.create_parameter(
+        attr=bias_attr,
+        shape=param_shape,
+        dtype=param_dtype,
+        is_bias=True,
+        default_initializer=fluid.initializer.Constant(0.))
+
+    out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
+    out = layers.elementwise_add(x=out, y=bias, axis=-1)
+
+    return out
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None,
+                         param_initializer=None,
+                         name='multi_head_att'):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_query_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_query_fc.b_0')
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_key_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_key_fc.b_0')
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_value_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_value_fc.b_0')
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return layers.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=True)
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    if cache is not None:  # use cache and concat time steps
+        # Since the inplace reshape in __split_heads changes the shape of k and
+        # v, which is the cache input for next time step, reshape the cache
+        # input from the previous time step first.
+        k = cache["k"] = layers.concat(
+            [layers.reshape(
+                cache["k"], shape=[0, 0, d_model]), k], axis=1)
+        v = cache["v"] = layers.concat(
+            [layers.reshape(
+                cache["v"], shape=[0, 0, d_model]), v], axis=1)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         num_flatten_dims=2,
+                         param_attr=fluid.ParamAttr(
+                             name=name + '_output_fc.w_0',
+                             initializer=param_initializer),
+                         bias_attr=name + '_output_fc.b_0')
+    return proj_out
+
+
+def positionwise_feed_forward(x,
+                              d_inner_hid,
+                              d_hid,
+                              dropout_rate,
+                              hidden_act,
+                              param_initializer=None,
+                              name='ffn'):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act=hidden_act,
+                       param_attr=fluid.ParamAttr(
+                           name=name + '_fc_0.w_0',
+                           initializer=param_initializer),
+                       bias_attr=name + '_fc_0.b_0')
+    if dropout_rate:
+        hidden = layers.dropout(
+            hidden,
+            dropout_prob=dropout_rate,
+            dropout_implementation="upscale_in_train",
+            is_test=False)
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=fluid.ParamAttr(
+                        name=name + '_fc_1.w_0', initializer=param_initializer),
+                    bias_attr=name + '_fc_1.b_0')
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
+                           name=''):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out_dtype = out.dtype
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float32")
+            out = layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_scale',
+                    initializer=fluid.initializer.Constant(1.)),
+                bias_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_bias',
+                    initializer=fluid.initializer.Constant(0.)))
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float16")
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    dropout_implementation="upscale_in_train",
+                    is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  hidden_act,
+                  preprocess_cmd="n",
+                  postprocess_cmd="da",
+                  param_initializer=None,
+                  name=''):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(
+        pre_process_layer(
+            enc_input,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_att'),
+        None,
+        None,
+        attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        param_initializer=param_initializer,
+        name=name + '_multi_head_att')
+    attn_output = post_process_layer(
+        enc_input,
+        attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_att')
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(
+            attn_output,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_ffn'),
+        d_inner_hid,
+        d_model,
+        relu_dropout,
+        hidden_act,
+        param_initializer=param_initializer,
+        name=name + '_ffn')
+    return post_process_layer(
+        attn_output,
+        ffd_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_ffn')
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
+
+    return enc_output
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/optimization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/optimization.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+from utils.fp16 import create_master_params_grads, master_param_to_train_param
+
+
+def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
+    """ Applies linear warmup of learning rate from 0 and decay to 0."""
+    with fluid.default_main_program()._lr_schedule_guard():
+        lr = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="scheduled_learning_rate")
+
+        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
+
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(global_step < num_train_steps * 0.1):
+                warmup_lr = learning_rate * (global_step / (num_train_steps * 0.1))
+                fluid.layers.tensor.assign(warmup_lr, lr)
+            with switch.default():
+                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+                    learning_rate=learning_rate,
+                    decay_steps=num_train_steps,
+                    end_learning_rate=0.0,
+                    power=1.0,
+                    cycle=False)
+                fluid.layers.tensor.assign(decayed_lr, lr)
+
+        return lr
+
+
+def optimization(loss,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 train_program,
+                 startup_prog,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 use_fp16=False,
+                 loss_scaling=1.0):
+    if warmup_steps > 0:
+        if scheduler == 'noam_decay':
+            scheduled_lr = fluid.layers.learning_rate_scheduler\
+             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
+                         warmup_steps)
+        elif scheduler == 'linear_warmup_decay':
+            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
+                                               num_train_steps)
+        else:
+            raise ValueError("Unkown learning rate scheduler, should be "
+                             "'noam_decay' or 'linear_warmup_decay'")
+        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, epsilon=1e-6)
+    else:
+        optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, epsilon=1e-6)
+        scheduled_lr = learning_rate
+
+    clip_norm_thres = 1.0
+    # When using mixed precision training, scale the gradient clip threshold
+    # by loss_scaling
+    if use_fp16 and loss_scaling > 1.0:
+        clip_norm_thres *= loss_scaling
+    fluid.clip.set_gradient_clip(
+        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
+
+    def exclude_from_weight_decay(name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+
+    param_list = dict()
+
+    if use_fp16:
+        param_grads = optimizer.backward(loss)
+        master_param_grads = create_master_params_grads(
+            param_grads, train_program, startup_prog, loss_scaling)
+
+        for param, _ in master_param_grads:
+            param_list[param.name] = param * 1.0
+            param_list[param.name].stop_gradient = True
+
+        optimizer.apply_gradients(master_param_grads)
+
+        if weight_decay > 0:
+            for param, grad in master_param_grads:
+                # if exclude_from_weight_decay(param.name.rstrip(".master")):
+                #     continue
+                if param.name == 'concept_emb_mat' or param.name == 'wn_concept_emb_mat' or param.name == 'nell_concept_emb_mat':
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+
+        master_param_to_train_param(master_param_grads, param_grads,
+                                    train_program)
+
+    else:
+        for param in train_program.global_block().all_parameters():
+            param_list[param.name] = param * 1.0
+            param_list[param.name].stop_gradient = True
+
+        _, param_grads = optimizer.minimize(loss)
+
+        if weight_decay > 0:
+            for param, grad in param_grads:
+                # if exclude_from_weight_decay(param.name):
+                #     continue
+                if param.name == 'concept_emb_mat' or param.name == 'wn_concept_emb_mat' or param.name == 'nell_concept_emb_mat':
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+
+    return scheduled_lr
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/__init__.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/__init__.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/record.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/record.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/record_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/record_twomemory.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/squad.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/squad.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/squad_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/reader/squad_twomemory.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_record.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_record.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_record_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_record_twomemory.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_squad.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_squad.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_squad_twomemory.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/run_squad_twomemory.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/tokenization.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/__init__.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/__init__.py
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/args.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Arguments for configuration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import argparse
+import logging
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+def str2bool(v):
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+
+
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+
+    def add_arg(self, name, type, default, help, **kwargs):
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            "--" + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+
+
+def print_arguments(args):
+    logger.info('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(six.iteritems(vars(args))):
+        logger.info('%s: %s' % (arg, value))
+    logger.info('------------------------------------------------')
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/fp16.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/fp16.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+
+
+def cast_fp16_to_fp32(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP16,
+            "out_dtype": fluid.core.VarDesc.VarType.FP32
+        })
+
+
+def cast_fp32_to_fp16(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP32,
+            "out_dtype": fluid.core.VarDesc.VarType.FP16
+        })
+
+
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+
+
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    tmp_role = main_prog._current_role
+    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
+    main_prog._current_role = OpRole.Backward
+    for p, g in params_grads:
+        # create master parameters
+        master_param = copy_to_master_param(p, main_prog.global_block())
+        startup_master_param = startup_prog.global_block()._clone_variable(
+            master_param)
+        startup_p = startup_prog.global_block().var(p.name)
+        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
+        # cast fp16 gradients to fp32 before apply gradients
+        if g.name.find("layer_norm") > -1:
+            if loss_scaling > 1:
+                scaled_g = g / float(loss_scaling)
+            else:
+                scaled_g = g
+            master_params_grads.append([p, scaled_g])
+            continue
+        master_grad = fluid.layers.cast(g, "float32")
+        if loss_scaling > 1:
+            master_grad = master_grad / float(loss_scaling)
+        master_params_grads.append([master_param, master_grad])
+    main_prog._current_role = tmp_role
+    return master_params_grads
+
+
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
--- a/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/init.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/reading_comprehension/src/utils/init.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import ast
+import copy
+import logging
+
+import numpy as np
+import paddle.fluid as fluid
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logging.getLogger().setLevel(logging.INFO)                    
+logger = logging.getLogger(__name__)
+
+def cast_fp32_to_fp16(exe, main_program):
+    logger.info("Cast parameters to float16 data format.")
+    for param in main_program.global_block().all_parameters():
+        if not param.name.endswith(".master"):
+            param_t = fluid.global_scope().find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            if param.name.find("layer_norm") == -1:
+                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            master_param_var = fluid.global_scope().find_var(param.name +
+                                                             ".master")
+            if master_param_var is not None:
+                master_param_var.get_tensor().set(data, exe.place)
+
+
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+
+    def existed_persitables(var):
+        if not fluid.io.is_persistable(var):
+            return False
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    logger.info("Load model from {}".format(init_checkpoint_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+
+
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+
+    def existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=existed_params)
+    logger.info("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
--- a/PaddleNLP/Research/ACL2019-KTNET/readme.md
+++ b/PaddleNLP/Research/ACL2019-KTNET/readme.md
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/KB_embeddings/.gitkeep
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/KB_embeddings/.gitkeep
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/ner_tagging_squad/tagging.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/ner_tagging_squad/tagging.py
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_nell/nell_concept_list.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_nell/nell_concept_list.txt
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_nell/retrieve.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_nell/retrieve.py
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/retrieve.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/retrieve.py
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/wordnet-mlj12-definitions.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/retrieve_wordnet/wordnet-mlj12-definitions.txt
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/do_tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/do_tokenization.py
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/tokenization.py
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/vocab.cased.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/vocab.cased.txt
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/vocab.uncased.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_record/vocab.uncased.txt
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/do_tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/do_tokenization.py
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/tokenization.py
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/tokenization.py
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/vocab.cased.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/vocab.cased.txt
--- a/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/vocab.uncased.txt
+++ b/PaddleNLP/Research/ACL2019-KTNET/retrieve_concepts/tokenization_squad/vocab.uncased.txt