Merge branch 'develop' of https://github.com/PaddlePaddle/models into fix_python3_tab_bug

f0ea0ef5 · shippingwang · 08236d32 · a8cd574b · f0ea0ef5 · f0ea0ef5
24 changed file
--- a/PaddleCV/image_classification/models/resnet.py
+++ b/PaddleCV/image_classification/models/resnet.py
@@ -105,7 +105,7 @@ class ResNet():
            num_filters=num_filters,
            filter_size=filter_size,
            stride=stride,
-            padding=(filter_size - 1) / 2,
+            padding=(filter_size - 1) // 2,
            groups=groups,
            act=None,
            param_attr=ParamAttr(name=name + "_weights"),

--- a/PaddleCV/image_classification/train.py
+++ b/PaddleCV/image_classification/train.py
@@ -40,7 +40,6 @@ add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate
 add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
 add_arg('enable_ce',        bool,  False,                "If set True, enable continuous evaluation job.")
 add_arg('data_dir',         str,   "./data/ILSVRC2012",  "The ImageNet dataset root dir.")
-add_arg('model_category',   str,   "models_name",        "Whether to use models_name or not, valid value:'models','models_name'." )
 add_arg('fp16',             bool,  False,                "Enable half precision training with fp16." )
 add_arg('scale_loss',       float, 1.0,                  "Scale loss for fp16." )
 add_arg('l2_decay',         float, 1e-4,                 "L2_decay parameter.")

--- a/PaddleNLP/ELMO/README.md
+++ b/PaddleNLP/ELMO/README.md
+<h1 align="center">ELMO</h1>
+
+## 介绍
+ELMO(Embeddings from Language Models)是一种新型深度语境化词表征，可对词进行复杂特征(如句法和语义)和词在语言语境中的变化进行建模(即对多义词进行建模)。PaddlePaddle版本该模型支持多卡训练，训练速度比主流实现快约1倍,  验证在中文词法分析任务上f1值提升0.68%。
+
+ELMO在大语料上以language model为训练目标，训练出bidirectional LSTM模型，利用LSTM产生词语的表征, 对下游NLP任务(如问答、分类、命名实体识别等）进行微调。
+
+## 基本配置及第三方安装包
+
+Python==2.7
+
+PaddlePaddle lastest版本
+
+numpy ==1.15.1
+
+six==1.11.0
+
+glob
+
+
+## 预训练模型
+
+1. 把文档文件切分成句子，并基于词表（参考vocabulary_min5k.txt）对句子进行切词。把文件切分成训练集trainset和测试集testset。
+
+```
+本 书 介绍 了 中国 经济 发展 的 内外 平衡 问题 、 亚洲 金融 危机 十 周年 回顾 与 反思 、 实践 中 的 城乡 统筹 发展 、 未来 十 年 中国 需要 研究 的 重大 课题 、 科学 发展 与 新型 工业 化 等 方面 。
+```
+```
+吴 敬 琏 曾经 提出 中国 股市 “ 赌场 论 ” ， 主张 维护 市场 规则 ， 保护 草根 阶层 生计 ， 被 誉 为 “ 中国 经济 学界 良心 ” ， 是 媒体 和 公众 眼中 的 学术 明星 
+```
+
+2. 训练模型
+
+```shell
+sh run.sh
+```
+
+3. 把checkpoint结果写入文件中。
+
+
+## 单机多卡训练
+
+模型支持单机多卡训练，需要在run.sh里export CUDA_VISIBLE_DEVICES设置指定卡,如下所示：
+```shell
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+```
+
+## 如何利用ELMO做微调
+
+1. 下载ELMO Paddle官方发布Checkout文件
+
+[PaddlePaddle官方发布Checkout文件下载地址](https://dureader.gz.bcebos.com/elmo/elmo_chinese_checkpoint.tar.gz)
+
+2. 在train部分中加载ELMO checkpoint文件
+
+```shell
+src_pretrain_model_path = '490001' #490001为ELMO checkpoint文件
+def if_exist(var):
+    path = os.path.join(src_pretrain_model_path, var.name)
+    exist = os.path.exists(path)
+    if exist:
+        print('Load model: %s' % path)
+    return exist
+
+fluid.io.load_vars(executor=exe, dirname=src_pretrain_model_path, predicate=if_exist, main_program=main_program) 
+```
+
+3. 在下游NLP任务文件夹中加入bilm.py文件
+
+4. 基于elmo词表（参考vocabulary_min5k.txt）对输入的句子或段落进行切词，并把切词的词转化为id,放入feed_dict中。
+
+5. 在下游任务网络定义中embedding部分加入ELMO网络的定义
+
+```shell
+#引入 bilm.py embedding部分和encoder部分
+from bilm import elmo_encoder
+from bilm import emb
+
+#word为输入elmo部分切词后的字典
+elmo_embedding = emb(word)
+elmo_enc= elmo_encoder(elmo_embedding)
+
+#与NLP任务中生成词向量word_embedding做连接操作
+word_embedding=layers.concat(input=[elmo_enc, word_embedding], axis=1)
+
+```
+
+
+## 参考论文
+[Deep contextualized word representations](https://arxiv.org/abs/1802.05365)
--- a/PaddleNLP/ELMO/args.py
+++ b/PaddleNLP/ELMO/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--load_dir",
+        type=str,
+        default="",
+        help="Specify the path to load trained models.")
+    parser.add_argument(
+        "--load_pretraining_params",
+        type=str,
+        default="",
+        help="Specify the path to load pretrained model parameters, NOT including moment and learning_rate")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="The sequence number of a mini-batch data. (default: %(default)d)")
+    parser.add_argument(
+        "--embed_size",
+        type=int,
+        default=512,
+        help="The dimension of embedding table. (default: %(default)d)")
+    parser.add_argument(
+        "--hidden_size",
+        type=int,
+        default=4096,
+        help="The size of rnn hidden unit. (default: %(default)d)")
+    parser.add_argument(
+        "--num_layers",
+        type=int,
+        default=2,
+        help="The size of rnn layers. (default: %(default)d)")
+    parser.add_argument(
+        "--num_steps",
+        type=int,
+        default=20,
+        help="The size of sequence len. (default: %(default)d)")
+    parser.add_argument(
+        "--data_path", type=str, help="all the data for train,valid,test")
+    parser.add_argument("--vocab_path", type=str, help="vocab file path")
+    parser.add_argument(
+        '--use_gpu', type=bool, default=False, help='whether using gpu')
+    parser.add_argument('--enable_ce', action='store_true')
+    parser.add_argument('--test_nccl', action='store_true')
+    parser.add_argument('--optim', default='adagrad', help='optimizer type')
+    parser.add_argument('--sample_softmax', action='store_true')
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=0.2,
+        help="Learning rate used to train the model. (default: %(default)f)")
+    parser.add_argument(
+        "--log_interval",
+        type=int,
+        default=100,
+        help="log the train loss every n batches."
+        "(default: %(default)d)")
+    parser.add_argument(
+        "--save_interval",
+        type=int,
+        default=10000,
+        help="log the train loss every n batches."
+        "(default: %(default)d)")
+    parser.add_argument(
+        "--dev_interval",
+        type=int,
+        default=10000,
+        help="cal dev loss every n batches."
+        "(default: %(default)d)")
+    parser.add_argument('--dropout', type=float, default=0.1)
+    parser.add_argument('--max_grad_norm', type=float, default=10.0)
+    parser.add_argument('--proj_clip', type=float, default=3.0)
+    parser.add_argument('--cell_clip', type=float, default=3.0)
+    parser.add_argument('--max_epoch', type=float, default=10)
+    parser.add_argument('--local', type=bool, default=False)
+    parser.add_argument('--shuffle', type=bool, default=False)
+    parser.add_argument('--use_custom_samples', type=bool, default=False)
+    parser.add_argument('--para_save_dir', type=str, default='model_new')
+    parser.add_argument('--train_path', type=str, default='')
+    parser.add_argument('--test_path', type=str, default='')
+    parser.add_argument('--update_method', type=str, default='nccl2')
+    parser.add_argument('--random_seed', type=int, default=0)
+    parser.add_argument('--n_negative_samples_batch', type=int, default=8000)
+    args = parser.parse_args()
+
+    return args
--- a/PaddleNLP/ELMO/bilm.py
+++ b/PaddleNLP/ELMO/bilm.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is used to finetone
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy
+import paddle.fluid.layers as layers
+import paddle.fluid as fluid
+import numpy as np
+
+# if you use our release weight layers,do not use the args.
+cell_clip = 3.0
+proj_clip = 3.0
+hidden_size = 4096
+vocab_size = 52445
+embed_size = 512
+# according to orginal paper, dropout need to be modifyed on finetone
+modify_dropout = 1
+proj_size = 512
+num_layers = 2
+random_seed = 0
+dropout_rate = 0.5
+
+
+def dropout(input):
+    return layers.dropout(
+        input,
+        dropout_prob=dropout_rate,
+        dropout_implementation="upscale_in_train",
+        seed=random_seed,
+        is_test=False)
+
+def lstmp_encoder(input_seq, gate_size, h_0, c_0, para_name):
+    # A lstm encoder implementation with projection.
+    # Linear transformation part for input gate, output gate, forget gate
+    # and cell activation vectors need be done outside of dynamic_lstm.
+    # So the output size is 4 times of gate_size.
+
+    input_proj = layers.fc(input=input_seq,
+                           param_attr=fluid.ParamAttr(
+                               name=para_name + '_gate_w', initializer=init),
+                           size=gate_size * 4,
+                           act=None,
+                           bias_attr=False)
+    hidden, cell = layers.dynamic_lstmp(
+        input=input_proj,
+        size=gate_size * 4,
+        proj_size=proj_size,
+        h_0=h_0,
+        c_0=c_0,
+        use_peepholes=False,
+        proj_clip=proj_clip,
+        cell_clip=cell_clip,
+        proj_activation="identity",
+        param_attr=fluid.ParamAttr(initializer=None),
+        bias_attr=fluid.ParamAttr(initializer=None))
+    return hidden, cell, input_proj
+
+
+def encoder(x_emb,
+            init_hidden=None,
+            init_cell=None,
+            para_name=''):
+    rnn_input = x_emb
+    rnn_outs = []
+    rnn_outs_ori = []
+    cells = []
+    projs = []
+    for i in range(num_layers):
+        if init_hidden and init_cell:
+            h0 = layers.squeeze(
+                layers.slice(
+                    init_hidden, axes=[0], starts=[i], ends=[i + 1]),
+                axes=[0])
+            c0 = layers.squeeze(
+                layers.slice(
+                    init_cell, axes=[0], starts=[i], ends=[i + 1]),
+                axes=[0])
+        else:
+            h0 = c0 = None
+        rnn_out, cell, input_proj = lstmp_encoder(
+            rnn_input, hidden_size, h0, c0,
+            para_name + 'layer{}'.format(i + 1))
+        rnn_out_ori = rnn_out
+        if i > 0:
+            rnn_out = rnn_out + rnn_input
+        rnn_out.stop_gradient = True
+        rnn_outs.append(rnn_out)
+        rnn_outs_ori.append(rnn_out_ori)
+    # add weight layers for finetone
+    a1 = layers.create_parameter(
+        [1], dtype="float32", name="gamma1")
+    a2 = layers.create_parameter(
+        [1], dtype="float32", name="gamma2")
+    rnn_outs[0].stop_gradient = True
+    rnn_outs[1].stop_gradient = True
+    num_layer1 = rnn_outs[0] * a1
+    num_layer2 = rnn_outs[1] * a2
+    output_layer = num_layer1 * 0.5 + num_layer2 * 0.5
+    return output_layer, rnn_outs_ori
+
+
+def emb(x):
+    x_emb = layers.embedding(
+        input=x,
+        size=[vocab_size, embed_size],
+        dtype='float32',
+        is_sparse=False,
+        param_attr=fluid.ParamAttr(name='embedding_para'))
+    return x_emb
+
+
+def elmo_encoder(x_emb):
+    x_emb_r = fluid.layers.sequence_reverse(x_emb, name=None)
+    fw_hiddens, fw_hiddens_ori = encoder(
+        x_emb,
+        para_name='fw_')
+    bw_hiddens, bw_hiddens_ori = encoder(
+        x_emb_r,
+        para_name='bw_')
+    embedding = layers.concat(input=[fw_hiddens, bw_hiddens], axis=1)
+    # add dropout on finetone
+    embedding = dropout(embedding)
+    a = layers.create_parameter(
+        [1], dtype="float32", name="gamma")
+    embedding = embedding * a
+    return embedding
--- a/PaddleNLP/ELMO/data.py
+++ b/PaddleNLP/ELMO/data.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import random
+
+import numpy as np
+import io
+import six
+
+class Vocabulary(object):
+    '''
+    A token vocabulary.  Holds a map from token to ids and provides
+    a method for encoding text to a sequence of ids.
+    '''
+
+    def __init__(self, filename, validate_file=False):
+        '''
+        filename = the vocabulary file.  It is a flat text file with one
+            (normalized) token per line.  In addition, the file should also
+            contain the special tokens <S>, </S>, <UNK> (case sensitive).
+        '''
+        self._id_to_word = []
+        self._word_to_id = {}
+        self._unk = -1
+        self._bos = -1
+        self._eos = -1
+
+        with io.open(filename, 'r', encoding='utf-8') as f:
+            idx = 0
+            for line in f:
+                word_name = line.strip()
+                if word_name == '<S>':
+                    self._bos = idx
+                elif word_name == '</S>':
+                    self._eos = idx
+                elif word_name == '<UNK>':
+                    self._unk = idx
+                if word_name == '!!!MAXTERMID':
+                    continue
+
+                self._id_to_word.append(word_name)
+                self._word_to_id[word_name] = idx
+                idx += 1
+
+        # check to ensure file has special tokens
+        if validate_file:
+            if self._bos == -1 or self._eos == -1 or self._unk == -1:
+                raise ValueError("Ensure the vocabulary file has "
+                                 "<S>, </S>, <UNK> tokens")
+
+    @property
+    def bos(self):
+        return self._bos
+
+    @property
+    def eos(self):
+        return self._eos
+
+    @property
+    def unk(self):
+        return self._unk
+
+    @property
+    def size(self):
+        return len(self._id_to_word)
+
+    def word_to_id(self, word):
+        if word in self._word_to_id:
+            return self._word_to_id[word]
+        return self.unk
+
+    def id_to_word(self, cur_id):
+        return self._id_to_word[cur_id]
+
+    def decode(self, cur_ids):
+        """Convert a list of ids to a sentence, with space inserted."""
+        return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids])
+
+    def encode(self, sentence, reverse=False, split=True):
+        """Convert a sentence to a list of ids, with special tokens added.
+        Sentence is a single string with tokens separated by whitespace.
+
+        If reverse, then the sentence is assumed to be reversed, and
+            this method will swap the BOS/EOS tokens appropriately."""
+
+        if split:
+            word_ids = [
+                self.word_to_id(cur_word) for cur_word in sentence.split()
+            ]
+        else:
+            word_ids = [self.word_to_id(cur_word) for cur_word in sentence]
+
+        if reverse:
+            return np.array([self.eos] + word_ids + [self.bos], dtype=np.int32)
+        else:
+            return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32)
+
+
+class UnicodeCharsVocabulary(Vocabulary):
+    """Vocabulary containing character-level and word level information.
+
+    Has a word vocabulary that is used to lookup word ids and
+    a character id that is used to map words to arrays of character ids.
+
+    The character ids are defined by ord(c) for c in word.encode('utf-8')
+    This limits the total number of possible char ids to 256.
+    To this we add 5 additional special ids: begin sentence, end sentence,
+        begin word, end word and padding.
+
+    WARNING: for prediction, we add +1 to the output ids from this
+    class to create a special padding id (=0).  As a result, we suggest
+    you use the `Batcher`, `TokenBatcher`, and `LMDataset` classes instead
+    of this lower level class.  If you are using this lower level class,
+    then be sure to add the +1 appropriately, otherwise embeddings computed
+    from the pre-trained model will be useless.
+    """
+
+    def __init__(self, filename, max_word_length, **kwargs):
+        super(UnicodeCharsVocabulary, self).__init__(filename, **kwargs)
+        self._max_word_length = max_word_length
+
+        # char ids 0-255 come from utf-8 encoding bytes
+        # assign 256-300 to special chars
+        self.bos_char = 256  # <begin sentence>
+        self.eos_char = 257  # <end sentence>
+        self.bow_char = 258  # <begin word>
+        self.eow_char = 259  # <end word>
+        self.pad_char = 260  # <padding>
+
+        num_words = len(self._id_to_word)
+
+        self._word_char_ids = np.zeros(
+            [num_words, max_word_length], dtype=np.int32)
+
+        # the charcter representation of the begin/end of sentence characters
+        def _make_bos_eos(c):
+            r = np.zeros([self.max_word_length], dtype=np.int32)
+            r[:] = self.pad_char
+            r[0] = self.bow_char
+            r[1] = c
+            r[2] = self.eow_char
+            return r
+
+        self.bos_chars = _make_bos_eos(self.bos_char)
+        self.eos_chars = _make_bos_eos(self.eos_char)
+
+        for i, word in enumerate(self._id_to_word):
+            self._word_char_ids[i] = self._convert_word_to_char_ids(word)
+
+        self._word_char_ids[self.bos] = self.bos_chars
+        self._word_char_ids[self.eos] = self.eos_chars
+
+    @property
+    def word_char_ids(self):
+        return self._word_char_ids
+
+    @property
+    def max_word_length(self):
+        return self._max_word_length
+
+    def _convert_word_to_char_ids(self, word):
+        code = np.zeros([self.max_word_length], dtype=np.int32)
+        code[:] = self.pad_char
+
+        word_encoded = word.encode('utf-8',
+                                   'ignore')[:(self.max_word_length - 2)]
+        code[0] = self.bow_char
+        for k, chr_id in enumerate(word_encoded, start=1):
+            code[k] = ord(chr_id)
+        code[k + 1] = self.eow_char
+
+        return code
+
+    def word_to_char_ids(self, word):
+        if word in self._word_to_id:
+            return self._word_char_ids[self._word_to_id[word]]
+        else:
+            return self._convert_word_to_char_ids(word)
+
+    def encode_chars(self, sentence, reverse=False, split=True):
+        '''
+        Encode the sentence as a white space delimited string of tokens.
+        '''
+        if split:
+            chars_ids = [
+                self.word_to_char_ids(cur_word)
+                for cur_word in sentence.split()
+            ]
+        else:
+            chars_ids = [
+                self.word_to_char_ids(cur_word) for cur_word in sentence
+            ]
+        if reverse:
+            return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars])
+        else:
+            return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])
+
+
+class Batcher(object):
+    ''' 
+    Batch sentences of tokenized text into character id matrices.
+    '''
+
+    # def __init__(self, lm_vocab_file: str, max_token_length: int):
+    def __init__(self, lm_vocab_file, max_token_length):
+        '''
+        lm_vocab_file = the language model vocabulary file (one line per
+            token)
+        max_token_length = the maximum number of characters in each token
+        '''
+        max_token_length = int(max_token_length)
+        self._lm_vocab = UnicodeCharsVocabulary(lm_vocab_file,
+                                                max_token_length)
+        self._max_token_length = max_token_length
+
+    # def batch_sentences(self, sentences: List[List[str]]):
+    def batch_sentences(self, sentences):
+        '''
+        Batch the sentences as character ids
+        Each sentence is a list of tokens without <s> or </s>, e.g.
+        [['The', 'first', 'sentence', '.'], ['Second', '.']]
+        '''
+        n_sentences = len(sentences)
+        max_length = max(len(sentence) for sentence in sentences) + 2
+
+        X_char_ids = np.zeros(
+            (n_sentences, max_length, self._max_token_length), dtype=np.int64)
+
+        for k, sent in enumerate(sentences):
+            length = len(sent) + 2
+            char_ids_without_mask = self._lm_vocab.encode_chars(
+                sent, split=False)
+            # add one so that 0 is the mask value
+            X_char_ids[k, :length, :] = char_ids_without_mask + 1
+
+        return X_char_ids
+
+
+class TokenBatcher(object):
+    ''' 
+    Batch sentences of tokenized text into token id matrices.
+    '''
+
+    def __init__(self, lm_vocab_file):
+        # def __init__(self, lm_vocab_file: str):
+        '''
+        lm_vocab_file = the language model vocabulary file (one line per
+            token)
+        '''
+        self._lm_vocab = Vocabulary(lm_vocab_file)
+
+    # def batch_sentences(self, sentences: List[List[str]]):
+    def batch_sentences(self, sentences):
+        '''
+        Batch the sentences as character ids
+        Each sentence is a list of tokens without <s> or </s>, e.g.
+        [['The', 'first', 'sentence', '.'], ['Second', '.']]
+        '''
+        n_sentences = len(sentences)
+        max_length = max(len(sentence) for sentence in sentences) + 2
+
+        X_ids = np.zeros((n_sentences, max_length), dtype=np.int64)
+
+        for k, sent in enumerate(sentences):
+            length = len(sent) + 2
+            ids_without_mask = self._lm_vocab.encode(sent, split=False)
+            # add one so that 0 is the mask value
+            X_ids[k, :length] = ids_without_mask + 1
+
+        return X_ids
+
+
+##### for training
+def _get_batch(generator, batch_size, num_steps, max_word_length):
+    """Read batches of input."""
+    cur_stream = [None] * batch_size
+
+    no_more_data = False
+    while True:
+        inputs = np.zeros([batch_size, num_steps], np.int32)
+        if max_word_length is not None:
+            char_inputs = np.zeros([batch_size, num_steps, max_word_length],
+                                   np.int32)
+        else:
+            char_inputs = None
+        targets = np.zeros([batch_size, num_steps], np.int32)
+
+        for i in range(batch_size):
+            cur_pos = 0
+
+            while cur_pos < num_steps:
+                if cur_stream[i] is None or len(cur_stream[i][0]) <= 1:
+                    try:
+                        cur_stream[i] = list(next(generator))
+                    except StopIteration:
+                        # No more data, exhaust current streams and quit
+                        no_more_data = True
+                        break
+
+                how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos)
+                next_pos = cur_pos + how_many
+
+                inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many]
+                if max_word_length is not None:
+                    char_inputs[i, cur_pos:next_pos] = cur_stream[i][
+                        1][:how_many]
+                targets[i, cur_pos:next_pos] = cur_stream[i][0][1:how_many + 1]
+
+                cur_pos = next_pos
+
+                cur_stream[i][0] = cur_stream[i][0][how_many:]
+                if max_word_length is not None:
+                    cur_stream[i][1] = cur_stream[i][1][how_many:]
+
+        if no_more_data:
+            # There is no more data.  Note: this will not return data
+            # for the incomplete batch
+            break
+
+        X = {
+            'token_ids': inputs,
+            'tokens_characters': char_inputs,
+            'next_token_id': targets
+        }
+
+        yield X
+
+
+class LMDataset(object):
+    """
+    Hold a language model dataset.
+
+    A dataset is a list of tokenized files.  Each file contains one sentence
+        per line.  Each sentence is pre-tokenized and white space joined.
+    """
+
+    def __init__(self,
+                 filepattern,
+                 vocab,
+                 reverse=False,
+                 test=False,
+                 shuffle_on_load=False):
+        '''
+        filepattern = a glob string that specifies the list of files.
+        vocab = an instance of Vocabulary or UnicodeCharsVocabulary
+        reverse = if True, then iterate over tokens in each sentence in reverse
+        test = if True, then iterate through all data once then stop.
+            Otherwise, iterate forever.
+        shuffle_on_load = if True, then shuffle the sentences after loading.
+        '''
+        self._vocab = vocab
+        self._all_shards = glob.glob(filepattern)
+        print('Found %d shards at %s' % (len(self._all_shards), filepattern))
+        if test:
+            self._all_shards = list(np.random.choice(self._all_shards, size=4))
+            print('sampled %d shards at %s' % (len(self._all_shards), filepattern))
+        self._shards_to_choose = []
+
+        self._reverse = reverse
+        self._test = test
+        self._shuffle_on_load = shuffle_on_load
+        self._use_char_inputs = hasattr(vocab, 'encode_chars')
+
+        self._ids = self._load_random_shard()
+
+    def _choose_random_shard(self):
+        if len(self._shards_to_choose) == 0:
+            self._shards_to_choose = list(self._all_shards)
+            random.shuffle(self._shards_to_choose)
+        shard_name = self._shards_to_choose.pop()
+        return shard_name
+
+    def _load_random_shard(self):
+        """Randomly select a file and read it."""
+        if self._test:
+            if len(self._all_shards) == 0:
+                # we've loaded all the data
+                # this will propogate up to the generator in get_batch
+                # and stop iterating
+                raise StopIteration
+            else:
+                shard_name = self._all_shards.pop()
+        else:
+            # just pick a random shard
+            shard_name = self._choose_random_shard()
+
+        ids = self._load_shard(shard_name)
+        self._i = 0
+        self._nids = len(ids)
+        return ids
+
+    def _load_shard(self, shard_name):
+        """Read one file and convert to ids.
+
+        Args:
+            shard_name: file path.
+
+        Returns:
+            list of (id, char_id) tuples.
+        """
+        print('Loading data from: %s' % shard_name)
+        with io.open(shard_name, 'r', encoding='utf-8') as f:
+            sentences_raw = f.readlines()
+
+        if self._reverse:
+            sentences = []
+            for sentence in sentences_raw:
+                splitted = sentence.split()
+                splitted.reverse()
+                sentences.append(' '.join(splitted))
+        else:
+            sentences = sentences_raw
+
+        if self._shuffle_on_load:
+            print('shuffle sentences')
+            random.shuffle(sentences)
+
+        ids = [
+            self.vocab.encode(sentence, self._reverse)
+            for sentence in sentences
+        ]
+        if self._use_char_inputs:
+            chars_ids = [
+                self.vocab.encode_chars(sentence, self._reverse)
+                for sentence in sentences
+            ]
+        else:
+            chars_ids = [None] * len(ids)
+
+        print('Loaded %d sentences.' % len(ids))
+        print('Finished loading')
+        return list(zip(ids, chars_ids))
+
+    def get_sentence(self):
+        while True:
+            if self._i == self._nids:
+                self._ids = self._load_random_shard()
+            ret = self._ids[self._i]
+            self._i += 1
+            yield ret
+
+    @property
+    def max_word_length(self):
+        if self._use_char_inputs:
+            return self._vocab.max_word_length
+        else:
+            return None
+
+    def iter_batches(self, batch_size, num_steps):
+        for X in _get_batch(self.get_sentence(), batch_size, num_steps,
+                            self.max_word_length):
+
+            # token_ids = (batch_size, num_steps)
+            # char_inputs = (batch_size, num_steps, 50) of character ids
+            # targets = word ID of next word (batch_size, num_steps)
+            yield X
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+
+class BidirectionalLMDataset(object):
+    def __init__(self, filepattern, vocab, test=False, shuffle_on_load=False):
+        '''
+        bidirectional version of LMDataset
+        '''
+        self._data_forward = LMDataset(
+            filepattern,
+            vocab,
+            reverse=False,
+            test=test,
+            shuffle_on_load=shuffle_on_load)
+        self._data_reverse = LMDataset(
+            filepattern,
+            vocab,
+            reverse=True,
+            test=test,
+            shuffle_on_load=shuffle_on_load)
+
+    def iter_batches(self, batch_size, num_steps):
+        max_word_length = self._data_forward.max_word_length
+
+        for X, Xr in six.moves.zip(
+                _get_batch(self._data_forward.get_sentence(), batch_size,
+                           num_steps, max_word_length),
+                _get_batch(self._data_reverse.get_sentence(), batch_size,
+                           num_steps, max_word_length)):
+
+            for k, v in Xr.items():
+                X[k + '_reverse'] = v
+
+            yield X
+
+
+class InvalidNumberOfCharacters(Exception):
+    pass
--- a/PaddleNLP/ELMO/data/vocabulary_min5k.txt
+++ b/PaddleNLP/ELMO/data/vocabulary_min5k.txt
--- a/PaddleNLP/ELMO/lm_model.py
+++ b/PaddleNLP/ELMO/lm_model.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid.layers as layers
+import paddle.fluid as fluid
+import numpy as np
+
+
+def dropout(input, test_mode, args):
+    if args.dropout and (not test_mode):
+        return layers.dropout(
+            input,
+            dropout_prob=args.dropout,
+            dropout_implementation="upscale_in_train",
+            seed=args.random_seed,
+            is_test=False)
+    else:
+        return input
+
+
+def lstmp_encoder(input_seq, gate_size, h_0, c_0, para_name, proj_size, test_mode, args):
+    # A lstm encoder implementation with projection.
+    # Linear transformation part for input gate, output gate, forget gate
+    # and cell activation vectors need be done outside of dynamic_lstm.
+    # So the output size is 4 times of gate_size.
+
+    input_seq = dropout(input_seq, test_mode, args)
+    input_proj = layers.fc(input=input_seq,
+                           param_attr=fluid.ParamAttr(
+                               name=para_name + '_gate_w', initializer=None),
+                           size=gate_size * 4,
+                           act=None,
+                           bias_attr=False)
+    hidden, cell = layers.dynamic_lstmp(
+        input=input_proj,
+        size=gate_size * 4,
+        proj_size=proj_size,
+        h_0=h_0,
+        c_0=c_0,
+        use_peepholes=False,
+        proj_clip=args.proj_clip,
+        cell_clip=args.cell_clip,
+        proj_activation="identity",
+        param_attr=fluid.ParamAttr(initializer=None),
+        bias_attr=fluid.ParamAttr(initializer=None))
+
+    return hidden, cell, input_proj
+
+
+def encoder(x,
+            y,
+            vocab_size,
+            emb_size,
+            init_hidden=None,
+            init_cell=None,
+            para_name='',
+            custom_samples=None,
+            custom_probabilities=None,
+            test_mode=False,
+            args=None):
+    x_emb = layers.embedding(
+        input=x,
+        size=[vocab_size, emb_size],
+        dtype='float32',
+        is_sparse=False,
+        param_attr=fluid.ParamAttr(name='embedding_para'))
+    rnn_input = x_emb
+    rnn_outs = []
+    rnn_outs_ori = []
+    cells = []
+    projs = []
+    for i in range(args.num_layers):
+        rnn_input = dropout(rnn_input, test_mode, args)
+        if init_hidden and init_cell:
+            h0 = layers.squeeze(
+                layers.slice(
+                    init_hidden, axes=[0], starts=[i], ends=[i + 1]),
+                axes=[0])
+            c0 = layers.squeeze(
+                layers.slice(
+                    init_cell, axes=[0], starts=[i], ends=[i + 1]),
+                axes=[0])
+        else:
+            h0 = c0 = None
+        rnn_out, cell, input_proj = lstmp_encoder(
+            rnn_input, args.hidden_size, h0, c0,
+            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
+        rnn_out_ori = rnn_out
+        if i > 0:
+            rnn_out = rnn_out + rnn_input
+        rnn_out = dropout(rnn_out, test_mode, args)
+        cell = dropout(cell, test_mode, args)
+        rnn_outs.append(rnn_out)
+        rnn_outs_ori.append(rnn_out_ori)
+        rnn_input = rnn_out
+        cells.append(cell)
+        projs.append(input_proj)
+
+    softmax_weight = layers.create_parameter(
+        [vocab_size, emb_size], dtype="float32", name="softmax_weight")
+    softmax_bias = layers.create_parameter(
+        [vocab_size], dtype="float32", name='softmax_bias')
+    projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True)
+    projection = layers.elementwise_add(projection, softmax_bias)
+
+    projection = layers.reshape(projection, shape=[-1, vocab_size])
+
+    if args.sample_softmax and (not test_mode):
+        loss = layers.sampled_softmax_with_cross_entropy(
+            logits=projection,
+            label=y,
+            num_samples=args.n_negative_samples_batch,
+            seed=args.random_seed)
+    else:
+        label = layers.one_hot(input=y, depth=vocab_size)
+        loss = layers.softmax_with_cross_entropy(
+            logits=projection, label=label, soft_label=True)
+    return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
+
+
+class LanguageModel(object):
+    def __init__(self, args, vocab_size, test_mode):
+        self.args = args
+        self.vocab_size = vocab_size
+        self.test_mode = test_mode
+
+    def build(self):
+        args = self.args
+        emb_size = args.embed_size
+        proj_size = args.embed_size
+        hidden_size = args.hidden_size
+        batch_size = args.batch_size
+        num_layers = args.num_layers
+        num_steps = args.num_steps
+
+        lstm_outputs = []
+
+        x_f = layers.data(name="x", shape=[1], dtype='int64', lod_level=1)
+        y_f = layers.data(name="y", shape=[1], dtype='int64', lod_level=1)
+
+        x_b = layers.data(name="x_r", shape=[1], dtype='int64', lod_level=1)
+        y_b = layers.data(name="y_r", shape=[1], dtype='int64', lod_level=1)
+
+        init_hiddens_ = layers.data(
+            name="init_hiddens", shape=[1], dtype='float32')
+        init_cells_ = layers.data(
+            name="init_cells", shape=[1], dtype='float32')
+
+        init_hiddens = layers.reshape(
+            init_hiddens_, shape=[2 * num_layers, -1, proj_size])
+        init_cells = layers.reshape(
+            init_cells_, shape=[2 * num_layers, -1, hidden_size])
+
+        init_hidden = layers.slice(
+            init_hiddens, axes=[0], starts=[0], ends=[num_layers])
+        init_cell = layers.slice(
+            init_cells, axes=[0], starts=[0], ends=[num_layers])
+        init_hidden_r = layers.slice(
+            init_hiddens, axes=[0], starts=[num_layers],
+            ends=[2 * num_layers])
+        init_cell_r = layers.slice(
+            init_cells, axes=[0], starts=[num_layers], ends=[2 * num_layers])
+
+        if args.use_custom_samples:
+            custom_samples = layers.data(
+                name="custom_samples",
+                shape=[args.n_negative_samples_batch + 1],
+                dtype='int64',
+                lod_level=1)
+            custom_samples_r = layers.data(
+                name="custom_samples_r",
+                shape=[args.n_negative_samples_batch + 1],
+                dtype='int64',
+                lod_level=1)
+            custom_probabilities = layers.data(
+                name="custom_probabilities",
+                shape=[args.n_negative_samples_batch + 1],
+                dtype='float32',
+                lod_level=1)
+        else:
+            custom_samples = None
+            custom_samples_r = None
+            custom_probabilities = None
+
+        forward, fw_hiddens, fw_hiddens_ori, fw_cells, fw_projs = encoder(
+            x_f,
+            y_f,
+            self.vocab_size,
+            emb_size,
+            init_hidden,
+            init_cell,
+            para_name='fw_',
+            custom_samples=custom_samples,
+            custom_probabilities=custom_probabilities,
+            test_mode=self.test_mode,
+            args=args)
+        backward, bw_hiddens, bw_hiddens_ori, bw_cells, bw_projs = encoder(
+            x_b,
+            y_b,
+            self.vocab_size,
+            emb_size,
+            init_hidden_r,
+            init_cell_r,
+            para_name='bw_',
+            custom_samples=custom_samples_r,
+            custom_probabilities=custom_probabilities,
+            test_mode=self.test_mode,
+            args=args)
+
+        losses = layers.concat([forward[-1], backward[-1]])
+        self.loss = layers.reduce_mean(losses)
+        self.loss.persistable = True
+        self.grad_vars = [x_f, y_f, x_b, y_b, self.loss]
+        self.grad_vars_name = ['x', 'y', 'x_r', 'y_r', 'final_loss']
+        fw_vars_name = ['x_emb', 'proj', 'loss'] + [
+            'init_hidden', 'init_cell'
+        ] + ['rnn_out', 'rnn_out2', 'cell', 'cell2', 'xproj', 'xproj2']
+        bw_vars_name = ['x_emb_r', 'proj_r', 'loss_r'] + [
+            'init_hidden_r', 'init_cell_r'
+        ] + [
+            'rnn_out_r', 'rnn_out2_r', 'cell_r', 'cell2_r', 'xproj_r',
+            'xproj2_r'
+        ]
+        fw_vars = forward + [init_hidden, init_cell
+                             ] + fw_hiddens + fw_cells + fw_projs
+        bw_vars = backward + [init_hidden_r, init_cell_r
+                              ] + bw_hiddens + bw_cells + bw_projs
+        for i in range(len(fw_vars_name)):
+            self.grad_vars.append(fw_vars[i])
+            self.grad_vars.append(bw_vars[i])
+            self.grad_vars_name.append(fw_vars_name[i])
+            self.grad_vars_name.append(bw_vars_name[i])
+        if args.use_custom_samples:
+            self.feed_order = [
+                'x', 'y', 'x_r', 'y_r', 'custom_samples', 'custom_samples_r',
+                'custom_probabilities'
+            ]
+        else:
+            self.feed_order = ['x', 'y', 'x_r', 'y_r']
+        self.last_hidden = [
+            fluid.layers.sequence_last_step(input=x)
+            for x in fw_hiddens_ori + bw_hiddens_ori
+        ]
+        self.last_cell = [
+            fluid.layers.sequence_last_step(input=x)
+            for x in fw_cells + bw_cells
+        ]
+        self.last_hidden = layers.concat(self.last_hidden, axis=0)
+        self.last_hidden.persistable = True
+        self.last_cell = layers.concat(self.last_cell, axis=0)
+        self.last_cell.persistable = True
--- a/PaddleNLP/ELMO/reader.py
+++ b/PaddleNLP/ELMO/reader.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import sys
+import numpy as np
+
+Py3 = sys.version_info[0] == 3
+
+
+def listDir(rootDir):
+    res = []
+    for filename in os.listdir(rootDir):
+        pathname = os.path.join(rootDir, filename)
+        if (os.path.isfile(pathname)):
+            res.append(pathname)
+    return res
+
+
+_unk = -1
+_bos = -1
+_eos = -1
+
+
+def _read_words(filename):
+    data = []
+    with open(filename, "r") as f:
+        return f.read().decode("utf-8").replace("\n", "<eos>").split()
+
+
+def _build_vocab(filename):
+    data = _read_words(filename)
+
+    counter = collections.Counter(data)
+    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
+
+    words, _ = list(zip(*count_pairs))
+
+    print("vocab word num", len(words))
+    word_to_id = dict(zip(words, range(len(words))))
+
+    return word_to_id
+
+
+def _load_vocab(filename):
+    with open(filename, "r") as f:
+        words = f.read().decode("utf-8").replace("\n", " ").split()
+        word_to_id = dict(zip(words, range(len(words))))
+        _unk = word_to_id['<S>']
+        _eos = word_to_id['</S>']
+        _unk = word_to_id['<UNK>']
+        return word_to_id
+
+
+def _file_to_word_ids(filenames, word_to_id):
+    for filename in filenames:
+        data = _read_words(filename)
+        for id in [word_to_id[word] for word in data if word in word_to_id]:
+            yield id
+
+
+def ptb_raw_data(data_path=None, vocab_path=None, args=None):
+    """Load PTB raw data from data directory "data_path".
+
+  Reads PTB text files, converts strings to integer ids,
+  and performs mini-batching of the inputs.
+
+  The PTB dataset comes from Tomas Mikolov's webpage:
+
+  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
+
+  Args:
+    data_path: string path to the directory where simple-examples.tgz has
+      been extracted.
+
+  Returns:
+    tuple (train_data, valid_data, test_data, vocabulary)
+    where each of the data objects can be passed to PTBIterator.
+  """
+    if vocab_path:
+        word_to_id = _load_vocab(vocab_path)
+
+    if not args.train_path:
+        train_path = os.path.join(data_path, "train")
+        train_data = _file_to_word_ids(listDir(train_path), word_to_id)
+    else:
+        train_path = args.train_path
+        train_data = _file_to_word_ids([train_path], word_to_id)
+    valid_path = os.path.join(data_path, "dev")
+    test_path = os.path.join(data_path, "dev")
+    valid_data = _file_to_word_ids(listDir(valid_path), word_to_id)
+    test_data = _file_to_word_ids(listDir(test_path), word_to_id)
+    vocabulary = len(word_to_id)
+    return train_data, valid_data, test_data, vocabulary
+
+
+def get_data_iter(raw_data, batch_size, num_steps):
+    def __impl__():
+        buf = []
+        while True:
+            if len(buf) >= num_steps * batch_size + 1:
+                x = np.asarray(
+                    buf[:-1], dtype='int64').reshape((batch_size, num_steps))
+                y = np.asarray(
+                    buf[1:], dtype='int64').reshape((batch_size, num_steps))
+                yield (x, y)
+                buf = [buf[-1]]
+            try:
+                buf.append(raw_data.next())
+            except StopIteration:
+                break
+
+    return __impl__
--- a/PaddleNLP/ELMO/run.sh
+++ b/PaddleNLP/ELMO/run.sh
+export CUDA_VISIBLE_DEVICES=0 
+python  train.py \
+--train_path='baike/train/sentence_file_*'  \
+--test_path='baike/dev/sentence_file_*'  \
+--vocab_path baike/vocabulary_min5k.txt \
+--learning_rate 0.2 \
+--use_gpu True \
+--local True $@
--- a/PaddleNLP/ELMO/train.py
+++ b/PaddleNLP/ELMO/train.py
--- a/PaddleNLP/machine_reading_comprehension/README.md
+++ b/PaddleNLP/machine_reading_comprehension/README.md
-# Abstract
-Dureader is an end-to-end neural network model for machine reading comprehension style question answering, which aims to answer questions from given passages. We first match the question and passages with a bidireactional attention flow network to obtrain the question-aware passages represenation. Then we employ a pointer network to locate the positions of answers from passages. Our experimental evalutions show that DuReader model achieves the state-of-the-art results in DuReader Dadaset.
-# Dataset
-DuReader Dataset is a new large-scale real-world and human sourced MRC dataset in Chinese. DuReader focuses on real-world open-domain question answering. The advantages of DuReader over existing datasets are concluded as follows:
- - Real question
- - Real article
- - Real answer
- - Real application scenario
- - Rich annotation
+DuReader是一个端到端的机器阅读理解神经网络模型，能够在给定文档和问题的情况，定位文档中问题的答案。我们首先利用双向注意力网络获得文档和问题的相同向量空间的表示，然后使用`point network` 定位文档中答案的位置。实验显示，我们的模型能够获得在Dureader数据集上SOTA的结果。

-# Network
-DuReader model is inspired by 3 classic reading comprehension models([BiDAF](https://arxiv.org/abs/1611.01603), [Match-LSTM](https://arxiv.org/abs/1608.07905), [R-NET](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf)).
+# 算法介绍
+DuReader模型主要实现了论文[BiDAF](https://arxiv.org/abs/1611.01603)， [Match-LSTM](https://arxiv.org/abs/1608.07905)中的模型结构。

-DuReader model is a hierarchical multi-stage process and consists of five layers
+模型在层次上可以分为5层：

- **Word Embedding Layer** maps each word to a vector using a pre-trained word embedding model.
- **Encoding Layer** extracts context infomation for each position in question and passages with a bi-directional LSTM network.
- **Attention Flow Layer** couples the query and context vectors and produces a set of query-aware feature vectors for each word in the context. Please refer to [BiDAF](https://arxiv.org/abs/1611.01603) for more details.
- **Fusion Layer** employs a layer of bi-directional LSTM to capture the interaction among context words independent of the query.
- **Decode Layer** employs an answer point network with attention pooling of the quesiton to locate the positions of answers from passages. Please refer to [Match-LSTM](https://arxiv.org/abs/1608.07905) and [R-NET](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf) for more details.
+- **词嵌入层** 将每一个词映射到一个向量(这个向量可以是预训练好的)。
+- **编码层** 使用双向LSMT网络获得问题和文档的每一个词的上下文信息。
+- **注意力层** 通过双向注意力网络获得文档的问题向量空间表示。更多参考[BiDAF](https://arxiv.org/abs/1611.01603)。
+- **融合层** 通过双向LSTM网络获得文档向量表示中上下文的关联性信息。
+- **输出层** 通过`point network`预测答案在问题中的位置。更多参考 [Match-LSTM](https://arxiv.org/abs/1608.07905)。

-## How to Run
-### Download the Dataset
-To Download DuReader dataset:
+## 数据准备
+### 下载数据集
+通过如下脚本下载数据集:
 ```
 cd data && bash download.sh
 ```
-For more details about DuReader dataset please refer to [DuReader Dataset Homepage](https://ai.baidu.com//broad/subordinate?dataset=dureader).
+模型默认使用DuReader数据集，是百度开源的真实阅读理解数据，更多参考[DuReader Dataset Homepage](https://ai.baidu.com//broad/subordinate?dataset=dureader)

-### Download Thirdparty Dependencies
-We use Bleu and Rouge as evaluation metrics, the calculation of these metrics relies on the scoring scripts under [coco-caption](https://github.com/tylin/coco-caption), to download them, run:
+### 下载第三方依赖
+我们使用Bleu和Rouge作为度量指标， 这些度量指标的源码位于[coco-caption](https://github.com/tylin/coco-caption)， 可以使用如下命令下载源码:

 ```
 cd utils && bash download_thirdparty.sh
 ```
-### Environment Requirements
-For now we've only tested on PaddlePaddle v1.0, to install PaddlePaddle and for more details about PaddlePaddle, see [PaddlePaddle Homepage](http://paddlepaddle.org).
-
-### Preparation
-Before training the model, we have to make sure that the data is ready. For preparation, we will check the data files, make directories and extract a vocabulary for later use. You can run the following command to do this with a specified task name:
+### 环境依赖
+当前模型是在paddlepaddle 1.2版本上测试， 因此建议在1.2版本上使用本模型。关于PaddlePaddle的安装可以参考[PaddlePaddle Homepage](http://paddlepaddle.org)。

+## 模型训练
+### 段落抽取
+在段落抽取阶段，主要是使用文档相关性score对文档内容进行优化， 抽取的结果将会放到`data/extracted/`目录下。如果你用demo数据测试，可以跳过这一步。如果你用dureader数据，需要指定抽取的数据目录，命令如下： 
 ```
-sh run.sh --prepare
+sh run.sh --para_extraction --trainset data/preprocessed/trainset/zhidao.train.json data/preprocessed/trainset/search.train.json --devset data/preprocessed/devset/zhidao.dev.json data/preprocessed/devset/search.dev.json --testset data/preprocessed/testset/zhidao.test.json data/preprocessed/testset/search.test.json
 ```
-You can specify the files for train/dev/test by setting the `trainset`/`devset`/`testset`.
-### Training
-To train the model and you can also set the hyper-parameters such as the learning rate by using `--learning_rate NUM`. For example, to train the model for 10 passes, you can run:
-
+其中参数 `trainset`/`devset`/`testset`分别对应训练、验证和测试数据集(下同)。
+### 词典准备
+在训练模型之前，我们应该确保数据已经准备好。在准备阶段，通过全部数据文件生成一个词典，这个词典会在后续的训练和预测中用到。你可以通过如下命令生成词典：
 ```
-sh run.sh --train --pass_num 10
+run.sh --prepare
 ```
-
-The training process includes an evaluation on the dev set after each training epoch. By default, the model with the least Bleu-4 score on the dev set will be saved.
-
-### Evaluation
-To conduct a single evaluation on the dev set with the the model already trained, you can run the following command:
-
+上面的命令默认使用demo数据，如果想使用dureader数据集，应该按照如下方式指定：
+```
+run.sh --prepare --trainset data/extracted/trainset/zhidao.train.json data/extracted/trainset/search.train.json --devset data/extracted/devset/zhidao.dev.json data/extracted/devset/search.dev.json --testset data/extracted/testset/zhidao.test.json data/extracted/testset/search.test.json
 ```
-sh run.sh --evaluate  --load_dir models/1
+其中参数 `trainset`/`devset`/`testset`分别对应训练、验证和测试数据集。
+### 模型训练
+训练模型的启动命令如下：
 ```
+sh run.sh --train
+```
+可以通过设置超参数更改训练的配置，比如通过`--learning_rate NUM`更改学习率，通过`--pass_num NUM`更改训练的轮数
+训练的过程中，每隔一定迭代周期，会测试在验证集上的性能指标， 通过`--dev_interval NUM`设置周期大小

-### Prediction
-You can also predict answers for the samples in some files using the following command:
+### 模型评测
+在模型训练结束后，如果想使用训练好的模型进行评测，获得度量指标，可以使用如下命令:
+```
+sh run.sh --evaluate  --load_dir data/models/1
+```
+其中，`--load_dir data/models/1`是模型的checkpoint目录

+### 预测
+使用训练好的模型，对问答文档数据直接预测结果，获得答案，可以使用如下命令:
 ```
-sh run.sh --predict --load_dir models/1 --testset ../data/preprocessed/testset/search.dev.json
+sh run.sh --predict --load_dir data/models/1 --testset data/extracted/testset/search.dev.json
 ```
+其中`--testset`指定了预测用的数据集，生成的问题答案默认会放到`data/results/` 目录，你可以通过参数`--result_dir DIR_PATH`更改配置
+
+### 实验结果
+验证集 ROUGE-L:47.65，测试集 ROUGE-L:54.58。
+
+这是在P40上，使用4卡GPU，batch size=4*32的训练结果，如果使用单卡，指标可能会略有降低，但在验证集上的ROUGE-L也不小于47。
+
+## 参考文献
+[Machine Comprehension Using Match-LSTM and Answer Pointer](https://arxiv.org/abs/1608.07905)

-By default, the results are saved at `../data/results/` folder. You can change this by specifying `--result_dir DIR_PATH`.
+[Bidirectional Attention Flow for Machine Comprehension](https://arxiv.org/abs/1611.01603)
--- a/PaddleNLP/machine_reading_comprehension/args.py
+++ b/PaddleNLP/machine_reading_comprehension/args.py
@@ -27,102 +27,65 @@ def parse_args():
        action='store_true',
        help='create the directories, prepare the vocabulary and embeddings')
    parser.add_argument('--train', action='store_true', help='train the model')
-    parser.add_argument(
-        '--evaluate', action='store_true', help='evaluate the model on dev set')
-    parser.add_argument(
-        '--predict',
-        action='store_true',
-        help='predict the answers for test set with trained model')
-    parser.add_argument(
-        "--embed_size",
-        type=int,
-        default=300,
-        help="The dimension of embedding table. (default: %(default)d)")
-    parser.add_argument(
-        "--hidden_size",
-        type=int,
-        default=300,
-        help="The size of rnn hidden unit. (default: %(default)d)")
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=32,
-        help="The sequence number of a mini-batch data. (default: %(default)d)")
-    parser.add_argument(
-        "--pass_num",
-        type=int,
-        default=5,
-        help="The pass number to train. (default: %(default)d)")
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=0.001,
-        help="Learning rate used to train the model. (default: %(default)f)")
-    parser.add_argument(
-        "--weight_decay",
-        type=float,
-        default=0.0001,
-        help="Weight decay. (default: %(default)f)")
-    parser.add_argument(
-        "--use_gpu",
-        type=distutils.util.strtobool,
-        default=True,
-        help="Whether to use gpu. (default: %(default)d)")
-    parser.add_argument(
-        "--save_dir",
-        type=str,
-        default="model",
-        help="Specify the path to save trained models.")
-    parser.add_argument(
-        "--load_dir",
-        type=str,
-        default="",
-        help="Specify the path to load trained models.")
-    parser.add_argument(
-        "--save_interval",
-        type=int,
-        default=1,
-        help="Save the trained model every n passes."
-        "(default: %(default)d)")
-    parser.add_argument(
-        "--log_interval",
-        type=int,
-        default=50,
-        help="log the train loss every n batches."
-        "(default: %(default)d)")
-    parser.add_argument(
-        "--dev_interval",
-        type=int,
-        default=1000,
-        help="cal dev loss every n batches."
-        "(default: %(default)d)")
+    parser.add_argument('--evaluate', action='store_true', help='evaluate the model on dev set')
+    parser.add_argument('--predict', action='store_true',
+                        help='predict the answers for test set with trained model')
+
+    parser.add_argument("--embed_size", type=int, default=300,
+                        help="The dimension of embedding table. (default: %(default)d)")
+    parser.add_argument("--hidden_size", type=int, default=150,
+                        help="The size of rnn hidden unit. (default: %(default)d)")
+    parser.add_argument("--learning_rate", type=float, default=0.001,
+                        help="Learning rate used to train the model. (default: %(default)f)")
    parser.add_argument('--optim', default='adam', help='optimizer type')
-    parser.add_argument('--trainset', nargs='+', help='train dataset')
-    parser.add_argument('--devset', nargs='+', help='dev dataset')
-    parser.add_argument('--testset', nargs='+', help='test dataset')
-    parser.add_argument('--vocab_dir', help='dict')
+    parser.add_argument("--weight_decay", type=float, default=0.0001,
+                        help="Weight decay. (default: %(default)f)")
+
+    parser.add_argument('--drop_rate', type=float, default=0.0, help="Dropout probability")
+    parser.add_argument('--random_seed', type=int, default=123)
+    parser.add_argument("--batch_size", type=int, default=32,
+                        help="The sequence number of a mini-batch data. (default: %(default)d)")
+    parser.add_argument("--pass_num", type=int, default=5,
+                        help="The number epochs to train. (default: %(default)d)")
+    parser.add_argument("--use_gpu", type=distutils.util.strtobool, default=True,
+                        help="Whether to use gpu. (default: %(default)d)")
+    parser.add_argument("--log_interval", type=int, default=50,
+                        help="log the train loss every n batches. (default: %(default)d)")
+
    parser.add_argument('--max_p_num', type=int, default=5)
    parser.add_argument('--max_a_len', type=int, default=200)
    parser.add_argument('--max_p_len', type=int, default=500)
-    parser.add_argument('--max_q_len', type=int, default=9)
+    parser.add_argument('--max_q_len', type=int, default=60)
    parser.add_argument('--doc_num', type=int, default=5)
-    parser.add_argument('--para_print', action='store_true')
-    parser.add_argument('--drop_rate', type=float, default=0.0)
-    parser.add_argument('--random_seed', type=int, default=123)
-    parser.add_argument(
-        '--log_path',
-        help='path of the log file. If not set, logs are printed to console')
-    parser.add_argument(
-        '--result_dir',
-        default='../data/results/',
-        help='the dir to output the results')
-    parser.add_argument(
-        '--result_name',
-        default='test_result',
-        help='the file name of the results')
-    parser.add_argument(
-        "--enable_ce",
-        action='store_true',
-        help="If set, run the task with continuous evaluation logs.")
+
+    parser.add_argument('--vocab_dir', default='data/vocab', help='vocabulary')
+    parser.add_argument("--save_dir", type=str, default="data/models",
+                        help="Specify the path to save trained models.")
+    parser.add_argument("--save_interval", type=int, default=1,
+                        help="Save the trained model every n passes. (default: %(default)d)")
+    parser.add_argument("--load_dir", type=str, default="",
+                        help="Specify the path to load trained models.")
+    parser.add_argument('--log_path',
+                        help='path of the log file. If not set, logs are printed to console')
+    parser.add_argument('--result_dir', default='data/results/',
+                        help='the dir to output the results')
+    parser.add_argument('--result_name', default='test_result',
+                        help='the file name of the predicted results')
+
+    parser.add_argument('--trainset', nargs='+',
+                        default=['data/demo/trainset/search.train.json'],
+                        help='train dataset')
+    parser.add_argument('--devset', nargs='+',
+                        default=['data/demo/devset/search.dev.json'],
+                        help='dev dataset')
+    parser.add_argument('--testset', nargs='+',
+                        default=['data/demo/testset/search.test.json'],
+                        help='test dataset')
+
+    parser.add_argument("--enable_ce", action='store_true',
+                        help="If set, run the task with continuous evaluation logs.")
+    parser.add_argument('--para_print', action='store_true', help="Print debug info")
+    parser.add_argument("--dev_interval", type=int, default=-1,
+                        help="evaluate on dev set loss every n batches. (default: %(default)d)")
    args = parser.parse_args()
    return args
--- a/PaddleNLP/machine_reading_comprehension/data/download.sh
+++ b/PaddleNLP/machine_reading_comprehension/data/download.sh
@@ -20,11 +20,14 @@ if [[ -d preprocessed ]] && [[ -d raw ]]; then
    echo "data exist"
    exit 0
 else
-    wget -c --no-check-certificate http://dureader.gz.bcebos.com/dureader_preprocessed.zip 
-    wget -c --no-check-certificate http://dureader.gz.bcebos.com/demo.tgz 
+    wget -c http://dureader.gz.bcebos.com/demo.zip
+    wget -c https://aipedataset.cdn.bcebos.com/dureader/dureader_raw.zip
+    wget -c https://aipedataset.cdn.bcebos.com/dureader/dureader_preprocessed.zip
 fi

 if md5sum --status -c md5sum.txt; then
+    unzip demo.zip
+    unzip dureader_raw.zip
    unzip dureader_preprocessed.zip
 else
    echo "download data error!" >> /dev/stderr

--- a/PaddleNLP/machine_reading_comprehension/data/md5sum.txt
+++ b/PaddleNLP/machine_reading_comprehension/data/md5sum.txt
-7a4c28026f7dc94e8135d17203c63664  dureader_preprocessed.zip
+0ca0510fa625d35d902b73033c4ba9d8  demo.zip
+dc7658b8cdf4f94b8714d130b7d15196  dureader_raw.zip
+3db9a32e5a7c5375a604a70687b45479  dureader_preprocessed.zip
--- a/PaddleNLP/machine_reading_comprehension/dataset.py
+++ b/PaddleNLP/machine_reading_comprehension/dataset.py
@@ -157,7 +157,8 @@ class BRCDataset(object):
            passade_idx_offset = sum(batch_data['passage_num'])
            batch_data['passage_num'].append(count)
            gold_passage_offset = 0
-            if 'answer_passages' in sample and len(sample['answer_passages']):
+            if 'answer_passages' in sample and len(sample['answer_passages']) and \
+                    sample['answer_passages'][0] < len(sample['documents']):
                for i in range(sample['answer_passages'][0]):
                    gold_passage_offset += len(batch_data['passage_token_ids'][
                        passade_idx_offset + i])

--- a/PaddleNLP/machine_reading_comprehension/paragraph_extraction.py
+++ b/PaddleNLP/machine_reading_comprehension/paragraph_extraction.py
+#!/usr/bin/python
+#-*- coding:utf-8 -*-
+
+import sys
+if sys.version[0] == '2':
+    reload(sys)
+    sys.setdefaultencoding("utf-8")
+import json
+import copy
+from preprocess import metric_max_over_ground_truths, f1_score
+
+
+def compute_paragraph_score(sample):
+    """
+    For each paragraph, compute the f1 score compared with the question
+    Args:
+        sample: a sample in the dataset.
+    Returns:
+        None
+    Raises:
+        None
+    """
+    question = sample["segmented_question"]
+    for doc in sample['documents']:
+        doc['segmented_paragraphs_scores'] = []
+        for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']):
+            if len(question) > 0:
+                related_score = metric_max_over_ground_truths(f1_score,
+                        para_tokens,
+                        question)
+            else:
+                related_score = 0.0
+            doc['segmented_paragraphs_scores'].append(related_score)
+
+
+def dup_remove(doc):
+    """
+    For each document, remove the duplicated paragraphs
+    Args:
+        doc: a doc in the sample
+    Returns:
+        bool
+    Raises:
+        None
+    """
+    paragraphs_his = {}
+    del_ids = []
+    para_id = None
+    if 'most_related_para' in doc:
+        para_id = doc['most_related_para']
+    doc['paragraphs_length'] = []
+    for p_idx, (segmented_paragraph, paragraph_score) in \
+        enumerate(zip(doc["segmented_paragraphs"], doc["segmented_paragraphs_scores"])):
+        doc['paragraphs_length'].append(len(segmented_paragraph))
+        paragraph = ''.join(segmented_paragraph)
+        if paragraph in paragraphs_his:
+            del_ids.append(p_idx)
+            if p_idx == para_id:
+                para_id = paragraphs_his[paragraph]
+            continue
+        paragraphs_his[paragraph] = p_idx
+    # delete
+    prev_del_num = 0
+    del_num = 0
+    for p_idx in del_ids:
+        if p_idx < para_id: 
+            prev_del_num += 1
+        del doc["segmented_paragraphs"][p_idx - del_num]
+        del doc["segmented_paragraphs_scores"][p_idx - del_num]
+        del doc['paragraphs_length'][p_idx - del_num]
+        del_num += 1
+    if len(del_ids) != 0:
+        if 'most_related_para' in doc:
+            doc['most_related_para'] = para_id - prev_del_num
+        doc['paragraphs'] = []
+        for segmented_para in doc["segmented_paragraphs"]:
+            paragraph = ''.join(segmented_para)
+            doc['paragraphs'].append(paragraph)
+        return True
+    else:
+        return False
+
+
+def paragraph_selection(sample, mode):
+    """
+    For each document, select paragraphs that includes as much information as possible
+    Args:
+        sample: a sample in the dataset.
+        mode: string of ("train", "dev", "test"), indicate the type of dataset to process.
+    Returns:
+        None
+    Raises:
+        None
+    """
+    # predefined maximum length of paragraph
+    MAX_P_LEN = 500
+    # predefined splitter
+    splitter = u'<splitter>'
+    # topN of related paragraph to choose
+    topN = 3
+    doc_id = None
+    if 'answer_docs' in sample and len(sample['answer_docs']) > 0:
+        doc_id = sample['answer_docs'][0]
+        if doc_id >= len(sample['documents']):
+            # Data error, answer doc ID > number of documents, this sample
+            # will be filtered by dataset.py
+            return
+    for d_idx, doc in enumerate(sample['documents']):
+        if 'segmented_paragraphs_scores' not in doc:
+            continue
+        status = dup_remove(doc)
+        segmented_title = doc["segmented_title"]
+        title_len = len(segmented_title)
+        para_id = None
+        if doc_id is not None:
+            para_id = sample['documents'][doc_id]['most_related_para']
+        total_len = title_len + sum(doc['paragraphs_length'])
+        # add splitter
+        para_num = len(doc["segmented_paragraphs"])
+        total_len += para_num
+        if total_len <= MAX_P_LEN:
+            incre_len = title_len
+            total_segmented_content = copy.deepcopy(segmented_title)
+            for p_idx, segmented_para in enumerate(doc["segmented_paragraphs"]):
+                if doc_id == d_idx and para_id > p_idx:
+                    incre_len += len([splitter] + segmented_para)
+                if doc_id == d_idx and para_id == p_idx:
+                    incre_len += 1
+                total_segmented_content += [splitter] + segmented_para
+            if doc_id == d_idx:
+                answer_start = incre_len + sample['answer_spans'][0][0]
+                answer_end = incre_len + sample['answer_spans'][0][1]
+                sample['answer_spans'][0][0] = answer_start
+                sample['answer_spans'][0][1] = answer_end
+            doc["segmented_paragraphs"] = [total_segmented_content]
+            doc["segmented_paragraphs_scores"] = [1.0]
+            doc['paragraphs_length'] = [total_len]
+            doc['paragraphs'] = [''.join(total_segmented_content)]
+            doc['most_related_para'] = 0
+            continue
+        # find topN paragraph id
+        para_infos = []
+        for p_idx, (para_tokens, para_scores) in \
+                enumerate(zip(doc['segmented_paragraphs'], doc['segmented_paragraphs_scores'])):
+            para_infos.append((para_tokens, para_scores, len(para_tokens), p_idx))
+        para_infos.sort(key=lambda x: (-x[1], x[2]))
+        topN_idx = []
+        for para_info in para_infos[:topN]:
+            topN_idx.append(para_info[-1])
+        final_idx = []
+        total_len = title_len
+        if doc_id == d_idx:
+            if mode == "train":
+                final_idx.append(para_id)
+                total_len = title_len + 1 + doc['paragraphs_length'][para_id]
+        for id in topN_idx:
+            if total_len > MAX_P_LEN:
+                break
+            if doc_id == d_idx and id == para_id and mode == "train":
+                continue
+            total_len += 1 + doc['paragraphs_length'][id] 
+            final_idx.append(id)
+        total_segmented_content = copy.deepcopy(segmented_title)
+        final_idx.sort()
+        incre_len = title_len
+        for id in final_idx:
+            if doc_id == d_idx and id < para_id:
+                incre_len += 1 + doc['paragraphs_length'][id]
+            if doc_id == d_idx and id == para_id:
+                incre_len += 1
+            total_segmented_content += [splitter] + doc['segmented_paragraphs'][id]
+        if doc_id == d_idx:
+            answer_start = incre_len + sample['answer_spans'][0][0]
+            answer_end = incre_len + sample['answer_spans'][0][1]
+            sample['answer_spans'][0][0] = answer_start
+            sample['answer_spans'][0][1] = answer_end
+        doc["segmented_paragraphs"] = [total_segmented_content]
+        doc["segmented_paragraphs_scores"] = [1.0]
+        doc['paragraphs_length'] = [total_len]
+        doc['paragraphs'] = [''.join(total_segmented_content)]
+        doc['most_related_para'] = 0
+
+
+if __name__ == "__main__":
+    # mode="train"/"dev"/"test"
+    mode = sys.argv[1]
+    for line in sys.stdin:
+        line = line.strip()
+        if line == "":
+            continue
+        try:
+            sample = json.loads(line, encoding='utf8')
+        except:
+            print >>sys.stderr, "Invalid input json format - '{}' will be ignored".format(line)
+            continue
+        compute_paragraph_score(sample)
+        paragraph_selection(sample, mode)
+        print(json.dumps(sample, encoding='utf8', ensure_ascii=False))
+
--- a/PaddleNLP/machine_reading_comprehension/preprocess.py
+++ b/PaddleNLP/machine_reading_comprehension/preprocess.py
+###############################################################################
+# ==============================================================================
+# Copyright 2017 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+This module finds the most related paragraph of each document according to recall.
+"""
+
+import sys
+if sys.version[0] == '2':
+    reload(sys)
+    sys.setdefaultencoding("utf-8")
+import json
+from collections import Counter
+
+
+def precision_recall_f1(prediction, ground_truth):
+    """
+    This function calculates and returns the precision, recall and f1-score
+    Args:
+        prediction: prediction string or list to be matched
+        ground_truth: golden string or list reference
+    Returns:
+        floats of (p, r, f1)
+    Raises:
+        None
+    """
+    if not isinstance(prediction, list):
+        prediction_tokens = prediction.split()
+    else:
+        prediction_tokens = prediction
+    if not isinstance(ground_truth, list):
+        ground_truth_tokens = ground_truth.split()
+    else:
+        ground_truth_tokens = ground_truth
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0, 0, 0
+    p = 1.0 * num_same / len(prediction_tokens)
+    r = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * p * r) / (p + r)
+    return p, r, f1
+
+
+def recall(prediction, ground_truth):
+    """
+    This function calculates and returns the recall
+    Args:
+        prediction: prediction string or list to be matched
+        ground_truth: golden string or list reference
+    Returns:
+        floats of recall
+    Raises:
+        None
+    """
+    return precision_recall_f1(prediction, ground_truth)[1]
+
+
+def f1_score(prediction, ground_truth):
+    """
+    This function calculates and returns the f1-score
+    Args:
+        prediction: prediction string or list to be matched
+        ground_truth: golden string or list reference
+    Returns:
+        floats of f1
+    Raises:
+        None
+    """
+    return precision_recall_f1(prediction, ground_truth)[2]
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    """
+    This function calculates and returns the precision, recall and f1-score
+    Args:
+        metric_fn: metric function pointer which calculates scores according to corresponding logic.
+        prediction: prediction string or list to be matched
+        ground_truth: golden string or list reference
+    Returns:
+        floats of (p, r, f1)
+    Raises:
+        None
+    """
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def find_best_question_match(doc, question, with_score=False):
+    """
+    For each document, find the paragraph that matches best to the question.
+    Args:
+        doc: The document object.
+        question: The question tokens.
+        with_score: If True then the match score will be returned,
+            otherwise False.
+    Returns:
+        The index of the best match paragraph, if with_score=False,
+        otherwise returns a tuple of the index of the best match paragraph
+        and the match score of that paragraph.
+    """
+    most_related_para = -1
+    max_related_score = 0
+    most_related_para_len = 0
+    for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']):
+        if len(question) > 0:
+            related_score = metric_max_over_ground_truths(recall,
+                    para_tokens,
+                    question)
+        else:
+            related_score = 0
+
+        if related_score > max_related_score \
+                or (related_score == max_related_score \
+                and len(para_tokens) < most_related_para_len):
+            most_related_para = p_idx
+            max_related_score = related_score
+            most_related_para_len = len(para_tokens)
+    if most_related_para == -1:
+        most_related_para = 0
+    if with_score:
+        return most_related_para, max_related_score
+    return most_related_para
+
+
+def find_fake_answer(sample):
+    """
+    For each document, finds the most related paragraph based on recall,
+    then finds a span that maximize the f1_score compared with the gold answers
+    and uses this span as a fake answer span
+    Args:
+        sample: a sample in the dataset
+    Returns:
+        None
+    Raises:
+        None
+    """
+    for doc in sample['documents']:
+        most_related_para = -1
+        most_related_para_len = 999999
+        max_related_score = 0
+        for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']):
+            if len(sample['segmented_answers']) > 0:
+                related_score = metric_max_over_ground_truths(recall,
+                                                              para_tokens,
+                                                              sample['segmented_answers'])
+            else:
+                continue
+            if related_score > max_related_score \
+                    or (related_score == max_related_score
+                        and len(para_tokens) < most_related_para_len):
+                most_related_para = p_idx
+                most_related_para_len = len(para_tokens)
+                max_related_score = related_score
+        doc['most_related_para'] = most_related_para
+
+    sample['answer_docs'] = []
+    sample['answer_spans'] = []
+    sample['fake_answers'] = []
+    sample['match_scores'] = []
+
+    best_match_score = 0
+    best_match_d_idx, best_match_span = -1, [-1, -1]
+    best_fake_answer = None
+    answer_tokens = set()
+    for segmented_answer in sample['segmented_answers']:
+        answer_tokens = answer_tokens | set([token for token in segmented_answer])
+    for d_idx, doc in enumerate(sample['documents']):
+        if not doc['is_selected']:
+            continue
+        if doc['most_related_para'] == -1:
+            doc['most_related_para'] = 0
+        most_related_para_tokens = doc['segmented_paragraphs'][doc['most_related_para']][:1000]
+        for start_tidx in range(len(most_related_para_tokens)):
+            if most_related_para_tokens[start_tidx] not in answer_tokens:
+                continue
+            for end_tidx in range(len(most_related_para_tokens) - 1, start_tidx - 1, -1):
+                span_tokens = most_related_para_tokens[start_tidx: end_tidx + 1]
+                if len(sample['segmented_answers']) > 0:
+                    match_score = metric_max_over_ground_truths(f1_score, span_tokens,
+                                                                sample['segmented_answers'])
+                else:
+                    match_score = 0
+                if match_score == 0:
+                    break
+                if match_score > best_match_score:
+                    best_match_d_idx = d_idx
+                    best_match_span = [start_tidx, end_tidx]
+                    best_match_score = match_score
+                    best_fake_answer = ''.join(span_tokens)
+    if best_match_score > 0:
+        sample['answer_docs'].append(best_match_d_idx)
+        sample['answer_spans'].append(best_match_span)
+        sample['fake_answers'].append(best_fake_answer)
+        sample['match_scores'].append(best_match_score)
+
+
+if __name__ == '__main__':
+    for line in sys.stdin:
+        sample = json.loads(line)
+        find_fake_answer(sample)
+        print(json.dumps(sample, encoding='utf8', ensure_ascii=False))
--- a/PaddleNLP/machine_reading_comprehension/rc_model.py
+++ b/PaddleNLP/machine_reading_comprehension/rc_model.py
@@ -22,6 +22,7 @@ import numpy as np


 def dropout(input, args):
+    """Dropout function"""
    if args.drop_rate:
        return layers.dropout(
            input,
@@ -33,10 +34,12 @@ def dropout(input, args):


 def bi_lstm_encoder(input_seq, gate_size, para_name, args):
-    # A bi-directional lstm encoder implementation.
-    # Linear transformation part for input gate, output gate, forget gate
-    # and cell activation vectors need be done outside of dynamic_lstm.
-    # So the output size is 4 times of gate_size.
+    """
+    A bi-directional lstm encoder implementation.
+    Linear transformation part for input gate, output gate, forget gate
+    and cell activation vectors need be done outside of dynamic_lstm.
+    So the output size is 4 times of gate_size.
+    """

    input_forward_proj = layers.fc(
        input=input_seq,
@@ -75,6 +78,7 @@ def get_data(input_name, lod_level, args):


 def embedding(input_ids, shape, args):
+    """Embedding layer"""
    input_embedding = layers.embedding(
        input=input_ids,
        size=shape,
@@ -85,6 +89,7 @@ def embedding(input_ids, shape, args):


 def encoder(input_embedding, para_name, hidden_size, args):
+    """Encoding layer"""
    encoder_out = bi_lstm_encoder(
        input_seq=input_embedding,
        gate_size=hidden_size,
@@ -94,6 +99,7 @@ def encoder(input_embedding, para_name, hidden_size, args):


 def attn_flow(q_enc, p_enc, p_ids_name, args):
+    """Bidirectional Attention layer"""
    tag = p_ids_name + "::"
    drnn = layers.DynamicRNN()
    with drnn.block():
@@ -123,7 +129,15 @@ def attn_flow(q_enc, p_enc, p_ids_name, args):
    return dropout(g, args)


+def fusion(g, args):
+    """Fusion layer"""
+    m = bi_lstm_encoder(
+        input_seq=g, gate_size=args.hidden_size, para_name='fusion', args=args)
+    return dropout(m, args)
+
+
 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size, para_name, args):
+    """Util function for pointer network"""
    def linear(inputs, para_name, args):
        return layers.fc(input=inputs,
                         size=size,
@@ -150,8 +164,8 @@ def lstm_step(x_t, hidden_t_prev, cell_t_prev, size, para_name, args):
    return hidden_t, cell_t


-#point network
 def point_network_decoder(p_vec, q_vec, hidden_size, args):
+    """Output layer - pointer network"""
    tag = 'pn_decoder:'
    init_random = fluid.initializer.Normal(loc=0.0, scale=1.0)

@@ -258,20 +272,15 @@ def point_network_decoder(p_vec, q_vec, hidden_size, args):
    return start_prob, end_prob


-def fusion(g, args):
-    m = bi_lstm_encoder(
-        input_seq=g, gate_size=args.hidden_size, para_name='fusion', args=args)
-    return dropout(m, args)
-
-
 def rc_model(hidden_size, vocab, args):
+    """This function build the whole BiDAF network"""
    emb_shape = [vocab.size(), vocab.embed_dim]
    start_labels = layers.data(
        name="start_lables", shape=[1], dtype='float32', lod_level=1)
    end_labels = layers.data(
        name="end_lables", shape=[1], dtype='float32', lod_level=1)

-    # stage 1:encode 
+    # stage 1:setup input data, embedding table & encode
    q_id0 = get_data('q_id0', 1, args)

    q_ids = get_data('q_ids', 2, args)
@@ -302,6 +311,7 @@ def rc_model(hidden_size, vocab, args):
    start_probs, end_probs = point_network_decoder(
        p_vec=p_vec, q_vec=q_vec, hidden_size=hidden_size, args=args)

+    # calculate model loss
    cost0 = layers.sequence_pool(
        layers.cross_entropy(
            input=start_probs, label=start_labels, soft_label=True),

--- a/PaddleNLP/machine_reading_comprehension/run.py
+++ b/PaddleNLP/machine_reading_comprehension/run.py
@@ -133,6 +133,7 @@ def LodTensor_Array(lod_tensor):


 def print_para(train_prog, train_exe, logger, args):
+    """Print para info for debug purpose"""
    if args.para_print:
        param_list = train_prog.block(0).all_parameters()
        param_name_list = [p.name for p in param_list]
@@ -171,7 +172,8 @@ def find_best_answer_for_passage(start_probs, end_probs, passage_len):
    return (best_start, best_end), max_prob


-def find_best_answer_for_inst(sample, start_prob, end_prob, inst_lod):
+def find_best_answer_for_inst(sample, start_prob, end_prob, inst_lod,
+                              para_prior_scores=(0.44, 0.23, 0.15, 0.09, 0.07)):
    """
    Finds the best answer for a sample given start_prob and end_prob for each position.
    This will call find_best_answer_for_passage because there are multiple passages in a sample
@@ -190,6 +192,10 @@ def find_best_answer_for_inst(sample, start_prob, end_prob, inst_lod):
        answer_span, score = find_best_answer_for_passage(
            start_prob[passage_start:passage_end],
            end_prob[passage_start:passage_end], passage_len)
+        if para_prior_scores is not None:
+            # the Nth prior score = the Number of training samples whose gold answer comes
+            #  from the Nth paragraph / the number of the training samples
+            score *= para_prior_scores[p_idx]
        if score > best_score:
            best_score = score
            best_p_idx = p_idx
@@ -205,16 +211,12 @@ def find_best_answer_for_inst(sample, start_prob, end_prob, inst_lod):
 def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order,
               place, dev_count, vocab, brc_data, logger, args):
    """
-        
+    do inference with given inference_program
    """
-    build_strategy = fluid.BuildStrategy()
-    build_strategy.enable_inplace = False
-    build_strategy.memory_optimize = False
    parallel_executor = fluid.ParallelExecutor(
        main_program=inference_program,
        use_cuda=bool(args.use_gpu),
-        loss_name=avg_cost.name,
-        build_strategy=build_strategy)
+        loss_name=avg_cost.name)
    print_para(inference_program, parallel_executor, logger, args)

    # Use test set as validation each pass
@@ -277,7 +279,7 @@ def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order,
                    'question_type': sample['question_type'],
                    'answers': [best_answer],
                    'entity_answers': [[]],
-                    'yesno_answers': [best_span]
+                    'yesno_answers': []
                }
                pred_answers.append(pred)
                if 'answers' in sample:
@@ -296,7 +298,7 @@ def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order,
    if result_dir is not None and result_prefix is not None:
        if not os.path.exists(args.result_dir):
            os.makedirs(args.result_dir)
-        result_file = os.path.join(result_dir, result_prefix + 'json')
+        result_file = os.path.join(result_dir, result_prefix + '.json')
        with open(result_file, 'w') as fout:
            for pred_answer in pred_answers:
                fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')
@@ -328,6 +330,7 @@ def l2_loss(train_prog):


 def train(logger, args):
+    """train a model"""
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        if six.PY2:
@@ -489,6 +492,7 @@ def train(logger, args):


 def evaluate(logger, args):
+    """evaluate a specific model using devset"""
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
@@ -527,8 +531,8 @@ def evaluate(logger, args):

            inference_program = main_program.clone(for_test=True)
            eval_loss, bleu_rouge = validation(
-                inference_program, avg_cost, s_probs, e_probs, match,
-                feed_order, place, dev_count, vocab, brc_data, logger, args)
+                inference_program, avg_cost, s_probs, e_probs, match, feed_order,
+                place, dev_count, vocab, brc_data, logger, args)
            logger.info('Dev eval loss {}'.format(eval_loss))
            logger.info('Dev eval result: {}'.format(bleu_rouge))
            logger.info('Predicted answers are saved to {}'.format(
@@ -536,6 +540,7 @@ def evaluate(logger, args):


 def predict(logger, args):
+    """do inference on the test dataset """
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

--- a/PaddleNLP/machine_reading_comprehension/run.sh
+++ b/PaddleNLP/machine_reading_comprehension/run.sh
-export CUDA_VISIBLE_DEVICES=0
-python run.py   \
--trainset 'data/preprocessed/trainset/search.train.json' \
-           'data/preprocessed/trainset/zhidao.train.json' \
--devset 'data/preprocessed/devset/search.dev.json' \
-         'data/preprocessed/devset/zhidao.dev.json' \
--testset 'data/preprocessed/testset/search.test.json' \
-          'data/preprocessed/testset/zhidao.test.json' \
--vocab_dir 'data/vocab' \
--use_gpu true \
--save_dir ./models \
--pass_num 10 \
--learning_rate 0.001 \
--batch_size 32 \
--embed_size 300 \
--hidden_size 150 \
--max_p_num 5 \
--max_p_len 500 \
--max_q_len 60 \
--max_a_len 200 \
--weight_decay 0.0001 \
--drop_rate 0.2 $@\
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=1
+
+paragraph_extraction ()
+{
+    SOURCE_DIR=$1
+    TARGET_DIR=$2
+    echo "Start paragraph extraction, this may take a few hours"
+    echo "Source dir: $SOURCE_DIR"
+    echo "Target dir: $TARGET_DIR"
+    mkdir -p $TARGET_DIR/trainset
+    mkdir -p $TARGET_DIR/devset
+    mkdir -p $TARGET_DIR/testset
+
+    echo "Processing trainset"
+    cat $SOURCE_DIR/trainset/search.train.json | python paragraph_extraction.py train \
+            > $TARGET_DIR/trainset/search.train.json
+    cat $SOURCE_DIR/trainset/zhidao.train.json | python paragraph_extraction.py train \
+            > $TARGET_DIR/trainset/zhidao.train.json
+
+    echo "Processing devset"
+    cat $SOURCE_DIR/devset/search.dev.json | python paragraph_extraction.py dev \
+            > $TARGET_DIR/devset/search.dev.json
+    cat $SOURCE_DIR/devset/zhidao.dev.json | python paragraph_extraction.py dev \
+            > $TARGET_DIR/devset/zhidao.dev.json
+
+    echo "Processing testset"
+    cat $SOURCE_DIR/testset/search.test.json | python paragraph_extraction.py test \
+            > $TARGET_DIR/testset/search.test.json
+    cat $SOURCE_DIR/testset/zhidao.test.json | python paragraph_extraction.py test \
+            > $TARGET_DIR/testset/zhidao.test.json
+    echo "Paragraph extraction done!"
+}
+
+
+PROCESS_NAME="$1"
+case $PROCESS_NAME in
+    --para_extraction)
+    # Start paragraph extraction 
+    if [ ! -d data/preprocessed ]; then
+        echo "Please download the preprocessed data first (See README - Preprocess)"
+        exit 1
+    fi
+    paragraph_extraction data/preprocessed data/extracted
+    ;;
+    --prepare|--train|--evaluate|--predict)
+        # Start Paddle baseline
+        python run.py $@
+    ;;
+    *)
+        echo $"Usage: $0 {--para_extraction|--prepare|--train|--evaluate|--predict}"
+esac
--- a/PaddleNLP/machine_reading_comprehension/vocab.py
+++ b/PaddleNLP/machine_reading_comprehension/vocab.py
@@ -37,9 +37,10 @@ class Vocab(object):

        self.pad_token = '<blank>'
        self.unk_token = '<unk>'
+        self.split_token = '<splitter>'

        self.initial_tokens = initial_tokens if initial_tokens is not None else []
-        self.initial_tokens.extend([self.pad_token, self.unk_token])
+        self.initial_tokens.extend([self.pad_token, self.unk_token, self.split_token])
        for token in self.initial_tokens:
            self.add(token)

@@ -137,7 +138,7 @@ class Vocab(object):
        """
        self.embed_dim = embed_dim
        self.embeddings = np.random.rand(self.size(), embed_dim)
-        for token in [self.pad_token, self.unk_token]:
+        for token in [self.pad_token, self.unk_token, self.split_token]:
            self.embeddings[self.get_id(token)] = np.zeros([self.embed_dim])

    def load_pretrained_embeddings(self, embedding_path):

--- a/PaddleRec/README.md
+++ b/PaddleRec/README.md
@@ -13,3 +13,6 @@ PaddleRec
 - [SequenceSemanticRetrieval](https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/ssr)
 - [DeepCTR](https://github.com/PaddlePaddle/models/blob/develop/PaddleRec/ctr/README.cn.md)
 - [Multiview-Simnet](https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/multiview_simnet)
+- [Word2Vec](https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/word2vec)
+- [GraphNeuralNetwork](https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/gnn)
+- [DeepInterestNetwork](https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/din)
--- a/PaddleRec/gnn/README.md
+++ b/PaddleRec/gnn/README.md
@@ -78,10 +78,10 @@ CUDA_VISIBLE_DEVICES=1 python -u train.py --use_cuda 1 > log.txt 2>&1 &

 cpu 单机训练
 ``` bash
-python -u train.py --use_cuda 0 > log.txt 2>&1 &
+CPU_NUM=1 python -u train.py --use_cuda 0 > log.txt 2>&1 &
 ```

-值得注意的是上述单卡训练可以通过加--parallel 1参数使用Parallel Executor来进行加速
+值得注意的是上述单卡训练可以通过加--use_parallel 1参数使用Parallel Executor来进行加速。


 ## 训练结果示例