model.py 13.7 KB
Newer Older
C
caoying03 已提交
1 2
#!/usr/bin/env python
#coding=utf-8
C
caoying03 已提交
3

C
caoying03 已提交
4 5 6 7 8 9 10 11
import paddle.v2 as paddle
from paddle.v2.layer import parse_network
import basic_modules
from config import ModelConfig

__all__ = ["GNR"]


C
caoying03 已提交
12
def build_pretrained_embedding(name, data_type, emb_dim, emb_drop=0.):
C
caoying03 已提交
13 14 15 16 17 18 19 20
    """create word a embedding layer which loads pre-trained embeddings.

    Arguments:
        - name:       The name of the data layer which accepts one-hot input.
        - data_type:  PaddlePaddle's data type for data layer.
        - emb_dim:    The path to the data files.
    """

C
caoying03 已提交
21
    return paddle.layer.embedding(
C
caoying03 已提交
22
        input=paddle.layer.data(name=name, type=data_type),
C
caoying03 已提交
23
        size=emb_dim,
C
caoying03 已提交
24
        param_attr=paddle.attr.Param(name="GloveVectors", is_static=True),
C
caoying03 已提交
25 26 27
        layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=emb_drop), )


C
caoying03 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
def encode_question(input_embedding,
                    lstm_hidden_dim,
                    depth,
                    passage_indep_embedding_dim,
                    prefix=""):
    """build question encoding by using bidirectional LSTM.

    Each question word is encoded by runing a stack of bidirectional LSTM over
    word embedding in question, producing hidden states. The hidden states are
    used to compute a passage-independent question embedding.

    The final question encoding is constructed by concatenating the final
    hidden states of the forward and backward LSTMs and the passage-independent
    embedding.

    Arguments:
        - input_embedding:    The question word embeddings.
        - lstm_hidden_dim:  The dimension of bi-directional LSTM.
        - depth:  The depth of stacked bi-directional LSTM.
        - passage_indep_embedding_dim:  The dimension of passage-independent
                                        embedding.
        - prefix:    A string which will be appended to name of each layer
                     created in this function. Each layer in a network should
                     has a unique name. The prefix makes this fucntion can be
                     called multiple times.
    """
    # stacked bi-directional LSTM to process question embeddings.
C
caoying03 已提交
55
    lstm_final, lstm_outs = basic_modules.stacked_bidirectional_lstm(
C
caoying03 已提交
56 57 58 59 60 61 62 63 64 65 66 67 68
        input_embedding, lstm_hidden_dim, depth, 0., prefix)

    # compute passage-independent embeddings.
    candidates = paddle.layer.fc(
        input=lstm_outs,
        bias_attr=False,
        size=passage_indep_embedding_dim,
        act=paddle.activation.Linear())
    weights = paddle.layer.fc(
        input=lstm_outs,
        size=1,
        bias_attr=False,
        act=paddle.activation.SequenceSoftmax())
C
caoying03 已提交
69 70 71
    weighted_candidates = paddle.layer.scaling(input=candidates, weight=weights)
    passage_indep_embedding = paddle.layer.pooling(
        input=weighted_candidates, pooling_type=paddle.pooling.Sum())
C
caoying03 已提交
72

C
caoying03 已提交
73 74 75 76 77
    return paddle.layer.concat(
        input=[lstm_final, passage_indep_embedding]), lstm_outs


def question_aligned_passage_embedding(question_lstm_outs, document_embeddings,
C
caoying03 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
                                       passage_aligned_embedding_dim):
    """create question aligned passage embedding.

    Arguments:
        - question_lstm_outs:    The dimension of output of LSTM that process
                                 question word embedding.
        - document_embeddings:   The document embeddings.
        - passage_aligned_embedding_dim:    The dimension of passage aligned
                                            embedding.
    """

    def outer_sentence_step(document_embeddings, question_lstm_outs,
                            passage_aligned_embedding_dim):
        """step function for PaddlePaddle's recurrent_group.

        In this function, the original input document_embeddings are scattered
        from nested sequence into sequence by recurrent_group in PaddlePaddle.
        The step function iterates over each sentence in the document.

        Arguments:
            - document_embeddings:   The word embeddings of the document.
            - question_lstm_outs:    The dimension of output of LSTM that
                                     process question word embedding.
            - passage_aligned_embedding_dim:    The dimension of passage aligned
                                                embedding.
        """
C
caoying03 已提交
104 105

        def inner_word_step(word_embedding, question_lstm_outs,
C
caoying03 已提交
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
                            question_outs_proj, passage_aligned_embedding_dim):
            """
            In this recurrent_group, sentence embedding has been scattered into
            word embeddings. The step function iterates over each word in one
            sentence in the document.

            Arguments:
                - word_embedding: The word embeddings of documents.
                - question_lstm_outs:    The dimension of output of LSTM that
                                         process question word embedding.
                - question_outs_proj:    The projection of question_lstm_outs
                                         into a new hidden space.
                - passage_aligned_embedding_dim:    The dimension of passage
                                                    aligned embedding.
            """

C
caoying03 已提交
122 123 124 125 126 127 128 129
            doc_word_expand = paddle.layer.expand(
                input=word_embedding,
                expand_as=question_lstm_outs,
                expand_level=paddle.layer.ExpandLevel.FROM_NO_SEQUENCE)

            weights = paddle.layer.fc(
                input=[question_lstm_outs, doc_word_expand],
                size=1,
C
caoying03 已提交
130
                bias_attr=False,
C
caoying03 已提交
131 132 133 134 135 136 137 138 139
                act=paddle.activation.SequenceSoftmax())
            weighted_candidates = paddle.layer.scaling(
                input=question_outs_proj, weight=weights)
            return paddle.layer.pooling(
                input=weighted_candidates, pooling_type=paddle.pooling.Sum())

        question_outs_proj = paddle.layer.fc(
            input=question_lstm_outs,
            bias_attr=False,
C
caoying03 已提交
140
            size=passage_aligned_embedding_dim)
C
caoying03 已提交
141 142 143 144 145
        return paddle.layer.recurrent_group(
            input=[
                paddle.layer.SubsequenceInput(document_embeddings),
                paddle.layer.StaticInput(question_lstm_outs),
                paddle.layer.StaticInput(question_outs_proj),
C
caoying03 已提交
146
                passage_aligned_embedding_dim,
C
caoying03 已提交
147 148 149 150 151 152 153
            ],
            step=inner_word_step,
            name="iter_over_word")

    return paddle.layer.recurrent_group(
        input=[
            paddle.layer.SubsequenceInput(document_embeddings),
C
caoying03 已提交
154 155
            paddle.layer.StaticInput(question_lstm_outs),
            passage_aligned_embedding_dim
C
caoying03 已提交
156 157 158 159 160 161
        ],
        step=outer_sentence_step,
        name="iter_over_sen")


def encode_documents(input_embedding, same_as_question, question_vector,
C
caoying03 已提交
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
                     question_lstm_outs, passage_indep_embedding_dim, prefix):
    """Build the final question-aware document embeddings.

    Each word in the document is represented as concatenation of its word
    vector, the question vector, boolean features indicating if a word appers
    in the question or is repeated, and a question aligned embedding.


    Arguments:
        - input_embedding:   The word embeddings of the document.
        - same_as_question:  The boolean features indicating if a word appears
                             in the question or is repeated.
        - question_lstm_outs: The final question encoding.
        - passage_indep_embedding_dim:  The dimension of passage independent
                                        embedding.
        - prefix:    The prefix which will be appended to name of each layer in
                     This function.
    """

C
caoying03 已提交
181 182 183 184 185
    question_expanded = paddle.layer.expand(
        input=question_vector,
        expand_as=input_embedding,
        expand_level=paddle.layer.ExpandLevel.FROM_NO_SEQUENCE)
    question_aligned_embedding = question_aligned_passage_embedding(
C
caoying03 已提交
186
        question_lstm_outs, input_embedding, passage_indep_embedding_dim)
C
caoying03 已提交
187 188 189 190 191 192
    return paddle.layer.concat(input=[
        input_embedding, question_expanded, same_as_question,
        question_aligned_embedding
    ])


C
caoying03 已提交
193 194
def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config,
                  is_infer):
C
caoying03 已提交
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
    """Search the answer from the document.

    The search process for this layer begins with searching a target sequence
    from a nested sequence by using paddle.lauer.kmax_seq_score and
    paddle.layer.sub_nested_seq_layer. In the first search step, top beam size
    sequences with highest scores, indices of these top k sequences in the
    original nested sequence, and the ground truth (also called gold)
    altogether (a triple) make up of the first beam.

    Then, start and end positions are searched. In these searches, top k
    positions with highest scores are selected, and then sequence, starting
    from the selected starts till ends of the sequences are taken to search
    next by using paddle.layer.seq_slice.

    Finally, the layer paddle.layer.cross_entropy_over_beam takes all the beam
    expansions which contain several candidate targets found along the
    three-step search. cross_entropy_over_beam calculates cross entropy over
    the expanded beams which all the candidates in the beam as the normalized
    factor.

    Note that, if gold falls off the beam at search step t, then the cost is
    calculated over the beam at step t.

    Arguments:
        - doc_lstm_outs:    The output of LSTM that process each document words.
        - sentence_idx:    Ground-truth indicating sentence index of the answer
                           in the document.
        - start_idx:    Ground-truth indicating start span index of the answer
                        in the sentence.
        - end_idx:    Ground-truth indicating end span index of the answer
                      in the sentence.
        - is_infer:    The boolean parameter indicating inferring or training.
    """

C
caoying03 已提交
229 230
    last_state_of_sentence = paddle.layer.last_seq(
        input=doc_lstm_outs, agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
C
caoying03 已提交
231 232 233 234 235
    sentence_scores = paddle.layer.fc(
        input=last_state_of_sentence,
        size=1,
        bias_attr=False,
        act=paddle.activation.Linear())
C
caoying03 已提交
236
    topk_sentence_ids = paddle.layer.kmax_seq_score(
C
caoying03 已提交
237 238
        input=sentence_scores, beam_size=config.beam_size)
    topk_sen = paddle.layer.sub_nested_seq(
C
caoying03 已提交
239
        input=doc_lstm_outs, selected_indices=topk_sentence_ids)
C
caoying03 已提交
240 241

    # expand beam to search start positions on selected sentences
C
caoying03 已提交
242 243 244 245
    start_pos_scores = paddle.layer.fc(
        input=topk_sen,
        size=1,
        layer_attr=paddle.attr.ExtraLayerAttribute(
C
caoying03 已提交
246
            error_clipping_threshold=5.0),
C
caoying03 已提交
247 248
        bias_attr=False,
        act=paddle.activation.Linear())
C
caoying03 已提交
249
    topk_start_pos_ids = paddle.layer.kmax_seq_score(
C
caoying03 已提交
250
        input=start_pos_scores, beam_size=config.beam_size)
C
caoying03 已提交
251 252 253 254 255
    topk_start_spans = paddle.layer.seq_slice(
        input=topk_sen, starts=topk_start_pos_ids, ends=None)

    # expand beam to search end positions on selected start spans
    _, end_span_embedding = basic_modules.stacked_bidirectional_lstm(
C
caoying03 已提交
256 257
        topk_start_spans, config.lstm_hidden_dim, config.lstm_depth,
        config.lstm_hidden_droprate, "__end_span_embeddings__")
C
caoying03 已提交
258 259 260 261 262
    end_pos_scores = paddle.layer.fc(
        input=end_span_embedding,
        size=1,
        bias_attr=False,
        act=paddle.activation.Linear())
C
caoying03 已提交
263
    topk_end_pos_ids = paddle.layer.kmax_seq_score(
C
caoying03 已提交
264 265
        input=end_pos_scores, beam_size=config.beam_size)

C
caoying03 已提交
266
    if is_infer:
C
caoying03 已提交
267 268 269 270
        return [
            sentence_scores, topk_sentence_ids, start_pos_scores,
            topk_start_pos_ids, end_pos_scores, topk_end_pos_ids
        ]
C
caoying03 已提交
271 272 273 274 275 276 277 278
    else:
        return paddle.layer.cross_entropy_over_beam(input=[
            paddle.layer.BeamInput(sentence_scores, topk_sentence_ids,
                                   sentence_idx),
            paddle.layer.BeamInput(start_pos_scores, topk_start_pos_ids,
                                   start_idx),
            paddle.layer.BeamInput(end_pos_scores, topk_end_pos_ids, end_idx)
        ])
C
caoying03 已提交
279

C
caoying03 已提交
280 281

def GNR(config, is_infer=False):
C
caoying03 已提交
282 283 284 285 286 287 288 289
    """Build the globally normalized reader model.

    Arguments:
        - config:    The model configuration.
        - is_infer:    The boolean parameter indicating inferring or training.
    """

    # encode question words
C
caoying03 已提交
290
    question_embeddings = build_pretrained_embedding(
C
caoying03 已提交
291 292
        "question",
        paddle.data_type.integer_value_sequence(config.vocab_size),
C
caoying03 已提交
293 294
        config.embedding_dim, config.embedding_droprate)
    question_vector, question_lstm_outs = encode_question(
C
caoying03 已提交
295 296
        question_embeddings, config.lstm_hidden_dim, config.lstm_depth,
        config.passage_indep_embedding_dim, "__ques")
C
caoying03 已提交
297

C
caoying03 已提交
298
    # encode document words
C
caoying03 已提交
299
    document_embeddings = build_pretrained_embedding(
C
caoying03 已提交
300 301 302
        "documents",
        paddle.data_type.integer_value_sub_sequence(config.vocab_size),
        config.embedding_dim, config.embedding_droprate)
C
caoying03 已提交
303 304
    same_as_question = paddle.layer.data(
        name="same_as_question",
C
caoying03 已提交
305 306
        type=paddle.data_type.dense_vector_sub_sequence(1))

C
caoying03 已提交
307
    document_words_ecoding = encode_documents(
C
caoying03 已提交
308 309 310 311 312 313 314 315
        document_embeddings, same_as_question, question_vector,
        question_lstm_outs, config.passage_indep_embedding_dim, "__doc")

    doc_lstm_outs = basic_modules.stacked_bidirectional_lstm_by_nested_seq(
        document_words_ecoding, config.lstm_depth, config.lstm_hidden_dim,
        "__doc_lstm")

    # search the answer.
C
caoying03 已提交
316 317 318 319 320 321 322
    sentence_idx = paddle.layer.data(
        name="sen_idx", type=paddle.data_type.integer_value(1))
    start_idx = paddle.layer.data(
        name="start_idx", type=paddle.data_type.integer_value(1))
    end_idx = paddle.layer.data(
        name="end_idx", type=paddle.data_type.integer_value(1))
    return search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx,
C
caoying03 已提交
323
                         config, is_infer)
C
caoying03 已提交
324 325 326 327


if __name__ == "__main__":
    print(parse_network(GNR(ModelConfig)))