model.py

#!/usr/bin/env python
#coding=utf-8

import paddle.v2 as paddle
from paddle.v2.layer import parse_network
import basic_modules
from config import ModelConfig

__all__ = ["GNR"]


def build_pretrained_embedding(name, data_type, emb_dim, emb_drop=0.):
    """create word a embedding layer which loads pre-trained embeddings.

    Arguments:
        - name:       The name of the data layer which accepts one-hot input.
        - data_type:  PaddlePaddle's data type for data layer.
        - emb_dim:    The path to the data files.
    """

    return paddle.layer.embedding(
        input=paddle.layer.data(name=name, type=data_type),
        size=emb_dim,
        param_attr=paddle.attr.Param(name="GloveVectors", is_static=True),
        layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=emb_drop), )


def encode_question(input_embedding,
                    lstm_hidden_dim,
                    depth,
                    passage_indep_embedding_dim,
                    prefix=""):
    """build question encoding by using bidirectional LSTM.

    Each question word is encoded by runing a stack of bidirectional LSTM over
    word embedding in question, producing hidden states. The hidden states are
    used to compute a passage-independent question embedding.

    The final question encoding is constructed by concatenating the final
    hidden states of the forward and backward LSTMs and the passage-independent
    embedding.

    Arguments:
        - input_embedding:    The question word embeddings.
        - lstm_hidden_dim:  The dimension of bi-directional LSTM.
        - depth:  The depth of stacked bi-directional LSTM.
        - passage_indep_embedding_dim:  The dimension of passage-independent
                                        embedding.
        - prefix:    A string which will be appended to name of each layer
                     created in this function. Each layer in a network should
                     has a unique name. The prefix makes this fucntion can be
                     called multiple times.
    """
    # stacked bi-directional LSTM to process question embeddings.
    lstm_final, lstm_outs = basic_modules.stacked_bidirectional_lstm(
        input_embedding, lstm_hidden_dim, depth, 0., prefix)

    # compute passage-independent embeddings.
    candidates = paddle.layer.fc(
        input=lstm_outs,
        bias_attr=False,
        size=passage_indep_embedding_dim,
        act=paddle.activation.Linear())
    weights = paddle.layer.fc(
        input=lstm_outs,
        size=1,
        bias_attr=False,
        act=paddle.activation.SequenceSoftmax())
    weighted_candidates = paddle.layer.scaling(input=candidates, weight=weights)
    passage_indep_embedding = paddle.layer.pooling(
        input=weighted_candidates, pooling_type=paddle.pooling.Sum())

    return paddle.layer.concat(
        input=[lstm_final, passage_indep_embedding]), lstm_outs


def question_aligned_passage_embedding(question_lstm_outs, document_embeddings,
                                       passage_aligned_embedding_dim):
    """create question aligned passage embedding.

    Arguments:
        - question_lstm_outs:    The dimension of output of LSTM that process
                                 question word embedding.
        - document_embeddings:   The document embeddings.
        - passage_aligned_embedding_dim:    The dimension of passage aligned
                                            embedding.
    """

    def outer_sentence_step(document_embeddings, question_lstm_outs,
                            passage_aligned_embedding_dim):
        """step function for PaddlePaddle's recurrent_group.

        In this function, the original input document_embeddings are scattered
        from nested sequence into sequence by recurrent_group in PaddlePaddle.
        The step function iterates over each sentence in the document.

        Arguments:
            - document_embeddings:   The word embeddings of the document.
            - question_lstm_outs:    The dimension of output of LSTM that
                                     process question word embedding.
            - passage_aligned_embedding_dim:    The dimension of passage aligned
                                                embedding.
        """

        def inner_word_step(word_embedding, question_lstm_outs,
                            question_outs_proj, passage_aligned_embedding_dim):
            """
            In this recurrent_group, sentence embedding has been scattered into
            word embeddings. The step function iterates over each word in one
            sentence in the document.

            Arguments:
                - word_embedding: The word embeddings of documents.
                - question_lstm_outs:    The dimension of output of LSTM that
                                         process question word embedding.
                - question_outs_proj:    The projection of question_lstm_outs
                                         into a new hidden space.
                - passage_aligned_embedding_dim:    The dimension of passage
                                                    aligned embedding.
            """

            doc_word_expand = paddle.layer.expand(
                input=word_embedding,
                expand_as=question_lstm_outs,
                expand_level=paddle.layer.ExpandLevel.FROM_NO_SEQUENCE)

            weights = paddle.layer.fc(
                input=[question_lstm_outs, doc_word_expand],
                size=1,
                bias_attr=False,
                act=paddle.activation.SequenceSoftmax())
            weighted_candidates = paddle.layer.scaling(
                input=question_outs_proj, weight=weights)
            return paddle.layer.pooling(
                input=weighted_candidates, pooling_type=paddle.pooling.Sum())

        question_outs_proj = paddle.layer.fc(
            input=question_lstm_outs,
            bias_attr=False,
            size=passage_aligned_embedding_dim)
        return paddle.layer.recurrent_group(
            input=[
                paddle.layer.SubsequenceInput(document_embeddings),
                paddle.layer.StaticInput(question_lstm_outs),
                paddle.layer.StaticInput(question_outs_proj),
                passage_aligned_embedding_dim,
            ],
            step=inner_word_step,
            name="iter_over_word")

    return paddle.layer.recurrent_group(
        input=[
            paddle.layer.SubsequenceInput(document_embeddings),
            paddle.layer.StaticInput(question_lstm_outs),
            passage_aligned_embedding_dim
        ],
        step=outer_sentence_step,
        name="iter_over_sen")


def encode_documents(input_embedding, same_as_question, question_vector,
                     question_lstm_outs, passage_indep_embedding_dim, prefix):
    """Build the final question-aware document embeddings.

    Each word in the document is represented as concatenation of its word
    vector, the question vector, boolean features indicating if a word appers
    in the question or is repeated, and a question aligned embedding.


    Arguments:
        - input_embedding:   The word embeddings of the document.
        - same_as_question:  The boolean features indicating if a word appears
                             in the question or is repeated.
        - question_lstm_outs: The final question encoding.
        - passage_indep_embedding_dim:  The dimension of passage independent
                                        embedding.
        - prefix:    The prefix which will be appended to name of each layer in
                     This function.
    """

    question_expanded = paddle.layer.expand(
        input=question_vector,
        expand_as=input_embedding,
        expand_level=paddle.layer.ExpandLevel.FROM_NO_SEQUENCE)
    question_aligned_embedding = question_aligned_passage_embedding(
        question_lstm_outs, input_embedding, passage_indep_embedding_dim)
    return paddle.layer.concat(input=[
        input_embedding, question_expanded, same_as_question,
        question_aligned_embedding
    ])


def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config,
                  is_infer):
    """Search the answer from the document.

    The search process for this layer begins with searching a target sequence
    from a nested sequence by using paddle.lauer.kmax_seq_score and
    paddle.layer.sub_nested_seq_layer. In the first search step, top beam size
    sequences with highest scores, indices of these top k sequences in the
    original nested sequence, and the ground truth (also called gold)
    altogether (a triple) make up of the first beam.

    Then, start and end positions are searched. In these searches, top k
    positions with highest scores are selected, and then sequence, starting
    from the selected starts till ends of the sequences are taken to search
    next by using paddle.layer.seq_slice.

    Finally, the layer paddle.layer.cross_entropy_over_beam takes all the beam
    expansions which contain several candidate targets found along the
    three-step search. cross_entropy_over_beam calculates cross entropy over
    the expanded beams which all the candidates in the beam as the normalized
    factor.

    Note that, if gold falls off the beam at search step t, then the cost is
    calculated over the beam at step t.

    Arguments:
        - doc_lstm_outs:    The output of LSTM that process each document words.
        - sentence_idx:    Ground-truth indicating sentence index of the answer
                           in the document.
        - start_idx:    Ground-truth indicating start span index of the answer
                        in the sentence.
        - end_idx:    Ground-truth indicating end span index of the answer
                      in the sentence.
        - is_infer:    The boolean parameter indicating inferring or training.
    """

    last_state_of_sentence = paddle.layer.last_seq(
        input=doc_lstm_outs, agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
    sentence_scores = paddle.layer.fc(
        input=last_state_of_sentence,
        size=1,
        bias_attr=False,
        act=paddle.activation.Linear())
    topk_sentence_ids = paddle.layer.kmax_seq_score(
        input=sentence_scores, beam_size=config.beam_size)
    topk_sen = paddle.layer.sub_nested_seq(
        input=doc_lstm_outs, selected_indices=topk_sentence_ids)

    # expand beam to search start positions on selected sentences
    start_pos_scores = paddle.layer.fc(
        input=topk_sen,
        size=1,
        layer_attr=paddle.attr.ExtraLayerAttribute(
            error_clipping_threshold=5.0),
        bias_attr=False,
        act=paddle.activation.Linear())
    topk_start_pos_ids = paddle.layer.kmax_seq_score(
        input=start_pos_scores, beam_size=config.beam_size)
    topk_start_spans = paddle.layer.seq_slice(
        input=topk_sen, starts=topk_start_pos_ids, ends=None)

    # expand beam to search end positions on selected start spans
    _, end_span_embedding = basic_modules.stacked_bidirectional_lstm(
        topk_start_spans, config.lstm_hidden_dim, config.lstm_depth,
        config.lstm_hidden_droprate, "__end_span_embeddings__")
    end_pos_scores = paddle.layer.fc(
        input=end_span_embedding,
        size=1,
        bias_attr=False,
        act=paddle.activation.Linear())
    topk_end_pos_ids = paddle.layer.kmax_seq_score(
        input=end_pos_scores, beam_size=config.beam_size)

    if is_infer:
        return [
            sentence_scores, topk_sentence_ids, start_pos_scores,
            topk_start_pos_ids, end_pos_scores, topk_end_pos_ids
        ]
    else:
        return paddle.layer.cross_entropy_over_beam(input=[
            paddle.layer.BeamInput(sentence_scores, topk_sentence_ids,
                                   sentence_idx),
            paddle.layer.BeamInput(start_pos_scores, topk_start_pos_ids,
                                   start_idx),
            paddle.layer.BeamInput(end_pos_scores, topk_end_pos_ids, end_idx)
        ])


def GNR(config, is_infer=False):
    """Build the globally normalized reader model.

    Arguments:
        - config:    The model configuration.
        - is_infer:    The boolean parameter indicating inferring or training.
    """

    # encode question words
    question_embeddings = build_pretrained_embedding(
        "question",
        paddle.data_type.integer_value_sequence(config.vocab_size),
        config.embedding_dim, config.embedding_droprate)
    question_vector, question_lstm_outs = encode_question(
        question_embeddings, config.lstm_hidden_dim, config.lstm_depth,
        config.passage_indep_embedding_dim, "__ques")

    # encode document words
    document_embeddings = build_pretrained_embedding(
        "documents",
        paddle.data_type.integer_value_sub_sequence(config.vocab_size),
        config.embedding_dim, config.embedding_droprate)
    same_as_question = paddle.layer.data(
        name="same_as_question",
        type=paddle.data_type.dense_vector_sub_sequence(1))

    document_words_ecoding = encode_documents(
        document_embeddings, same_as_question, question_vector,
        question_lstm_outs, config.passage_indep_embedding_dim, "__doc")

    doc_lstm_outs = basic_modules.stacked_bidirectional_lstm_by_nested_seq(
        document_words_ecoding, config.lstm_depth, config.lstm_hidden_dim,
        "__doc_lstm")

    # search the answer.
    sentence_idx = paddle.layer.data(
        name="sen_idx", type=paddle.data_type.integer_value(1))
    start_idx = paddle.layer.data(
        name="start_idx", type=paddle.data_type.integer_value(1))
    end_idx = paddle.layer.data(
        name="end_idx", type=paddle.data_type.integer_value(1))
    return search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx,
                         config, is_infer)


if __name__ == "__main__":
    print(parse_network(GNR(ModelConfig)))