#coding=utf-8

import math

import paddle.v2 as paddle

__all__ = ["conv_seq2seq"]


def gated_conv_with_batchnorm(input,
                              size,
                              context_len,
                              context_start=None,
                              learning_rate=1.0,
                              drop_rate=0.):
    """
    Definition of the convolution block.

    :param input: The input of this block.
    :type input: LayerOutput
    :param size: The dimension of the block's output.
    :type size: int
    :param context_len: The context width of the convolution.
    :type context_len: int
    :param context_start: The start position of the context.
    :type context_start: int
    :param learning_rate: The learning rate factor of the parameters in the block.
                          The actual learning rate is the product of the global
                          learning rate and this factor.
    :type learning_rate: float
    :param drop_rate: Dropout rate.
    :type drop_rate: float
    :return: The output of the convolution block.
    :rtype: LayerOutput
    """
    input = paddle.layer.dropout(input=input, dropout_rate=drop_rate)

    context = paddle.layer.mixed(
        size=input.size * context_len,
        input=paddle.layer.context_projection(
            input=input, context_len=context_len, context_start=context_start))

    raw_conv = paddle.layer.fc(
        input=context,
        size=size * 2,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(
            initial_mean=0.,
            initial_std=math.sqrt(4.0 * (1.0 - drop_rate) / context.size),
            learning_rate=learning_rate),
        bias_attr=False)

    batch_norm_conv = paddle.layer.batch_norm(
        input=raw_conv,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(learning_rate=learning_rate))

    with paddle.layer.mixed(size=size) as conv:
        conv += paddle.layer.identity_projection(
            batch_norm_conv, size=size, offset=0)

    with paddle.layer.mixed(size=size, act=paddle.activation.Sigmoid()) as gate:
        gate += paddle.layer.identity_projection(
            batch_norm_conv, size=size, offset=size)

    with paddle.layer.mixed(size=size) as gated_conv:
        gated_conv += paddle.layer.dotmul_operator(conv, gate)

    return gated_conv


def encoder(token_emb,
            pos_emb,
            conv_blocks=[(256, 3)] * 5,
            num_attention=3,
            drop_rate=0.1):
    """
    Definition of the encoder.

    :param token_emb: The embedding vector of the input token.
    :type token_emb: LayerOutput
    :param pos_emb: The embedding vector of the input token's position.
    :type pos_emb: LayerOutput
    :param conv_blocks: The scale list of the convolution blocks. And each element of the
                        list contains output dimension and context length of the corresponding
                        convolution block.
    :type conv_blocks: list of tuple
    :param num_attention: The total number of the attention modules used in the decoder.
    :type num_attention: int
    :param drop_rate: Dropout rate.
    :type drop_rate: float
    :return: The input token encoding.
    :rtype: LayerOutput
    """
    embedding = paddle.layer.addto(
        input=[token_emb, pos_emb],
        layer_attr=paddle.attr.Extra(drop_rate=drop_rate))

    proj_size = conv_blocks[0][0]
    block_input = paddle.layer.fc(
        input=embedding,
        size=proj_size,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(
            initial_mean=0.,
            initial_std=math.sqrt((1.0 - drop_rate) / embedding.size),
            learning_rate=1.0 / (2.0 * num_attention)),
        bias_attr=True, )

    for (size, context_len) in conv_blocks:
        if block_input.size == size:
            res = block_input
        else:
            res = paddle.layer.fc(
                input=block_input,
                size=size,
                act=paddle.activation.Linear(),
                param_attr=paddle.attr.Param(learning_rate=1.0 /
                                             (2.0 * num_attention)),
                bias_attr=True)

        gated_conv = gated_conv_with_batchnorm(
            input=block_input,
            size=size,
            context_len=context_len,
            learning_rate=1.0 / (2.0 * num_attention),
            drop_rate=drop_rate)

        with paddle.layer.mixed(size=size) as block_output:
            block_output += paddle.layer.identity_projection(res)
            block_output += paddle.layer.identity_projection(gated_conv)

        block_output = paddle.layer.slope_intercept(
            input=block_output, slope=math.sqrt(0.5))

        block_input = block_output

    emb_dim = embedding.size
    encoded_vec = paddle.layer.fc(
        input=block_output,
        size=emb_dim,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)),
        bias_attr=True)

    encoded = paddle.layer.addto(input=[encoded_vec, embedding])

    encoded = paddle.layer.slope_intercept(input=encoded, slope=math.sqrt(0.5))

    return encoded_vec, encoded


def attention(decoder_state, cur_embedding, encoded_vec, encoded):
    """
    Definition of the attention.

    :param decoder_state: The hidden state of the decoder.
    :type decoder_state: LayerOutput
    :param cur_embedding: The embedding vector of the current token.
    :type cur_embedding: LayerOutput
    :param encoded_vec: The source token encoding.
    :type encoded_vec: LayerOutput
    :param encoded: The sum of the source token's encoding and embedding.
    :type encoded: LayerOutput
    :return: A context vector.
    :rtype: LayerOutput
    """
    res = decoder_state

    state_size = decoder_state.size
    emb_dim = cur_embedding.size
    with paddle.layer.mixed(size=emb_dim, bias_attr=True) as state_summary:
        state_summary += paddle.layer.full_matrix_projection(decoder_state)
        state_summary += paddle.layer.identity_projection(cur_embedding)

    state_summary = paddle.layer.slope_intercept(
        input=state_summary, slope=math.sqrt(0.5))

    expanded = paddle.layer.expand(input=state_summary, expand_as=encoded_vec)

    m = paddle.layer.linear_comb(weights=expanded, vectors=encoded_vec)

    attention_weight = paddle.layer.fc(
        input=m,
        size=1,
        act=paddle.activation.SequenceSoftmax(),
        bias_attr=False)

    scaled = paddle.layer.scaling(weight=attention_weight, input=encoded)

    attended = paddle.layer.pooling(
        input=scaled, pooling_type=paddle.pooling.Sum())

    attended_proj = paddle.layer.fc(
        input=attended,
        size=state_size,
        act=paddle.activation.Linear(),
        bias_attr=True)

    # TODO scaled by length

    attention_result = paddle.layer.addto(input=[attended_proj, res])
    attention_result = paddle.layer.slope_intercept(
        input=attention_result, slope=math.sqrt(0.5))
    return attention_result


def decoder(token_emb,
            pos_emb,
            encoded_vec,
            encoded,
            dict_size,
            conv_blocks=[(256, 3)] * 3,
            drop_rate=0.1):
    """
    Definition of the decoder.

    :param token_emb: The embedding vector of the input token.
    :type token_emb: LayerOutput
    :param pos_emb: The embedding vector of the input token's position.
    :type pos_emb: LayerOutput
    :param encoded_vec: The source token encoding.
    :type encoded_vec: LayerOutput
    :param encoded: The sum of the source token's encoding and embedding.
    :type encoded: LayerOutput
    :param dict_size: The size of the target dictionary.
    :type dict_size: int
    :param conv_blocks: The scale list of the convolution blocks. And each element of the
                        list contains output dimension and context length of the corresponding
                        convolution block.
    :type conv_blocks: list of tuple
    :param drop_rate: Dropout rate.
    :type drop_rate: float
    :return: The probability of the predicted token.
    :rtype: LayerOutput
    """

    def attention_step(decoder_state, cur_embedding, encoded_vec, encoded):
        conditional = attention(
            decoder_state=decoder_state,
            cur_embedding=cur_embedding,
            encoded_vec=encoded_vec,
            encoded=encoded)
        return conditional

    def softmax_step(input):
        return paddle.layer.fc(
            input=input,
            size=dict_size,
            act=paddle.activation.Softmax(),
            param_attr=paddle.attr.Param(
                initial_mean=0.,
                initial_std=math.sqrt((1.0 - drop_rate) / input.size)),
            bias_attr=True, )

    embedding = paddle.layer.addto(
        input=[token_emb, pos_emb],
        layer_attr=paddle.attr.Extra(drop_rate=drop_rate))

    proj_size = conv_blocks[0][0]
    block_input = paddle.layer.fc(
        input=embedding,
        size=proj_size,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(
            initial_mean=0.,
            initial_std=math.sqrt((1.0 - drop_rate) / embedding.size)),
        bias_attr=True, )

    for (size, context_len) in conv_blocks:
        if block_input.size == size:
            res = block_input
        else:
            res = paddle.layer.fc(
                input=block_input,
                size=size,
                act=paddle.activation.Linear(),
                bias_attr=True)

        decoder_state = gated_conv_with_batchnorm(
            input=block_input,
            size=size,
            context_len=context_len,
            context_start=0,
            drop_rate=drop_rate)

        group_inputs = [
            decoder_state,
            embedding,
            paddle.layer.StaticInput(input=encoded_vec),
            paddle.layer.StaticInput(input=encoded),
        ]

        conditional = paddle.layer.recurrent_group(
            step=attention_step, input=group_inputs)

        block_output = paddle.layer.addto(input=[conditional, res])
        block_output = paddle.layer.slope_intercept(
            input=block_output, slope=math.sqrt(0.5))

        block_input = block_output

    out_emb_dim = embedding.size
    block_output = paddle.layer.fc(
        input=block_output,
        size=out_emb_dim,
        act=paddle.activation.Linear(),
        layer_attr=paddle.attr.Extra(drop_rate=drop_rate))

    decoder_out = paddle.layer.recurrent_group(
        step=softmax_step, input=[block_output])

    return decoder_out


def conv_seq2seq(src_dict_size,
                 trg_dict_size,
                 pos_size,
                 emb_dim,
                 enc_conv_blocks=[(256, 3)] * 5,
                 dec_conv_blocks=[(256, 3)] * 3,
                 drop_rate=0.1,
                 is_infer=False):
    """
    Definition of convolutional sequence-to-sequence network.

    :param src_dict_size: The size of the source dictionary.
    :type src_dict_size: int
    :param trg_dict_size: The size of the target dictionary.
    :type trg_dict_size: int
    :param pos_size: The total number of the position indexes, which means
                     the maximum value of the index is pos_size - 1.
    :type pos_size: int
    :param emb_dim: The dimension of the embedding vector.
    :type emb_dim: int
    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
                            the list contains output dimension and context length of the corresponding
                            convolution block.
    :type enc_conv_blocks: list of tuple
    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
                            the list contains output dimension and context length of the corresponding
                            convolution block.
    :type dec_conv_blocks: list of tuple
    :param drop_rate: Dropout rate.
    :type drop_rate: float
    :param is_infer: Whether infer or not.
    :type is_infer: bool
    :return: Cost or output layer.
    :rtype: LayerOutput
    """
    src = paddle.layer.data(
        name='src_word',
        type=paddle.data_type.integer_value_sequence(src_dict_size))
    src_pos = paddle.layer.data(
        name='src_word_pos',
        type=paddle.data_type.integer_value_sequence(pos_size +
                                                     1))  # one for padding

    src_emb = paddle.layer.embedding(
        input=src,
        size=emb_dim,
        name='src_word_emb',
        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
    src_pos_emb = paddle.layer.embedding(
        input=src_pos,
        size=emb_dim,
        name='src_pos_emb',
        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))

    num_attention = len(dec_conv_blocks)
    encoded_vec, encoded = encoder(
        token_emb=src_emb,
        pos_emb=src_pos_emb,
        conv_blocks=enc_conv_blocks,
        num_attention=num_attention,
        drop_rate=drop_rate)

    trg = paddle.layer.data(
        name='trg_word',
        type=paddle.data_type.integer_value_sequence(trg_dict_size +
                                                     1))  # one for padding
    trg_pos = paddle.layer.data(
        name='trg_word_pos',
        type=paddle.data_type.integer_value_sequence(pos_size +
                                                     1))  # one for padding

    trg_emb = paddle.layer.embedding(
        input=trg,
        size=emb_dim,
        name='trg_word_emb',
        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
    trg_pos_emb = paddle.layer.embedding(
        input=trg_pos,
        size=emb_dim,
        name='trg_pos_emb',
        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))

    decoder_out = decoder(
        token_emb=trg_emb,
        pos_emb=trg_pos_emb,
        encoded_vec=encoded_vec,
        encoded=encoded,
        dict_size=trg_dict_size,
        conv_blocks=dec_conv_blocks,
        drop_rate=drop_rate)

    if is_infer:
        return decoder_out

    trg_next_word = paddle.layer.data(
        name='trg_next_word',
        type=paddle.data_type.integer_value_sequence(trg_dict_size))
    cost = paddle.layer.classification_cost(
        input=decoder_out, label=trg_next_word)

    return cost