#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import paddle.fluid as fluid
from paddle.fluid import ParamAttr
import numpy as np


class ShiftingAttentionModel(object):
    """Shifting Attention Model"""

    def __init__(self, input_dim, seg_num, n_att, name):
        self.n_att = n_att
        self.input_dim = input_dim
        self.seg_num = seg_num
        self.name = name
        self.gnorm = np.sqrt(n_att)

    def softmax_m1(self, x):
        x_shape = fluid.layers.shape(x)
        x_shape.stop_gradient = True
        flat_x = fluid.layers.reshape(x, shape=(-1, self.seg_num))
        flat_softmax = fluid.layers.softmax(flat_x)
        return fluid.layers.reshape(
            flat_softmax, shape=x.shape, actual_shape=x_shape)

    def glorot(self, n):
        return np.sqrt(1.0 / np.sqrt(n))

    def forward(self, x):
        """Forward shifting attention model.

        Args:
          x: input features in shape of [N, L, F].

        Returns:
          out: output features in shape of [N, F * C]
        """

        trans_x = fluid.layers.transpose(x, perm=[0, 2, 1])
        # scores and weight in shape [N, C, L], sum(weights, -1) = 1
        trans_x = fluid.layers.unsqueeze(trans_x, [-1])
        scores = fluid.layers.conv2d(
            trans_x,
            self.n_att,
            filter_size=1,
            param_attr=ParamAttr(
                name=self.name + ".conv.weight",
                initializer=fluid.initializer.MSRA(uniform=False)),
            bias_attr=ParamAttr(
                name=self.name + ".conv.bias",
                initializer=fluid.initializer.MSRA()))
        scores = fluid.layers.squeeze(scores, [-1])
        weights = self.softmax_m1(scores)

        glrt = self.glorot(self.n_att)
        self.w = fluid.layers.create_parameter(
            shape=(self.n_att, ),
            dtype=x.dtype,
            attr=ParamAttr(self.name + ".shift_w"),
            default_initializer=fluid.initializer.Normal(0.0, glrt))
        self.b = fluid.layers.create_parameter(
            shape=(self.n_att, ),
            dtype=x.dtype,
            attr=ParamAttr(name=self.name + ".shift_b"),
            default_initializer=fluid.initializer.Normal(0.0, glrt))

        outs = []
        for i in range(self.n_att):
            # slice weight and expand to shape [N, L, C]
            weight = fluid.layers.slice(
                weights, axes=[1], starts=[i], ends=[i + 1])
            weight = fluid.layers.transpose(weight, perm=[0, 2, 1])
            weight = fluid.layers.expand(weight, [1, 1, self.input_dim])

            w_i = fluid.layers.slice(self.w, axes=[0], starts=[i], ends=[i + 1])
            b_i = fluid.layers.slice(self.b, axes=[0], starts=[i], ends=[i + 1])
            shift = fluid.layers.reduce_sum(x * weight, dim=1) * w_i + b_i

            l2_norm = fluid.layers.l2_normalize(shift, axis=-1)
            outs.append(l2_norm / self.gnorm)

        out = fluid.layers.concat(outs, axis=1)
        return out