#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import paddle
import math
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers


def network(items_num, hidden_size, step, bs):
    stdv = 1.0 / math.sqrt(hidden_size)

    items = fluid.data(
        name="items",
        shape=[bs, -1],
        dtype="int64") #[batch_size, uniq_max]
    seq_index = fluid.data(
        name="seq_index",
        shape=[bs, -1, 2],
        dtype="int32") #[batch_size, seq_max, 2]
    last_index = fluid.data(
        name="last_index",
        shape=[bs, 2],
        dtype="int32") #[batch_size, 2]
    adj_in = fluid.data(
        name="adj_in",
        shape=[bs, -1, -1],
        dtype="float32") #[batch_size, seq_max, seq_max]
    adj_out = fluid.data(
        name="adj_out",
        shape=[bs, -1, -1],
        dtype="float32") #[batch_size, seq_max, seq_max]
    mask = fluid.data(
        name="mask",
        shape=[bs, -1, 1],
        dtype="float32") #[batch_size, seq_max, 1]
    label = fluid.data(
        name="label",
        shape=[bs, 1],
        dtype="int64") #[batch_size, 1]

    datas = [items, seq_index, last_index, adj_in, adj_out, mask, label]
    py_reader = fluid.io.DataLoader.from_generator(capacity=256, feed_list=datas, iterable=False)
    feed_datas = datas

    items_emb = fluid.embedding(
        input=items,
        param_attr=fluid.ParamAttr(
            name="emb",
            initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[batch_size, uniq_max, h]

    pre_state = items_emb
    for i in range(step):
        pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size])
        state_in = layers.fc(
            input=pre_state,
            name="state_in",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
        state_out = layers.fc(
            input=pre_state,
            name="state_out",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]

        state_adj_in = layers.matmul(adj_in, state_in)  #[batch_size, uniq_max, h]
        state_adj_out = layers.matmul(adj_out, state_out)   #[batch_size, uniq_max, h]

        gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

        gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
        gru_fc = layers.fc(
            input=gru_input,
            name="gru_fc",
            size=3 * hidden_size,
            bias_attr=False)
        pre_state, _, _ = fluid.layers.gru_unit(
            input=gru_fc,
            hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]),
            size=3 * hidden_size)

    final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
    seq = layers.gather_nd(final_state, seq_index)
    last = layers.gather_nd(final_state, last_index)

    seq_fc = layers.fc(
        input=seq,
        name="seq_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(
            initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, seq_max, h]
    last_fc = layers.fc(
        input=last,
        name="last_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=1,
        param_attr=fluid.ParamAttr(
            initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[bathc_size, h]

    seq_fc_t = layers.transpose(
        seq_fc, perm=[1, 0, 2])  #[seq_max, batch_size, h]
    add = layers.elementwise_add(
        seq_fc_t, last_fc)  #[seq_max, batch_size, h]
    b = layers.create_parameter(
        shape=[hidden_size],
        dtype='float32',
        default_initializer=fluid.initializer.Constant(value=0.0))  #[h]
    add = layers.elementwise_add(add, b)  #[seq_max, batch_size, h]

    add_sigmoid = layers.sigmoid(add) #[seq_max, batch_size, h] 
    add_sigmoid = layers.transpose(
        add_sigmoid, perm=[1, 0, 2])  #[batch_size, seq_max, h]

    weight = layers.fc(
        input=add_sigmoid,
        name="weight_fc",
        size=1,
        act=None,
        num_flatten_dims=2,
        bias_attr=False,
        param_attr=fluid.ParamAttr(
            initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, seq_max, 1]
    weight *= mask
    weight_mask = layers.elementwise_mul(seq, weight, axis=0) #[batch_size, seq_max, h]
    global_attention = layers.reduce_sum(weight_mask, dim=1) #[batch_size, h]

    final_attention = layers.concat(
        [global_attention, last], axis=1)  #[batch_size, 2*h]
    final_attention_fc = layers.fc(
        input=final_attention,
        name="final_attention_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, h]

    all_vocab = layers.create_global_var(
        shape=[items_num - 1],
        value=0,
        dtype="int64",
        persistable=True,
        name="all_vocab")

    all_emb = fluid.embedding(
        input=all_vocab,
        param_attr=fluid.ParamAttr(
            name="emb",
            initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[all_vocab, h]

    logits = layers.matmul(
        x=final_attention_fc, y=all_emb,
        transpose_y=True)  #[batch_size, all_vocab]
    softmax = layers.softmax_with_cross_entropy(
        logits=logits, label=label)  #[batch_size, 1]
    loss = layers.reduce_mean(softmax)  # [1]
    acc = layers.accuracy(input=logits, label=label, k=20)
    return loss, acc, py_reader, feed_datas