#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from itertools import chain
from collections import namedtuple

from paddle import fluid
import paddle.fluid.dygraph as dg

import numpy as np

from parakeet.modules import conv

from parakeet.modules.modules import Embedding, PositionEmbedding
from parakeet.modules.modules import FC, Conv1D, Conv1DGLU, Conv1DTranspose

ConvSpec = namedtuple("ConvSpec", ["out_channels", "filter_size", "dilation"])
WindowRange = namedtuple("WindowRange", ["backward", "ahead"])


def expand_speaker_embed(x, speaker_embed, tdim=-1):
    """
    Expand speaker embeddings for multiple timesteps.
    
    Args:
        x (Variable): A reference Variable used to determine number of timesteps.
        speaker_embed (Variable): Shape(B, C), embeddings of speakers, where
            B means batch_size, C means speaker embedding size.
        tdim (int, optional): The idex of time dimension in x. Defaults to -1,
            which means the last dimension is time dimension.
    
    Returns:
        Variable: Shape(B, C, 1, T), the expanded speaker embeddings, where
            T = x.shape[tdim]. T means number of timesteps.
    
    """

    speaker_embed = fluid.layers.reshape(
        speaker_embed, shape=speaker_embed.shape + [1, 1])
    time_steps = x.shape[tdim]
    speaker_embed_bc1t = fluid.layers.expand(
        speaker_embed, expand_times=[1, 1, 1, time_steps])
    return speaker_embed_bc1t


def gen_mask2(valid_lengths, max_len, dtype="float32"):
    """
    Generate a mask tensor from valid lengths. note that it return a *reverse*
    mask. Indices within valid lengths correspond to 0, and those within
    padding area correspond to 1. 
    
    Assume that valid_lengths = [2,5,7], and max_len = 7, the generated mask is
    [[0, 0, 1, 1, 1, 1, 1],
     [0, 0, 0, 0, 0, 1, 1],
     [0, 0, 0, 0, 0, 0, 0]].

    Args:
        valid_lengths (Variable): Shape(B), dtype: int64. A 1D-Tensor containing
            the valid lengths (timesteps) of each example, where B means
            beatch_size.
        max_len (int): The length (number of timesteps) of the mask.
        dtype (str, optional): A string that specifies the data type of the
            returned mask.

    Returns:
        mask (Variable): A mask computed from valid lengths.
    """
    batch_size = valid_lengths.shape[0]
    mask = fluid.layers.sequence_mask(
        valid_lengths, maxlen=max_len, dtype=dtype)
    mask = 1 - mask
    return mask


def expand_mask(mask, attn):
    """
    Expand a mask for multiple time steps. This function is used
    by the AttentionLayer in the Decoder to expand a mask for every
    timestep in the decoder.

    Args:
        mask (Variable): Shape(B, T_enc), a mask generated with valid
            text lengths, where T_enc means encoder length(time steps).
        attn (Variable): Shape(B, T_dec, T_enc), a Variable stands for 
            the alignment tensor between encoder and decoder, where 
            T_dec means the decoder length(time_steps).

    Returns:
        mask_btc (Variable): shape(B, T_dec, T_enc), the expanded mask.
    """
    decoder_length = attn.shape[1]
    mask = fluid.layers.reshape(mask, [mask.shape[0], 1, mask.shape[1]])
    mask_btc = fluid.layers.expand(mask, expand_times=[1, decoder_length, 1])
    return mask_btc


class Encoder(dg.Layer):
    def __init__(self,
                 name_scope,
                 n_vocab,
                 embed_dim,
                 n_speakers,
                 speaker_dim,
                 padding_idx=None,
                 embedding_weight_std=0.1,
                 convolutions=(ConvSpec(64, 5, 1)) * 7,
                 max_positions=512,
                 dropout=0.1,
                 dtype="float32"):
        super(Encoder, self).__init__(name_scope, dtype=dtype)

        self.dropout = dropout
        self.embedding_weight_std = embedding_weight_std

        self.embed = Embedding(
            self.full_name(),
            n_vocab,
            embed_dim,
            padding_idx=padding_idx,
            std=embedding_weight_std,
            dtype=dtype)

        if n_speakers > 1:
            self.sp_proj1 = Conv1D(
                self.full_name(),
                speaker_dim,
                embed_dim,
                filter_size=1,
                std_mul=1.0,
                dropout=dropout,
                act="softsign",
                dtype=dtype)
            self.sp_proj2 = Conv1D(
                self.full_name(),
                speaker_dim,
                embed_dim,
                filter_size=1,
                std_mul=1.0,
                dropout=dropout,
                act="softsign",
                dtype=dtype)
        self.n_speakers = n_speakers

        self.convolutions = []

        in_channels = embed_dim
        std_mul = 1.0
        for (out_channels, filter_size, dilation) in convolutions:
            # 1 * 1 convolution & relu
            if in_channels != out_channels:
                self.convolutions.append(
                    Conv1D(
                        self.full_name(),
                        in_channels,
                        out_channels,
                        filter_size=1,
                        std_mul=std_mul,
                        act="relu",
                        dtype=dtype))
                in_channels = out_channels
                std_mul = 2.0

            self.convolutions.append(
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    in_channels,
                    out_channels,
                    filter_size,
                    dilation,
                    std_mul=std_mul,
                    dropout=dropout,
                    causal=False,
                    residual=True,
                    dtype=dtype))
            in_channels = out_channels
            std_mul = 4.0

        self.convolutions.append(
            Conv1D(
                self.full_name(),
                in_channels,
                embed_dim,
                filter_size=1,
                std_mul=std_mul,
                dropout=dropout,
                dtype=dtype))

        for i, layer in enumerate(self.convolutions):
            self.add_sublayer("convolution_{}".format(i), layer)

    def forward(self, x, speaker_embed=None):
        """
        Encode text sequence.
        
        Args:
            x (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text
                indices. T_enc means the timesteps of decoder input x.
            speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim),
                dtype: float32. Speaker embeddings. This arg is not None only
                when the model is a multispeaker model.

        Returns:
            keys (Variable), Shape(B, C_emb, 1, T_enc), the encoded
                representation for keys, where C_emb menas the text embedding
                size.
            values (Variable), Shape(B, C_embed, 1, T_enc), the encoded
                representation for values.
        """
        x = self.embed(x)

        x = fluid.layers.dropout(
            x, self.dropout, dropout_implementation="upscale_in_train")
        x = fluid.layers.transpose(
            fluid.layers.reshape(
                x, shape=x.shape + [1]), perm=[0, 2, 3, 1])

        speaker_embed_bc1t = None
        if speaker_embed is not None:
            speaker_embed_bc1t = expand_speaker_embed(x, speaker_embed, tdim=3)

            speaker_embed_bc1t = fluid.layers.dropout(
                speaker_embed_bc1t,
                self.dropout,
                dropout_implementation="upscale_in_train")

            x = x + self.sp_proj1(speaker_embed_bc1t)

        input_embed = x

        for layer in self.convolutions:
            if isinstance(layer, Conv1DGLU):
                x = layer(x, speaker_embed_bc1t)
            else:
                x = layer(x)

        if speaker_embed is not None:
            x = x + self.sp_proj2(speaker_embed_bc1t)

        keys = x
        values = fluid.layers.scale(input_embed + x, scale=np.sqrt(0.5))

        return keys, values

    def freeze_embedding(self):
        """Fix text embedding while training."""
        for param in self.embed.parameters():
            param.trainable = False


class AttentionLayer(dg.Layer):
    def __init__(self,
                 name_scope,
                 conv_channels,
                 embed_dim,
                 dropout=0.0,
                 window_range=WindowRange(-1, 3),
                 key_projection=True,
                 value_projection=True,
                 dtype="float32"):
        super(AttentionLayer, self).__init__(name_scope, dtype=dtype)
        self.query_proj = Conv1D(
            self.full_name(),
            conv_channels,
            embed_dim,
            filter_size=1,
            dtype=dtype)

        if key_projection:
            self.key_proj = Conv1D(
                self.full_name(),
                embed_dim,
                embed_dim,
                filter_size=1,
                dtype=dtype)

        if value_projection:
            self.value_proj = Conv1D(
                self.full_name(),
                embed_dim,
                embed_dim,
                filter_size=1,
                dtype=dtype)

        self.out_proj = Conv1D(
            self.full_name(),
            embed_dim,
            conv_channels,
            filter_size=1,
            dtype=dtype)

        self.key_projection = key_projection
        self.value_projection = value_projection
        self.dropout = dropout
        self.window_range = window_range

    def forward(self, query, encoder_out, mask=None, last_attended=None):
        """
        Compute pooled context representation and alignment scores.
        
        Args:
            query (Variable): shape(B, C_q, 1, T_dec), the query tensor,
                where C_q means the channel of query.
            encoder_out (Tuple(Variable, Variable)): 
                keys (Variable): shape(B, C_emb, 1, T_enc), the key
                    representation from an encoder, where C_emb means
                    text embedding size.
                values (Variable): shape(B, C_emb, 1, T_enc), the value
                    representation from an encoder, where C_emb means
                    text embedding size.
            mask (Variable, optional): Shape(B, T_enc), mask generated with 
                valid text lengths.
            last_attended (int, optional): The position that received most
                attention at last timestep. This is only used at decoding.

        Outpus:
            x (Variable): Shape(B, C_q, 1, T_dec), the context representation
                pooled from attention mechanism.
            attn_scores (Variable): shape(B, T_dec, T_enc), the alignment
                tensor, where T_dec means the number of decoder time steps and 
                T_enc means number the number of decoder time steps.
        """
        keys, values = encoder_out
        residual = query
        if self.value_projection:
            values = self.value_proj(values)

        if self.key_projection:
            keys = self.key_proj(keys)

        x = self.query_proj(query)

        batch_size, conv_channels, _, decoder_length = query.shape
        encoder_length = keys.shape[-1]
        embed_dim = keys.shape[1]

        x = fluid.layers.matmul(
            fluid.layers.reshape(
                x, shape=[batch_size, embed_dim, decoder_length]),
            fluid.layers.reshape(
                keys, shape=[batch_size, embed_dim, encoder_length]),
            transpose_x=True)

        mask_value = -1.0e30
        if mask is not None:
            mask = expand_mask(mask, x)
            neg_inf_mask = fluid.layers.scale(mask, mask_value)
            x = x + neg_inf_mask

        # if last_attended is provided, focus only on a window range around it
        # to enforce monotonic attention.
        if last_attended is not None:
            locality_mask = np.ones(shape=x.shape, dtype=np.float32)
            backward, ahead = self.window_range
            backward = last_attended + backward
            ahead = last_attended + ahead
            if backward < 0:
                backward = 0
            if ahead > x.shape[-1]:
                ahead = x.shape[-1]
            locality_mask[:, :, backward:ahead] = 0.

            locality_mask = dg.to_variable(locality_mask)
            neg_inf_mask = fluid.layers.scale(locality_mask, mask_value)
            x = x + neg_inf_mask

        x = fluid.layers.softmax(x)
        attn_scores = x

        x = fluid.layers.dropout(
            x, self.dropout, dropout_implementation="upscale_in_train")

        x = fluid.layers.matmul(
            fluid.layers.reshape(
                values, shape=[batch_size, embed_dim, encoder_length]),
            x,
            transpose_y=True)

        x = fluid.layers.reshape(x, [batch_size, embed_dim, 1, decoder_length])

        x = fluid.layers.scale(x,
                               encoder_length * np.sqrt(1.0 / encoder_length))

        x = self.out_proj(x)

        x = fluid.layers.scale((x + residual), np.sqrt(0.5))
        return x, attn_scores


class Decoder(dg.Layer):
    def __init__(self,
                 name_scope,
                 n_speakers,
                 speaker_dim,
                 embed_dim,
                 mel_dim=80,
                 r=5,
                 max_positions=512,
                 padding_idx=None,
                 preattention=(ConvSpec(128, 5, 1)) * 4,
                 convolutions=(ConvSpec(128, 5, 1)) * 4,
                 attention=True,
                 dropout=0.1,
                 use_memory_mask=False,
                 force_monotonic_attention=False,
                 query_position_rate=1.0,
                 key_position_rate=1.29,
                 window_range=WindowRange(-1, 3),
                 key_projection=True,
                 value_projection=True,
                 dtype="float32"):
        super(Decoder, self).__init__(name_scope, dtype=dtype)

        self.dropout = dropout
        self.mel_dim = mel_dim
        self.r = r
        self.query_position_rate = query_position_rate
        self.key_position_rate = key_position_rate
        self.window_range = window_range
        self.n_speakers = n_speakers

        conv_channels = convolutions[0].out_channels
        self.embed_query_positions = PositionEmbedding(
            self.full_name(),
            max_positions,
            conv_channels,
            padding_idx=padding_idx,
            dtype=dtype)
        self.embed_keys_positions = PositionEmbedding(
            self.full_name(),
            max_positions,
            embed_dim,
            padding_idx=padding_idx,
            dtype=dtype)

        # Used to compute multiplier for position rate
        if n_speakers > 1:
            self.speaker_proj1 = FC(self.full_name(),
                                    speaker_dim,
                                    1,
                                    act="sigmoid",
                                    dropout=dropout,
                                    dtype=dtype)
            self.speaker_proj2 = FC(self.full_name(),
                                    speaker_dim,
                                    1,
                                    act="sigmoid",
                                    dropout=dropout,
                                    dtype=dtype)

        # prenet
        self.prenet = []
        in_channels = mel_dim * r
        std_mul = 1.0
        for (out_channels, filter_size, dilation) in preattention:
            if in_channels != out_channels:
                # conv1d & relu
                self.prenet.append(
                    Conv1D(
                        self.full_name(),
                        in_channels,
                        out_channels,
                        filter_size=1,
                        std_mul=std_mul,
                        act="relu"))
                in_channels = out_channels
                std_mul = 2.0
            self.prenet.append(
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    in_channels,
                    out_channels,
                    filter_size,
                    dilation,
                    std_mul=std_mul,
                    dropout=dropout,
                    causal=True,
                    residual=True,
                    dtype=dtype))
            in_channels = out_channels
            std_mul = 4.0
        for i, layer in enumerate(self.prenet):
            self.add_sublayer("prenet_{}".format(i), layer)

        self.use_memory_mask = use_memory_mask
        if isinstance(attention, bool):
            self.attention = [attention] * len(convolutions)
        else:
            self.attention = attention

        if isinstance(force_monotonic_attention, bool):
            self.force_monotonic_attention = [force_monotonic_attention
                                              ] * len(convolutions)
        else:
            self.force_monotonic_attention = force_monotonic_attention

        # causual convolution & attention
        self.conv_attn = []
        for use_attention, (out_channels, filter_size,
                            dilation) in zip(self.attention, convolutions):
            assert (
                in_channels == out_channels
            ), "the stack of convolution & attention does not change channels"
            conv_layer = Conv1DGLU(
                self.full_name(),
                n_speakers,
                speaker_dim,
                in_channels,
                out_channels,
                filter_size,
                dilation,
                std_mul=std_mul,
                dropout=dropout,
                causal=True,
                residual=False,
                dtype=dtype)
            attn_layer = (AttentionLayer(
                self.full_name(),
                out_channels,
                embed_dim,
                dropout=dropout,
                window_range=window_range,
                key_projection=key_projection,
                value_projection=value_projection,
                dtype=dtype) if use_attention else None)
            in_channels = out_channels
            std_mul = 4.0
            self.conv_attn.append((conv_layer, attn_layer))
        for i, (conv_layer, attn_layer) in enumerate(self.conv_attn):
            self.add_sublayer("conv_{}".format(i), conv_layer)
            if attn_layer is not None:
                self.add_sublayer("attn_{}".format(i), attn_layer)

        # 1 * 1 conv to transform channels
        self.last_conv = Conv1D(
            self.full_name(),
            in_channels,
            mel_dim * r,
            filter_size=1,
            std_mul=std_mul,
            dropout=dropout,
            dtype=dtype)

        # mel (before sigmoid) to done hat
        self.fc = Conv1D(
            self.full_name(), mel_dim * r, 1, filter_size=1, dtype=dtype)

        # decoding configs
        self.max_decoder_steps = 200
        self.min_decoder_steps = 10

    def freeze_positional_encoding(self):
        for param in self.embed_query_positions.parameters():
            param.trainable = False
        for param in self.embed_keys_positions.parameters():
            param.trainable = False

    def forward(self,
                encoder_out,
                lengths,
                inputs,
                text_positions,
                frame_positions,
                speaker_embed=None):
        """
        Compute decoder outputs with ground truth mel spectrogram.

        Args:
            encoder_out (Tuple(Variable, Variable)): 
                keys (Variable): shape(B, C_emb, 1, T_enc), the key
                    representation from an encoder, where C_emb means
                    text embedding size.
                values (Variable): shape(B, C_emb, 1, T_enc), the value
                    representation from an encoder, where C_emb means
                    text embedding size.
            lengths (Variable): Shape(batch_size,), dtype: int64, valid lengths
                of text inputs for each example.
            inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth
                mel-spectrogram, which is used as decoder inputs when training.
            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
                Positions indices for text inputs for the encoder, where 
                T_enc means the encoder timesteps.
            frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: 
                int64. Positions indices for each decoder time steps.
            speaker_embed: shape(batch_size, speaker_dim), speaker embedding, 
                only used for multispeaker model.


        Returns:
            outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
                outputs, where C_mel means the channels of mel-spectrogram, r 
                means the outputs per decoder step, T_mel means the length(time
                steps) of mel spectrogram. Note that, when r > 1, the decoder
                outputs r frames of mel spectrogram per step.
            alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment
                tensor between the decoder and the encoder, where N means number
                of Attention Layers, T_mel means the length of mel spectrogram,
                r means the outputs per decoder step, T_enc means the encoder
                time steps.
            done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
                outputs should stop.
            decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder
                hidden states, where C_dec means the channels of decoder states.
        """

        # pack multiple frames if necessary
        B, _, _, T = inputs.shape
        if self.r > 1 and inputs.shape[1] == self.mel_dim:
            if T % self.r != 0:
                inputs = fluid.layers.slice(
                    inputs, axes=[3], starts=[0], ends=[T - T % self.r])
            inputs = fluid.layers.transpose(inputs, [0, 3, 2, 1])
            inputs = fluid.layers.reshape(
                inputs, shape=[B, -1, 1, self.mel_dim * self.r])
            inputs = fluid.layers.transpose(inputs, [0, 3, 2, 1])
        assert inputs.shape[3] == T // self.r

        if speaker_embed is not None:
            speaker_embed_bc1t = expand_speaker_embed(inputs, speaker_embed)
            speaker_embed_bc1t = fluid.layers.dropout(
                speaker_embed_bc1t,
                self.dropout,
                dropout_implementation="upscale_in_train")
        else:
            speaker_embed_bc1t = None

        keys, values = encoder_out

        if self.use_memory_mask and lengths is not None:
            mask = gen_mask2(lengths, keys.shape[-1])
        else:
            mask = None

        if text_positions is not None:
            w = self.key_position_rate
            if self.n_speakers > 1:
                w = w * fluid.layers.reshape(
                    self.speaker_proj1(speaker_embed), [B, -1])
            text_pos_embed = self.embed_keys_positions(text_positions, w)
            text_pos_embed = fluid.layers.transpose(
                fluid.layers.reshape(
                    text_pos_embed, shape=text_pos_embed.shape + [1]),
                perm=[0, 2, 3, 1])
            keys = keys + text_pos_embed

        if frame_positions is not None:
            w = self.query_position_rate
            if self.n_speakers > 1:
                w = w * fluid.layers.reshape(
                    self.speaker_proj2(speaker_embed), [B, -1])
            frame_pos_embed = self.embed_query_positions(frame_positions, w)
            frame_pos_embed = fluid.layers.transpose(
                fluid.layers.reshape(
                    frame_pos_embed, shape=frame_pos_embed.shape + [1]),
                perm=[0, 2, 3, 1])
        else:
            frame_pos_embed = None

        x = inputs
        x = fluid.layers.dropout(
            x, self.dropout, dropout_implementation="upscale_in_train")

        # Prenet
        for layer in self.prenet:
            x = (layer(x, speaker_embed_bc1t)
                 if isinstance(layer, Conv1DGLU) else layer(x))

        # Convolution & Multi-hop Attention
        alignments = []
        for conv, attn in self.conv_attn:
            residual = x
            x = conv(x, speaker_embed_bc1t)
            if attn is not None:
                if frame_pos_embed is not None:
                    x = x + frame_pos_embed
                x, attn_scores = attn(x, (keys, values), mask)
                alignments.append(attn_scores)
            x = fluid.layers.scale(residual + x, scale=np.sqrt(0.5))

        alignments = fluid.layers.stack(alignments)

        decoder_states = x
        x = self.last_conv(x)
        outputs = fluid.layers.sigmoid(x)
        done = fluid.layers.sigmoid(self.fc(x))

        return outputs, alignments, done, decoder_states

    def decode(self,
               encoder_out,
               text_positions,
               speaker_embed=None,
               initial_input=None,
               test_inputs=None):
        """
        Decode without ground truth mel spectrogram.
        
        Args:
            encoder_out (Tuple(Variable, Variable)): 
                keys (Variable): shape(B, C_emb, 1, T_enc), the key
                    representation from an encoder, where C_emb means
                    text embedding size.
                values (Variable): shape(B, C_emb, 1, T_enc), the value
                    representation from an encoder, where C_emb means
                    text embedding size.
            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
                Positions indices for text inputs for the encoder, where 
                T_enc means the encoder timesteps.
               
            speaker_embed (Variable): Shape(B, C_sp), where C_sp means 
               speaker embedding size. It is only used for multispeaker model.
            initial_input (Variable, optional): Shape(B, C_mel * r, 1, 1).
               The input for the first time step of the decoder. If r > 0,
               it is a packed r frames of mel spectrograms.
            test_inputs (Variable, optional): Shape(B, C_mel, 1, T_test),
               where T_test means the time steps of test inputs. This is 
               only used for testing this method, the user should just leave
               it None.

        Returns:
            outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
                outputs, where C_mel means the channels of mel-spectrogram, r 
                means the outputs per decoder step, T_mel means the length(time
                steps) of output mel spectrogram. Note that, when r > 1, 
                the decoder outputs r frames of mel spectrogram per step.
            alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment
                tensor between the decoder and the encoder, T_mel means the 
                length of output mel spectrogram, r means the outputs per
                decoder step, T_enc means the encoder time steps.
            done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
                outputs stops.
            decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder
                hidden states, where C_dec means the channels of decoder states.
        """
        self.start_new_sequence()
        keys, values = encoder_out
        B = keys.shape[0]
        assert B == 1, "now only supports single instance inference"
        mask = None  # no mask because we use single instance decoding

        w = self.key_position_rate
        if speaker_embed is not None:
            if self.n_speakers > 1:
                w = w * fluid.layers.reshape(
                    self.speaker_proj1(speaker_embed), shape=[B, -1])
            speaker_embed_bc11 = fluid.layers.reshape(
                speaker_embed, shape=[B, speaker_embed.shape[1], 1, 1])
        else:
            speaker_embed_bc11 = None

        if text_positions is not None:
            text_pos_embed = self.embed_keys_positions(text_positions, w)
            text_pos_embed = fluid.layers.transpose(
                fluid.layers.reshape(
                    text_pos_embed, shape=text_pos_embed.shape + [1]),
                perm=[0, 2, 3, 1])
            keys = keys + text_pos_embed

        # start decoding, init accumulators
        decoder_states = []
        outputs = []
        alignments = []
        dones = []

        last_attended = [None] * len(self.conv_attn)
        for idx, monotonic_attn in enumerate(self.force_monotonic_attention):
            if monotonic_attn:
                last_attended[idx] = 0

        t = 0  # decoder time step
        if initial_input is None:
            initial_input = fluid.layers.zeros(
                shape=[B, self.mel_dim * self.r, 1, 1], dtype=keys.dtype)
        current_input = initial_input

        while True:
            frame_pos = fluid.layers.fill_constant(
                shape=[B, 1, 1], value=t + 1, dtype="int64")
            w = self.query_position_rate
            if self.n_speakers > 1:
                w = w * fluid.layers.reshape(
                    self.speaker_proj2(speaker_embed), shape=[B, -1])
            frame_pos_embed = self.embed_query_positions(frame_pos, w)
            frame_pos_embed = fluid.layers.transpose(
                fluid.layers.reshape(
                    frame_pos_embed, shape=frame_pos_embed.shape + [1]),
                perm=[0, 2, 3, 1])

            if test_inputs is not None:
                if t >= test_inputs.shape[3]:
                    break
                current_input = fluid.layers.reshape(
                    test_inputs[:, :, :, t],
                    shape=[B, test_inputs.shape[1], 1, 1])
            else:
                if t > 0:
                    current_input = outputs[-1]

            x = current_input
            x = fluid.layers.dropout(
                x, self.dropout, dropout_implementation="upscale_in_train")

            # Prenet
            for layer in self.prenet:
                x = (layer.add_input(x, speaker_embed_bc11)
                     if isinstance(layer, Conv1DGLU) else layer.add_input(x))

            step_attn_scores = []
            # Casual convolutions + Multi-hop attentions
            for i, (conv, attn) in enumerate(self.conv_attn):
                residual = x
                x = conv.add_input(x, speaker_embed_bc11)
                if attn is not None:
                    if frame_pos_embed is not None:
                        x = x + frame_pos_embed
                    x, attn_scores = attn(x, (keys, values), mask,
                                          last_attended[i])
                    step_attn_scores.append(attn_scores)

                    # update last attended when necessary
                    if self.force_monotonic_attention[i]:
                        last_attended[i] = np.argmax(
                            attn_scores.numpy(), axis=-1)[0][0]
                x = fluid.layers.scale(residual + x, scale=np.sqrt(0.5))
            if len(step_attn_scores):
                average_attn_scores = fluid.layers.reduce_mean(
                    fluid.layers.stack(step_attn_scores), dim=0)
            else:
                average_attn_scores = None

            decoder_state = x
            x = self.last_conv.add_input(x)

            output = fluid.layers.sigmoid(x)  # (B, r * C_mel, 1, 1)
            done = fluid.layers.sigmoid(self.fc(x))  # (B, 1, 1, 1)

            decoder_states.append(decoder_state)
            outputs.append(output)
            if average_attn_scores is not None:
                alignments.append(average_attn_scores)
            dones.append(done)

            t += 1

            if test_inputs is None:
                if (fluid.layers.reduce_min(done).numpy()[0] > 0.5 and
                        t > self.min_decoder_steps):
                    break
                elif t > self.max_decoder_steps:
                    break

        outputs = fluid.layers.concat(outputs, axis=3)
        if len(alignments):
            alignments = fluid.layers.concat(alignments, axis=1)
        else:
            alignments = None
        dones = fluid.layers.concat(dones, axis=3)
        decoder_states = fluid.layers.concat(decoder_states, axis=3)

        return outputs, alignments, dones, decoder_states

    def start_new_sequence(self):
        for layer in self.sublayers():
            if isinstance(layer, conv.Conv1D):
                layer.start_new_sequence()


class Converter(dg.Layer):
    """
    Vocoder that transforms mel spectrogram (or ecoder hidden states) 
    to waveform.
    """

    def __init__(self,
                 name_scope,
                 n_speakers,
                 speaker_dim,
                 in_channels,
                 linear_dim,
                 convolutions=(ConvSpec(256, 5, 1)) * 4,
                 time_upsampling=1,
                 dropout=0.1,
                 dtype="float32"):
        super(Converter, self).__init__(name_scope, dtype=dtype)

        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim
        self.in_channels = in_channels
        self.linear_dim = linear_dim
        self.time_upsampling = time_upsampling
        self.dropout = dropout

        target_channels = convolutions[0][0]

        # conv proj to target channels
        self.first_conv_proj = Conv1D(
            self.full_name(),
            in_channels,
            target_channels,
            filter_size=1,
            std_mul=1.0,
            dtype=dtype)

        # Idea from nyanko
        # upsampling convolitions
        if time_upsampling == 4:
            self.upsampling_convolutions = [
                Conv1DTranspose(
                    self.full_name(),
                    target_channels,
                    target_channels,
                    filter_size=2,
                    padding=0,
                    stride=2,
                    std_mul=1.0,
                    dtype=dtype),
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    target_channels,
                    target_channels,
                    filter_size=3,
                    dilation=1,
                    std_mul=1.0,
                    dropout=dropout,
                    causal=False,
                    residual=True,
                    dtype=dtype),
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    target_channels,
                    target_channels,
                    filter_size=3,
                    dilation=3,
                    std_mul=4.0,
                    dropout=dropout,
                    causal=False,
                    residual=True,
                    dtype=dtype),
                Conv1DTranspose(
                    self.full_name(),
                    target_channels,
                    target_channels,
                    filter_size=2,
                    padding=0,
                    stride=2,
                    std_mul=4.0,
                    dtype=dtype),
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    target_channels,
                    target_channels,
                    filter_size=3,
                    dilation=1,
                    std_mul=1.0,
                    dropout=dropout,
                    causal=False,
                    residual=True,
                    dtype=dtype),
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    target_channels,
                    target_channels,
                    filter_size=3,
                    dilation=3,
                    std_mul=4.0,
                    dropout=dropout,
                    causal=False,
                    residual=True,
                    dtype=dtype),
            ]

        elif time_upsampling == 2:
            self.upsampling_convolutions = [
                Conv1DTranspose(
                    self.full_name(),
                    target_channels,
                    target_channels,
                    filter_size=2,
                    padding=0,
                    stride=2,
                    std_mul=1.0,
                    dtype=dtype),
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    target_channels,
                    target_channels,
                    filter_size=3,
                    dilation=1,
                    std_mul=1.0,
                    dropout=dropout,
                    causal=False,
                    residual=True,
                    dtype=dtype),
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    target_channels,
                    target_channels,
                    filter_size=3,
                    dilation=3,
                    std_mul=4.0,
                    dropout=dropout,
                    causal=False,
                    residual=True,
                    dtype=dtype),
            ]
        elif time_upsampling == 1:
            self.upsampling_convolutions = [
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    target_channels,
                    target_channels,
                    filter_size=3,
                    dilation=3,
                    std_mul=4.0,
                    dropout=dropout,
                    causal=False,
                    residual=True,
                    dtype=dtype)
            ]
        else:
            raise ValueError("Not supported.")

        for i, layer in enumerate(self.upsampling_convolutions):
            self.add_sublayer("upsampling_convolutions_{}".format(i), layer)

        # post conv layers
        std_mul = 4.0
        in_channels = target_channels
        self.convolutions = []
        for (out_channels, filter_size, dilation) in convolutions:
            if in_channels != out_channels:
                self.convolutions.append(
                    Conv1D(
                        self.full_name(),
                        in_channels,
                        out_channels,
                        filter_size=1,
                        std_mul=std_mul,
                        act="relu",
                        dtype=dtype))
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
                Conv1DGLU(
                    self.full_name(),
                    n_speakers,
                    speaker_dim,
                    in_channels,
                    out_channels,
                    filter_size=filter_size,
                    dilation=dilation,
                    std_mul=std_mul,
                    dropout=dropout,
                    causal=False,
                    residual=True,
                    dtype=dtype))
            in_channels = out_channels
            std_mul = 4.0

        for i, layer in enumerate(self.convolutions):
            self.add_sublayer("convolutions_{}".format(i), layer)

        # final conv proj, channel transformed to linear dim
        self.last_conv_proj = Conv1D(
            self.full_name(),
            in_channels,
            linear_dim,
            filter_size=1,
            std_mul=std_mul,
            dropout=dropout,
            act="sigmoid",
            dtype=dtype)

    def forward(self, x, speaker_embed=None):
        """
        Convert mel spectrogram or decoder hidden states to linear spectrogram.
        
        Args:
            x (Variable): Shape(B, C_in, 1, T_mel), converter inputs, where
                C_in means the input channel for the converter. Note that it 
                can be either C_mel (channel of mel spectrogram) or C_dec // r.
                When use mel_spectrogram as the input of converter, C_in = 
                C_mel; and when use decoder states as the input of converter,
                C_in = C_dec // r. In this scenario, decoder hidden states are
                treated as if they were r outputs per decoder step and are
                unpacked before passing to the converter.
            speaker_embed (Variable, optional): shape(B, C_sp), speaker
                embedding, where C_sp means the speaker embedding size.

        Returns:
            out (Variable): Shape(B, C_lin, 1, T_lin), the output linear 
                spectrogram, where C_lin means the channel of linear 
                spectrogram and T_linear means the length(time steps) of linear
                spectrogram. T_line = time_upsampling * T_mel, which depends 
                on the time_upsampling converter.
        """
        speaker_embed_bc1t = None
        if speaker_embed is not None:
            speaker_embed_bc1t = expand_speaker_embed(x, speaker_embed, tdim=-1)
            speaker_embed_bc1t = fluid.layers.dropout(
                speaker_embed_bc1t,
                self.dropout,
                dropout_implementation="upscale_in_train")

        x = self.first_conv_proj(x)

        for layer in chain(self.upsampling_convolutions, self.convolutions):
            # time_steps may change when timt_upsampling > 1
            if (speaker_embed_bc1t is not None and
                    speaker_embed_bc1t.shape[3] != x.shape[3]):
                speaker_embed_bc1t = expand_speaker_embed(
                    x, speaker_embed, tdim=3)
                speaker_embed_bc1t = fluid.layers.dropout(
                    speaker_embed_bc1t,
                    self.dropout,
                    dropout_implementation="upscale_in_train")
            x = (layer(x, speaker_embed_bc1t)
                 if isinstance(layer, Conv1DGLU) else layer(x))

        out = self.last_conv_proj(x)
        return out


class DeepVoiceTTS(dg.Layer):
    def __init__(self, name_scope, n_speakers, speaker_dim,
                 speaker_embedding_weight_std, n_vocab, embed_dim,
                 text_padding_idx, text_embedding_weight_std,
                 freeze_text_embedding, encoder_convolutions, max_positions,
                 position_padding_idx, trainable_positional_encodings, mel_dim,
                 r, prenet_convolutions, attentive_convolutions, attention,
                 use_memory_mask, force_monotonic_attention,
                 query_position_rate, key_position_rate, window_range,
                 key_projection, value_projection, linear_dim,
                 postnet_convolutions, time_upsampling, dropout,
                 use_decoder_state_for_postnet_input, dtype):
        super(DeepVoiceTTS, self).__init__(name_scope, dtype)

        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim
        if n_speakers > 1:
            self.speaker_embedding = Embedding(
                self.full_name(),
                n_speakers,
                speaker_dim,
                padding_idx=None,
                std=speaker_embedding_weight_std,
                dtype=dtype)

        self.embed_dim = embed_dim
        self.mel_dim = mel_dim
        self.r = r

        self.seq2seq = ConvS2S(
            self.full_name(), n_speakers, speaker_dim,
            speaker_embedding_weight_std, n_vocab, embed_dim, text_padding_idx,
            text_embedding_weight_std, freeze_text_embedding,
            encoder_convolutions, max_positions, position_padding_idx,
            trainable_positional_encodings, mel_dim, r, prenet_convolutions,
            attentive_convolutions, attention, use_memory_mask,
            force_monotonic_attention, query_position_rate, key_position_rate,
            window_range, key_projection, value_projection, dropout, dtype)

        self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input
        if use_decoder_state_for_postnet_input:
            assert (
                attentive_convolutions[-1].out_channels % self.r == 0
            ), "when using decoder states as converter input, you must assure the decoder state channels can be divided by r"
            converter_input_channels = attentive_convolutions[
                -1].out_channels // r
        else:
            converter_input_channels = mel_dim

        self.converter_input_channels = converter_input_channels
        self.linear_dim = linear_dim
        self.converter = Converter(
            self.full_name(),
            n_speakers,
            speaker_dim,
            converter_input_channels,
            linear_dim,
            convolutions=postnet_convolutions,
            time_upsampling=time_upsampling,
            dropout=dropout,
            dtype=dtype)

    def forward(self,
                text_sequences,
                valid_lengths,
                mel_inputs,
                speaker_indices=None,
                text_positions=None,
                frame_positions=None):
        """
        Encode text sequence and decode with ground truth mel spectrogram.
                
        Args:
            text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
                input text indices. T_enc means the timesteps of text_sequences.
            valid_lengths (Variable): shape(batch_size,), dtype: int64,
                valid lengths for each example in text_sequences.
            mel_inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth
                mel-spectrogram, which is used as decoder inputs when training. 
            speaker_indices (Variable, optional): Shape(Batch_size, 1),
                dtype: int64. Speaker index for each example. This arg is not
                None only when the model is a multispeaker model.
            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
                Positions indices for text inputs for the encoder, where 
                T_enc means the encoder timesteps.
            frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: 
                int64. Positions indices for each decoder time steps.

        Returns:
            mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
                outputs, where C_mel means the channels of mel-spectrogram, r 
                means the outputs per decoder step, T_mel means the length(time
                steps) of mel spectrogram. Note that, when r > 1, the decoder
                outputs r frames of mel spectrogram per step.
            linear_outputs (Variable): Shape(B, C_lin, 1, T_lin), the output
                linear spectrogram, where C_lin means the channel of linear 
                spectrogram and T_linear means the length(time steps) of linear
                spectrogram. T_line = time_upsampling * T_mel, which depends 
                on the time_upsampling converter.
            alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment
                tensor between the decoder and the encoder, where N means number
                of Attention Layers, T_mel means the length of mel spectrogram,
                r means the outputs per decoder step, T_enc means the encoder
                time steps.
            done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
                outputs should stop.
        """

        batch_size = text_sequences.shape[0]
        if self.n_speakers == 1:
            assert speaker_indices is None, "this model does not support multi-speaker"

        if speaker_indices is not None:
            speaker_embed = self.speaker_embedding(speaker_indices)
        else:
            speaker_embed = None

        mel_outputs, alignments, done, decoder_states = self.seq2seq(
            text_sequences, valid_lengths, mel_inputs, speaker_embed,
            text_positions, frame_positions)

        # unpack multi frames
        if self.r > 1:
            mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1])
            mel_outputs = fluid.layers.reshape(
                mel_outputs, [batch_size, -1, 1, self.mel_dim])
            mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1])

        if self.use_decoder_state_for_postnet_input:
            postnet_input = fluid.layers.transpose(decoder_states, [0, 3, 2, 1])
            postnet_input = fluid.layers.reshape(
                postnet_input,
                [batch_size, -1, 1, self.converter_input_channels])
            postnet_input = fluid.layers.transpose(postnet_input, [0, 3, 2, 1])
        else:
            postnet_input = mel_outputs

        linear_outputs = self.converter(postnet_input, speaker_embed)

        return mel_outputs, linear_outputs, alignments, done

    def transduce(self, text_sequences, text_positions, speaker_indices=None):
        """
        Encode text sequence and decode without ground truth mel spectrogram.
        
        Args:
            text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
                input text indices. T_enc means the timesteps of text_sequences.
            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
                Positions indices for text inputs for the encoder, where 
                T_enc means the encoder timesteps.
            speaker_indices (Variable, optional): Shape(Batch_size, 1),
                dtype: int64. Speaker index for each example. This arg is not
                None only when the model is a multispeaker model.

        Returns:
            mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
                outputs, where C_mel means the channels of mel-spectrogram, r 
                means the outputs per decoder step, T_mel means the length(time
                steps) of mel spectrogram. Note that, when r > 1, the decoder
                outputs r frames of mel spectrogram per step.
            linear_outputs (Variable): Shape(B, C_lin, 1, T_lin), the output
                linear spectrogram, where C_lin means the channel of linear 
                spectrogram and T_linear means the length(time steps) of linear
                spectrogram. T_line = time_upsampling * T_mel, which depends 
                on the time_upsampling converter.
            alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment
                tensor between the decoder and the encoder, T_mel means the
                length of mel spectrogram, r means the outputs per decoder
                step, T_enc means the encoder time steps.
            done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
                outputs should stop.
        """
        batch_size = text_sequences.shape[0]

        if speaker_indices is not None:
            speaker_embed = self.speaker_embedding(speaker_indices)
        else:
            speaker_embed = None

        mel_outputs, alignments, done, decoder_states = self.seq2seq.transduce(
            text_sequences, text_positions, speaker_embed)

        if self.r > 1:
            mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1])
            mel_outputs = fluid.layers.reshape(
                mel_outputs, [batch_size, -1, 1, self.mel_dim])
            mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1])

        if self.use_decoder_state_for_postnet_input:
            postnet_input = fluid.layers.transpose(decoder_states, [0, 3, 2, 1])
            postnet_input = fluid.layers.reshape(
                postnet_input,
                [batch_size, -1, 1, self.converter_input_channels])
            postnet_input = fluid.layers.transpose(postnet_input, [0, 3, 2, 1])
        else:
            postnet_input = mel_outputs

        linear_outputs = self.converter(postnet_input, speaker_embed)

        return mel_outputs, linear_outputs, alignments, done


class ConvS2S(dg.Layer):
    def __init__(self, name_scope, n_speakers, speaker_dim,
                 speaker_embedding_weight_std, n_vocab, embed_dim,
                 text_padding_idx, text_embedding_weight_std,
                 freeze_text_embedding, encoder_convolutions, max_positions,
                 position_padding_idx, trainable_positional_encodings, mel_dim,
                 r, prenet_convolutions, attentive_convolutions, attention,
                 use_memory_mask, force_monotonic_attention,
                 query_position_rate, key_position_rate, window_range,
                 key_projection, value_projection, dropout, dtype):
        super(ConvS2S, self).__init__(name_scope, dtype)

        self.freeze_text_embedding = freeze_text_embedding
        self.trainable_positional_encodings = trainable_positional_encodings

        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim

        self.embed_dim = embed_dim
        self.encoder = Encoder(
            self.full_name(),
            n_vocab,
            embed_dim,
            n_speakers,
            speaker_dim,
            padding_idx=None,
            embedding_weight_std=text_embedding_weight_std,
            convolutions=encoder_convolutions,
            max_positions=max_positions,
            dropout=dropout,
            dtype=dtype)
        if freeze_text_embedding:
            self.encoder.freeze_embedding()

        self.mel_dim = mel_dim
        self.r = r
        self.decoder = Decoder(
            self.full_name(),
            n_speakers,
            speaker_dim,
            embed_dim,
            mel_dim,
            r,
            max_positions,
            position_padding_idx,
            preattention=prenet_convolutions,
            convolutions=attentive_convolutions,
            attention=attention,
            dropout=dropout,
            use_memory_mask=use_memory_mask,
            force_monotonic_attention=force_monotonic_attention,
            query_position_rate=query_position_rate,
            key_position_rate=key_position_rate,
            window_range=window_range,
            key_projection=key_projection,
            value_projection=key_projection,
            dtype=dtype)
        if not trainable_positional_encodings:
            self.decoder.freeze_positional_encoding()

    def forward(self,
                text_sequences,
                valid_lengths,
                mel_inputs,
                speaker_embed=None,
                text_positions=None,
                frame_positions=None):
        """
        Encode text sequence and decode with ground truth mel spectrogram.

        Args:
            text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
                input text indices. T_enc means the timesteps of text_sequences.
            valid_lengths (Variable): shape(batch_size,), dtype: int64,
                valid lengths for each example in text_sequences.
            mel_inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth
                mel-spectrogram, which is used as decoder inputs when training. 
            speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim),
                dtype: float32. Speaker embeddings. This arg is not None only
                when the model is a multispeaker model.
            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
                Positions indices for text inputs for the encoder, where 
                T_enc means the encoder timesteps.
            frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: 
                int64. Positions indices for each decoder time steps.

        Returns:
            mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
                outputs, where C_mel means the channels of mel-spectrogram, r 
                means the outputs per decoder step, T_mel means the length(time
                steps) of mel spectrogram. Note that, when r > 1, the decoder
                outputs r frames of mel spectrogram per step.
            alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment
                tensor between the decoder and the encoder, where N means number
                of Attention Layers, T_mel means the length of mel spectrogram,
                r means the outputs per decoder step, T_enc means the encoder
                time steps.
            done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
                outputs should stop.
            decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder
                hidden states, where C_dec means the channels of decoder states.
        """
        keys, values = self.encoder(text_sequences, speaker_embed)
        mel_outputs, alignments, done, decoder_states = self.decoder(
            (keys, values), valid_lengths, mel_inputs, text_positions,
            frame_positions, speaker_embed)

        return mel_outputs, alignments, done, decoder_states

    def transduce(self, text_sequences, text_positions, speaker_embed=None):
        """
        Encode text sequence and decode without ground truth mel spectrogram.
        
        Args:
            text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
                input text indices. T_enc means the timesteps of text_sequences.
            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
                Positions indices for text inputs for the encoder, where 
                T_enc means the encoder timesteps.
            speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim),
                dtype: float32. Speaker embeddings. This arg is not None only
                when the model is a multispeaker model.

        Returns:
            mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
                outputs, where C_mel means the channels of mel-spectrogram, r 
                means the outputs per decoder step, T_mel means the length(time
                steps) of mel spectrogram. Note that, when r > 1, the decoder
                outputs r frames of mel spectrogram per step.
            alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment
                tensor between the decoder and the encoder, T_mel means the
                length of mel spectrogram, r means the outputs per decoder
                step, T_enc means the encoder time steps.
            done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
                outputs should stop.
            decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder
                hidden states, where C_dec means the channels of decoder states.
        """
        keys, values = self.encoder(text_sequences, speaker_embed)
        mel_outputs, alignments, done, decoder_states = self.decoder.decode(
            (keys, values), text_positions, speaker_embed)

        return mel_outputs, alignments, done, decoder_states