# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Callable, Union, Tuple

import paddle.nn as nn
import paddle.nn.functional as F
import paddle
import numpy as np
from paddlehub.module.module import moduleinfo
import paddlehub.vision.segmentation_transforms as T
from paddlehub.module.cv_module import ImageSegmentationModule

import fastscnn_cityscapes.layers as layers


@moduleinfo(
    name="fastscnn_cityscapes",
    type="CV/semantic_segmentation",
    author="paddlepaddle",
    author_email="",
    summary="fastscnn_cityscapes is a segmentation model.",
    version="1.0.0",
    meta=ImageSegmentationModule)
class FastSCNN(nn.Layer):
    """
    The FastSCNN implementation based on PaddlePaddle.
    As mentioned in the original paper, FastSCNN is a real-time segmentation algorithm (123.5fps)
    even for high resolution images (1024x2048).
    The original article refers to
    Poudel, Rudra PK, et al. "Fast-scnn: Fast semantic segmentation network"
    (https://arxiv.org/pdf/1902.04502.pdf).
    Args:
        num_classes (int): The unique number of target classes, default is 19.
        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False.
        pretrained (str, optional): The path or url of pretrained model. Default: None.
    """

    def __init__(self, num_classes: int = 19, align_corners: bool = False, pretrained: str = None):

        super(FastSCNN, self).__init__()

        self.learning_to_downsample = LearningToDownsample(32, 48, 64)
        self.global_feature_extractor = GlobalFeatureExtractor(
            in_channels=64,
            block_channels=[64, 96, 128],
            out_channels=128,
            expansion=6,
            num_blocks=[3, 3, 3],
            align_corners=True)
        self.feature_fusion = FeatureFusionModule(64, 128, 128, align_corners)
        self.classifier = Classifier(128, num_classes)
        self.align_corners = align_corners
        self.transforms = T.Compose([T.Normalize()])

        if pretrained is not None:
            model_dict = paddle.load(pretrained)
            self.set_dict(model_dict)
            print("load custom parameters success")

        else:
            checkpoint = os.path.join(self.directory, 'fastscnn_model.pdparams')
            model_dict = paddle.load(checkpoint)
            self.set_dict(model_dict)
            print("load pretrained parameters success")

    def transform(self, img: Union[np.ndarray, str]) -> Union[np.ndarray, str]:
        return self.transforms(img)

    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
        logit_list = []
        input_size = paddle.shape(x)[2:]
        higher_res_features = self.learning_to_downsample(x)
        x = self.global_feature_extractor(higher_res_features)
        x = self.feature_fusion(higher_res_features, x)
        logit = self.classifier(x)
        logit = F.interpolate(logit, input_size, mode='bilinear', align_corners=self.align_corners)
        logit_list.append(logit)

        return logit_list


class LearningToDownsample(nn.Layer):
    """
    Learning to downsample module.
    This module consists of three downsampling blocks (one conv and two separable conv)
    Args:
        dw_channels1 (int, optional): The input channels of the first sep conv. Default: 32.
        dw_channels2 (int, optional): The input channels of the second sep conv. Default: 48.
        out_channels (int, optional): The output channels of LearningToDownsample module. Default: 64.
    """

    def __init__(self, dw_channels1: int = 32, dw_channels2: int = 48, out_channels: int = 64):
        super(LearningToDownsample, self).__init__()

        self.conv_bn_relu = layers.ConvBNReLU(in_channels=3, out_channels=dw_channels1, kernel_size=3, stride=2)
        self.dsconv_bn_relu1 = layers.SeparableConvBNReLU(
            in_channels=dw_channels1, out_channels=dw_channels2, kernel_size=3, stride=2, padding=1)
        self.dsconv_bn_relu2 = layers.SeparableConvBNReLU(
            in_channels=dw_channels2, out_channels=out_channels, kernel_size=3, stride=2, padding=1)

    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
        x = self.conv_bn_relu(x)
        x = self.dsconv_bn_relu1(x)
        x = self.dsconv_bn_relu2(x)
        return x


class GlobalFeatureExtractor(nn.Layer):
    """
    Global feature extractor module.
    This module consists of three InvertedBottleneck blocks (like inverted residual introduced by MobileNetV2) and
    a PPModule (introduced by PSPNet).
    Args:
        in_channels (int): The number of input channels to the module.
        block_channels (tuple): A tuple represents output channels of each bottleneck block.
        out_channels (int): The number of output channels of the module. Default:
        expansion (int): The expansion factor in bottleneck.
        num_blocks (tuple): It indicates the repeat time of each bottleneck.
        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
    """

    def __init__(self, in_channels: int, block_channels: int, out_channels: int, expansion: int, num_blocks: Tuple[int],
                 align_corners: bool):
        super(GlobalFeatureExtractor, self).__init__()

        self.bottleneck1 = self._make_layer(InvertedBottleneck, in_channels, block_channels[0], num_blocks[0],
                                            expansion, 2)
        self.bottleneck2 = self._make_layer(InvertedBottleneck, block_channels[0], block_channels[1], num_blocks[1],
                                            expansion, 2)
        self.bottleneck3 = self._make_layer(InvertedBottleneck, block_channels[1], block_channels[2], num_blocks[2],
                                            expansion, 1)

        self.ppm = layers.PPModule(
            block_channels[2], out_channels, bin_sizes=(1, 2, 3, 6), dim_reduction=True, align_corners=align_corners)

    def _make_layer(self,
                    block: Callable,
                    in_channels: int,
                    out_channels: int,
                    blocks: int,
                    expansion: int = 6,
                    stride: int = 1):
        layers = []
        layers.append(block(in_channels, out_channels, expansion, stride))
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels, expansion, 1))
        return nn.Sequential(*layers)

    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
        x = self.bottleneck1(x)
        x = self.bottleneck2(x)
        x = self.bottleneck3(x)
        x = self.ppm(x)
        return x


class InvertedBottleneck(nn.Layer):
    """
    Single Inverted bottleneck implementation.
    Args:
        in_channels (int): The number of input channels to bottleneck block.
        out_channels (int): The number of output channels of bottleneck block.
        expansion (int, optional). The expansion factor in bottleneck. Default: 6.
        stride (int, optional). The stride used in depth-wise conv. Defalt: 2.
    """

    def __init__(self, in_channels: int, out_channels: int, expansion: int = 6, stride: int = 2):
        super().__init__()

        self.use_shortcut = stride == 1 and in_channels == out_channels

        expand_channels = in_channels * expansion
        self.block = nn.Sequential(
            # pw
            layers.ConvBNReLU(in_channels=in_channels, out_channels=expand_channels, kernel_size=1, bias_attr=False),
            # dw
            layers.ConvBNReLU(
                in_channels=expand_channels,
                out_channels=expand_channels,
                kernel_size=3,
                stride=stride,
                padding=1,
                groups=expand_channels,
                bias_attr=False),
            # pw-linear
            layers.ConvBN(in_channels=expand_channels, out_channels=out_channels, kernel_size=1, bias_attr=False))

    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
        out = self.block(x)
        if self.use_shortcut:
            out = x + out
        return out


class FeatureFusionModule(nn.Layer):
    """
    Feature Fusion Module Implementation.
    This module fuses high-resolution feature and low-resolution feature.
    Args:
        high_in_channels (int): The channels of high-resolution feature (output of LearningToDownsample).
        low_in_channels (int): The channels of low-resolution feature (output of GlobalFeatureExtractor).
        out_channels (int): The output channels of this module.
        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
    """

    def __init__(self, high_in_channels: int, low_in_channels: int, out_channels: int, align_corners: bool):
        super().__init__()

        # Only depth-wise conv
        self.dwconv = layers.ConvBNReLU(
            in_channels=low_in_channels,
            out_channels=out_channels,
            kernel_size=3,
            padding=1,
            groups=128,
            bias_attr=False)

        self.conv_low_res = layers.ConvBN(out_channels, out_channels, 1)
        self.conv_high_res = layers.ConvBN(high_in_channels, out_channels, 1)
        self.align_corners = align_corners

    def forward(self, high_res_input: int, low_res_input: int) -> paddle.Tensor:
        low_res_input = F.interpolate(
            low_res_input, paddle.shape(high_res_input)[2:], mode='bilinear', align_corners=self.align_corners)
        low_res_input = self.dwconv(low_res_input)
        low_res_input = self.conv_low_res(low_res_input)
        high_res_input = self.conv_high_res(high_res_input)
        x = high_res_input + low_res_input

        return F.relu(x)


class Classifier(nn.Layer):
    """
    The Classifier module implementation.
    This module consists of two depth-wise conv and one conv.
    Args:
        input_channels (int): The input channels to this module.
        num_classes (int): The unique number of target classes.
    """

    def __init__(self, input_channels: int, num_classes: int):
        super().__init__()

        self.dsconv1 = layers.SeparableConvBNReLU(
            in_channels=input_channels, out_channels=input_channels, kernel_size=3, padding=1)

        self.dsconv2 = layers.SeparableConvBNReLU(
            in_channels=input_channels, out_channels=input_channels, kernel_size=3, padding=1)

        self.conv = nn.Conv2D(in_channels=input_channels, out_channels=num_classes, kernel_size=1)

        self.dropout = nn.Dropout(p=0.1)  # dropout_prob

    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
        x = self.dsconv1(x)
        x = self.dsconv2(x)
        x = self.dropout(x)
        x = self.conv(x)
        return x