# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import paddle
import math
import paddle.nn as nn
from paddle import ParamAttr
from paddle.nn.initializer import MSRA
from paddle.nn import Conv2d, BatchNorm, Linear, Dropout
from paddle.nn import AdaptiveAvgPool2d, MaxPool2d, AvgPool2d
from paddle.regularizer import L2Decay

__all__ = ["ResNeSt50_fast_1s1x64d", "ResNeSt50"]


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 filter_size,
                 stride=1,
                 dilation=1,
                 groups=1,
                 act=None,
                 name=None):
        super(ConvBNLayer, self).__init__()

        bn_decay = 0.0

        self._conv = Conv2d(
            in_channels=num_channels,
            out_channels=num_filters,
            kernel_size=filter_size,
            stride=stride,
            padding=(filter_size - 1) // 2,
            dilation=dilation,
            groups=groups,
            weight_attr=ParamAttr(name=name + "_weight"),
            bias_attr=False)
        self._batch_norm = BatchNorm(
            num_filters,
            act=act,
            param_attr=ParamAttr(
                name=name + "_scale", regularizer=L2Decay(bn_decay)),
            bias_attr=ParamAttr(
                name + "_offset", regularizer=L2Decay(bn_decay)),
            moving_mean_name=name + "_mean",
            moving_variance_name=name + "_variance")

    def forward(self, x):
        x = self._conv(x)
        x = self._batch_norm(x)
        return x


class rSoftmax(nn.Layer):
    def __init__(self, radix, cardinality):
        super(rSoftmax, self).__init__()
        self.radix = radix
        self.cardinality = cardinality

    def forward(self, x):
        cardinality = self.cardinality
        radix = self.radix

        batch, r, h, w = x.shape
        if self.radix > 1:
            x = paddle.reshape(
                x=x,
                shape=[
                    0, cardinality, radix, int(r * h * w / cardinality / radix)
                ])
            x = paddle.transpose(x=x, perm=[0, 2, 1, 3])
            x = nn.functional.softmax(x, axis=1)
            x = paddle.reshape(x=x, shape=[0, r * h * w])
        else:
            x = nn.functional.sigmoid(x)
        return x


class SplatConv(nn.Layer):
    def __init__(self,
                 in_channels,
                 channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 radix=2,
                 reduction_factor=4,
                 rectify_avg=False,
                 name=None):
        super(SplatConv, self).__init__()

        self.radix = radix

        self.conv1 = ConvBNLayer(
            num_channels=in_channels,
            num_filters=channels * radix,
            filter_size=kernel_size,
            stride=stride,
            groups=groups * radix,
            act="relu",
            name=name + "_splat1")

        self.avg_pool2d = AdaptiveAvgPool2d(1)

        inter_channels = int(max(in_channels * radix // reduction_factor, 32))

        # to calc gap
        self.conv2 = ConvBNLayer(
            num_channels=channels,
            num_filters=inter_channels,
            filter_size=1,
            stride=1,
            groups=groups,
            act="relu",
            name=name + "_splat2")

        # to calc atten
        self.conv3 = Conv2d(
            in_channels=inter_channels,
            out_channels=channels * radix,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=groups,
            weight_attr=ParamAttr(
                name=name + "_splat_weights", initializer=MSRA()),
            bias_attr=False)

        self.rsoftmax = rSoftmax(radix=radix, cardinality=groups)

    def forward(self, x):
        x = self.conv1(x)

        if self.radix > 1:
            splited = paddle.split(x, num_or_sections=self.radix, axis=1)
            gap = paddle.sums(splited)
        else:
            gap = x

        gap = self.avg_pool2d(gap)
        gap = self.conv2(gap)

        atten = self.conv3(gap)
        atten = self.rsoftmax(atten)
        atten = paddle.reshape(x=atten, shape=[-1, atten.shape[1], 1, 1])

        if self.radix > 1:
            attens = paddle.split(atten, num_or_sections=self.radix, axis=1)
            y = paddle.sums(
                [att * split for (att, split) in zip(attens, splited)])
        else:
            y = atten * x

        return y


class BottleneckBlock(nn.Layer):
    def __init__(self,
                 inplanes,
                 planes,
                 stride=1,
                 radix=1,
                 cardinality=1,
                 bottleneck_width=64,
                 avd=False,
                 avd_first=False,
                 dilation=1,
                 is_first=False,
                 rectify_avg=False,
                 last_gamma=False,
                 avg_down=False,
                 name=None):
        super(BottleneckBlock, self).__init__()
        self.inplanes = inplanes
        self.planes = planes
        self.stride = stride
        self.radix = radix
        self.cardinality = cardinality
        self.avd = avd
        self.avd_first = avd_first
        self.dilation = dilation
        self.is_first = is_first
        self.rectify_avg = rectify_avg
        self.last_gamma = last_gamma
        self.avg_down = avg_down

        group_width = int(planes * (bottleneck_width / 64.)) * cardinality

        self.conv1 = ConvBNLayer(
            num_channels=self.inplanes,
            num_filters=group_width,
            filter_size=1,
            stride=1,
            groups=1,
            act="relu",
            name=name + "_conv1")

        if avd and avd_first and (stride > 1 or is_first):
            self.avg_pool2d_1 = AvgPool2d(
                kernel_size=3, stride=stride, padding=1)

        if radix >= 1:
            self.conv2 = SplatConv(
                in_channels=group_width,
                channels=group_width,
                kernel_size=3,
                stride=1,
                padding=dilation,
                dilation=dilation,
                groups=cardinality,
                bias=False,
                radix=radix,
                rectify_avg=rectify_avg,
                name=name + "_splatconv")
        else:
            self.conv2 = ConvBNLayer(
                num_channels=group_width,
                num_filters=group_width,
                filter_size=3,
                stride=1,
                dilation=dialtion,
                groups=cardinality,
                act="relu",
                name=name + "_conv2")

        if avd and avd_first == False and (stride > 1 or is_first):
            self.avg_pool2d_2 = AvgPool2d(
                kernel_size=3, stride=stride, padding=1)

        self.conv3 = ConvBNLayer(
            num_channels=group_width,
            num_filters=planes * 4,
            filter_size=1,
            stride=1,
            groups=1,
            act=None,
            name=name + "_conv3")

        if stride != 1 or self.inplanes != self.planes * 4:
            if avg_down:
                if dilation == 1:
                    self.avg_pool2d_3 = AvgPool2d(
                        kernel_size=stride, stride=stride, padding=0)
                else:
                    self.avg_pool2d_3 = AvgPool2d(
                        kernel_size=1, stride=1, padding=0, ceil_mode=True)

                self.conv4 = Conv2d(
                    in_channels=self.inplanes,
                    out_channels=planes * 4,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    groups=1,
                    weight_attr=ParamAttr(
                        name=name + "_weights", initializer=MSRA()),
                    bias_attr=False)
            else:
                self.conv4 = Conv2d(
                    in_channels=self.inplanes,
                    out_channels=planes * 4,
                    kernel_size=1,
                    stride=stride,
                    padding=0,
                    groups=1,
                    weight_attr=ParamAttr(
                        name=name + "_shortcut_weights", initializer=MSRA()),
                    bias_attr=False)

            bn_decay = 0.0
            self._batch_norm = BatchNorm(
                planes * 4,
                act=None,
                param_attr=ParamAttr(
                    name=name + "_shortcut_scale",
                    regularizer=L2Decay(regularization_coeff=bn_decay)),
                bias_attr=ParamAttr(
                    name + "_shortcut_offset",
                    regularizer=L2Decay(regularization_coeff=bn_decay)),
                moving_mean_name=name + "_shortcut_mean",
                moving_variance_name=name + "_shortcut_variance")

    def forward(self, x):
        short = x

        x = self.conv1(x)
        if self.avd and self.avd_first and (self.stride > 1 or self.is_first):
            x = self.avg_pool2d_1(x)

        x = self.conv2(x)

        if self.avd and self.avd_first == False and (self.stride > 1 or
                                                     self.is_first):
            x = self.avg_pool2d_2(x)

        x = self.conv3(x)

        if self.stride != 1 or self.inplanes != self.planes * 4:
            if self.avg_down:
                short = self.avg_pool2d_3(short)

            short = self.conv4(short)

            short = self._batch_norm(short)

        y = paddle.elementwise_add(x=short, y=x, act="relu")
        return y


class ResNeStLayer(nn.Layer):
    def __init__(self,
                 inplanes,
                 planes,
                 blocks,
                 radix,
                 cardinality,
                 bottleneck_width,
                 avg_down,
                 avd,
                 avd_first,
                 rectify_avg,
                 last_gamma,
                 stride=1,
                 dilation=1,
                 is_first=True,
                 name=None):
        super(ResNeStLayer, self).__init__()
        self.inplanes = inplanes
        self.planes = planes
        self.blocks = blocks
        self.radix = radix
        self.cardinality = cardinality
        self.bottleneck_width = bottleneck_width
        self.avg_down = avg_down
        self.avd = avd
        self.avd_first = avd_first
        self.rectify_avg = rectify_avg
        self.last_gamma = last_gamma
        self.is_first = is_first

        if dilation == 1 or dilation == 2:
            bottleneck_func = self.add_sublayer(
                name + "_bottleneck_0",
                BottleneckBlock(
                    inplanes=self.inplanes,
                    planes=planes,
                    stride=stride,
                    radix=radix,
                    cardinality=cardinality,
                    bottleneck_width=bottleneck_width,
                    avg_down=self.avg_down,
                    avd=avd,
                    avd_first=avd_first,
                    dilation=1,
                    is_first=is_first,
                    rectify_avg=rectify_avg,
                    last_gamma=last_gamma,
                    name=name + "_bottleneck_0"))
        elif dilation == 4:
            bottleneck_func = self.add_sublayer(
                name + "_bottleneck_0",
                BottleneckBlock(
                    inplanes=self.inplanes,
                    planes=planes,
                    stride=stride,
                    radix=radix,
                    cardinality=cardinality,
                    bottleneck_width=bottleneck_width,
                    avg_down=self.avg_down,
                    avd=avd,
                    avd_first=avd_first,
                    dilation=2,
                    is_first=is_first,
                    rectify_avg=rectify_avg,
                    last_gamma=last_gamma,
                    name=name + "_bottleneck_0"))
        else:
            raise RuntimeError("=>unknown dilation size")

        self.inplanes = planes * 4
        self.bottleneck_block_list = [bottleneck_func]
        for i in range(1, blocks):
            name = name + "_bottleneck_" + str(i)

            bottleneck_func = self.add_sublayer(
                name,
                BottleneckBlock(
                    inplanes=self.inplanes,
                    planes=planes,
                    radix=radix,
                    cardinality=cardinality,
                    bottleneck_width=bottleneck_width,
                    avg_down=self.avg_down,
                    avd=avd,
                    avd_first=avd_first,
                    dilation=dilation,
                    rectify_avg=rectify_avg,
                    last_gamma=last_gamma,
                    name=name))
            self.bottleneck_block_list.append(bottleneck_func)

    def forward(self, x):
        for bottleneck_block in self.bottleneck_block_list:
            x = bottleneck_block(x)
        return x


class ResNeSt(nn.Layer):
    def __init__(self,
                 layers,
                 radix=1,
                 groups=1,
                 bottleneck_width=64,
                 dilated=False,
                 dilation=1,
                 deep_stem=False,
                 stem_width=64,
                 avg_down=False,
                 rectify_avg=False,
                 avd=False,
                 avd_first=False,
                 final_drop=0.0,
                 last_gamma=False,
                 class_dim=1000):
        super(ResNeSt, self).__init__()

        self.cardinality = groups
        self.bottleneck_width = bottleneck_width
        # ResNet-D params
        self.inplanes = stem_width * 2 if deep_stem else 64
        self.avg_down = avg_down
        self.last_gamma = last_gamma
        # ResNeSt params
        self.radix = radix
        self.avd = avd
        self.avd_first = avd_first

        self.deep_stem = deep_stem
        self.stem_width = stem_width
        self.layers = layers
        self.final_drop = final_drop
        self.dilated = dilated
        self.dilation = dilation

        self.rectify_avg = rectify_avg

        if self.deep_stem:
            self.stem = nn.Sequential(
                ("conv1", ConvBNLayer(
                    num_channels=3,
                    num_filters=stem_width,
                    filter_size=3,
                    stride=2,
                    act="relu",
                    name="conv1")), ("conv2", ConvBNLayer(
                        num_channels=stem_width,
                        num_filters=stem_width,
                        filter_size=3,
                        stride=1,
                        act="relu",
                        name="conv2")), ("conv3", ConvBNLayer(
                            num_channels=stem_width,
                            num_filters=stem_width * 2,
                            filter_size=3,
                            stride=1,
                            act="relu",
                            name="conv3")))
        else:
            self.stem = ConvBNLayer(
                num_channels=3,
                num_filters=stem_width,
                filter_size=7,
                stride=2,
                act="relu",
                name="conv1")

        self.max_pool2d = MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = ResNeStLayer(
            inplanes=self.stem_width * 2
            if self.deep_stem else self.stem_width,
            planes=64,
            blocks=self.layers[0],
            radix=radix,
            cardinality=self.cardinality,
            bottleneck_width=bottleneck_width,
            avg_down=self.avg_down,
            avd=avd,
            avd_first=avd_first,
            rectify_avg=rectify_avg,
            last_gamma=last_gamma,
            stride=1,
            dilation=1,
            is_first=False,
            name="layer1")

        #         return

        self.layer2 = ResNeStLayer(
            inplanes=256,
            planes=128,
            blocks=self.layers[1],
            radix=radix,
            cardinality=self.cardinality,
            bottleneck_width=bottleneck_width,
            avg_down=self.avg_down,
            avd=avd,
            avd_first=avd_first,
            rectify_avg=rectify_avg,
            last_gamma=last_gamma,
            stride=2,
            name="layer2")

        if self.dilated or self.dilation == 4:
            self.layer3 = ResNeStLayer(
                inplanes=512,
                planes=256,
                blocks=self.layers[2],
                radix=radix,
                cardinality=self.cardinality,
                bottleneck_width=bottleneck_width,
                avg_down=self.avg_down,
                avd=avd,
                avd_first=avd_first,
                rectify_avg=rectify_avg,
                last_gamma=last_gamma,
                stride=1,
                dilation=2,
                name="layer3")
            self.layer4 = ResNeStLayer(
                inplanes=1024,
                planes=512,
                blocks=self.layers[3],
                radix=radix,
                cardinality=self.cardinality,
                bottleneck_width=bottleneck_width,
                avg_down=self.avg_down,
                avd=avd,
                avd_first=avd_first,
                rectify_avg=rectify_avg,
                last_gamma=last_gamma,
                stride=1,
                dilation=4,
                name="layer4")
        elif self.dilation == 2:
            self.layer3 = ResNeStLayer(
                inplanes=512,
                planes=256,
                blocks=self.layers[2],
                radix=radix,
                cardinality=self.cardinality,
                bottleneck_width=bottleneck_width,
                avg_down=self.avg_down,
                avd=avd,
                avd_first=avd_first,
                rectify_avg=rectify_avg,
                last_gamma=last_gamma,
                stride=2,
                dilation=1,
                name="layer3")
            self.layer4 = ResNeStLayer(
                inplanes=1024,
                planes=512,
                blocks=self.layers[3],
                radix=radix,
                cardinality=self.cardinality,
                bottleneck_width=bottleneck_width,
                avg_down=self.avg_down,
                avd=avd,
                avd_first=avd_first,
                rectify_avg=rectify_avg,
                last_gamma=last_gamma,
                stride=1,
                dilation=2,
                name="layer4")
        else:
            self.layer3 = ResNeStLayer(
                inplanes=512,
                planes=256,
                blocks=self.layers[2],
                radix=radix,
                cardinality=self.cardinality,
                bottleneck_width=bottleneck_width,
                avg_down=self.avg_down,
                avd=avd,
                avd_first=avd_first,
                rectify_avg=rectify_avg,
                last_gamma=last_gamma,
                stride=2,
                name="layer3")
            self.layer4 = ResNeStLayer(
                inplanes=1024,
                planes=512,
                blocks=self.layers[3],
                radix=radix,
                cardinality=self.cardinality,
                bottleneck_width=bottleneck_width,
                avg_down=self.avg_down,
                avd=avd,
                avd_first=avd_first,
                rectify_avg=rectify_avg,
                last_gamma=last_gamma,
                stride=2,
                name="layer4")

        self.pool2d_avg = AdaptiveAvgPool2d(1)

        self.out_channels = 2048

        stdv = 1.0 / math.sqrt(self.out_channels * 1.0)

        self.out = Linear(
            self.out_channels,
            class_dim,
            weight_attr=ParamAttr(
                initializer=nn.initializer.Uniform(-stdv, stdv),
                name="fc_weights"),
            bias_attr=ParamAttr(name="fc_offset"))

    def forward(self, x):
        x = self.stem(x)
        x = self.max_pool2d(x)
        x = self.layer1(x)
        x = self.layer2(x)

        x = self.layer3(x)

        x = self.layer4(x)
        x = self.pool2d_avg(x)
        x = paddle.reshape(x, shape=[-1, self.out_channels])
        x = self.out(x)
        return x


def ResNeSt50_fast_1s1x64d(**args):
    model = ResNeSt(
        layers=[3, 4, 6, 3],
        radix=1,
        groups=1,
        bottleneck_width=64,
        deep_stem=True,
        stem_width=32,
        avg_down=True,
        avd=True,
        avd_first=True,
        final_drop=0.0,
        **args)
    return model


def ResNeSt50(**args):
    model = ResNeSt(
        layers=[3, 4, 6, 3],
        radix=2,
        groups=1,
        bottleneck_width=64,
        deep_stem=True,
        stem_width=32,
        avg_down=True,
        avd=True,
        avd_first=False,
        final_drop=0.0,
        **args)
    return model