# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.initializer import NormalInitializer, MSRAInitializer, ConstantInitializer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from genotypes import PRIMITIVES
from operations import *


def channel_shuffle(x, groups):
    batchsize, num_channels, height, width = x.shape
    channels_per_group = num_channels // groups

    # reshape
    x = fluid.layers.reshape(
        x, [batchsize, groups, channels_per_group, height, width])
    x = fluid.layers.transpose(x, [0, 2, 1, 3, 4])

    # flatten
    x = fluid.layers.reshape(x, [batchsize, num_channels, height, width])
    return x


class MixedOp(fluid.dygraph.Layer):
    def __init__(self, c_cur, stride, method):
        super(MixedOp, self).__init__()
        self._method = method
        self._k = 4 if self._method == "PC-DARTS" else 1
        self.mp = Pool2D(
            pool_size=2,
            pool_stride=2,
            pool_type='max', )
        ops = []
        for primitive in PRIMITIVES:
            op = OPS[primitive](c_cur // self._k, stride, False)
            if 'pool' in primitive:
                gama = ParamAttr(
                    initializer=fluid.initializer.Constant(value=1),
                    trainable=False)
                beta = ParamAttr(
                    initializer=fluid.initializer.Constant(value=0),
                    trainable=False)
                BN = BatchNorm(
                    c_cur // self._k, param_attr=gama, bias_attr=beta)
                op = fluid.dygraph.Sequential(op, BN)
            ops.append(op)
        self._ops = fluid.dygraph.LayerList(ops)

    def forward(self, x, weights):
        if self._method == "PC-DARTS":
            dim_2 = x.shape[1]
            xtemp = x[:, :dim_2 // self._k, :, :]
            xtemp2 = x[:, dim_2 // self._k:, :, :]

            temp1 = fluid.layers.sums(
                [weights[i] * op(xtemp) for i, op in enumerate(self._ops)])

            if temp1.shape[2] == x.shape[2]:
                out = fluid.layers.concat([temp1, xtemp2], axis=1)
            else:
                out = fluid.layers.concat([temp1, self.mp(xtemp2)], axis=1)
            out = channel_shuffle(out, self._k)
        else:
            out = fluid.layers.sums(
                [weights[i] * op(x) for i, op in enumerate(self._ops)])
        return out


class Cell(fluid.dygraph.Layer):
    def __init__(self, steps, multiplier, c_prev_prev, c_prev, c_cur,
                 reduction, reduction_prev, method):
        super(Cell, self).__init__()
        self.reduction = reduction

        if reduction_prev:
            self.preprocess0 = FactorizedReduce(c_prev_prev, c_cur, False)
        else:
            self.preprocess0 = ReLUConvBN(c_prev_prev, c_cur, 1, 1, 0, False)
        self.preprocess1 = ReLUConvBN(c_prev, c_cur, 1, 1, 0, affine=False)
        self._steps = steps
        self._multiplier = multiplier
        self._method = method

        ops = []
        for i in range(self._steps):
            for j in range(2 + i):
                stride = 2 if reduction and j < 2 else 1
                op = MixedOp(c_cur, stride, method)
                ops.append(op)
        self._ops = fluid.dygraph.LayerList(ops)

    def forward(self, s0, s1, weights, weights2=None):
        s0 = self.preprocess0(s0)
        s1 = self.preprocess1(s1)

        states = [s0, s1]
        offset = 0
        for i in range(self._steps):
            if self._method == "PC-DARTS":
                s = fluid.layers.sums([
                    weights2[offset + j] *
                    self._ops[offset + j](h, weights[offset + j])
                    for j, h in enumerate(states)
                ])
            else:
                s = fluid.layers.sums([
                    self._ops[offset + j](h, weights[offset + j])
                    for j, h in enumerate(states)
                ])
            offset += len(states)
            states.append(s)
        out = fluid.layers.concat(input=states[-self._multiplier:], axis=1)
        return out


class Network(fluid.dygraph.Layer):
    def __init__(self,
                 c_in,
                 num_classes,
                 layers,
                 method,
                 steps=4,
                 multiplier=4,
                 stem_multiplier=3):
        super(Network, self).__init__()
        self._c_in = c_in
        self._num_classes = num_classes
        self._layers = layers
        self._steps = steps
        self._multiplier = multiplier
        self._primitives = PRIMITIVES
        self._method = method

        c_cur = stem_multiplier * c_in
        self.stem = fluid.dygraph.Sequential(
            Conv2D(
                num_channels=3,
                num_filters=c_cur,
                filter_size=3,
                padding=1,
                param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
                bias_attr=False),
            BatchNorm(
                num_channels=c_cur,
                param_attr=fluid.ParamAttr(
                    initializer=ConstantInitializer(value=1)),
                bias_attr=fluid.ParamAttr(
                    initializer=ConstantInitializer(value=0))))

        c_prev_prev, c_prev, c_cur = c_cur, c_cur, c_in
        cells = []
        reduction_prev = False
        for i in range(layers):
            if i in [layers // 3, 2 * layers // 3]:
                c_cur *= 2
                reduction = True
            else:
                reduction = False
            cell = Cell(steps, multiplier, c_prev_prev, c_prev, c_cur,
                        reduction, reduction_prev, method)
            reduction_prev = reduction
            cells.append(cell)
            c_prev_prev, c_prev = c_prev, multiplier * c_cur
        self.cells = fluid.dygraph.LayerList(cells)
        self.global_pooling = Pool2D(pool_type='avg', global_pooling=True)
        self.classifier = Linear(
            input_dim=c_prev,
            output_dim=num_classes,
            param_attr=ParamAttr(initializer=MSRAInitializer()),
            bias_attr=ParamAttr(initializer=MSRAInitializer()))

        self._initialize_alphas()

    def forward(self, input):
        s0 = s1 = self.stem(input)
        weights2 = None
        for i, cell in enumerate(self.cells):
            if cell.reduction:
                weights = fluid.layers.softmax(self.alphas_reduce)
                if self._method == "PC-DARTS":
                    n = 3
                    start = 2
                    weights2 = fluid.layers.softmax(self.betas_reduce[0:2])
                    for i in range(self._steps - 1):
                        end = start + n
                        tw2 = fluid.layers.softmax(self.betas_reduce[start:
                                                                     end])
                        start = end
                        n += 1
                        weights2 = fluid.layers.concat([weights2, tw2])
            else:
                weights = fluid.layers.softmax(self.alphas_normal)
                if self._method == "PC-DARTS":
                    n = 3
                    start = 2
                    weights2 = fluid.layers.softmax(self.betas_normal[0:2])
                    for i in range(self._steps - 1):
                        end = start + n
                        tw2 = fluid.layers.softmax(self.betas_normal[start:
                                                                     end])
                        start = end
                        n += 1
                        weights2 = fluid.layers.concat([weights2, tw2])
            s0, s1 = s1, cell(s0, s1, weights, weights2)
        out = self.global_pooling(s1)
        out = fluid.layers.squeeze(out, axes=[2, 3])
        logits = self.classifier(out)
        return logits

    def _loss(self, input, target):
        logits = self(input)
        loss = fluid.layers.reduce_mean(
            fluid.layers.softmax_with_cross_entropy(logits, target))
        return loss

    def new(self):
        model_new = Network(self._c_in, self._num_classes, self._layers,
                            self._method)
        return model_new

    def _initialize_alphas(self):
        k = sum(1 for i in range(self._steps) for n in range(2 + i))
        num_ops = len(self._primitives)
        self.alphas_normal = fluid.layers.create_parameter(
            shape=[k, num_ops],
            dtype="float32",
            default_initializer=NormalInitializer(
                loc=0.0, scale=1e-3))
        self.alphas_reduce = fluid.layers.create_parameter(
            shape=[k, num_ops],
            dtype="float32",
            default_initializer=NormalInitializer(
                loc=0.0, scale=1e-3))
        self._arch_parameters = [
            self.alphas_normal,
            self.alphas_reduce,
        ]
        if self._method == "PC-DARTS":
            self.betas_normal = fluid.layers.create_parameter(
                shape=[k],
                dtype="float32",
                default_initializer=NormalInitializer(
                    loc=0.0, scale=1e-3))
            self.betas_reduce = fluid.layers.create_parameter(
                shape=[k],
                dtype="float32",
                default_initializer=NormalInitializer(
                    loc=0.0, scale=1e-3))
            self._arch_parameters += [self.betas_normal, self.betas_reduce]

    def arch_parameters(self):
        return self._arch_parameters