import paddle.v2 as paddle
from config.pascal_voc_conf import cfg


def net_conf(mode):
    """Network configuration. Total three modes included 'train' 'eval'
    and 'infer'. Loss and mAP evaluation layer will return if using 'train'
    and 'eval'. In 'infer' mode, only detection output layer will be returned.
    """
    default_l2regularization = cfg.TRAIN.L2REGULARIZATION

    default_bias_attr = paddle.attr.ParamAttr(
        l2_rate=0.0, learning_rate=2.0, momentum=cfg.TRAIN.MOMENTUM)
    default_static_bias_attr = paddle.attr.ParamAttr(is_static=True)

    def xavier(channels, filter_size, local_lr, regularization):
        init_w = (3.0 / (filter_size**2 * channels))**0.5
        is_static = False
        if local_lr == 0.0:
            is_static = True
        return paddle.attr.ParamAttr(
            initial_min=(0.0 - init_w),
            initial_max=init_w,
            learning_rate=local_lr,
            l2_rate=regularization,
            momentum=cfg.TRAIN.MOMENTUM,
            is_static=is_static)

    def vgg_block(idx_str, input, num_channels, num_filters, pool_size,
                  pool_stride, pool_pad):
        layer_name = "conv%s_" % idx_str
        conv1 = paddle.layer.img_conv(
            name=layer_name + "1",
            input=input,
            filter_size=3,
            num_channels=num_channels,
            num_filters=num_filters,
            stride=1,
            padding=1,
            bias_attr=default_bias_attr,
            param_attr=xavier(num_filters, 3, 1, default_l2regularization),
            act=paddle.activation.Relu())
        conv2 = paddle.layer.img_conv(
            name=layer_name + "2",
            input=conv1,
            filter_size=3,
            num_channels=num_filters,
            num_filters=num_filters,
            stride=1,
            padding=1,
            bias_attr=default_bias_attr,
            param_attr=xavier(num_filters, 3, 1, default_l2regularization),
            act=paddle.activation.Relu())
        conv3 = paddle.layer.img_conv(
            name=layer_name + "3",
            input=conv2,
            filter_size=3,
            num_channels=num_filters,
            num_filters=num_filters,
            stride=1,
            padding=1,
            bias_attr=default_bias_attr,
            param_attr=xavier(num_filters, 3, 1, default_l2regularization),
            act=paddle.activation.Relu())
        pool = paddle.layer.img_pool(
            input=conv3,
            pool_size=pool_size,
            num_channels=num_filters,
            pool_type=paddle.pooling.CudnnMax(),
            stride=pool_stride,
            padding=pool_pad)
        return conv3, pool

    def mbox_block(layer_idx, input, num_channels, filter_size, loc_filters,
                   conf_filters):
        mbox_loc_name = layer_idx + "_mbox_loc"
        mbox_loc = paddle.layer.img_conv(
            name=mbox_loc_name,
            input=input,
            filter_size=filter_size,
            num_channels=num_channels,
            num_filters=loc_filters,
            stride=1,
            padding=1,
            bias_attr=default_bias_attr,
            param_attr=xavier(loc_filters, filter_size, 1,
                              default_l2regularization),
            act=paddle.activation.Identity())

        mbox_conf_name = layer_idx + "_mbox_conf"
        mbox_conf = paddle.layer.img_conv(
            name=mbox_conf_name,
            input=input,
            filter_size=filter_size,
            num_channels=num_channels,
            num_filters=conf_filters,
            stride=1,
            padding=1,
            bias_attr=default_bias_attr,
            param_attr=xavier(conf_filters, filter_size, 1,
                              default_l2regularization),
            act=paddle.activation.Identity())

        return mbox_loc, mbox_conf

    def ssd_block(layer_idx, input, img_shape, num_channels, num_filters1,
                  num_filters2, aspect_ratio, variance, min_size, max_size):
        layer_name = "conv" + layer_idx + "_"
        conv1_name = layer_name + "1"
        conv1 = paddle.layer.img_conv(
            name=conv1_name,
            input=input,
            filter_size=1,
            num_channels=num_channels,
            num_filters=num_filters1,
            stride=1,
            padding=0,
            bias_attr=default_bias_attr,
            param_attr=xavier(num_filters1, 1, 1, default_l2regularization),
            act=paddle.activation.Relu())
        conv2_name = layer_name + "2"
        conv2 = paddle.layer.img_conv(
            name=conv2_name,
            input=conv1,
            filter_size=3,
            num_channels=num_filters1,
            num_filters=num_filters2,
            stride=2,
            padding=1,
            bias_attr=default_bias_attr,
            param_attr=xavier(num_filters2, 3, 1, default_l2regularization),
            act=paddle.activation.Relu())

        loc_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
        conf_filters = (
            len(aspect_ratio) * 2 + 1 + len(max_size)) * cfg.CLASS_NUM
        mbox_loc, mbox_conf = mbox_block(conv2_name, conv2, num_filters2, 3,
                                         loc_filters, conf_filters)
        mbox_priorbox = paddle.layer.priorbox(
            input=conv2,
            image=img_shape,
            min_size=min_size,
            max_size=max_size,
            aspect_ratio=aspect_ratio,
            variance=variance)

        return conv2, mbox_loc, mbox_conf, mbox_priorbox

    img = paddle.layer.data(
        name='image',
        type=paddle.data_type.dense_vector(cfg.IMG_CHANNEL * cfg.IMG_HEIGHT *
                                           cfg.IMG_WIDTH),
        height=cfg.IMG_HEIGHT,
        width=cfg.IMG_WIDTH)

    conv1_1 = paddle.layer.img_conv(
        name="conv1_1",
        input=img,
        filter_size=3,
        num_channels=3,
        num_filters=64,
        stride=1,
        padding=1,
        bias_attr=default_static_bias_attr,
        param_attr=xavier(64, 3, 0, 0),
        act=paddle.activation.Relu())
    conv1_2 = paddle.layer.img_conv(
        name="conv1_2",
        input=conv1_1,
        filter_size=3,
        num_channels=64,
        num_filters=64,
        stride=1,
        padding=1,
        bias_attr=default_static_bias_attr,
        param_attr=xavier(64, 3, 0, 0),
        act=paddle.activation.Relu())
    pool1 = paddle.layer.img_pool(
        name="pool1",
        input=conv1_2,
        pool_type=paddle.pooling.CudnnMax(),
        pool_size=2,
        num_channels=64,
        stride=2)

    conv2_1 = paddle.layer.img_conv(
        name="conv2_1",
        input=pool1,
        filter_size=3,
        num_channels=64,
        num_filters=128,
        stride=1,
        padding=1,
        bias_attr=default_static_bias_attr,
        param_attr=xavier(128, 3, 0, 0),
        act=paddle.activation.Relu())
    conv2_2 = paddle.layer.img_conv(
        name="conv2_2",
        input=conv2_1,
        filter_size=3,
        num_channels=128,
        num_filters=128,
        stride=1,
        padding=1,
        bias_attr=default_static_bias_attr,
        param_attr=xavier(128, 3, 0, 0),
        act=paddle.activation.Relu())
    pool2 = paddle.layer.img_pool(
        name="pool2",
        input=conv2_2,
        pool_type=paddle.pooling.CudnnMax(),
        pool_size=2,
        num_channels=128,
        stride=2)

    conv3_3, pool3 = vgg_block("3", pool2, 128, 256, 2, 2, 0)

    conv4_3, pool4 = vgg_block("4", pool3, 256, 512, 2, 2, 0)
    conv4_3_mbox_priorbox = paddle.layer.priorbox(
        input=conv4_3,
        image=img,
        min_size=cfg.NET.CONV4.PB.MIN_SIZE,
        aspect_ratio=cfg.NET.CONV4.PB.ASPECT_RATIO,
        variance=cfg.NET.CONV4.PB.VARIANCE)
    conv4_3_norm = paddle.layer.cross_channel_norm(
        name="conv4_3_norm",
        input=conv4_3,
        param_attr=paddle.attr.ParamAttr(
            initial_mean=20,
            initial_std=0,
            is_static=False,
            learning_rate=1,
            momentum=cfg.TRAIN.MOMENTUM))
    conv4_3_norm_mbox_loc, conv4_3_norm_mbox_conf = \
            mbox_block("conv4_3_norm", conv4_3_norm, 512, 3, 12, 63)

    conv5_3, pool5 = vgg_block("5", pool4, 512, 512, 3, 1, 1)

    fc6 = paddle.layer.img_conv(
        name="fc6",
        input=pool5,
        filter_size=3,
        num_channels=512,
        num_filters=1024,
        stride=1,
        padding=1,
        bias_attr=default_bias_attr,
        param_attr=xavier(1024, 3, 1, default_l2regularization),
        act=paddle.activation.Relu())

    fc7 = paddle.layer.img_conv(
        name="fc7",
        input=fc6,
        filter_size=1,
        num_channels=1024,
        num_filters=1024,
        stride=1,
        padding=0,
        bias_attr=default_bias_attr,
        param_attr=xavier(1024, 1, 1, default_l2regularization),
        act=paddle.activation.Relu())
    fc7_mbox_loc, fc7_mbox_conf = mbox_block("fc7", fc7, 1024, 3, 24, 126)
    fc7_mbox_priorbox = paddle.layer.priorbox(
        input=fc7,
        image=img,
        min_size=cfg.NET.FC7.PB.MIN_SIZE,
        max_size=cfg.NET.FC7.PB.MAX_SIZE,
        aspect_ratio=cfg.NET.FC7.PB.ASPECT_RATIO,
        variance=cfg.NET.FC7.PB.VARIANCE)

    conv6_2, conv6_2_mbox_loc, conv6_2_mbox_conf, conv6_2_mbox_priorbox = \
            ssd_block("6", fc7, img, 1024, 256, 512,
                    cfg.NET.CONV6.PB.ASPECT_RATIO,
                    cfg.NET.CONV6.PB.VARIANCE,
                    cfg.NET.CONV6.PB.MIN_SIZE,
                    cfg.NET.CONV6.PB.MAX_SIZE)
    conv7_2, conv7_2_mbox_loc, conv7_2_mbox_conf, conv7_2_mbox_priorbox = \
            ssd_block("7", conv6_2, img, 512, 128, 256,
                    cfg.NET.CONV7.PB.ASPECT_RATIO,
                    cfg.NET.CONV7.PB.VARIANCE,
                    cfg.NET.CONV7.PB.MIN_SIZE,
                    cfg.NET.CONV7.PB.MAX_SIZE)
    conv8_2, conv8_2_mbox_loc, conv8_2_mbox_conf, conv8_2_mbox_priorbox = \
            ssd_block("8", conv7_2, img, 256, 128, 256,
                    cfg.NET.CONV8.PB.ASPECT_RATIO,
                    cfg.NET.CONV8.PB.VARIANCE,
                    cfg.NET.CONV8.PB.MIN_SIZE,
                    cfg.NET.CONV8.PB.MAX_SIZE)

    pool6 = paddle.layer.img_pool(
        name="pool6",
        input=conv8_2,
        pool_size=3,
        num_channels=256,
        stride=1,
        pool_type=paddle.pooling.Avg())
    pool6_mbox_loc, pool6_mbox_conf = mbox_block("pool6", pool6, 256, 3, 24,
                                                 126)
    pool6_mbox_priorbox = paddle.layer.priorbox(
        input=pool6,
        image=img,
        min_size=cfg.NET.POOL6.PB.MIN_SIZE,
        max_size=cfg.NET.POOL6.PB.MAX_SIZE,
        aspect_ratio=cfg.NET.POOL6.PB.ASPECT_RATIO,
        variance=cfg.NET.POOL6.PB.VARIANCE)

    mbox_priorbox = paddle.layer.concat(
        name="mbox_priorbox",
        input=[
            conv4_3_mbox_priorbox, fc7_mbox_priorbox, conv6_2_mbox_priorbox,
            conv7_2_mbox_priorbox, conv8_2_mbox_priorbox, pool6_mbox_priorbox
        ])

    loc_loss_input = [
        conv4_3_norm_mbox_loc, fc7_mbox_loc, conv6_2_mbox_loc, conv7_2_mbox_loc,
        conv8_2_mbox_loc, pool6_mbox_loc
    ]

    conf_loss_input = [
        conv4_3_norm_mbox_conf, fc7_mbox_conf, conv6_2_mbox_conf,
        conv7_2_mbox_conf, conv8_2_mbox_conf, pool6_mbox_conf
    ]

    detection_out = paddle.layer.detection_output(
        input_loc=loc_loss_input,
        input_conf=conf_loss_input,
        priorbox=mbox_priorbox,
        confidence_threshold=cfg.NET.DETOUT.CONFIDENCE_THRESHOLD,
        nms_threshold=cfg.NET.DETOUT.NMS_THRESHOLD,
        num_classes=cfg.CLASS_NUM,
        nms_top_k=cfg.NET.DETOUT.NMS_TOP_K,
        keep_top_k=cfg.NET.DETOUT.KEEP_TOP_K,
        background_id=cfg.BACKGROUND_ID,
        name="detection_output")

    if mode == 'train' or mode == 'eval':
        bbox = paddle.layer.data(
            name='bbox', type=paddle.data_type.dense_vector_sequence(6))
        loss = paddle.layer.multibox_loss(
            input_loc=loc_loss_input,
            input_conf=conf_loss_input,
            priorbox=mbox_priorbox,
            label=bbox,
            num_classes=cfg.CLASS_NUM,
            overlap_threshold=cfg.NET.MBLOSS.OVERLAP_THRESHOLD,
            neg_pos_ratio=cfg.NET.MBLOSS.NEG_POS_RATIO,
            neg_overlap=cfg.NET.MBLOSS.NEG_OVERLAP,
            background_id=cfg.BACKGROUND_ID,
            name="multibox_loss")
        paddle.evaluator.detection_map(
            input=detection_out,
            label=bbox,
            overlap_threshold=cfg.NET.DETMAP.OVERLAP_THRESHOLD,
            background_id=cfg.BACKGROUND_ID,
            evaluate_difficult=cfg.NET.DETMAP.EVAL_DIFFICULT,
            ap_type=cfg.NET.DETMAP.AP_TYPE,
            name="detection_evaluator")
        return loss, detection_out
    elif mode == 'infer':
        return detection_out