deeplab.py

# coding: utf8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import contextlib
import paddle
import paddle.fluid as fluid
from utils.config import cfg
from models.libs.model_libs import scope, name_scope
from models.libs.model_libs import bn, bn_relu, relu, qsigmoid
from models.libs.model_libs import conv
from models.libs.model_libs import separate_conv
from models.backbone.mobilenet_v2 import MobileNetV2 as mobilenet_v2_backbone
from models.backbone.mobilenet_v3 import MobileNetV3 as mobilenet_v3_backbone
from models.backbone.xception import Xception as xception_backbone
from models.backbone.resnet_vd import ResNet as resnet_vd_backbone


def encoder(input):
    # 编码器配置，采用ASPP架构，pooling + 1x1_conv + 三个不同尺度的空洞卷积并行, concat后1x1conv
    # ASPP_WITH_SEP_CONV：默认为真，使用depthwise可分离卷积，否则使用普通卷积
    # OUTPUT_STRIDE: 下采样倍数，8或16，决定aspp_ratios大小
    # aspp_ratios：ASPP模块空洞卷积的采样率

    if not cfg.MODEL.DEEPLAB.ENCODER.ASPP_RATIOS:
        if cfg.MODEL.DEEPLAB.OUTPUT_STRIDE == 16:
            aspp_ratios = [6, 12, 18]
        elif cfg.MODEL.DEEPLAB.OUTPUT_STRIDE == 8:
            aspp_ratios = [12, 24, 36]
        else:
            aspp_ratios = []
    else:
        aspp_ratios = cfg.MODEL.DEEPLAB.ENCODER.ASPP_RATIOS

    param_attr = fluid.ParamAttr(
        name=name_scope + 'weights',
        regularizer=None,
        initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=0.06))

    concat_logits = []
    with scope('encoder'):
        channel = cfg.MODEL.DEEPLAB.ENCODER.ASPP_CONVS_FILTERS
        with scope("image_pool"):
            if not cfg.MODEL.DEEPLAB.ENCODER.POOLING_CROP_SIZE:
                image_avg = fluid.layers.reduce_mean(
                    input, [2, 3], keep_dim=True)
            else:
                pool_w = int((cfg.MODEL.DEEPLAB.ENCODER.POOLING_CROP_SIZE[0] -
                              1.0) / cfg.MODEL.DEEPLAB.OUTPUT_STRIDE + 1.0)
                pool_h = int((cfg.MODEL.DEEPLAB.ENCODER.POOLING_CROP_SIZE[1] -
                              1.0) / cfg.MODEL.DEEPLAB.OUTPUT_STRIDE + 1.0)
                image_avg = fluid.layers.pool2d(
                    input,
                    pool_size=(pool_h, pool_w),
                    pool_stride=cfg.MODEL.DEEPLAB.ENCODER.POOLING_STRIDE,
                    pool_type='avg',
                    pool_padding='VALID')

            act = qsigmoid if cfg.MODEL.DEEPLAB.ENCODER.SE_USE_QSIGMOID else bn_relu
            image_avg = act(
                conv(
                    image_avg,
                    channel,
                    1,
                    1,
                    groups=1,
                    padding=0,
                    param_attr=param_attr))
            image_avg = fluid.layers.resize_bilinear(image_avg, input.shape[2:])
            if cfg.MODEL.DEEPLAB.ENCODER.ADD_IMAGE_LEVEL_FEATURE:
                concat_logits.append(image_avg)

        with scope("aspp0"):
            aspp0 = bn_relu(
                conv(
                    input,
                    channel,
                    1,
                    1,
                    groups=1,
                    padding=0,
                    param_attr=param_attr))
            concat_logits.append(aspp0)

        if aspp_ratios:
            with scope("aspp1"):
                if cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV:
                    aspp1 = separate_conv(
                        input, channel, 1, 3, dilation=aspp_ratios[0], act=relu)
                else:
                    aspp1 = bn_relu(
                        conv(
                            input,
                            channel,
                            stride=1,
                            filter_size=3,
                            dilation=aspp_ratios[0],
                            padding=aspp_ratios[0],
                            param_attr=param_attr))
                concat_logits.append(aspp1)
            with scope("aspp2"):
                if cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV:
                    aspp2 = separate_conv(
                        input, channel, 1, 3, dilation=aspp_ratios[1], act=relu)
                else:
                    aspp2 = bn_relu(
                        conv(
                            input,
                            channel,
                            stride=1,
                            filter_size=3,
                            dilation=aspp_ratios[1],
                            padding=aspp_ratios[1],
                            param_attr=param_attr))
                concat_logits.append(aspp2)
            with scope("aspp3"):
                if cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV:
                    aspp3 = separate_conv(
                        input, channel, 1, 3, dilation=aspp_ratios[2], act=relu)
                else:
                    aspp3 = bn_relu(
                        conv(
                            input,
                            channel,
                            stride=1,
                            filter_size=3,
                            dilation=aspp_ratios[2],
                            padding=aspp_ratios[2],
                            param_attr=param_attr))
                concat_logits.append(aspp3)

        with scope("concat"):
            data = fluid.layers.concat(concat_logits, axis=1)
            if cfg.MODEL.DEEPLAB.ENCODER.ASPP_WITH_CONCAT_PROJECTION:
                data = bn_relu(
                    conv(
                        data,
                        channel,
                        1,
                        1,
                        groups=1,
                        padding=0,
                        param_attr=param_attr))
                data = fluid.layers.dropout(data, 0.9)

        if cfg.MODEL.DEEPLAB.ENCODER.ASPP_WITH_SE:
            data = data * image_avg
        return data


def _decoder_with_sum_merge(encode_data, decode_shortcut, param_attr):
    encode_data = fluid.layers.resize_bilinear(encode_data,
                                               decode_shortcut.shape[2:])
    encode_data = conv(
        encode_data,
        cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
        1,
        1,
        groups=1,
        padding=0,
        param_attr=param_attr)

    with scope('merge'):
        decode_shortcut = conv(
            decode_shortcut,
            cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
            1,
            1,
            groups=1,
            padding=0,
            param_attr=param_attr)

        return encode_data + decode_shortcut


def _decoder_with_concat(encode_data, decode_shortcut, param_attr):
    with scope('concat'):
        decode_shortcut = bn_relu(
            conv(
                decode_shortcut,
                48,
                1,
                1,
                groups=1,
                padding=0,
                param_attr=param_attr))

        encode_data = fluid.layers.resize_bilinear(encode_data,
                                                   decode_shortcut.shape[2:])
        encode_data = fluid.layers.concat([encode_data, decode_shortcut],
                                          axis=1)
    if cfg.MODEL.DEEPLAB.DECODER_USE_SEP_CONV:
        with scope("separable_conv1"):
            encode_data = separate_conv(
                encode_data,
                cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
                1,
                3,
                dilation=1,
                act=relu)
        with scope("separable_conv2"):
            encode_data = separate_conv(
                encode_data,
                cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
                1,
                3,
                dilation=1,
                act=relu)
    else:
        with scope("decoder_conv1"):
            encode_data = bn_relu(
                conv(
                    encode_data,
                    cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
                    stride=1,
                    filter_size=3,
                    dilation=1,
                    padding=1,
                    param_attr=param_attr))
        with scope("decoder_conv2"):
            encode_data = bn_relu(
                conv(
                    encode_data,
                    cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
                    stride=1,
                    filter_size=3,
                    dilation=1,
                    padding=1,
                    param_attr=param_attr))
    return encode_data


def decoder(encode_data, decode_shortcut):
    # 解码器配置
    # encode_data：编码器输出
    # decode_shortcut: 从backbone引出的分支, resize后与encode_data concat
    # DECODER_USE_SEP_CONV: 默认为真，则concat后连接两个可分离卷积，否则为普通卷积
    param_attr = fluid.ParamAttr(
        name=name_scope + 'weights',
        regularizer=None,
        initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=0.06))
    with scope('decoder'):
        if cfg.MODEL.DEEPLAB.DECODER.USE_SUM_MERGE:
            return _decoder_with_sum_merge(encode_data, decode_shortcut,
                                           param_attr)

        return _decoder_with_concat(encode_data, decode_shortcut, param_attr)


def mobilenet(input):
    if 'v3' in cfg.MODEL.DEEPLAB.BACKBONE:
        model_name = 'large' if 'large' in cfg.MODEL.DEEPLAB.BACKBONE else 'small'
        return _mobilenetv3(input, model_name)
    return _mobilenetv2(input)


def _mobilenetv3(input, model_name='large'):
    # Backbone: mobilenetv3结构配置
    # DEPTH_MULTIPLIER: mobilenetv3的scale设置，默认1.0
    # OUTPUT_STRIDE：下采样倍数
    scale = cfg.MODEL.DEEPLAB.DEPTH_MULTIPLIER
    output_stride = cfg.MODEL.DEEPLAB.OUTPUT_STRIDE
    lr_mult_shortcut = cfg.MODEL.DEEPLAB.BACKBONE_LR_MULT_LIST
    model = mobilenet_v3_backbone(
        scale=scale,
        output_stride=output_stride,
        model_name=model_name,
        lr_mult_list=lr_mult_shortcut)
    data, decode_shortcut = model.net(input)
    return data, decode_shortcut


def _mobilenetv2(input):
    # Backbone: mobilenetv2结构配置
    # DEPTH_MULTIPLIER: mobilenetv2的scale设置，默认1.0
    # OUTPUT_STRIDE：下采样倍数
    # end_points: mobilenetv2的block数
    # decode_point: 从mobilenetv2中引出分支所在block数, 作为decoder输入
    if cfg.MODEL.DEEPLAB.BACKBONE_LR_MULT_LIST is not None:
        print(
            'mobilenetv2 backbone do not support BACKBONE_LR_MULT_LIST setting')

    scale = cfg.MODEL.DEEPLAB.DEPTH_MULTIPLIER
    output_stride = cfg.MODEL.DEEPLAB.OUTPUT_STRIDE
    model = mobilenet_v2_backbone(scale=scale, output_stride=output_stride)
    end_points = 18
    decode_point = 4
    data, decode_shortcuts = model.net(
        input, end_points=end_points, decode_points=decode_point)
    decode_shortcut = decode_shortcuts[decode_point]
    return data, decode_shortcut


def xception(input):
    # Backbone: Xception结构配置, xception_65, xception_41, xception_71三种可选
    # decode_point: 从Xception中引出分支所在block数，作为decoder输入
    # end_point：Xception的block数
    cfg.MODEL.DEFAULT_EPSILON = 1e-3
    model = xception_backbone(cfg.MODEL.DEEPLAB.BACKBONE)
    backbone = cfg.MODEL.DEEPLAB.BACKBONE
    output_stride = cfg.MODEL.DEEPLAB.OUTPUT_STRIDE
    if '65' in backbone:
        decode_point = 2
        end_points = 21
    if '41' in backbone:
        decode_point = 2
        end_points = 13
    if '71' in backbone:
        decode_point = 3
        end_points = 23
    data, decode_shortcuts = model.net(
        input,
        output_stride=output_stride,
        end_points=end_points,
        decode_points=decode_point)
    decode_shortcut = decode_shortcuts[decode_point]
    return data, decode_shortcut


def resnet_vd(input):
    # backbone: resnet_vd, 可选resnet50_vd, resnet101_vd
    # end_points: resnet终止层数
    # dilation_dict: resnet block数及对应的膨胀卷积尺度
    backbone = cfg.MODEL.DEEPLAB.BACKBONE
    if '50' in backbone:
        layers = 50
    elif '101' in backbone:
        layers = 101
    else:
        raise Exception("resnet_vd backbone only support layers 50 or 101")
    output_stride = cfg.MODEL.DEEPLAB.OUTPUT_STRIDE
    end_points = layers - 1
    decode_point = 10
    if output_stride == 8:
        dilation_dict = {2: 2, 3: 4}
    elif output_stride == 16:
        dilation_dict = {3: 2}
    else:
        raise Exception("deeplab only support stride 8 or 16")
    lr_mult_list = cfg.MODEL.DEEPLAB.BACKBONE_LR_MULT_LIST
    if lr_mult_list is None:
        lr_mult_list = [1.0, 1.0, 1.0, 1.0, 1.0]
    model = resnet_vd_backbone(
        layers, stem='deeplab', lr_mult_list=lr_mult_list)
    data, decode_shortcuts = model.net(
        input,
        end_points=end_points,
        decode_points=decode_point,
        dilation_dict=dilation_dict)
    decode_shortcut = decode_shortcuts[decode_point]

    return data, decode_shortcut


def deeplabv3p(img, num_classes):
    # Backbone设置：xception 或 mobilenetv2
    if 'xception' in cfg.MODEL.DEEPLAB.BACKBONE:
        data, decode_shortcut = xception(img)
        if cfg.MODEL.DEEPLAB.BACKBONE_LR_MULT_LIST is not None:
            print(
                'xception backbone do not support BACKBONE_LR_MULT_LIST setting'
            )
    elif 'mobilenet' in cfg.MODEL.DEEPLAB.BACKBONE:
        data, decode_shortcut = mobilenet(img)
    elif 'resnet' in cfg.MODEL.DEEPLAB.BACKBONE:
        data, decode_shortcut = resnet_vd(img)
    else:
        raise Exception(
            "deeplab only support xception, mobilenet, and resnet_vd backbone")

    # 编码器解码器设置
    cfg.MODEL.DEFAULT_EPSILON = 1e-5
    if cfg.MODEL.DEEPLAB.ENCODER_WITH_ASPP:
        data = encoder(data)
    if cfg.MODEL.DEEPLAB.ENABLE_DECODER:
        data = decoder(data, decode_shortcut)

    # 根据类别数设置最后一个卷积层输出，并resize到图片原始尺寸
    param_attr = fluid.ParamAttr(
        name=name_scope + 'weights',
        regularizer=fluid.regularizer.L2DecayRegularizer(
            regularization_coeff=0.0),
        initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=0.01))

    if not cfg.MODEL.DEEPLAB.DECODER.OUTPUT_IS_LOGITS:
        with scope('logit'):
            with fluid.name_scope('last_conv'):
                logit = conv(
                    data,
                    num_classes,
                    1,
                    stride=1,
                    padding=0,
                    bias_attr=True,
                    param_attr=param_attr)
    else:
        logit = data

    logit = fluid.layers.resize_bilinear(logit, img.shape[2:])
    return logit