From 24856d059c4a0518f38b1cc011690087f8cd243a Mon Sep 17 00:00:00 2001
From: wuzewu <wuzewu@baidu.com>
Date: Wed, 1 Jul 2020 14:33:49 +0800
Subject: [PATCH] add mobilenetv3 backbone (#306)

* Add mobilenet v3

* Update config
---
 ...eplabv3p_mobilenetv3_large_cityscapes.yaml |  57 +++
 pdseg/models/backbone/mobilenet_v3.py         | 363 ++++++++++++++++++
 pdseg/models/libs/model_libs.py               |   4 +
 pdseg/models/modeling/deeplab.py              | 347 +++++++++++------
 pdseg/utils/config.py                         |  19 +-
 pretrained_model/download_model.py            |   4 +
 6 files changed, 668 insertions(+), 126 deletions(-)
 create mode 100644 configs/deeplabv3p_mobilenetv3_large_cityscapes.yaml
 create mode 100644 pdseg/models/backbone/mobilenet_v3.py

diff --git a/configs/deeplabv3p_mobilenetv3_large_cityscapes.yaml b/configs/deeplabv3p_mobilenetv3_large_cityscapes.yaml
new file mode 100644
index 00000000..c21da5d7
--- /dev/null
+++ b/configs/deeplabv3p_mobilenetv3_large_cityscapes.yaml
@@ -0,0 +1,57 @@
+EVAL_CROP_SIZE: (2049, 1025) # (width, height), for unpadding rangescaling and stepscaling
+TRAIN_CROP_SIZE: (769, 769) # (width, height), for unpadding rangescaling and stepscaling
+AUG:
+    AUG_METHOD: "stepscaling" # choice unpadding rangescaling and stepscaling
+    MAX_SCALE_FACTOR: 2.0  # for stepscaling
+    MIN_SCALE_FACTOR: 0.5  # for stepscaling
+    SCALE_STEP_SIZE: 0.25  # for stepscaling
+    MIRROR: True
+BATCH_SIZE: 32
+DATASET:
+    DATA_DIR: "./dataset/cityscapes/"
+    IMAGE_TYPE: "rgb"  # choice rgb or rgba
+    NUM_CLASSES: 19
+    TEST_FILE_LIST: "dataset/cityscapes/val.list"
+    TRAIN_FILE_LIST: "dataset/cityscapes/train.list"
+    VAL_FILE_LIST: "dataset/cityscapes/val.list"
+    IGNORE_INDEX: 255
+    SEPARATOR: " "
+FREEZE:
+    MODEL_FILENAME: "model"
+    PARAMS_FILENAME: "params"
+MODEL:
+    DEFAULT_NORM_TYPE: "bn"
+    MODEL_NAME: "deeplabv3p"
+    DEEPLAB:
+        BACKBONE: "mobilenetv3_large"
+        ASPP_WITH_SEP_CONV: True
+        DECODER_USE_SEP_CONV: True
+        ENCODER_WITH_ASPP: True
+        ENABLE_DECODER: True
+        OUTPUT_STRIDE: 32
+        BACKBONE_LR_MULT_LIST: [0.15,0.35,0.65,0.85,1]
+        ENCODER:
+            POOLING_STRIDE: (4, 5)
+            POOLING_CROP_SIZE: (769, 769)
+            ASPP_WITH_SE: True
+            SE_USE_QSIGMOID: True
+            ASPP_CONVS_FILTERS: 128
+            ASPP_WITH_CONCAT_PROJECTION: False
+            ADD_IMAGE_LEVEL_FEATURE: False
+        DECODER:
+            USE_SUM_MERGE: True
+            CONV_FILTERS: 19
+            OUTPUT_IS_LOGITS: True
+
+TRAIN:
+    PRETRAINED_MODEL_DIR: u"pretrained_model/mobilenetv3-1-0_large_bn_imagenet"
+    MODEL_SAVE_DIR: "saved_model/deeplabv3p_mobilenetv3_large_cityscapes"
+    SNAPSHOT_EPOCH: 1
+    SYNC_BATCH_NORM: True
+TEST:
+    TEST_MODEL: "saved_model/deeplabv3p_mobilenetv3_large_cityscapes/final"
+SOLVER:
+    LR: 0.2
+    LR_POLICY: "poly"
+    OPTIMIZER: "sgd"
+    NUM_EPOCHS: 850
diff --git a/pdseg/models/backbone/mobilenet_v3.py b/pdseg/models/backbone/mobilenet_v3.py
new file mode 100644
index 00000000..e0a6a8df
--- /dev/null
+++ b/pdseg/models/backbone/mobilenet_v3.py
@@ -0,0 +1,363 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = [
+    'MobileNetV3', 'MobileNetV3_small_x0_35', 'MobileNetV3_small_x0_5',
+    'MobileNetV3_small_x0_75', 'MobileNetV3_small_x1_0',
+    'MobileNetV3_small_x1_25', 'MobileNetV3_large_x0_35',
+    'MobileNetV3_large_x0_5', 'MobileNetV3_large_x0_75',
+    'MobileNetV3_large_x1_0', 'MobileNetV3_large_x1_25'
+]
+
+
+class MobileNetV3():
+    def __init__(self,
+                 scale=1.0,
+                 model_name='small',
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 output_stride=None):
+        self.scale = scale
+        self.inplanes = 16
+
+        self.lr_mult_list = lr_mult_list
+        assert len(self.lr_mult_list) == 5, \
+            "lr_mult_list length in MobileNetV3 must be 5 but got {}!!".format(
+            len(self.lr_mult_list))
+        self.curr_stage = 0
+        self.decode_point = None
+        self.end_point = None
+
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'relu', 1],
+                [3, 64, 24, False, 'relu', 2],
+                [3, 72, 24, False, 'relu', 1],
+                [5, 72, 40, True, 'relu', 2],
+                [5, 120, 40, True, 'relu', 1],
+                [5, 120, 40, True, 'relu', 1],
+                [3, 240, 80, False, 'hard_swish', 2],
+                [3, 200, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 480, 112, True, 'hard_swish', 1],
+                [3, 672, 112, True, 'hard_swish', 1],
+                # The number of channels in the last 4 stages is reduced by a
+                # factor of 2 compared to the standard implementation.
+                [5, 336, 80, True, 'hard_swish', 2],
+                [5, 480, 80, True, 'hard_swish', 1],
+                [5, 480, 80, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 480
+            self.cls_ch_expand = 1280
+            self.lr_interval = 3
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'relu', 2],
+                [3, 72, 24, False, 'relu', 2],
+                [3, 88, 24, False, 'relu', 1],
+                [5, 96, 40, True, 'hard_swish', 2],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 120, 48, True, 'hard_swish', 1],
+                [5, 144, 48, True, 'hard_swish', 1],
+                # The number of channels in the last 4 stages is reduced by a
+                # factor of 2 compared to the standard implementation.
+                [5, 144, 48, True, 'hard_swish', 2],
+                [5, 288, 48, True, 'hard_swish', 1],
+                [5, 288, 48, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 288
+            self.cls_ch_expand = 1280
+            self.lr_interval = 2
+        else:
+            raise NotImplementedError(
+                "mode[{}_model] is not implemented!".format(model_name))
+
+        self.modify_bottle_params(output_stride)
+
+    def modify_bottle_params(self, output_stride=None):
+        if output_stride is not None and output_stride % 2 != 0:
+            raise Exception("output stride must to be even number")
+        if output_stride is None:
+            return
+        else:
+            stride = 2
+            for i, _cfg in enumerate(self.cfg):
+                stride = stride * _cfg[-1]
+                if stride > output_stride:
+                    s = 1
+                    self.cfg[i][-1] = s
+
+    def net(self, input, class_dim=1000, end_points=None, decode_points=None):
+        scale = self.scale
+        inplanes = self.inplanes
+        cfg = self.cfg
+        cls_ch_squeeze = self.cls_ch_squeeze
+        cls_ch_expand = self.cls_ch_expand
+
+        # conv1
+        conv = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            num_filters=self.make_divisible(inplanes * scale),
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv1')
+
+        i = 0
+        inplanes = self.make_divisible(inplanes * scale)
+        for layer_cfg in cfg:
+            conv = self.residual_unit(
+                input=conv,
+                num_in_filter=inplanes,
+                num_mid_filter=self.make_divisible(scale * layer_cfg[1]),
+                num_out_filter=self.make_divisible(scale * layer_cfg[2]),
+                act=layer_cfg[4],
+                stride=layer_cfg[5],
+                filter_size=layer_cfg[0],
+                use_se=layer_cfg[3],
+                name='conv' + str(i + 2))
+            inplanes = self.make_divisible(scale * layer_cfg[2])
+            i += 1
+            self.curr_stage = i
+
+        conv = self.conv_bn_layer(
+            input=conv,
+            filter_size=1,
+            num_filters=self.make_divisible(scale * cls_ch_squeeze),
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv_last')
+
+        return conv, self.decode_point
+
+        conv = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv = fluid.layers.conv2d(
+            input=conv,
+            num_filters=cls_ch_expand,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(name='last_1x1_conv_weights'),
+            bias_attr=False)
+        conv = fluid.layers.hard_swish(conv)
+        drop = fluid.layers.dropout(x=conv, dropout_prob=0.2)
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            param_attr=ParamAttr(name='fc_weights'),
+            bias_attr=ParamAttr(name='fc_offset'))
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      if_act=True,
+                      act=None,
+                      name=None,
+                      use_cudnn=True,
+                      res_last_bn_init=False):
+        lr_idx = self.curr_stage // self.lr_interval
+        lr_idx = min(lr_idx, len(self.lr_mult_list) - 1)
+        lr_mult = self.lr_mult_list[lr_idx]
+
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(name=name + '_weights', learning_rate=lr_mult),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(
+                name=bn_name + "_scale",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            bias_attr=ParamAttr(
+                name=bn_name + "_offset",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            if act == 'relu':
+                bn = fluid.layers.relu(bn)
+            elif act == 'hard_swish':
+                bn = fluid.layers.hard_swish(bn)
+        return bn
+
+    def make_divisible(self, v, divisor=8, min_value=None):
+        if min_value is None:
+            min_value = divisor
+        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+        if new_v < 0.9 * v:
+            new_v += divisor
+        return new_v
+
+    def se_block(self, input, num_out_filter, ratio=4, name=None):
+        lr_idx = self.curr_stage // self.lr_interval
+        lr_idx = min(lr_idx, len(self.lr_mult_list) - 1)
+        lr_mult = self.lr_mult_list[lr_idx]
+
+        num_mid_filter = num_out_filter // ratio
+        pool = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv1 = fluid.layers.conv2d(
+            input=pool,
+            filter_size=1,
+            num_filters=num_mid_filter,
+            act='relu',
+            param_attr=ParamAttr(
+                name=name + '_1_weights', learning_rate=lr_mult),
+            bias_attr=ParamAttr(name=name + '_1_offset', learning_rate=lr_mult))
+        conv2 = fluid.layers.conv2d(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            act='hard_sigmoid',
+            param_attr=ParamAttr(
+                name=name + '_2_weights', learning_rate=lr_mult),
+            bias_attr=ParamAttr(name=name + '_2_offset', learning_rate=lr_mult))
+        scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
+        return scale
+
+    def residual_unit(self,
+                      input,
+                      num_in_filter,
+                      num_mid_filter,
+                      num_out_filter,
+                      stride,
+                      filter_size,
+                      act=None,
+                      use_se=False,
+                      name=None):
+
+        conv0 = self.conv_bn_layer(
+            input=input,
+            filter_size=1,
+            num_filters=num_mid_filter,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act,
+            name=name + '_expand')
+
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            filter_size=filter_size,
+            num_filters=num_mid_filter,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            if_act=True,
+            act=act,
+            num_groups=num_mid_filter,
+            use_cudnn=False,
+            name=name + '_depthwise')
+
+        if self.curr_stage == 5:
+            self.decode_point = conv1
+        if use_se:
+            conv1 = self.se_block(
+                input=conv1, num_out_filter=num_mid_filter, name=name + '_se')
+
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            stride=1,
+            padding=0,
+            if_act=False,
+            name=name + '_linear',
+            res_last_bn_init=True)
+        if num_in_filter != num_out_filter or stride != 1:
+            return conv2
+        else:
+            return fluid.layers.elementwise_add(x=input, y=conv2, act=None)
+
+
+def MobileNetV3_small_x0_35():
+    model = MobileNetV3(model_name='small', scale=0.35)
+    return model
+
+
+def MobileNetV3_small_x0_5():
+    model = MobileNetV3(model_name='small', scale=0.5)
+    return model
+
+
+def MobileNetV3_small_x0_75():
+    model = MobileNetV3(model_name='small', scale=0.75)
+    return model
+
+
+def MobileNetV3_small_x1_0(**args):
+    model = MobileNetV3(model_name='small', scale=1.0, **args)
+    return model
+
+
+def MobileNetV3_small_x1_25():
+    model = MobileNetV3(model_name='small', scale=1.25)
+    return model
+
+
+def MobileNetV3_large_x0_35():
+    model = MobileNetV3(model_name='large', scale=0.35)
+    return model
+
+
+def MobileNetV3_large_x0_5():
+    model = MobileNetV3(model_name='large', scale=0.5)
+    return model
+
+
+def MobileNetV3_large_x0_75():
+    model = MobileNetV3(model_name='large', scale=0.75)
+    return model
+
+
+def MobileNetV3_large_x1_0(**args):
+    model = MobileNetV3(model_name='large', scale=1.0, **args)
+    return model
+
+
+def MobileNetV3_large_x1_25():
+    model = MobileNetV3(model_name='large', scale=1.25)
+    return model
diff --git a/pdseg/models/libs/model_libs.py b/pdseg/models/libs/model_libs.py
index dbd04dea..ade921d7 100644
--- a/pdseg/models/libs/model_libs.py
+++ b/pdseg/models/libs/model_libs.py
@@ -109,6 +109,10 @@ def bn_relu(data):
     return fluid.layers.relu(bn(data))
 
 
+def qsigmoid(data):
+    return fluid.layers.relu6(data + 3) * 0.16667
+
+
 def relu(data):
     return fluid.layers.relu(data)
 
diff --git a/pdseg/models/modeling/deeplab.py b/pdseg/models/modeling/deeplab.py
index 454a5c47..73ab9a2a 100644
--- a/pdseg/models/modeling/deeplab.py
+++ b/pdseg/models/modeling/deeplab.py
@@ -21,10 +21,11 @@ import paddle
 import paddle.fluid as fluid
 from utils.config import cfg
 from models.libs.model_libs import scope, name_scope
-from models.libs.model_libs import bn, bn_relu, relu
+from models.libs.model_libs import bn, bn_relu, relu, qsigmoid
 from models.libs.model_libs import conv
 from models.libs.model_libs import separate_conv
-from models.backbone.mobilenet_v2 import MobileNetV2 as mobilenet_backbone
+from models.backbone.mobilenet_v2 import MobileNetV2 as mobilenet_v2_backbone
+from models.backbone.mobilenet_v3 import MobileNetV3 as mobilenet_v3_backbone
 from models.backbone.xception import Xception as xception_backbone
 from models.backbone.resnet_vd import ResNet as resnet_vd_backbone
 
@@ -35,22 +36,42 @@ def encoder(input):
     # OUTPUT_STRIDE: 下采样倍数，8或16，决定aspp_ratios大小
     # aspp_ratios：ASPP模块空洞卷积的采样率
 
-    if cfg.MODEL.DEEPLAB.OUTPUT_STRIDE == 16:
-        aspp_ratios = [6, 12, 18]
-    elif cfg.MODEL.DEEPLAB.OUTPUT_STRIDE == 8:
-        aspp_ratios = [12, 24, 36]
+    if not cfg.MODEL.DEEPLAB.ENCODER.ASPP_RATIOS:
+        if cfg.MODEL.DEEPLAB.OUTPUT_STRIDE == 16:
+            aspp_ratios = [6, 12, 18]
+        elif cfg.MODEL.DEEPLAB.OUTPUT_STRIDE == 8:
+            aspp_ratios = [12, 24, 36]
+        else:
+            aspp_ratios = []
     else:
-        raise Exception("deeplab only support stride 8 or 16")
+        aspp_ratios = cfg.MODEL.DEEPLAB.ENCODER.ASPP_RATIOS
 
     param_attr = fluid.ParamAttr(
         name=name_scope + 'weights',
         regularizer=None,
         initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=0.06))
+
+    concat_logits = []
     with scope('encoder'):
-        channel = 256
+        channel = cfg.MODEL.DEEPLAB.ENCODER.ASPP_CONVS_FILTERS
         with scope("image_pool"):
-            image_avg = fluid.layers.reduce_mean(input, [2, 3], keep_dim=True)
-            image_avg = bn_relu(
+            if not cfg.MODEL.DEEPLAB.ENCODER.POOLING_CROP_SIZE:
+                image_avg = fluid.layers.reduce_mean(
+                    input, [2, 3], keep_dim=True)
+            else:
+                pool_w = int((cfg.MODEL.DEEPLAB.ENCODER.POOLING_CROP_SIZE[0] -
+                              1.0) / cfg.MODEL.DEEPLAB.OUTPUT_STRIDE + 1.0)
+                pool_h = int((cfg.MODEL.DEEPLAB.ENCODER.POOLING_CROP_SIZE[1] -
+                              1.0) / cfg.MODEL.DEEPLAB.OUTPUT_STRIDE + 1.0)
+                image_avg = fluid.layers.pool2d(
+                    input,
+                    pool_size=(pool_h, pool_w),
+                    pool_stride=cfg.MODEL.DEEPLAB.ENCODER.POOLING_STRIDE,
+                    pool_type='avg',
+                    pool_padding='VALID')
+
+            act = qsigmoid if cfg.MODEL.DEEPLAB.ENCODER.SE_USE_QSIGMOID else bn_relu
+            image_avg = act(
                 conv(
                     image_avg,
                     channel,
@@ -60,6 +81,8 @@ def encoder(input):
                     padding=0,
                     param_attr=param_attr))
             image_avg = fluid.layers.resize_bilinear(image_avg, input.shape[2:])
+            if cfg.MODEL.DEEPLAB.ENCODER.ADD_IMAGE_LEVEL_FEATURE:
+                concat_logits.append(image_avg)
 
         with scope("aspp0"):
             aspp0 = bn_relu(
@@ -71,62 +94,154 @@ def encoder(input):
                     groups=1,
                     padding=0,
                     param_attr=param_attr))
-        with scope("aspp1"):
-            if cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV:
-                aspp1 = separate_conv(
-                    input, channel, 1, 3, dilation=aspp_ratios[0], act=relu)
-            else:
-                aspp1 = bn_relu(
-                    conv(
-                        input,
-                        channel,
-                        stride=1,
-                        filter_size=3,
-                        dilation=aspp_ratios[0],
-                        padding=aspp_ratios[0],
-                        param_attr=param_attr))
-        with scope("aspp2"):
-            if cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV:
-                aspp2 = separate_conv(
-                    input, channel, 1, 3, dilation=aspp_ratios[1], act=relu)
-            else:
-                aspp2 = bn_relu(
-                    conv(
-                        input,
-                        channel,
-                        stride=1,
-                        filter_size=3,
-                        dilation=aspp_ratios[1],
-                        padding=aspp_ratios[1],
-                        param_attr=param_attr))
-        with scope("aspp3"):
-            if cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV:
-                aspp3 = separate_conv(
-                    input, channel, 1, 3, dilation=aspp_ratios[2], act=relu)
-            else:
-                aspp3 = bn_relu(
+            concat_logits.append(aspp0)
+
+        if aspp_ratios:
+            with scope("aspp1"):
+                if cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV:
+                    aspp1 = separate_conv(
+                        input, channel, 1, 3, dilation=aspp_ratios[0], act=relu)
+                else:
+                    aspp1 = bn_relu(
+                        conv(
+                            input,
+                            channel,
+                            stride=1,
+                            filter_size=3,
+                            dilation=aspp_ratios[0],
+                            padding=aspp_ratios[0],
+                            param_attr=param_attr))
+                concat_logits.append(aspp1)
+            with scope("aspp2"):
+                if cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV:
+                    aspp2 = separate_conv(
+                        input, channel, 1, 3, dilation=aspp_ratios[1], act=relu)
+                else:
+                    aspp2 = bn_relu(
+                        conv(
+                            input,
+                            channel,
+                            stride=1,
+                            filter_size=3,
+                            dilation=aspp_ratios[1],
+                            padding=aspp_ratios[1],
+                            param_attr=param_attr))
+                concat_logits.append(aspp2)
+            with scope("aspp3"):
+                if cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV:
+                    aspp3 = separate_conv(
+                        input, channel, 1, 3, dilation=aspp_ratios[2], act=relu)
+                else:
+                    aspp3 = bn_relu(
+                        conv(
+                            input,
+                            channel,
+                            stride=1,
+                            filter_size=3,
+                            dilation=aspp_ratios[2],
+                            padding=aspp_ratios[2],
+                            param_attr=param_attr))
+                concat_logits.append(aspp3)
+
+        with scope("concat"):
+            data = fluid.layers.concat(concat_logits, axis=1)
+            if cfg.MODEL.DEEPLAB.ENCODER.ASPP_WITH_CONCAT_PROJECTION:
+                data = bn_relu(
                     conv(
-                        input,
+                        data,
                         channel,
-                        stride=1,
-                        filter_size=3,
-                        dilation=aspp_ratios[2],
-                        padding=aspp_ratios[2],
+                        1,
+                        1,
+                        groups=1,
+                        padding=0,
                         param_attr=param_attr))
-        with scope("concat"):
-            data = fluid.layers.concat([image_avg, aspp0, aspp1, aspp2, aspp3],
-                                       axis=1)
-            data = bn_relu(
+                data = fluid.layers.dropout(data, 0.9)
+
+        if cfg.MODEL.DEEPLAB.ENCODER.ASPP_WITH_SE:
+            data = data * image_avg
+        return data
+
+
+def _decoder_with_sum_merge(encode_data, decode_shortcut, param_attr):
+    encode_data = fluid.layers.resize_bilinear(encode_data,
+                                               decode_shortcut.shape[2:])
+    encode_data = conv(
+        encode_data,
+        cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
+        1,
+        1,
+        groups=1,
+        padding=0,
+        param_attr=param_attr)
+
+    with scope('merge'):
+        decode_shortcut = conv(
+            decode_shortcut,
+            cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
+            1,
+            1,
+            groups=1,
+            padding=0,
+            param_attr=param_attr)
+
+        return encode_data + decode_shortcut
+
+
+def _decoder_with_concat(encode_data, decode_shortcut, param_attr):
+    with scope('concat'):
+        decode_shortcut = bn_relu(
+            conv(
+                decode_shortcut,
+                48,
+                1,
+                1,
+                groups=1,
+                padding=0,
+                param_attr=param_attr))
+
+        encode_data = fluid.layers.resize_bilinear(encode_data,
+                                                   decode_shortcut.shape[2:])
+        encode_data = fluid.layers.concat([encode_data, decode_shortcut],
+                                          axis=1)
+    if cfg.MODEL.DEEPLAB.DECODER_USE_SEP_CONV:
+        with scope("separable_conv1"):
+            encode_data = separate_conv(
+                encode_data,
+                cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
+                1,
+                3,
+                dilation=1,
+                act=relu)
+        with scope("separable_conv2"):
+            encode_data = separate_conv(
+                encode_data,
+                cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
+                1,
+                3,
+                dilation=1,
+                act=relu)
+    else:
+        with scope("decoder_conv1"):
+            encode_data = bn_relu(
                 conv(
-                    data,
-                    channel,
-                    1,
-                    1,
-                    groups=1,
-                    padding=0,
+                    encode_data,
+                    cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
+                    stride=1,
+                    filter_size=3,
+                    dilation=1,
+                    padding=1,
                     param_attr=param_attr))
-            data = fluid.layers.dropout(data, 0.9)
-        return data
+        with scope("decoder_conv2"):
+            encode_data = bn_relu(
+                conv(
+                    encode_data,
+                    cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS,
+                    stride=1,
+                    filter_size=3,
+                    dilation=1,
+                    padding=1,
+                    param_attr=param_attr))
+    return encode_data
 
 
 def decoder(encode_data, decode_shortcut):
@@ -139,61 +254,49 @@ def decoder(encode_data, decode_shortcut):
         regularizer=None,
         initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=0.06))
     with scope('decoder'):
-        with scope('concat'):
-            decode_shortcut = bn_relu(
-                conv(
-                    decode_shortcut,
-                    48,
-                    1,
-                    1,
-                    groups=1,
-                    padding=0,
-                    param_attr=param_attr))
+        if cfg.MODEL.DEEPLAB.DECODER.USE_SUM_MERGE:
+            return _decoder_with_sum_merge(encode_data, decode_shortcut,
+                                           param_attr)
 
-            encode_data = fluid.layers.resize_bilinear(
-                encode_data, decode_shortcut.shape[2:])
-            encode_data = fluid.layers.concat([encode_data, decode_shortcut],
-                                              axis=1)
-        if cfg.MODEL.DEEPLAB.DECODER_USE_SEP_CONV:
-            with scope("separable_conv1"):
-                encode_data = separate_conv(
-                    encode_data, 256, 1, 3, dilation=1, act=relu)
-            with scope("separable_conv2"):
-                encode_data = separate_conv(
-                    encode_data, 256, 1, 3, dilation=1, act=relu)
-        else:
-            with scope("decoder_conv1"):
-                encode_data = bn_relu(
-                    conv(
-                        encode_data,
-                        256,
-                        stride=1,
-                        filter_size=3,
-                        dilation=1,
-                        padding=1,
-                        param_attr=param_attr))
-            with scope("decoder_conv2"):
-                encode_data = bn_relu(
-                    conv(
-                        encode_data,
-                        256,
-                        stride=1,
-                        filter_size=3,
-                        dilation=1,
-                        padding=1,
-                        param_attr=param_attr))
-        return encode_data
+        return _decoder_with_concat(encode_data, decode_shortcut, param_attr)
+
+
+def mobilenet(input):
+    if 'v3' in cfg.MODEL.DEEPLAB.BACKBONE:
+        model_name = 'large' if 'large' in cfg.MODEL.DEEPLAB.BACKBONE else 'small'
+        return _mobilenetv3(input, model_name)
+    return _mobilenetv2(input)
 
 
-def mobilenetv2(input):
+def _mobilenetv3(input, model_name='large'):
+    # Backbone: mobilenetv3结构配置
+    # DEPTH_MULTIPLIER: mobilenetv3的scale设置，默认1.0
+    # OUTPUT_STRIDE：下采样倍数
+    scale = cfg.MODEL.DEEPLAB.DEPTH_MULTIPLIER
+    output_stride = cfg.MODEL.DEEPLAB.OUTPUT_STRIDE
+    lr_mult_shortcut = cfg.MODEL.DEEPLAB.BACKBONE_LR_MULT_LIST
+    model = mobilenet_v3_backbone(
+        scale=scale,
+        output_stride=output_stride,
+        model_name=model_name,
+        lr_mult_list=lr_mult_shortcut)
+    data, decode_shortcut = model.net(input)
+    return data, decode_shortcut
+
+
+def _mobilenetv2(input):
     # Backbone: mobilenetv2结构配置
     # DEPTH_MULTIPLIER: mobilenetv2的scale设置，默认1.0
     # OUTPUT_STRIDE：下采样倍数
     # end_points: mobilenetv2的block数
     # decode_point: 从mobilenetv2中引出分支所在block数, 作为decoder输入
+    if cfg.MODEL.DEEPLAB.BACKBONE_LR_MULT_LIST is not None:
+        print(
+            'mobilenetv2 backbone do not support BACKBONE_LR_MULT_LIST setting')
+
     scale = cfg.MODEL.DEEPLAB.DEPTH_MULTIPLIER
     output_stride = cfg.MODEL.DEEPLAB.OUTPUT_STRIDE
-    model = mobilenet_backbone(scale=scale, output_stride=output_stride)
+    model = mobilenet_v2_backbone(scale=scale, output_stride=output_stride)
     end_points = 18
     decode_point = 4
     data, decode_shortcuts = model.net(
@@ -270,11 +373,7 @@ def deeplabv3p(img, num_classes):
                 'xception backbone do not support BACKBONE_LR_MULT_LIST setting'
             )
     elif 'mobilenet' in cfg.MODEL.DEEPLAB.BACKBONE:
-        data, decode_shortcut = mobilenetv2(img)
-        if cfg.MODEL.DEEPLAB.BACKBONE_LR_MULT_LIST is not None:
-            print(
-                'mobilenetv2 backbone do not support BACKBONE_LR_MULT_LIST setting'
-            )
+        data, decode_shortcut = mobilenet(img)
     elif 'resnet' in cfg.MODEL.DEEPLAB.BACKBONE:
         data, decode_shortcut = resnet_vd(img)
     else:
@@ -294,16 +393,20 @@ def deeplabv3p(img, num_classes):
         regularizer=fluid.regularizer.L2DecayRegularizer(
             regularization_coeff=0.0),
         initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=0.01))
-    with scope('logit'):
-        with fluid.name_scope('last_conv'):
-            logit = conv(
-                data,
-                num_classes,
-                1,
-                stride=1,
-                padding=0,
-                bias_attr=True,
-                param_attr=param_attr)
-        logit = fluid.layers.resize_bilinear(logit, img.shape[2:])
 
+    if not cfg.MODEL.DEEPLAB.DECODER.OUTPUT_IS_LOGITS:
+        with scope('logit'):
+            with fluid.name_scope('last_conv'):
+                logit = conv(
+                    data,
+                    num_classes,
+                    1,
+                    stride=1,
+                    padding=0,
+                    bias_attr=True,
+                    param_attr=param_attr)
+    else:
+        logit = data
+
+    logit = fluid.layers.resize_bilinear(logit, img.shape[2:])
     return logit
diff --git a/pdseg/utils/config.py b/pdseg/utils/config.py
index 9989630a..8c1bccc1 100644
--- a/pdseg/utils/config.py
+++ b/pdseg/utils/config.py
@@ -198,17 +198,28 @@ cfg.MODEL.SCALE_LOSS = "DYNAMIC"
 cfg.MODEL.DEEPLAB.BACKBONE = "xception_65"
 # DeepLab output stride
 cfg.MODEL.DEEPLAB.OUTPUT_STRIDE = 16
-# MobileNet v2 backbone scale 设置
+# MobileNet v2/v3 backbone scale 设置
 cfg.MODEL.DEEPLAB.DEPTH_MULTIPLIER = 1.0
-# MobileNet v2 backbone scale 设置
+# DeepLab Encoder 设置
 cfg.MODEL.DEEPLAB.ENCODER_WITH_ASPP = True
-# MobileNet v2 backbone scale 设置
+cfg.MODEL.DEEPLAB.ENCODER.POOLING_STRIDE = [1, 1]
+cfg.MODEL.DEEPLAB.ENCODER.POOLING_CROP_SIZE = None
+cfg.MODEL.DEEPLAB.ENCODER.ASPP_WITH_SE = False
+cfg.MODEL.DEEPLAB.ENCODER.SE_USE_QSIGMOID = False
+cfg.MODEL.DEEPLAB.ENCODER.ASPP_CONVS_FILTERS = 256
+cfg.MODEL.DEEPLAB.ENCODER.ASPP_WITH_CONCAT_PROJECTION = True
+cfg.MODEL.DEEPLAB.ENCODER.ADD_IMAGE_LEVEL_FEATURE = True
+cfg.MODEL.DEEPLAB.ENCODER.ASPP_RATIOS = None
+# DeepLab Decoder 设置
 cfg.MODEL.DEEPLAB.ENABLE_DECODER = True
+cfg.MODEL.DEEPLAB.DECODER.USE_SUM_MERGE = False
+cfg.MODEL.DEEPLAB.DECODER.CONV_FILTERS = 256
+cfg.MODEL.DEEPLAB.DECODER.OUTPUT_IS_LOGITS = False
 # ASPP是否使用可分离卷积
 cfg.MODEL.DEEPLAB.ASPP_WITH_SEP_CONV = True
 # 解码器是否使用可分离卷积
 cfg.MODEL.DEEPLAB.DECODER_USE_SEP_CONV = True
-# resnet_vd分阶段学习率
+# Backbone分阶段学习率
 cfg.MODEL.DEEPLAB.BACKBONE_LR_MULT_LIST = None
 
 ########################## UNET模型配置 #######################################
diff --git a/pretrained_model/download_model.py b/pretrained_model/download_model.py
index 6ac0346c..17bbc7bd 100644
--- a/pretrained_model/download_model.py
+++ b/pretrained_model/download_model.py
@@ -34,6 +34,10 @@ model_urls = {
     "https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_x0_5_pretrained.tar",
     "mobilenetv2-0-25_bn_imagenet":
     "https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_x0_25_pretrained.tar",
+    "mobilenetv3-1-0_large_bn_imagenet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV3_large_x1_0_ssld_pretrained.tar",
+    "mobilenetv3-1-0_small_bn_imagenet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV3_small_x1_0_ssld_pretrained.tar",
     "xception41_imagenet":
     "https://paddleseg.bj.bcebos.com/models/Xception41_pretrained.tgz",
     "xception65_imagenet":
-- 
GitLab