Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSeg into dygraph

f7e5320e · chenguowei01 · 70c0212a · 53f1d1e4 · f7e5320e · f7e5320e
7 changed file
--- a/dygraph/configs/_base_/cityscapes.yml
+++ b/dygraph/configs/_base_/cityscapes.yml
 batch_size: 4
 iters: 100000
-learning_rate: 0.01
 train_dataset:
  type: Cityscapes

--- a/dygraph/configs/_base_/optic_disc_seg.yml
+++ b/dygraph/configs/_base_/optic_disc_seg.yml
 batch_size: 4
 iters: 10000
-learning_rate: 0.01
 train_dataset:
  type: OpticDiscSeg

--- a/dygraph/paddleseg/models/ann.py
+++ b/dygraph/paddleseg/models/ann.py
@@ -19,7 +19,7 @@ import paddle.nn.functional as F
 from paddle import nn
 from paddleseg.cvlibs import manager
-from paddleseg.models.common import layer_libs
+from paddleseg.models.common.layer_libs import ConvBNReLU, ConvBN, AuxLayer
 from paddleseg.utils import utils
@@ -32,11 +32,62 @@ class ANN(nn.Layer):
        Zhen, Zhu, et al. "Asymmetric Non-local Neural Networks for Semantic Segmentation."
        (https://arxiv.org/pdf/1908.07678.pdf)
+    Args:
+        num_classes (int): the unique number of target classes.
+        backbone (Paddle.nn.Layer): backbone network, currently support Resnet50/101.
+        model_pretrained (str): the path of pretrained model. Default to None.
+        backbone_indices (tuple): two values in the tuple indicate the indices of output of backbone.
+        key_value_channels (int): the key and value channels of self-attention map in both AFNB and APNB modules.
+            Default to 256.
+        inter_channels (int): both input and output channels of APNB modules.
+        psp_size (tuple): the out size of pooled feature maps. Default to (1, 3, 6, 8).
+        enable_auxiliary_loss (bool): a bool values indicates whether adding auxiliary loss. Default to True.
+        pretrained (str): the path of pretrained model. Default to None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 key_value_channels=256,
+                 inter_channels=512,
+                 psp_size=(1, 3, 6, 8),
+                 enable_auxiliary_loss=True,
+                 pretrained=None,):
+        super(ANN, self).__init__()
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+        self.head = ANNHead(
+            num_classes, 
+            backbone_indices,
+            backbone_channels,
+            key_value_channels,
+            inter_channels,
+            psp_size,
+            enable_auxiliary_loss)
+        utils.load_entire_model(self, pretrained)
+    def forward(self, input):
+        feat_list = self.backbone(input)
+        logit_list = self.head(feat_list)
+        return [
+            F.resize_bilinear(logit, input.shape[2:]) for logit in logit_list
+        ]
+class ANNHead(nn.Layer):
+    """
+    The ANNHead implementation.
    It mainly consists of AFNB and APNB modules.
    Args:
        num_classes (int): the unique number of target classes.
-        backbone (Paddle.nn.Layer): backbone network, currently support Resnet50/101.
        model_pretrained (str): the path of pretrained model. Default to None.
        backbone_indices (tuple): two values in the tuple indicate the indices of output of backbone.
            the first index will be taken as low-level features; the second one will be 
@@ -53,17 +104,13 @@ class ANN(nn.Layer):
    def __init__(self,
                 num_classes,
-                 backbone,
-                 model_pretrained=None,
                 backbone_indices=(2, 3),
                 backbone_channels=(1024, 2048),
                 key_value_channels=256,
                 inter_channels=512,
                 psp_size=(1, 3, 6, 8),
                 enable_auxiliary_loss=True):
-        super(ANN, self).__init__()
+        super(ANNHead, self).__init__()
-        self.backbone = backbone
        low_in_channels = backbone_channels[0]
        high_in_channels = backbone_channels[1]
@@ -79,7 +126,7 @@ class ANN(nn.Layer):
            psp_size=psp_size)
        self.context = nn.Sequential(
-            layer_libs.ConvBNReLU(
+            ConvBNReLU(
                in_channels=high_in_channels,
                out_channels=inter_channels,
                kernel_size=3,
@@ -95,7 +142,7 @@ class ANN(nn.Layer):
        self.cls = nn.Conv2d(
            in_channels=inter_channels, out_channels=num_classes, kernel_size=1)
-        self.auxlayer = layer_libs.AuxLayer(
+        self.auxlayer = AuxLayer(
            in_channels=low_in_channels,
            inter_channels=low_in_channels // 2,
            out_channels=num_classes,
@@ -104,41 +151,31 @@ class ANN(nn.Layer):
        self.backbone_indices = backbone_indices
        self.enable_auxiliary_loss = enable_auxiliary_loss
-        self.init_weight(model_pretrained)
+        self.init_weight()
-    def forward(self, input, label=None):
+    def forward(self, feat_list):
        logit_list = []
-        _, feat_list = self.backbone(input)
        low_level_x = feat_list[self.backbone_indices[0]]
        high_level_x = feat_list[self.backbone_indices[1]]
        x = self.fusion(low_level_x, high_level_x)
        x = self.context(x)
        logit = self.cls(x)
-        logit = F.resize_bilinear(logit, input.shape[2:])
        logit_list.append(logit)
        if self.enable_auxiliary_loss:
            auxiliary_logit = self.auxlayer(low_level_x)
-            auxiliary_logit = F.resize_bilinear(auxiliary_logit,
-                                                input.shape[2:])
            logit_list.append(auxiliary_logit)
        return logit_list
-    def init_weight(self, pretrained_model=None):
+    def init_weight(self):
        """
        Initialize the parameters of model parts.
-        Args:
-            pretrained_model ([str], optional): the pretrained_model path of backbone. Defaults to None.
        """
+        pass
-        if pretrained_model is not None:
-            if os.path.exists(pretrained_model):
-                utils.load_pretrained_model(self.backbone, pretrained_model)
 class AFNB(nn.Layer):
    """
    Asymmetric Fusion Non-local Block
@@ -171,7 +208,7 @@ class AFNB(nn.Layer):
                                    key_channels, value_channels, out_channels,
                                    size) for size in sizes
        ])
-        self.conv_bn = layer_libs.ConvBn(
+        self.conv_bn = ConvBN(
            in_channels=out_channels + high_in_channels,
            out_channels=out_channels,
            kernel_size=1)
@@ -218,7 +255,7 @@ class APNB(nn.Layer):
            SelfAttentionBlock_APNB(in_channels, out_channels, key_channels,
                                    value_channels, size) for size in sizes
        ])
-        self.conv_bn = layer_libs.ConvBNReLU(
+        self.conv_bn = ConvBNReLU(
            in_channels=in_channels * 2,
            out_channels=out_channels,
            kernel_size=1)
@@ -279,11 +316,11 @@ class SelfAttentionBlock_AFNB(nn.Layer):
        if out_channels == None:
            self.out_channels = high_in_channels
        self.pool = nn.Pool2D(pool_size=(scale, scale), pool_type="max")
-        self.f_key = layer_libs.ConvBNReLU(
+        self.f_key = ConvBNReLU(
            in_channels=low_in_channels,
            out_channels=key_channels,
            kernel_size=1)
-        self.f_query = layer_libs.ConvBNReLU(
+        self.f_query = ConvBNReLU(
            in_channels=high_in_channels,
            out_channels=key_channels,
            kernel_size=1)
@@ -357,7 +394,7 @@ class SelfAttentionBlock_APNB(nn.Layer):
        self.value_channels = value_channels
        self.pool = nn.Pool2D(pool_size=(scale, scale), pool_type="max")
-        self.f_key = layer_libs.ConvBNReLU(
+        self.f_key = ConvBNReLU(
            in_channels=self.in_channels,
            out_channels=self.key_channels,
            kernel_size=1)

--- a/dygraph/paddleseg/models/deeplab.py
+++ b/dygraph/paddleseg/models/deeplab.py
@@ -18,7 +18,8 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 from paddleseg.cvlibs import manager
-from paddleseg.models.common import pyramid_pool, layer_libs
+from paddleseg.models.common import pyramid_pool
+from paddleseg.models.common.layer_libs import ConvBNReLU, DepthwiseConvBNReLU, AuxLayer
 from paddleseg.utils import utils
 __all__ = ['DeepLabV3P', 'DeepLabV3']
@@ -47,8 +48,7 @@ class DeepLabV3P(nn.Layer):
            if output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
            if output_stride=8, aspp_ratios is (1, 12, 24, 36).
        aspp_out_channels (int): the output channels of ASPP module.
-        pretrained (str): the path of pretrained model for fine tuning.
+        pretrained (str): the path of pretrained model. Default to None.
    """
    def __init__(self,
@@ -94,7 +94,7 @@ class DeepLabV3PHead(nn.Layer):
            each stage, so we set default (0, 3), which means taking feature map of the first
            stage in backbone as low-level feature used in Decoder, and feature map of the fourth
            stage as input of ASPP.
-        backbone_channels (tuple): returned channels of backbone
+        backbone_channels (tuple): the same length with "backbone_indices". It indicates the channels of corresponding index.
        aspp_ratios (tuple): the dilation rate using in ASSP module.
            if output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
            if output_stride=8, aspp_ratios is (1, 12, 24, 36).
@@ -231,12 +231,12 @@ class Decoder(nn.Layer):
    def __init__(self, num_classes, in_channels):
        super(Decoder, self).__init__()
-        self.conv_bn_relu1 = layer_libs.ConvBNReLU(
+        self.conv_bn_relu1 = ConvBNReLU(
            in_channels=in_channels, out_channels=48, kernel_size=1)
-        self.conv_bn_relu2 = layer_libs.DepthwiseConvBNReLU(
+        self.conv_bn_relu2 = DepthwiseConvBNReLU(
            in_channels=304, out_channels=256, kernel_size=3, padding=1)
-        self.conv_bn_relu3 = layer_libs.DepthwiseConvBNReLU(
+        self.conv_bn_relu3 = DepthwiseConvBNReLU(
            in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv = nn.Conv2d(
            in_channels=256, out_channels=num_classes, kernel_size=1)

--- a/dygraph/paddleseg/models/fast_scnn.py
+++ b/dygraph/paddleseg/models/fast_scnn.py
@@ -14,9 +14,11 @@
 import paddle.nn.functional as F
 from paddle import nn
-from paddleseg.cvlibs import manager
-from paddleseg.models.common import layer_libs, pyramid_pool
+from paddleseg.cvlibs import manager
+from paddleseg.models.common import pyramid_pool
+from paddleseg.models.common.layer_libs import ConvBNReLU, DepthwiseConvBNReLU, AuxLayer
+from paddleseg.utils import utils
 @manager.MODELS.add_component
 class FastSCNN(nn.Layer):
@@ -33,15 +35,15 @@ class FastSCNN(nn.Layer):
    Args:
        num_classes (int): the unique number of target classes. Default to 2.
-        model_pretrained (str): the path of pretrained model. Default to None.
        enable_auxiliary_loss (bool): a bool values indicates whether adding auxiliary loss.
            if true, auxiliary loss will be added after LearningToDownsample module, where the weight is 0.4. Default to False.
+        pretrained (str): the path of pretrained model. Default to None.
    """
    def __init__(self,
                 num_classes,
-                 model_pretrained=None,
+                 enable_auxiliary_loss=True,
-                 enable_auxiliary_loss=True):
+                 pretrained=None):
        super(FastSCNN, self).__init__()
@@ -52,11 +54,12 @@ class FastSCNN(nn.Layer):
        self.classifier = Classifier(128, num_classes)
        if enable_auxiliary_loss:
-            self.auxlayer = layer_libs.AuxLayer(64, 32, num_classes)
+            self.auxlayer = AuxLayer(64, 32, num_classes)
        self.enable_auxiliary_loss = enable_auxiliary_loss
-        self.init_weight(model_pretrained)
+        self.init_weight()
+        utils.load_entire_model(self, pretrained)
    def forward(self, input, label=None):
@@ -76,18 +79,11 @@ class FastSCNN(nn.Layer):
        return logit_list
-    def init_weight(self, pretrained_model=None):
+    def init_weight(self):
        """
        Initialize the parameters of model parts.
-        Args:
-            pretrained_model ([str], optional): the path of pretrained model. Defaults to None.
        """
-        if pretrained_model is not None:
+        pass
-            if os.path.exists(pretrained_model):
-                utils.load_pretrained_model(self, pretrained_model)
-            else:
-                raise Exception('Pretrained model is not found: {}'.format(
-                    pretrained_model))
 class LearningToDownsample(nn.Layer):
@@ -105,15 +101,15 @@ class LearningToDownsample(nn.Layer):
    def __init__(self, dw_channels1=32, dw_channels2=48, out_channels=64):
        super(LearningToDownsample, self).__init__()
-        self.conv_bn_relu = layer_libs.ConvBNReLU(
+        self.conv_bn_relu = ConvBNReLU(
            in_channels=3, out_channels=dw_channels1, kernel_size=3, stride=2)
-        self.dsconv_bn_relu1 = layer_libs.DepthwiseConvBNReLU(
+        self.dsconv_bn_relu1 = DepthwiseConvBNReLU(
            in_channels=dw_channels1,
            out_channels=dw_channels2,
            kernel_size=3,
            stride=2,
            padding=1)
-        self.dsconv_bn_relu2 = layer_libs.DepthwiseConvBNReLU(
+        self.dsconv_bn_relu2 = DepthwiseConvBNReLU(
            in_channels=dw_channels2,
            out_channels=out_channels,
            kernel_size=3,
@@ -208,13 +204,13 @@ class LinearBottleneck(nn.Layer):
        expand_channels = in_channels * expansion
        self.block = nn.Sequential(
            # pw
-            layer_libs.ConvBNReLU(
+            ConvBNReLU(
                in_channels=in_channels,
                out_channels=expand_channels,
                kernel_size=1,
                bias_attr=False),
            # dw
-            layer_libs.ConvBNReLU(
+            ConvBNReLU(
                in_channels=expand_channels,
                out_channels=expand_channels,
                kernel_size=3,
@@ -253,7 +249,7 @@ class FeatureFusionModule(nn.Layer):
        super(FeatureFusionModule, self).__init__()
        # There only depth-wise conv is used WITHOUT point-wise conv
-        self.dwconv = layer_libs.ConvBNReLU(
+        self.dwconv = ConvBNReLU(
            in_channels=low_in_channels,
            out_channels=out_channels,
            kernel_size=3,
@@ -289,9 +285,9 @@ class FeatureFusionModule(nn.Layer):
 class Classifier(nn.Layer):
    """
-    The Classifier module implemetation.
+    The Classifier module implementation.
-    This module consists of two depth-wsie conv and one conv.
+    This module consists of two depth-wise conv and one conv.
    Args:
        input_channels (int): the input channels to this module.
@@ -301,13 +297,13 @@ class Classifier(nn.Layer):
    def __init__(self, input_channels, num_classes):
        super(Classifier, self).__init__()
-        self.dsconv1 = layer_libs.DepthwiseConvBNReLU(
+        self.dsconv1 = DepthwiseConvBNReLU(
            in_channels=input_channels,
            out_channels=input_channels,
            kernel_size=3,
            padding=1)
-        self.dsconv2 = layer_libs.DepthwiseConvBNReLU(
+        self.dsconv2 = DepthwiseConvBNReLU(
            in_channels=input_channels,
            out_channels=input_channels,
            kernel_size=3,

--- a/dygraph/paddleseg/models/gcnet.py
+++ b/dygraph/paddleseg/models/gcnet.py
@@ -18,10 +18,12 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 from paddleseg.cvlibs import manager
-from paddleseg.models.common import layer_libs
+from paddleseg.models.common.layer_libs import ConvBNReLU, AuxLayer
 from paddleseg.utils import utils
 @manager.MODELS.add_component
 class GCNet(nn.Layer):
    """
@@ -34,7 +36,54 @@ class GCNet(nn.Layer):
    Args:
        num_classes (int): the unique number of target classes.
        backbone (Paddle.nn.Layer): backbone network, currently support Resnet50/101.
-        model_pretrained (str): the path of pretrained model. Default to None.
+        backbone_indices (tuple): two values in the tuple indicate the indices of output of backbone.
+        gc_channels (int): input channels to Global Context Block. Default to 512.
+        ratio (float): it indicates the ratio of attention channels and gc_channels. Default to 1/4.
+        enable_auxiliary_loss (bool): a bool values indicates whether adding auxiliary loss. Default to True.
+        pretrained (str): the path of pretrained model. Default to None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 gc_channels=512,
+                 ratio=1 / 4,
+                 enable_auxiliary_loss=True,
+                 pretrained=None):
+        super(GCNet, self).__init__()
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+        self.head = GCNetHead(
+            num_classes, 
+            backbone_indices,
+            backbone_channels,
+            gc_channels,
+            ratio,
+            enable_auxiliary_loss)
+        utils.load_entire_model(self, pretrained)
+    def forward(self, input):
+        feat_list = self.backbone(input)
+        logit_list = self.head(feat_list)
+        return [
+            F.resize_bilinear(logit, input.shape[2:]) for logit in logit_list
+        ]
+class GCNetHead(nn.Layer):
+    """
+    The GCNetHead implementation.
+    Args:
+        num_classes (int): the unique number of target classes.
        backbone_indices (tuple): two values in the tuple indicate the indices of output of backbone.
            the first index will be taken as a deep-supervision feature in auxiliary layer;
            the second one will be taken as input of GlobalContextBlock. Usually backbone 
@@ -49,21 +98,16 @@ class GCNet(nn.Layer):
    def __init__(self,
                 num_classes,
-                 backbone,
-                 model_pretrained=None,
                 backbone_indices=(2, 3),
                 backbone_channels=(1024, 2048),
                 gc_channels=512,
                 ratio=1 / 4,
-                 enable_auxiliary_loss=True,
+                 enable_auxiliary_loss=True):
-                 pretrained_model=None):
-        super(GCNet, self).__init__()
-        self.backbone = backbone
+        super(GCNetHead, self).__init__()
        in_channels = backbone_channels[1]
-        self.conv_bn_relu1 = layer_libs.ConvBNReLU(
+        self.conv_bn_relu1 = ConvBNReLU(
            in_channels=in_channels,
            out_channels=gc_channels,
            kernel_size=3,
@@ -71,13 +115,13 @@ class GCNet(nn.Layer):
        self.gc_block = GlobalContextBlock(in_channels=gc_channels, ratio=ratio)
-        self.conv_bn_relu2 = layer_libs.ConvBNReLU(
+        self.conv_bn_relu2 = ConvBNReLU(
            in_channels=gc_channels,
            out_channels=gc_channels,
            kernel_size=3,
            padding=1)
-        self.conv_bn_relu3 = layer_libs.ConvBNReLU(
+        self.conv_bn_relu3 = ConvBNReLU(
            in_channels=in_channels + gc_channels,
            out_channels=gc_channels,
            kernel_size=3,
@@ -87,7 +131,7 @@ class GCNet(nn.Layer):
            in_channels=gc_channels, out_channels=num_classes, kernel_size=1)
        if enable_auxiliary_loss:
-            self.auxlayer = layer_libs.AuxLayer(
+            self.auxlayer = AuxLayer(
                in_channels=backbone_channels[0],
                inter_channels=backbone_channels[0] // 4,
                out_channels=num_classes)
@@ -95,12 +139,11 @@ class GCNet(nn.Layer):
        self.backbone_indices = backbone_indices
        self.enable_auxiliary_loss = enable_auxiliary_loss
-        self.init_weight(model_pretrained)
+        self.init_weight()
-    def forward(self, input, label=None):
+    def forward(self, feat_list):
        logit_list = []
-        _, feat_list = self.backbone(input)
        x = feat_list[self.backbone_indices[1]]
        output = self.conv_bn_relu1(x)
@@ -112,14 +155,11 @@ class GCNet(nn.Layer):
        output = F.dropout(output, p=0.1)  # dropout_prob
        logit = self.conv(output)
-        logit = F.resize_bilinear(logit, input.shape[2:])
        logit_list.append(logit)
        if self.enable_auxiliary_loss:
            low_level_feat = feat_list[self.backbone_indices[0]]
            auxiliary_logit = self.auxlayer(low_level_feat)
-            auxiliary_logit = F.resize_bilinear(auxiliary_logit,
-                                                input.shape[2:])
            logit_list.append(auxiliary_logit)
        return logit_list
@@ -127,15 +167,8 @@ class GCNet(nn.Layer):
    def init_weight(self, pretrained_model=None):
        """
        Initialize the parameters of model parts.
-        Args:
-            pretrained_model ([str], optional): the path of pretrained model. Defaults to None.
        """
-        if pretrained_model is not None:
+        pass
-            if os.path.exists(pretrained_model):
-                utils.load_pretrained_model(self, pretrained_model)
-            else:
-                raise Exception('Pretrained model is not found: {}'.format(
-                    pretrained_model))
 class GlobalContextBlock(nn.Layer):

--- a/dygraph/paddleseg/models/pspnet.py
+++ b/dygraph/paddleseg/models/pspnet.py
@@ -17,7 +17,8 @@ import os
 import paddle.nn.functional as F
 from paddle import nn
 from paddleseg.cvlibs import manager
-from paddleseg.models.common import layer_libs, pyramid_pool
+from paddleseg.models.common import pyramid_pool
+from paddleseg.models.common.layer_libs import ConvBNReLU, AuxLayer
 from paddleseg.utils import utils
@@ -36,30 +37,75 @@ class PSPNet(nn.Layer):
        backbone (Paddle.nn.Layer): backbone network, currently support Resnet50/101.
        model_pretrained (str): the path of pretrained model. Default to None.
        backbone_indices (tuple): two values in the tuple indicate the indices of output of backbone.
-                        the first index will be taken as a deep-supervision feature in auxiliary layer;
-                        the second one will be taken as input of Pyramid Pooling Module (PPModule).
-                        Usually backbone consists of four downsampling stage, and return an output of
-                        each stage, so we set default (2, 3), which means taking feature map of the third
-                        stage (res4b22) in backbone, and feature map of the fourth stage (res5c) as input of PPModule.
-        backbone_channels (tuple): the same length with "backbone_indices". It indicates the channels of corresponding index.
        pp_out_channels (int): output channels after Pyramid Pooling Module. Default to 1024.
        bin_sizes (tuple): the out size of pooled feature maps. Default to (1,2,3,6).
        enable_auxiliary_loss (bool): a bool values indicates whether adding auxiliary loss. Default to True.
+        pretrained (str): the path of pretrained model. Default to None.
    """
    def __init__(self,
                 num_classes,
                 backbone,
-                 model_pretrained=None,
                 backbone_indices=(2, 3),
-                 backbone_channels=(1024, 2048),
                 pp_out_channels=1024,
                 bin_sizes=(1, 2, 3, 6),
-                 enable_auxiliary_loss=True):
+                 enable_auxiliary_loss=True,
+                 pretrained=None):
        super(PSPNet, self).__init__()
        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+        self.head = PSPNetHead(
+            num_classes, 
+            backbone_indices,
+            backbone_channels,
+            pp_out_channels,
+            bin_sizes,
+            enable_auxiliary_loss)
+        utils.load_entire_model(self, pretrained)
+    def forward(self, input):
+        feat_list = self.backbone(input)
+        logit_list = self.head(feat_list)
+        return [
+            F.resize_bilinear(logit, input.shape[2:]) for logit in logit_list
+        ]
+class PSPNetHead(nn.Layer):
+    """
+    The PSPNetHead implementation.
+    Args:
+        num_classes (int): the unique number of target classes.
+        backbone_indices (tuple): two values in the tuple indicate the indices of output of backbone.
+            the first index will be taken as a deep-supervision feature in auxiliary layer;
+            the second one will be taken as input of Pyramid Pooling Module (PPModule).
+            Usually backbone consists of four downsampling stage, and return an output of
+            each stage, so we set default (2, 3), which means taking feature map of the third
+            stage (res4b22) in backbone, and feature map of the fourth stage (res5c) as input of PPModule.
+        backbone_channels (tuple): the same length with "backbone_indices". It indicates the channels of corresponding index.
+        pp_out_channels (int): output channels after Pyramid Pooling Module. Default to 1024.
+        bin_sizes (tuple): the out size of pooled feature maps. Default to (1,2,3,6).
+        enable_auxiliary_loss (bool): a bool values indicates whether adding auxiliary loss. Default to True.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone_indices=(2, 3),
+                 backbone_channels=(1024, 2048),
+                 pp_out_channels=1024,
+                 bin_sizes=(1, 2, 3, 6),
+                 enable_auxiliary_loss=True):
+        super(PSPNetHead, self).__init__()
        self.backbone_indices = backbone_indices
        self.psp_module = pyramid_pool.PPModule(
@@ -73,33 +119,29 @@ class PSPNet(nn.Layer):
            kernel_size=1)
        if enable_auxiliary_loss:
-            self.auxlayer = layer_libs.AuxLayer(
+            self.auxlayer = AuxLayer(
-                in_channels=backbone_channels[0], 
+                in_channels=backbone_channels[0],
                inter_channels=backbone_channels[0] // 4,
                out_channels=num_classes)
        self.enable_auxiliary_loss = enable_auxiliary_loss
-        self.init_weight(model_pretrained)
+        self.init_weight()
-    def forward(self, input, label=None):
+    def forward(self, feat_list):
        logit_list = []
-        _, feat_list = self.backbone(input)
        x = feat_list[self.backbone_indices[1]]
        x = self.psp_module(x)
        x = F.dropout(x, p=0.1)  # dropout_prob
        logit = self.conv(x)
-        logit = F.resize_bilinear(logit, input.shape[2:])
        logit_list.append(logit)
        if self.enable_auxiliary_loss:
            auxiliary_feat = feat_list[self.backbone_indices[0]]
            auxiliary_logit = self.auxlayer(auxiliary_feat)
-            auxiliary_logit = F.resize_bilinear(auxiliary_logit,
-                                                input.shape[2:])
            logit_list.append(auxiliary_logit)
        return logit_list
@@ -107,13 +149,6 @@ class PSPNet(nn.Layer):
    def init_weight(self, pretrained_model=None):
        """
        Initialize the parameters of model parts.
-        Args:
-            pretrained_model ([str], optional): the path of pretrained model. Defaults to None.
        """
-        if pretrained_model is not None:
+        pass
-            if os.path.exists(pretrained_model):
-                utils.load_pretrained_model(self, pretrained_model)
-            else:
-                raise Exception('Pretrained model is not found: {}'.format(
-                    pretrained_model))