[cherry-pick] refactor vision models (#42252)

* reuse ConvNormActivation in some vision models (#40431) * reuse ConvNormActivation in some vision models * reimplement ResNeXt based on ResNet (#40588) * refactor resnext

[cherry-pick] refactor vision models (#42252)
* reuse ConvNormActivation in some vision models (#40431) * reuse ConvNormActivation in some vision models * reimplement ResNeXt based on ResNet (#40588) * refactor resnext
40ac92b2 · Nyakku Shigure · GitHub · 5eba3847 · 40ac92b2 · 40ac92b2
9 changed file
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -34,6 +34,12 @@ from .models import resnet34  # noqa: F401
 from .models import resnet50  # noqa: F401
 from .models import resnet101  # noqa: F401
 from .models import resnet152  # noqa: F401
+from .models import resnext50_32x4d  # noqa: F401
+from .models import resnext50_64x4d  # noqa: F401
+from .models import resnext101_32x4d  # noqa: F401
+from .models import resnext101_64x4d  # noqa: F401
+from .models import resnext152_32x4d  # noqa: F401
+from .models import resnext152_64x4d  # noqa: F401
 from .models import wide_resnet50_2  # noqa: F401
 from .models import wide_resnet101_2  # noqa: F401
 from .models import MobileNetV1  # noqa: F401
@@ -61,13 +67,6 @@ from .models import densenet201  # noqa: F401
 from .models import densenet264  # noqa: F401
 from .models import AlexNet  # noqa: F401
 from .models import alexnet  # noqa: F401
-from .models import ResNeXt  # noqa: F401
-from .models import resnext50_32x4d  # noqa: F401
-from .models import resnext50_64x4d  # noqa: F401
-from .models import resnext101_32x4d  # noqa: F401
-from .models import resnext101_64x4d  # noqa: F401
-from .models import resnext152_32x4d  # noqa: F401
-from .models import resnext152_64x4d  # noqa: F401
 from .models import InceptionV3  # noqa: F401
 from .models import inception_v3  # noqa: F401
 from .models import GoogLeNet  # noqa: F401

--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
@@ -18,6 +18,12 @@ from .resnet import resnet34  # noqa: F401
 from .resnet import resnet50  # noqa: F401
 from .resnet import resnet101  # noqa: F401
 from .resnet import resnet152  # noqa: F401
+from .resnet import resnext50_32x4d  # noqa: F401
+from .resnet import resnext50_64x4d  # noqa: F401
+from .resnet import resnext101_32x4d  # noqa: F401
+from .resnet import resnext101_64x4d  # noqa: F401
+from .resnet import resnext152_32x4d  # noqa: F401
+from .resnet import resnext152_64x4d  # noqa: F401
 from .resnet import wide_resnet50_2  # noqa: F401
 from .resnet import wide_resnet101_2  # noqa: F401
 from .mobilenetv1 import MobileNetV1  # noqa: F401
@@ -42,13 +48,6 @@ from .densenet import densenet201  # noqa: F401
 from .densenet import densenet264  # noqa: F401
 from .alexnet import AlexNet  # noqa: F401
 from .alexnet import alexnet  # noqa: F401
-from .resnext import ResNeXt  # noqa: F401
-from .resnext import resnext50_32x4d  # noqa: F401
-from .resnext import resnext50_64x4d  # noqa: F401
-from .resnext import resnext101_32x4d  # noqa: F401
-from .resnext import resnext101_64x4d  # noqa: F401
-from .resnext import resnext152_32x4d  # noqa: F401
-from .resnext import resnext152_64x4d  # noqa: F401
 from .inceptionv3 import InceptionV3  # noqa: F401
 from .inceptionv3 import inception_v3  # noqa: F401
 from .squeezenet import SqueezeNet  # noqa: F401
@@ -72,6 +71,12 @@ __all__ = [ #noqa
    'resnet50',
    'resnet101',
    'resnet152',
+    'resnext50_32x4d',
+    'resnext50_64x4d',
+    'resnext101_32x4d',
+    'resnext101_64x4d',
+    'resnext152_32x4d',
+    'resnext152_64x4d',
    'wide_resnet50_2',
    'wide_resnet101_2',
    'VGG',
@@ -96,13 +101,6 @@ __all__ = [ #noqa
    'densenet264',
    'AlexNet',
    'alexnet',
-    'ResNeXt',
-    'resnext50_32x4d',
-    'resnext50_64x4d',
-    'resnext101_32x4d',
-    'resnext101_64x4d',
-    'resnext152_32x4d',
-    'resnext152_64x4d',
    'InceptionV3',
    'inception_v3',
    'SqueezeNet',

--- a/python/paddle/vision/models/inceptionv3.py
+++ b/python/paddle/vision/models/inceptionv3.py
@@ -19,75 +19,60 @@ from __future__ import print_function
 import math
 import paddle
 import paddle.nn as nn
-from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import Linear, Dropout
 from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
 from paddle.nn.initializer import Uniform
 from paddle.fluid.param_attr import ParamAttr
 from paddle.utils.download import get_weights_path_from_url
+from ..ops import ConvNormActivation
 __all__ = []
 model_urls = {
    "inception_v3":
-    ("https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/InceptionV3_pretrained.pdparams",
+    ("https://paddle-hapi.bj.bcebos.com/models/inception_v3.pdparams",
-     "e4d0905a818f6bb7946e881777a8a935")
+     "649a4547c3243e8b59c656f41fe330b8")
 }
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 padding=0,
-                 groups=1,
-                 act="relu"):
-        super().__init__()
-        self.act = act
-        self.conv = Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            bias_attr=False)
-        self.bn = BatchNorm(num_filters)
-        self.relu = nn.ReLU()
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.act:
-            x = self.relu(x)
-        return x
 class InceptionStem(nn.Layer):
    def __init__(self):
        super().__init__()
-        self.conv_1a_3x3 = ConvBNLayer(
+        self.conv_1a_3x3 = ConvNormActivation(
-            num_channels=3, num_filters=32, filter_size=3, stride=2, act="relu")
+            in_channels=3,
-        self.conv_2a_3x3 = ConvBNLayer(
+            out_channels=32,
-            num_channels=32,
+            kernel_size=3,
-            num_filters=32,
+            stride=2,
-            filter_size=3,
+            padding=0,
+            activation_layer=nn.ReLU)
+        self.conv_2a_3x3 = ConvNormActivation(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
            stride=1,
-            act="relu")
+            padding=0,
-        self.conv_2b_3x3 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=32,
+        self.conv_2b_3x3 = ConvNormActivation(
-            num_filters=64,
+            in_channels=32,
-            filter_size=3,
+            out_channels=64,
+            kernel_size=3,
            padding=1,
-            act="relu")
+            activation_layer=nn.ReLU)
        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
-        self.conv_3b_1x1 = ConvBNLayer(
+        self.conv_3b_1x1 = ConvNormActivation(
-            num_channels=64, num_filters=80, filter_size=1, act="relu")
+            in_channels=64,
-        self.conv_4a_3x3 = ConvBNLayer(
+            out_channels=80,
-            num_channels=80, num_filters=192, filter_size=3, act="relu")
+            kernel_size=1,
+            padding=0,
+            activation_layer=nn.ReLU)
+        self.conv_4a_3x3 = ConvNormActivation(
+            in_channels=80,
+            out_channels=192,
+            kernel_size=3,
+            padding=0,
+            activation_layer=nn.ReLU)
    def forward(self, x):
        x = self.conv_1a_3x3(x)
@@ -103,47 +88,53 @@ class InceptionStem(nn.Layer):
 class InceptionA(nn.Layer):
    def __init__(self, num_channels, pool_features):
        super().__init__()
-        self.branch1x1 = ConvBNLayer(
+        self.branch1x1 = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=64,
+            out_channels=64,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
-        self.branch5x5_1 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=num_channels,
-            num_filters=48,
+        self.branch5x5_1 = ConvNormActivation(
-            filter_size=1,
+            in_channels=num_channels,
-            act="relu")
+            out_channels=48,
-        self.branch5x5_2 = ConvBNLayer(
+            kernel_size=1,
-            num_channels=48,
+            padding=0,
-            num_filters=64,
+            activation_layer=nn.ReLU)
-            filter_size=5,
+        self.branch5x5_2 = ConvNormActivation(
+            in_channels=48,
+            out_channels=64,
+            kernel_size=5,
            padding=2,
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch3x3dbl_1 = ConvBNLayer(
+        self.branch3x3dbl_1 = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=64,
+            out_channels=64,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
-        self.branch3x3dbl_2 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=64,
+        self.branch3x3dbl_2 = ConvNormActivation(
-            num_filters=96,
+            in_channels=64,
-            filter_size=3,
+            out_channels=96,
+            kernel_size=3,
            padding=1,
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch3x3dbl_3 = ConvBNLayer(
+        self.branch3x3dbl_3 = ConvNormActivation(
-            num_channels=96,
+            in_channels=96,
-            num_filters=96,
+            out_channels=96,
-            filter_size=3,
+            kernel_size=3,
            padding=1,
-            act="relu")
+            activation_layer=nn.ReLU)
        self.branch_pool = AvgPool2D(
            kernel_size=3, stride=1, padding=1, exclusive=False)
-        self.branch_pool_conv = ConvBNLayer(
+        self.branch_pool_conv = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=pool_features,
+            out_channels=pool_features,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
+            activation_layer=nn.ReLU)
    def forward(self, x):
        branch1x1 = self.branch1x1(x)
@@ -164,29 +155,34 @@ class InceptionA(nn.Layer):
 class InceptionB(nn.Layer):
    def __init__(self, num_channels):
        super().__init__()
-        self.branch3x3 = ConvBNLayer(
+        self.branch3x3 = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=384,
+            out_channels=384,
-            filter_size=3,
+            kernel_size=3,
            stride=2,
-            act="relu")
+            padding=0,
-        self.branch3x3dbl_1 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=num_channels,
-            num_filters=64,
+        self.branch3x3dbl_1 = ConvNormActivation(
-            filter_size=1,
+            in_channels=num_channels,
-            act="relu")
+            out_channels=64,
-        self.branch3x3dbl_2 = ConvBNLayer(
+            kernel_size=1,
-            num_channels=64,
+            padding=0,
-            num_filters=96,
+            activation_layer=nn.ReLU)
-            filter_size=3,
+        self.branch3x3dbl_2 = ConvNormActivation(
+            in_channels=64,
+            out_channels=96,
+            kernel_size=3,
            padding=1,
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch3x3dbl_3 = ConvBNLayer(
+        self.branch3x3dbl_3 = ConvNormActivation(
-            num_channels=96,
+            in_channels=96,
-            num_filters=96,
+            out_channels=96,
-            filter_size=3,
+            kernel_size=3,
            stride=2,
-            act="relu")
+            padding=0,
+            activation_layer=nn.ReLU)
        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
    def forward(self, x):
@@ -206,70 +202,74 @@ class InceptionB(nn.Layer):
 class InceptionC(nn.Layer):
    def __init__(self, num_channels, channels_7x7):
        super().__init__()
-        self.branch1x1 = ConvBNLayer(
+        self.branch1x1 = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=192,
+            out_channels=192,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
+            activation_layer=nn.ReLU)
-        self.branch7x7_1 = ConvBNLayer(
-            num_channels=num_channels,
+        self.branch7x7_1 = ConvNormActivation(
-            num_filters=channels_7x7,
+            in_channels=num_channels,
-            filter_size=1,
+            out_channels=channels_7x7,
+            kernel_size=1,
            stride=1,
-            act="relu")
+            padding=0,
-        self.branch7x7_2 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=channels_7x7,
+        self.branch7x7_2 = ConvNormActivation(
-            num_filters=channels_7x7,
+            in_channels=channels_7x7,
-            filter_size=(1, 7),
+            out_channels=channels_7x7,
+            kernel_size=(1, 7),
            stride=1,
            padding=(0, 3),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch7x7_3 = ConvBNLayer(
+        self.branch7x7_3 = ConvNormActivation(
-            num_channels=channels_7x7,
+            in_channels=channels_7x7,
-            num_filters=192,
+            out_channels=192,
-            filter_size=(7, 1),
+            kernel_size=(7, 1),
            stride=1,
            padding=(3, 0),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch7x7dbl_1 = ConvBNLayer(
+        self.branch7x7dbl_1 = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=channels_7x7,
+            out_channels=channels_7x7,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
-        self.branch7x7dbl_2 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=channels_7x7,
+        self.branch7x7dbl_2 = ConvNormActivation(
-            num_filters=channels_7x7,
+            in_channels=channels_7x7,
-            filter_size=(7, 1),
+            out_channels=channels_7x7,
+            kernel_size=(7, 1),
            padding=(3, 0),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch7x7dbl_3 = ConvBNLayer(
+        self.branch7x7dbl_3 = ConvNormActivation(
-            num_channels=channels_7x7,
+            in_channels=channels_7x7,
-            num_filters=channels_7x7,
+            out_channels=channels_7x7,
-            filter_size=(1, 7),
+            kernel_size=(1, 7),
            padding=(0, 3),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch7x7dbl_4 = ConvBNLayer(
+        self.branch7x7dbl_4 = ConvNormActivation(
-            num_channels=channels_7x7,
+            in_channels=channels_7x7,
-            num_filters=channels_7x7,
+            out_channels=channels_7x7,
-            filter_size=(7, 1),
+            kernel_size=(7, 1),
            padding=(3, 0),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch7x7dbl_5 = ConvBNLayer(
+        self.branch7x7dbl_5 = ConvNormActivation(
-            num_channels=channels_7x7,
+            in_channels=channels_7x7,
-            num_filters=192,
+            out_channels=192,
-            filter_size=(1, 7),
+            kernel_size=(1, 7),
            padding=(0, 3),
-            act="relu")
+            activation_layer=nn.ReLU)
        self.branch_pool = AvgPool2D(
            kernel_size=3, stride=1, padding=1, exclusive=False)
-        self.branch_pool_conv = ConvBNLayer(
+        self.branch_pool_conv = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=192,
+            out_channels=192,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
+            activation_layer=nn.ReLU)
    def forward(self, x):
        branch1x1 = self.branch1x1(x)
@@ -296,40 +296,46 @@ class InceptionC(nn.Layer):
 class InceptionD(nn.Layer):
    def __init__(self, num_channels):
        super().__init__()
-        self.branch3x3_1 = ConvBNLayer(
+        self.branch3x3_1 = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=192,
+            out_channels=192,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
-        self.branch3x3_2 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=192,
+        self.branch3x3_2 = ConvNormActivation(
-            num_filters=320,
+            in_channels=192,
-            filter_size=3,
+            out_channels=320,
+            kernel_size=3,
            stride=2,
-            act="relu")
+            padding=0,
-        self.branch7x7x3_1 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=num_channels,
-            num_filters=192,
+        self.branch7x7x3_1 = ConvNormActivation(
-            filter_size=1,
+            in_channels=num_channels,
-            act="relu")
+            out_channels=192,
-        self.branch7x7x3_2 = ConvBNLayer(
+            kernel_size=1,
-            num_channels=192,
+            padding=0,
-            num_filters=192,
+            activation_layer=nn.ReLU)
-            filter_size=(1, 7),
+        self.branch7x7x3_2 = ConvNormActivation(
+            in_channels=192,
+            out_channels=192,
+            kernel_size=(1, 7),
            padding=(0, 3),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch7x7x3_3 = ConvBNLayer(
+        self.branch7x7x3_3 = ConvNormActivation(
-            num_channels=192,
+            in_channels=192,
-            num_filters=192,
+            out_channels=192,
-            filter_size=(7, 1),
+            kernel_size=(7, 1),
            padding=(3, 0),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch7x7x3_4 = ConvBNLayer(
+        self.branch7x7x3_4 = ConvNormActivation(
-            num_channels=192,
+            in_channels=192,
-            num_filters=192,
+            out_channels=192,
-            filter_size=3,
+            kernel_size=3,
            stride=2,
-            act="relu")
+            padding=0,
+            activation_layer=nn.ReLU)
        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
    def forward(self, x):
@@ -350,59 +356,64 @@ class InceptionD(nn.Layer):
 class InceptionE(nn.Layer):
    def __init__(self, num_channels):
        super().__init__()
-        self.branch1x1 = ConvBNLayer(
+        self.branch1x1 = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=320,
+            out_channels=320,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
-        self.branch3x3_1 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=num_channels,
+        self.branch3x3_1 = ConvNormActivation(
-            num_filters=384,
+            in_channels=num_channels,
-            filter_size=1,
+            out_channels=384,
-            act="relu")
+            kernel_size=1,
-        self.branch3x3_2a = ConvBNLayer(
+            padding=0,
-            num_channels=384,
+            activation_layer=nn.ReLU)
-            num_filters=384,
+        self.branch3x3_2a = ConvNormActivation(
-            filter_size=(1, 3),
+            in_channels=384,
+            out_channels=384,
+            kernel_size=(1, 3),
            padding=(0, 1),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch3x3_2b = ConvBNLayer(
+        self.branch3x3_2b = ConvNormActivation(
-            num_channels=384,
+            in_channels=384,
-            num_filters=384,
+            out_channels=384,
-            filter_size=(3, 1),
+            kernel_size=(3, 1),
            padding=(1, 0),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch3x3dbl_1 = ConvBNLayer(
+        self.branch3x3dbl_1 = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=448,
+            out_channels=448,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
-        self.branch3x3dbl_2 = ConvBNLayer(
+            activation_layer=nn.ReLU)
-            num_channels=448,
+        self.branch3x3dbl_2 = ConvNormActivation(
-            num_filters=384,
+            in_channels=448,
-            filter_size=3,
+            out_channels=384,
+            kernel_size=3,
            padding=1,
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch3x3dbl_3a = ConvBNLayer(
+        self.branch3x3dbl_3a = ConvNormActivation(
-            num_channels=384,
+            in_channels=384,
-            num_filters=384,
+            out_channels=384,
-            filter_size=(1, 3),
+            kernel_size=(1, 3),
            padding=(0, 1),
-            act="relu")
+            activation_layer=nn.ReLU)
-        self.branch3x3dbl_3b = ConvBNLayer(
+        self.branch3x3dbl_3b = ConvNormActivation(
-            num_channels=384,
+            in_channels=384,
-            num_filters=384,
+            out_channels=384,
-            filter_size=(3, 1),
+            kernel_size=(3, 1),
            padding=(1, 0),
-            act="relu")
+            activation_layer=nn.ReLU)
        self.branch_pool = AvgPool2D(
            kernel_size=3, stride=1, padding=1, exclusive=False)
-        self.branch_pool_conv = ConvBNLayer(
+        self.branch_pool_conv = ConvNormActivation(
-            num_channels=num_channels,
+            in_channels=num_channels,
-            num_filters=192,
+            out_channels=192,
-            filter_size=1,
+            kernel_size=1,
-            act="relu")
+            padding=0,
+            activation_layer=nn.ReLU)
    def forward(self, x):
        branch1x1 = self.branch1x1(x)

--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -16,59 +16,31 @@ import paddle
 import paddle.nn as nn
 from paddle.utils.download import get_weights_path_from_url
+from ..ops import ConvNormActivation
 __all__ = []
 model_urls = {
    'mobilenetv1_1.0':
-    ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams',
+    ('https://paddle-hapi.bj.bcebos.com/models/mobilenetv1_1.0.pdparams',
-     '42a154c2f26f86e7457d6daded114e8c')
+     '3033ab1975b1670bef51545feb65fc45')
 }
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding,
-                 num_groups=1):
-        super(ConvBNLayer, self).__init__()
-        self._conv = nn.Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            bias_attr=False)
-        self._norm_layer = nn.BatchNorm2D(out_channels)
-        self._act = nn.ReLU()
-    def forward(self, x):
-        x = self._conv(x)
-        x = self._norm_layer(x)
-        x = self._act(x)
-        return x
 class DepthwiseSeparable(nn.Layer):
    def __init__(self, in_channels, out_channels1, out_channels2, num_groups,
                 stride, scale):
        super(DepthwiseSeparable, self).__init__()
-        self._depthwise_conv = ConvBNLayer(
+        self._depthwise_conv = ConvNormActivation(
            in_channels,
            int(out_channels1 * scale),
            kernel_size=3,
            stride=stride,
            padding=1,
-            num_groups=int(num_groups * scale))
+            groups=int(num_groups * scale))
-        self._pointwise_conv = ConvBNLayer(
+        self._pointwise_conv = ConvNormActivation(
            int(out_channels1 * scale),
            int(out_channels2 * scale),
            kernel_size=1,
@@ -94,9 +66,15 @@ class MobileNetV1(nn.Layer):
    Examples:
        .. code-block:: python
+            import paddle
            from paddle.vision.models import MobileNetV1
            model = MobileNetV1()
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
    """
    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
@@ -106,7 +84,7 @@ class MobileNetV1(nn.Layer):
        self.num_classes = num_classes
        self.with_pool = with_pool
-        self.conv1 = ConvBNLayer(
+        self.conv1 = ConvNormActivation(
            in_channels=3,
            out_channels=int(32 * scale),
            kernel_size=3,
@@ -257,6 +235,7 @@ def mobilenet_v1(pretrained=False, scale=1.0, **kwargs):
    Examples:
        .. code-block:: python
+            import paddle
            from paddle.vision.models import mobilenet_v1
            # build model
@@ -266,7 +245,12 @@ def mobilenet_v1(pretrained=False, scale=1.0, **kwargs):
            # model = mobilenet_v1(pretrained=True)
            # build mobilenet v1 with scale=0.5
-            model = mobilenet_v1(scale=0.5)
+            model_scale = mobilenet_v1(scale=0.5)
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
    """
    model = _mobilenet(
        'mobilenetv1_' + str(scale), pretrained, scale=scale, **kwargs)

--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -17,6 +17,7 @@ import paddle.nn as nn
 from paddle.utils.download import get_weights_path_from_url
 from .utils import _make_divisible
+from ..ops import ConvNormActivation
 __all__ = []
@@ -27,29 +28,6 @@ model_urls = {
 }
-class ConvBNReLU(nn.Sequential):
-    def __init__(self,
-                 in_planes,
-                 out_planes,
-                 kernel_size=3,
-                 stride=1,
-                 groups=1,
-                 norm_layer=nn.BatchNorm2D):
-        padding = (kernel_size - 1) // 2
-        super(ConvBNReLU, self).__init__(
-            nn.Conv2D(
-                in_planes,
-                out_planes,
-                kernel_size,
-                stride,
-                padding,
-                groups=groups,
-                bias_attr=False),
-            norm_layer(out_planes),
-            nn.ReLU6())
 class InvertedResidual(nn.Layer):
    def __init__(self,
                 inp,
@@ -67,15 +45,20 @@ class InvertedResidual(nn.Layer):
        layers = []
        if expand_ratio != 1:
            layers.append(
-                ConvBNReLU(
+                ConvNormActivation(
-                    inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
+                    inp,
+                    hidden_dim,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=nn.ReLU6))
        layers.extend([
-            ConvBNReLU(
+            ConvNormActivation(
                hidden_dim,
                hidden_dim,
                stride=stride,
                groups=hidden_dim,
-                norm_layer=norm_layer),
+                norm_layer=norm_layer,
+                activation_layer=nn.ReLU6),
            nn.Conv2D(
                hidden_dim, oup, 1, 1, 0, bias_attr=False),
            norm_layer(oup),
@@ -90,7 +73,6 @@ class InvertedResidual(nn.Layer):
 class MobileNetV2(nn.Layer):
-    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
    """MobileNetV2 model from
    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
@@ -103,10 +85,18 @@ class MobileNetV2(nn.Layer):
    Examples:
        .. code-block:: python
+            import paddle
            from paddle.vision.models import MobileNetV2
            model = MobileNetV2()
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
    """
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
        super(MobileNetV2, self).__init__()
        self.num_classes = num_classes
        self.with_pool = with_pool
@@ -130,8 +120,12 @@ class MobileNetV2(nn.Layer):
        self.last_channel = _make_divisible(last_channel * max(1.0, scale),
                                            round_nearest)
        features = [
-            ConvBNReLU(
+            ConvNormActivation(
-                3, input_channel, stride=2, norm_layer=norm_layer)
+                3,
+                input_channel,
+                stride=2,
+                norm_layer=norm_layer,
+                activation_layer=nn.ReLU6)
        ]
        for t, c, n, s in inverted_residual_setting:
@@ -148,11 +142,12 @@ class MobileNetV2(nn.Layer):
                input_channel = output_channel
        features.append(
-            ConvBNReLU(
+            ConvNormActivation(
                input_channel,
                self.last_channel,
                kernel_size=1,
-                norm_layer=norm_layer))
+                norm_layer=norm_layer,
+                activation_layer=nn.ReLU6))
        self.features = nn.Sequential(*features)
@@ -199,6 +194,7 @@ def mobilenet_v2(pretrained=False, scale=1.0, **kwargs):
    Examples:
        .. code-block:: python
+            import paddle
            from paddle.vision.models import mobilenet_v2
            # build model
@@ -209,6 +205,11 @@ def mobilenet_v2(pretrained=False, scale=1.0, **kwargs):
            # build mobilenet v2 with scale=0.5
            model = mobilenet_v2(scale=0.5)
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
    """
    model = _mobilenet(
        'mobilenetv2_' + str(scale), pretrained, scale=scale, **kwargs)

--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -33,11 +33,29 @@ model_urls = {
                  '02f35f034ca3858e1e54d4036443c92d'),
    'resnet152': ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams',
                  '7ad16a2f1e7333859ff986138630fd7a'),
-    'wide_resnet50_2':
+    'resnext50_32x4d':
-    ('https://paddle-hapi.bj.bcebos.com/models/wide_resnet50_2.pdparams',
+    ('https://paddle-hapi.bj.bcebos.com/models/resnext50_32x4d.pdparams',
+     'dc47483169be7d6f018fcbb7baf8775d'),
+    "resnext50_64x4d":
+    ('https://paddle-hapi.bj.bcebos.com/models/resnext50_64x4d.pdparams',
+     '063d4b483e12b06388529450ad7576db'),
+    'resnext101_32x4d': (
+        'https://paddle-hapi.bj.bcebos.com/models/resnext101_32x4d.pdparams',
+        '967b090039f9de2c8d06fe994fb9095f'),
+    'resnext101_64x4d': (
+        'https://paddle-hapi.bj.bcebos.com/models/resnext101_64x4d.pdparams',
+        '98e04e7ca616a066699230d769d03008'),
+    'resnext152_32x4d': (
+        'https://paddle-hapi.bj.bcebos.com/models/resnext152_32x4d.pdparams',
+        '18ff0beee21f2efc99c4b31786107121'),
+    'resnext152_64x4d': (
+        'https://paddle-hapi.bj.bcebos.com/models/resnext152_64x4d.pdparams',
+        '77c4af00ca42c405fa7f841841959379'),
+    'wide_resnet50_2': (
+        'https://paddle-hapi.bj.bcebos.com/models/wide_resnet50_2.pdparams',
        '0282f804d73debdab289bd9fea3fa6dc'),
-    'wide_resnet101_2':
+    'wide_resnet101_2': (
-    ('https://paddle-hapi.bj.bcebos.com/models/wide_resnet101_2.pdparams',
+        'https://paddle-hapi.bj.bcebos.com/models/wide_resnet101_2.pdparams',
        'd4360a2d23657f059216f5d5a1a9ac93'),
 }
@@ -158,11 +176,12 @@ class ResNet(nn.Layer):
    Args:
        Block (BasicBlock|BottleneckBlock): block module of model.
-        depth (int): layers of resnet, default: 50.
+        depth (int, optional): layers of resnet, Default: 50.
-        width (int): base width of resnet, default: 64.
+        width (int, optional): base width per convolution group for each convolution block, Default: 64.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer
+        num_classes (int, optional): output dim of last fc layer. If num_classes <=0, last fc layer
                            will not be defined. Default: 1000.
-        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        with_pool (bool, optional): use pool before the last fc layer or not. Default: True.
+        groups (int, optional): number of groups for each convolution block, Default: 1.
    Examples:
        .. code-block:: python
@@ -171,16 +190,23 @@ class ResNet(nn.Layer):
            from paddle.vision.models import ResNet
            from paddle.vision.models.resnet import BottleneckBlock, BasicBlock
+            # build ResNet with 18 layers
+            resnet18 = ResNet(BasicBlock, 18)
+            # build ResNet with 50 layers
            resnet50 = ResNet(BottleneckBlock, 50)
+            # build Wide ResNet model
            wide_resnet50_2 = ResNet(BottleneckBlock, 50, width=64*2)
-            resnet18 = ResNet(BasicBlock, 18)
+            # build ResNeXt model
+            resnext50_32x4d = ResNet(BottleneckBlock, 50, width=4, groups=32)
            x = paddle.rand([1, 3, 224, 224])
            out = resnet18(x)
            print(out.shape)
+            # [1, 1000]
    """
@@ -189,7 +215,8 @@ class ResNet(nn.Layer):
                 depth=50,
                 width=64,
                 num_classes=1000,
-                 with_pool=True):
+                 with_pool=True,
+                 groups=1):
        super(ResNet, self).__init__()
        layer_cfg = {
            18: [2, 2, 2, 2],
@@ -199,7 +226,7 @@ class ResNet(nn.Layer):
            152: [3, 8, 36, 3]
        }
        layers = layer_cfg[depth]
-        self.groups = 1
+        self.groups = groups
        self.base_width = width
        self.num_classes = num_classes
        self.with_pool = with_pool
@@ -300,7 +327,7 @@ def resnet18(pretrained=False, **kwargs):
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
    Examples:
        .. code-block:: python
@@ -318,6 +345,7 @@ def resnet18(pretrained=False, **kwargs):
            out = model(x)
            print(out.shape)
+            # [1, 1000]
    """
    return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs)
@@ -327,7 +355,7 @@ def resnet34(pretrained=False, **kwargs):
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
    Examples:
        .. code-block:: python
@@ -345,6 +373,7 @@ def resnet34(pretrained=False, **kwargs):
            out = model(x)
            print(out.shape)
+            # [1, 1000]
    """
    return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs)
@@ -354,7 +383,7 @@ def resnet50(pretrained=False, **kwargs):
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
    Examples:
        .. code-block:: python
@@ -372,6 +401,7 @@ def resnet50(pretrained=False, **kwargs):
            out = model(x)
            print(out.shape)
+            # [1, 1000]
    """
    return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs)
@@ -381,7 +411,7 @@ def resnet101(pretrained=False, **kwargs):
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
    Examples:
        .. code-block:: python
@@ -399,6 +429,7 @@ def resnet101(pretrained=False, **kwargs):
            out = model(x)
            print(out.shape)
+            # [1, 1000]
    """
    return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs)
@@ -408,7 +439,7 @@ def resnet152(pretrained=False, **kwargs):
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
    Examples:
        .. code-block:: python
@@ -426,16 +457,201 @@ def resnet152(pretrained=False, **kwargs):
            out = model(x)
            print(out.shape)
+            # [1, 1000]
    """
    return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs)
+def resnext50_32x4d(pretrained=False, **kwargs):
+    """ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.vision.models import resnext50_32x4d
+            # build model
+            model = resnext50_32x4d()
+            # build model and load imagenet pretrained weight
+            # model = resnext50_32x4d(pretrained=True)
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
+            # [1, 1000]
+    """
+    kwargs['groups'] = 32
+    kwargs['width'] = 4
+    return _resnet('resnext50_32x4d', BottleneckBlock, 50, pretrained, **kwargs)
+def resnext50_64x4d(pretrained=False, **kwargs):
+    """ResNeXt-50 64x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.vision.models import resnext50_64x4d
+            # build model
+            model = resnext50_64x4d()
+            # build model and load imagenet pretrained weight
+            # model = resnext50_64x4d(pretrained=True)
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
+            # [1, 1000]
+    """
+    kwargs['groups'] = 64
+    kwargs['width'] = 4
+    return _resnet('resnext50_64x4d', BottleneckBlock, 50, pretrained, **kwargs)
+def resnext101_32x4d(pretrained=False, **kwargs):
+    """ResNeXt-101 32x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.vision.models import resnext101_32x4d
+            # build model
+            model = resnext101_32x4d()
+            # build model and load imagenet pretrained weight
+            # model = resnext101_32x4d(pretrained=True)
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
+            # [1, 1000]
+    """
+    kwargs['groups'] = 32
+    kwargs['width'] = 4
+    return _resnet('resnext101_32x4d', BottleneckBlock, 101, pretrained,
+                   **kwargs)
+def resnext101_64x4d(pretrained=False, **kwargs):
+    """ResNeXt-101 64x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.vision.models import resnext101_64x4d
+            # build model
+            model = resnext101_64x4d()
+            # build model and load imagenet pretrained weight
+            # model = resnext101_64x4d(pretrained=True)
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
+            # [1, 1000]
+    """
+    kwargs['groups'] = 64
+    kwargs['width'] = 4
+    return _resnet('resnext101_64x4d', BottleneckBlock, 101, pretrained,
+                   **kwargs)
+def resnext152_32x4d(pretrained=False, **kwargs):
+    """ResNeXt-152 32x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.vision.models import resnext152_32x4d
+            # build model
+            model = resnext152_32x4d()
+            # build model and load imagenet pretrained weight
+            # model = resnext152_32x4d(pretrained=True)
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
+            # [1, 1000]
+    """
+    kwargs['groups'] = 32
+    kwargs['width'] = 4
+    return _resnet('resnext152_32x4d', BottleneckBlock, 152, pretrained,
+                   **kwargs)
+def resnext152_64x4d(pretrained=False, **kwargs):
+    """ResNeXt-152 64x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.vision.models import resnext152_64x4d
+            # build model
+            model = resnext152_64x4d()
+            # build model and load imagenet pretrained weight
+            # model = resnext152_64x4d(pretrained=True)
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+            print(out.shape)
+            # [1, 1000]
+    """
+    kwargs['groups'] = 64
+    kwargs['width'] = 4
+    return _resnet('resnext152_64x4d', BottleneckBlock, 152, pretrained,
+                   **kwargs)
 def wide_resnet50_2(pretrained=False, **kwargs):
    """Wide ResNet-50-2 model from
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
    Examples:
        .. code-block:: python
@@ -453,6 +669,7 @@ def wide_resnet50_2(pretrained=False, **kwargs):
            out = model(x)
            print(out.shape)
+            # [1, 1000]
    """
    kwargs['width'] = 64 * 2
    return _resnet('wide_resnet50_2', BottleneckBlock, 50, pretrained, **kwargs)
@@ -463,7 +680,7 @@ def wide_resnet101_2(pretrained=False, **kwargs):
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
    Examples:
        .. code-block:: python
@@ -481,6 +698,7 @@ def wide_resnet101_2(pretrained=False, **kwargs):
            out = model(x)
            print(out.shape)
+            # [1, 1000]
    """
    kwargs['width'] = 64 * 2
    return _resnet('wide_resnet101_2', BottleneckBlock, 101, pretrained,

--- a/python/paddle/vision/models/resnext.py
+++ b/python/paddle/vision/models/resnext.py
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.fluid.param_attr import ParamAttr
-from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Linear, MaxPool2D
-from paddle.nn.initializer import Uniform
-from paddle.utils.download import get_weights_path_from_url
-__all__ = []
-model_urls = {
-    'resnext50_32x4d':
-    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_32x4d_pretrained.pdparams',
-     'bf04add2f7fd22efcbe91511bcd1eebe'),
-    "resnext50_64x4d":
-    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_64x4d_pretrained.pdparams',
-     '46307df0e2d6d41d3b1c1d22b00abc69'),
-    'resnext101_32x4d':
-    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x4d_pretrained.pdparams',
-     '078ca145b3bea964ba0544303a43c36d'),
-    'resnext101_64x4d':
-    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_64x4d_pretrained.pdparams',
-     '4edc0eb32d3cc5d80eff7cab32cd5c64'),
-    'resnext152_32x4d':
-    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_32x4d_pretrained.pdparams',
-     '7971cc994d459af167c502366f866378'),
-    'resnext152_64x4d':
-    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_64x4d_pretrained.pdparams',
-     '836943f03709efec364d486c57d132de'),
-}
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-        self._conv = Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            bias_attr=False)
-        self._batch_norm = BatchNorm(num_filters, act=act)
-    def forward(self, inputs):
-        x = self._conv(inputs)
-        x = self._batch_norm(x)
-        return x
-class BottleneckBlock(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 stride,
-                 cardinality,
-                 shortcut=True):
-        super(BottleneckBlock, self).__init__()
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            groups=cardinality,
-            stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
-            filter_size=1,
-            act=None)
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 2
-                if cardinality == 32 else num_filters,
-                filter_size=1,
-                stride=stride)
-        self.shortcut = shortcut
-    def forward(self, inputs):
-        x = self.conv0(inputs)
-        conv1 = self.conv1(x)
-        conv2 = self.conv2(conv1)
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-        x = paddle.add(x=short, y=conv2)
-        x = F.relu(x)
-        return x
-class ResNeXt(nn.Layer):
-    """ResNeXt model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
-    Args:
-        depth (int, optional): depth of resnext. Default: 50.
-        cardinality (int, optional): cardinality of resnext. Default: 32.
-        num_classes (int, optional): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool, optional): use pool before the last fc layer or not. Default: True.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import ResNeXt
-            resnext50_32x4d = ResNeXt(depth=50, cardinality=32)
-    """
-    def __init__(self,
-                 depth=50,
-                 cardinality=32,
-                 num_classes=1000,
-                 with_pool=True):
-        super(ResNeXt, self).__init__()
-        self.depth = depth
-        self.cardinality = cardinality
-        self.num_classes = num_classes
-        self.with_pool = with_pool
-        supported_depth = [50, 101, 152]
-        assert depth in supported_depth, \
-            "supported layers are {} but input layer is {}".format(
-                supported_depth, depth)
-        supported_cardinality = [32, 64]
-        assert cardinality in supported_cardinality, \
-            "supported cardinality is {} but input cardinality is {}" \
-            .format(supported_cardinality, cardinality)
-        layer_cfg = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}
-        layers = layer_cfg[depth]
-        num_channels = [64, 256, 512, 1024]
-        num_filters = [128, 256, 512,
-                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
-        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
-        self.block_list = []
-        for block in range(len(layers)):
-            shortcut = False
-            for i in range(layers[block]):
-                bottleneck_block = self.add_sublayer(
-                    'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        num_channels=num_channels[block] if i == 0 else
-                        num_filters[block] * int(64 // self.cardinality),
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        cardinality=self.cardinality,
-                        shortcut=shortcut))
-                self.block_list.append(bottleneck_block)
-                shortcut = True
-        if with_pool:
-            self.pool2d_avg = AdaptiveAvgPool2D(1)
-        if num_classes > 0:
-            self.pool2d_avg_channels = num_channels[-1] * 2
-            stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
-            self.out = Linear(
-                self.pool2d_avg_channels,
-                num_classes,
-                weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
-    def forward(self, inputs):
-        with paddle.static.amp.fp16_guard():
-            x = self.conv(inputs)
-            x = self.pool2d_max(x)
-            for block in self.block_list:
-                x = block(x)
-            if self.with_pool:
-                x = self.pool2d_avg(x)
-            if self.num_classes > 0:
-                x = paddle.reshape(x, shape=[-1, self.pool2d_avg_channels])
-                x = self.out(x)
-            return x
-def _resnext(arch, depth, cardinality, pretrained, **kwargs):
-    model = ResNeXt(depth=depth, cardinality=cardinality, **kwargs)
-    if pretrained:
-        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
-            arch)
-        weight_path = get_weights_path_from_url(model_urls[arch][0],
-                                                model_urls[arch][1])
-        param = paddle.load(weight_path)
-        model.set_dict(param)
-    return model
-def resnext50_32x4d(pretrained=False, **kwargs):
-    """ResNeXt-50 32x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext50_32x4d
-            # build model
-            model = resnext50_32x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext50_32x4d(pretrained=True)
-    """
-    return _resnext('resnext50_32x4d', 50, 32, pretrained, **kwargs)
-def resnext50_64x4d(pretrained=False, **kwargs):
-    """ResNeXt-50 64x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext50_64x4d
-            # build model
-            model = resnext50_64x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext50_64x4d(pretrained=True)
-    """
-    return _resnext('resnext50_64x4d', 50, 64, pretrained, **kwargs)
-def resnext101_32x4d(pretrained=False, **kwargs):
-    """ResNeXt-101 32x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext101_32x4d
-            # build model
-            model = resnext101_32x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext101_32x4d(pretrained=True)
-    """
-    return _resnext('resnext101_32x4d', 101, 32, pretrained, **kwargs)
-def resnext101_64x4d(pretrained=False, **kwargs):
-    """ResNeXt-101 64x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext101_64x4d
-            # build model
-            model = resnext101_64x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext101_64x4d(pretrained=True)
-    """
-    return _resnext('resnext101_64x4d', 101, 64, pretrained, **kwargs)
-def resnext152_32x4d(pretrained=False, **kwargs):
-    """ResNeXt-152 32x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext152_32x4d
-            # build model
-            model = resnext152_32x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext152_32x4d(pretrained=True)
-    """
-    return _resnext('resnext152_32x4d', 152, 32, pretrained, **kwargs)
-def resnext152_64x4d(pretrained=False, **kwargs):
-    """ResNeXt-152 64x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext152_64x4d
-            # build model
-            model = resnext152_64x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext152_64x4d(pretrained=True)
-    """
-    return _resnext('resnext152_64x4d', 152, 64, pretrained, **kwargs)
--- a/python/paddle/vision/models/shufflenetv2.py
+++ b/python/paddle/vision/models/shufflenetv2.py
@@ -18,37 +18,50 @@ from __future__ import print_function
 import paddle
 import paddle.nn as nn
-from paddle.fluid.param_attr import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, Linear, MaxPool2D
-from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Linear, MaxPool2D
 from paddle.utils.download import get_weights_path_from_url
+from ..ops import ConvNormActivation
 __all__ = []
 model_urls = {
    "shufflenet_v2_x0_25": (
-        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_25_pretrained.pdparams",
+        "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x0_25.pdparams",
-        "e753404cbd95027759c5f56ecd6c9c4b", ),
+        "1e509b4c140eeb096bb16e214796d03b", ),
    "shufflenet_v2_x0_33": (
-        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_33_pretrained.pdparams",
+        "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x0_33.pdparams",
-        "776e3cf9a4923abdfce789c45b8fe1f2", ),
+        "3d7b3ab0eaa5c0927ff1026d31b729bd", ),
    "shufflenet_v2_x0_5": (
-        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_5_pretrained.pdparams",
+        "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x0_5.pdparams",
-        "e3649cf531566917e2969487d2bc6b60", ),
+        "5e5cee182a7793c4e4c73949b1a71bd4", ),
    "shufflenet_v2_x1_0": (
-        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_0_pretrained.pdparams",
+        "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x1_0.pdparams",
-        "7821c348ea34e58847c43a08a4ac0bdf", ),
+        "122d42478b9e81eb49f8a9ede327b1a4", ),
    "shufflenet_v2_x1_5": (
-        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_5_pretrained.pdparams",
+        "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x1_5.pdparams",
-        "93a07fa557ab2d8803550f39e5b6c391", ),
+        "faced5827380d73531d0ee027c67826d", ),
    "shufflenet_v2_x2_0": (
-        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x2_0_pretrained.pdparams",
+        "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x2_0.pdparams",
-        "4ab1f622fd0d341e0f84b4e057797563", ),
+        "cd3dddcd8305e7bcd8ad14d1c69a5784", ),
    "shufflenet_v2_swish": (
-        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_swish_pretrained.pdparams",
+        "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_swish.pdparams",
-        "daff38b3df1b3748fccbb13cfdf02519", ),
+        "adde0aa3b023e5b0c94a68be1c394b84", ),
 }
+def create_activation_layer(act):
+    if act == "swish":
+        return nn.Swish
+    elif act == "relu":
+        return nn.ReLU
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
 def channel_shuffle(x, groups):
    batch_size, num_channels, height, width = x.shape[0:4]
    channels_per_group = num_channels // groups
@@ -65,61 +78,37 @@ def channel_shuffle(x, groups):
    return x
-class ConvBNLayer(nn.Layer):
+class InvertedResidual(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
-                 kernel_size,
                 stride,
-                 padding,
+                 activation_layer=nn.ReLU):
-                 groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-        self._conv = Conv2D(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
-            bias_attr=False, )
-        self._batch_norm = BatchNorm(out_channels, act=act)
-    def forward(self, inputs):
-        x = self._conv(inputs)
-        x = self._batch_norm(x)
-        return x
-class InvertedResidual(nn.Layer):
-    def __init__(self, in_channels, out_channels, stride, act="relu"):
        super(InvertedResidual, self).__init__()
-        self._conv_pw = ConvBNLayer(
+        self._conv_pw = ConvNormActivation(
            in_channels=in_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
-            act=act)
+            activation_layer=activation_layer)
-        self._conv_dw = ConvBNLayer(
+        self._conv_dw = ConvNormActivation(
            in_channels=out_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=out_channels // 2,
-            act=None)
+            activation_layer=None)
-        self._conv_linear = ConvBNLayer(
+        self._conv_linear = ConvNormActivation(
            in_channels=out_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
-            act=act)
+            activation_layer=activation_layer)
    def forward(self, inputs):
        x1, x2 = paddle.split(
@@ -134,51 +123,55 @@ class InvertedResidual(nn.Layer):
 class InvertedResidualDS(nn.Layer):
-    def __init__(self, in_channels, out_channels, stride, act="relu"):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 activation_layer=nn.ReLU):
        super(InvertedResidualDS, self).__init__()
        # branch1
-        self._conv_dw_1 = ConvBNLayer(
+        self._conv_dw_1 = ConvNormActivation(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=in_channels,
-            act=None)
+            activation_layer=None)
-        self._conv_linear_1 = ConvBNLayer(
+        self._conv_linear_1 = ConvNormActivation(
            in_channels=in_channels,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
-            act=act)
+            activation_layer=activation_layer)
        # branch2
-        self._conv_pw_2 = ConvBNLayer(
+        self._conv_pw_2 = ConvNormActivation(
            in_channels=in_channels,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
-            act=act)
+            activation_layer=activation_layer)
-        self._conv_dw_2 = ConvBNLayer(
+        self._conv_dw_2 = ConvNormActivation(
            in_channels=out_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=out_channels // 2,
-            act=None)
+            activation_layer=None)
-        self._conv_linear_2 = ConvBNLayer(
+        self._conv_linear_2 = ConvNormActivation(
            in_channels=out_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
-            act=act)
+            activation_layer=activation_layer)
    def forward(self, inputs):
        x1 = self._conv_dw_1(inputs)
@@ -221,6 +214,7 @@ class ShuffleNetV2(nn.Layer):
        self.num_classes = num_classes
        self.with_pool = with_pool
        stage_repeats = [4, 8, 4]
+        activation_layer = create_activation_layer(act)
        if scale == 0.25:
            stage_out_channels = [-1, 24, 24, 48, 96, 512]
@@ -238,13 +232,13 @@ class ShuffleNetV2(nn.Layer):
            raise NotImplementedError("This scale size:[" + str(scale) +
                                      "] is not implemented!")
        # 1. conv1
-        self._conv1 = ConvBNLayer(
+        self._conv1 = ConvNormActivation(
            in_channels=3,
            out_channels=stage_out_channels[1],
            kernel_size=3,
            stride=2,
            padding=1,
-            act=act)
+            activation_layer=activation_layer)
        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
        # 2. bottleneck sequences
@@ -257,7 +251,7 @@ class ShuffleNetV2(nn.Layer):
                            in_channels=stage_out_channels[stage_id + 1],
                            out_channels=stage_out_channels[stage_id + 2],
                            stride=2,
-                            act=act),
+                            activation_layer=activation_layer),
                        name=str(stage_id + 2) + "_" + str(i + 1))
                else:
                    block = self.add_sublayer(
@@ -265,17 +259,17 @@ class ShuffleNetV2(nn.Layer):
                            in_channels=stage_out_channels[stage_id + 2],
                            out_channels=stage_out_channels[stage_id + 2],
                            stride=1,
-                            act=act),
+                            activation_layer=activation_layer),
                        name=str(stage_id + 2) + "_" + str(i + 1))
                self._block_list.append(block)
        # 3. last_conv
-        self._last_conv = ConvBNLayer(
+        self._last_conv = ConvNormActivation(
            in_channels=stage_out_channels[-2],
            out_channels=stage_out_channels[-1],
            kernel_size=1,
            stride=1,
            padding=0,
-            act=act)
+            activation_layer=activation_layer)
        # 4. pool
        if with_pool:
            self._pool2d_avg = AdaptiveAvgPool2D(1)

--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -1335,13 +1335,13 @@ class ConvNormActivation(Sequential):
    Args:
        in_channels (int): Number of channels in the input image
        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
-        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        kernel_size: (int|list|tuple, optional): Size of the convolving kernel. Default: 3
-        stride (int, optional): Stride of the convolution. Default: 1
+        stride (int|list|tuple, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None,
+        padding (int|str|tuple|list, optional): Padding added to all four sides of the input. Default: None,
            in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
        norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer.
-            If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2d``
+            If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2D``
        activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization
            layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU``
        dilation (int): Spacing between kernel elements. Default: 1