Refine vision models (#27476)

* refine vision models

Refine vision models (#27476)
* refine vision models
b38e4f28 · LielinJiang · GitHub · 0b4bb023 · b38e4f28 · b38e4f28
14 changed file
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -301,10 +301,11 @@ class ProgBarLogger(Callback):
            train_dataset = paddle.vision.datasets.MNIST(mode='train')
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                inputs, labels)
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
            model.prepare(optimizer=optim,
                        loss=paddle.nn.CrossEntropyLoss(),
                        metrics=paddle.metric.Accuracy())
@@ -436,10 +437,11 @@ class ModelCheckpoint(Callback):
            train_dataset = paddle.vision.datasets.MNIST(mode='train')
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                inputs, labels)
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
            model.prepare(optimizer=optim,
                        loss=paddle.nn.CrossEntropyLoss(),
                        metrics=paddle.metric.Accuracy())

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -814,10 +814,9 @@ class Model(object):
        from paddle.static import InputSpec
        device = paddle.set_device('cpu') # or 'gpu'
-        # if use static graph, do not set
-        paddle.disable_static(device)
        net = nn.Sequential(
+            nn.Flatten(1),
            nn.Linear(784, 200),
            nn.Tanh(),
            nn.Linear(200, 10))
@@ -833,7 +832,7 @@ class Model(object):
                      paddle.nn.CrossEntropyLoss(),
                      paddle.metric.Accuracy())
-        data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+        data = paddle.vision.datasets.MNIST(mode='train')
        model.fit(data, epochs=2, batch_size=32, verbose=1)
    """
@@ -850,7 +849,8 @@ class Model(object):
        if not isinstance(inputs, (list, dict, Input)):
            raise TypeError(
-                "'inputs' must be list or dict, and couldn't be None.")
+                "'inputs' must be list or dict in static graph mode")
        self._inputs = self._verify_spec(inputs, True)
        self._labels = self._verify_spec(labels)
@@ -885,7 +885,6 @@ class Model(object):
              from paddle.static import InputSpec
              device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
              net = nn.Sequential(
                  nn.Linear(784, 200),
@@ -930,7 +929,6 @@ class Model(object):
              from paddle.static import InputSpec
              device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
              net = nn.Sequential(
                  nn.Linear(784, 200),
@@ -970,9 +968,12 @@ class Model(object):
              import numpy as np
              import paddle
              import paddle.nn as nn
+              from paddle.static import InputSpec
              device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
+              input = InputSpec([None, 784], 'float32', 'x')
+              label = InputSpec([None, 1], 'int64', 'label')
              net = nn.Sequential(
                  nn.Linear(784, 200),
@@ -980,7 +981,7 @@ class Model(object):
                  nn.Linear(200, 10),
                  nn.Softmax())
-              model = paddle.Model(net)
+              model = paddle.Model(net, input, label)
              model.prepare()
              data = np.random.random(size=(4,784)).astype(np.float32)
              out = model.test_batch([data])
@@ -1026,6 +1027,7 @@ class Model(object):
                    def __init__(self):
                        super(Mnist, self).__init__()
                        self.net = nn.Sequential(
+                            nn.Flatten(1),
                            nn.Linear(784, 200),
                            nn.Tanh(),
                            nn.Linear(200, 10),
@@ -1045,7 +1047,7 @@ class Model(object):
                optim = paddle.optimizer.SGD(learning_rate=1e-3,
                    parameters=model.parameters())
                model.prepare(optim, paddle.nn.CrossEntropyLoss())
-                data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+                data = paddle.vision.datasets.MNIST(mode='train')
                model.fit(data, epochs=1, batch_size=32, verbose=0)
                model.save('checkpoint/test')  # save for training
                model.save('inference_model', False)  # save for inference
@@ -1092,15 +1094,18 @@ class Model(object):
              import paddle
              import paddle.nn as nn
+              from paddle.static import InputSpec
              device = paddle.set_device('cpu')
-              paddle.disable_static(device)
+              input = InputSpec([None, 784], 'float32', 'x')
              model = paddle.Model(nn.Sequential(
                  nn.Linear(784, 200),
                  nn.Tanh(),
                  nn.Linear(200, 10),
-                  nn.Softmax()))
+                  nn.Softmax()), input)
              model.save('checkpoint/test')
              model.load('checkpoint/test')
        """
@@ -1165,13 +1170,15 @@ class Model(object):
              import paddle
              import paddle.nn as nn
+              from paddle.static import InputSpec
-              paddle.disable_static()
+              input = InputSpec([None, 784], 'float32', 'x')
              model = paddle.Model(nn.Sequential(
                  nn.Linear(784, 200),
                  nn.Tanh(),
-                  nn.Linear(200, 10)))
+                  nn.Linear(200, 10)), input)
              params = model.parameters()
        """
        return self._adapter.parameters()
@@ -1313,7 +1320,7 @@ class Model(object):
              label = InputSpec([None, 1], 'int64', 'label')
              model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None),
+                  paddle.vision.models.LeNet(),
                  input, label)
              optim = paddle.optimizer.Adam(
                  learning_rate=0.001, parameters=model.parameters())
@@ -1350,7 +1357,7 @@ class Model(object):
              label = InputSpec([None, 1], 'int64', 'label')
              model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None), input, label)
+                  paddle.vision.models.LeNet(), input, label)
              optim = paddle.optimizer.Adam(
                  learning_rate=0.001, parameters=model.parameters())
              model.prepare(
@@ -1483,7 +1490,7 @@ class Model(object):
            # imperative mode
            paddle.disable_static()
-            model = paddle.Model(paddle.vision.models.LeNet())
+            model = paddle.Model(paddle.vision.models.LeNet(), input, label)
            model.prepare(metrics=paddle.metric.Accuracy())
            result = model.evaluate(val_dataset, batch_size=64)
            print(result)
@@ -1580,19 +1587,20 @@ class Model(object):
            test_dataset = MnistDataset(mode='test', return_label=False)
-            # declarative mode
+            # imperative mode
            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
            model = paddle.Model(paddle.vision.models.LeNet(), input)
            model.prepare()
            result = model.predict(test_dataset, batch_size=64)
            print(len(result[0]), result[0][0].shape)
-            # imperative mode
+            # declarative mode
            device = paddle.set_device('cpu')
-            paddle.disable_static(device)
+            paddle.enable_static()
-            model = paddle.Model(paddle.vision.models.LeNet())
+            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+            model = paddle.Model(paddle.vision.models.LeNet(), input)
            model.prepare()
            result = model.predict(test_dataset, batch_size=64)
            print(len(result[0]), result[0][0].shape)
        """
@@ -1832,15 +1840,11 @@ class Model(object):
              import paddle
              from paddle.static import InputSpec
-              dynamic = True
-              device = paddle.set_device('cpu')
-              paddle.disable_static(device) if dynamic else None
              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
              label = InputSpec([None, 1], 'int64', 'label')
-              model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+              model = paddle.Model(paddle.vision.LeNet(),
                  input, label)
              optim = paddle.optimizer.Adam(
                  learning_rate=0.001, parameters=model.parameters())

--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -182,7 +182,6 @@ class Accuracy(Metric):
        import numpy as np
        import paddle
-        paddle.disable_static()
        x = paddle.to_tensor(np.array([
            [0.1, 0.2, 0.3, 0.4],
            [0.1, 0.4, 0.3, 0.2],
@@ -202,11 +201,13 @@ class Accuracy(Metric):
        .. code-block:: python
        import paddle
+        from paddle.static import InputSpec
-        paddle.disable_static()
+        input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+        label = InputSpec([None, 1], 'int64', 'label')
        train_dataset = paddle.vision.datasets.MNIST(mode='train')
-        model = paddle.Model(paddle.vision.LeNet(classifier_activation=None))
+        model = paddle.Model(paddle.vision.LeNet(), input, label)
        optim = paddle.optimizer.Adam(
            learning_rate=0.001, parameters=model.parameters())
        model.prepare(

--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -8,10 +8,6 @@ foreach(TEST_OP ${DIST_TEST_OPS})
    list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
-# disable test_pretrained_model and test_vision_models
-list(REMOVE_ITEM TEST_OPS test_pretrained_model)
-list(REMOVE_ITEM TEST_OPS test_vision_models)
 foreach(src ${TEST_OPS})
    py_test(${src} SRCS ${src}.py)
 endforeach()

--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -68,7 +68,7 @@ class TestDistTraning(unittest.TestCase):
        inputs = [Input(im_shape, 'float32', 'image')]
        labels = [Input([None, 1], 'int64', 'label')]
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
        optim = fluid.optimizer.Momentum(
            learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
        model.prepare(optim, CrossEntropyLoss(), Accuracy())

--- a/python/paddle/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/tests/dist_hapi_mnist_static.py
@@ -67,7 +67,7 @@ class TestDistTraning(unittest.TestCase):
        inputs = [Input(im_shape, 'float32', 'image')]
        labels = [Input([None, 1], 'int64', 'label')]
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
        optim = fluid.optimizer.Momentum(
            learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
        model.prepare(optim, CrossEntropyLoss(), Accuracy())

--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -40,7 +40,7 @@ from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTra
 class LeNetDygraph(paddle.nn.Layer):
-    def __init__(self, num_classes=10, classifier_activation=None):
+    def __init__(self, num_classes=10):
        super(LeNetDygraph, self).__init__()
        self.num_classes = num_classes
        self.features = Sequential(
@@ -55,8 +55,7 @@ class LeNetDygraph(paddle.nn.Layer):
        if num_classes > 0:
            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
-                Softmax())  #Todo: accept any activation
    def forward(self, inputs):
        x = self.features(inputs)
@@ -67,6 +66,34 @@ class LeNetDygraph(paddle.nn.Layer):
        return x
+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
 class MnistDataset(MNIST):
    def __init__(self, mode, return_label=True, sample_num=None):
        super(MnistDataset, self).__init__(mode=mode)
@@ -198,7 +225,7 @@ class TestModel(unittest.TestCase):
        paddle.manual_seed(seed)
        paddle.framework.random._manual_program_seed(seed)
-        net = LeNet(classifier_activation=None)
+        net = LeNet()
        optim_new = fluid.optimizer.Adam(
            learning_rate=0.001, parameter_list=net.parameters())
        model = Model(net, inputs=self.inputs, labels=self.labels)
@@ -287,14 +314,12 @@ class TestModel(unittest.TestCase):
 class MyModel(paddle.nn.Layer):
-    def __init__(self, classifier_activation='softmax'):
+    def __init__(self):
        super(MyModel, self).__init__()
        self._fc = Linear(20, 10)
-        self._act = Softmax()  #Todo: accept any activation
    def forward(self, x):
        y = self._fc(x)
-        y = self._act(y)
        return y
@@ -311,7 +336,7 @@ class TestModelFunction(unittest.TestCase):
        def get_expect():
            fluid.enable_dygraph(fluid.CPUPlace())
            self.set_seed()
-            m = MyModel(classifier_activation=None)
+            m = MyModel()
            optim = fluid.optimizer.SGD(learning_rate=0.001,
                                        parameter_list=m.parameters())
            m.train()
@@ -330,7 +355,7 @@ class TestModelFunction(unittest.TestCase):
            fluid.enable_dygraph(device) if dynamic else None
            self.set_seed()
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
            optim2 = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=net.parameters())
@@ -374,7 +399,7 @@ class TestModelFunction(unittest.TestCase):
        for dynamic in [True, False]:
            device = paddle.set_device('cpu')
            fluid.enable_dygraph(device) if dynamic else None
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
            inputs = [InputSpec([None, 20], 'float32', 'x')]
            labels = [InputSpec([None, 1], 'int64', 'label')]
            optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -417,7 +442,7 @@ class TestModelFunction(unittest.TestCase):
        fluid.enable_dygraph(device)
        inputs = [InputSpec([None, 20], 'float32', 'x')]
        labels = [InputSpec([None, 1], 'int64', 'label')]
-        model = Model(MyModel(classifier_activation=None), inputs, labels)
+        model = Model(MyModel(), inputs, labels)
        optim = fluid.optimizer.SGD(learning_rate=0.001,
                                    parameter_list=model.parameters())
        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -426,7 +451,7 @@ class TestModelFunction(unittest.TestCase):
        inputs = [InputSpec([None, 20], 'float32', 'x')]
        labels = [InputSpec([None, 1], 'int64', 'label')]
-        model = Model(MyModel(classifier_activation=None), inputs, labels)
+        model = Model(MyModel(), inputs, labels)
        optim = fluid.optimizer.SGD(learning_rate=0.001,
                                    parameter_list=model.parameters())
        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -436,7 +461,7 @@ class TestModelFunction(unittest.TestCase):
    def test_static_save_dynamic_load(self):
        path = tempfile.mkdtemp()
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
        inputs = [InputSpec([None, 20], 'float32', 'x')]
        labels = [InputSpec([None, 1], 'int64', 'label')]
        optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -448,7 +473,7 @@ class TestModelFunction(unittest.TestCase):
        device = paddle.set_device('cpu')
        fluid.enable_dygraph(device)  #if dynamic else None
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
        inputs = [InputSpec([None, 20], 'float32', 'x')]
        labels = [InputSpec([None, 1], 'int64', 'label')]
        optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -557,7 +582,7 @@ class TestModelFunction(unittest.TestCase):
 class TestRaiseError(unittest.TestCase):
    def test_input_without_name(self):
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
        inputs = [InputSpec([None, 10], 'float32')]
        labels = [InputSpec([None, 1], 'int64', 'label')]
@@ -567,7 +592,7 @@ class TestRaiseError(unittest.TestCase):
    def test_input_without_input_spec(self):
        for dynamic in [True, False]:
            paddle.disable_static() if dynamic else None
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
            with self.assertRaises(TypeError):
                model = Model(net)
            paddle.enable_static()

--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 import unittest
+import tempfile
+import shutil
 import numpy as np
 import paddle
@@ -23,27 +25,36 @@ import paddle.vision.models as models
 # test the predicted resutls of static graph and dynamic graph are equal
 # when used pretrained model
 class TestPretrainedModel(unittest.TestCase):
-    def infer(self, x, arch, dygraph=True):
+    def infer(self, arch):
-        if dygraph:
+        path = tempfile.mkdtemp()
-            paddle.disable_static()
+        x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
+        res = {}
-        net = models.__dict__[arch](pretrained=True, classifier_activation=None)
+        for dygraph in [True, False]:
-        inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
+            if not dygraph:
-        model = paddle.Model(network=net, inputs=inputs)
+                paddle.enable_static()
-        model.prepare()
-        res = model.test_batch(x)
+            net = models.__dict__[arch]()
+            inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
-        if dygraph:
+            model = paddle.Model(network=net, inputs=inputs)
-            paddle.enable_static()
+            model.prepare()
-        return res
+            if dygraph:
+                model.save(path)
+                res['dygraph'] = model.test_batch(x)
+            else:
+                model.load(path)
+                res['static'] = model.test_batch(x)
+            if not dygraph:
+                paddle.disable_static()
+        shutil.rmtree(path)
+        np.testing.assert_allclose(res['dygraph'], res['static'])
    def test_models(self):
        arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18']
        for arch in arches:
-            x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
+            self.infer(arch)
-            y_dygraph = self.infer(x, arch)
-            y_static = self.infer(x, arch, dygraph=False)
-            np.testing.assert_allclose(y_dygraph, y_static)
 if __name__ == '__main__':

--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -36,7 +36,7 @@ class TestVisonModels(unittest.TestCase):
        model.test_batch(x)
    def test_mobilenetv2_pretrained(self):
-        self.models_infer('mobilenet_v2', pretrained=True)
+        self.models_infer('mobilenet_v2', pretrained=False)
    def test_mobilenetv1(self):
        self.models_infer('mobilenet_v1')

--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -12,20 +12,19 @@
 #See the License for the specific language governing permissions and
 #limitations under the License.
-import paddle.fluid as fluid
+import paddle
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
+import paddle.nn as nn
 __all__ = ['LeNet']
-class LeNet(fluid.dygraph.Layer):
+class LeNet(nn.Layer):
    """LeNet model from
    `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_
    Args:
        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                            will not be defined. Default: 10.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    Examples:
        .. code-block:: python
@@ -35,28 +34,27 @@ class LeNet(fluid.dygraph.Layer):
            model = LeNet()
    """
-    def __init__(self, num_classes=10, classifier_activation='softmax'):
+    def __init__(self, num_classes=10):
        super(LeNet, self).__init__()
        self.num_classes = num_classes
-        self.features = Sequential(
+        self.features = nn.Sequential(
-            Conv2d(
+            nn.Conv2d(
                1, 6, 3, stride=1, padding=1),
-            ReLU(),
+            nn.ReLU(),
-            Pool2D(2, 'max', 2),
+            nn.MaxPool2d(2, 2),
-            Conv2d(
+            nn.Conv2d(
                6, 16, 5, stride=1, padding=0),
-            ReLU(),
+            nn.ReLU(),
-            Pool2D(2, 'max', 2))
+            nn.MaxPool2d(2, 2))
        if num_classes > 0:
-            self.fc = Sequential(
+            self.fc = nn.Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                nn.Linear(400, 120), nn.Linear(120, 84), nn.Linear(84, 10))
-                Softmax())  #Todo: accept any activation
    def forward(self, inputs):
        x = self.features(inputs)
        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
+            x = paddle.flatten(x, 1)
            x = self.fc(x)
        return x
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import paddle.fluid as fluid
+import paddle
-from paddle.fluid.initializer import MSRA
+import paddle.nn as nn
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.utils.download import get_weights_path_from_url
@@ -24,85 +22,66 @@ __all__ = ['MobileNetV1', 'mobilenet_v1']
 model_urls = {
    'mobilenetv1_1.0':
    ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams',
-     'bf0d25cb0bed1114d9dac9384ce2b4a6')
+     '42a154c2f26f86e7457d6daded114e8c')
 }
-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(nn.Layer):
    def __init__(self,
-                 num_channels,
+                 in_channels,
-                 filter_size,
+                 out_channels,
-                 num_filters,
+                 kernel_size,
                 stride,
                 padding,
-                 channels=None,
+                 num_groups=1):
-                 num_groups=1,
-                 act='relu',
-                 use_cudnn=True,
-                 name=None):
        super(ConvBNLayer, self).__init__()
-        self._conv = Conv2D(
+        self._conv = nn.Conv2d(
-            num_channels=num_channels,
+            in_channels,
-            num_filters=num_filters,
+            out_channels,
-            filter_size=filter_size,
+            kernel_size,
            stride=stride,
            padding=padding,
            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "_weights"),
            bias_attr=False)
-        self._batch_norm = BatchNorm(
+        self._norm_layer = nn.BatchNorm2d(out_channels)
-            num_filters,
+        self._act = nn.ReLU()
-            act=act,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
-    def forward(self, inputs):
+    def forward(self, x):
-        y = self._conv(inputs)
+        x = self._conv(x)
-        y = self._batch_norm(y)
+        x = self._norm_layer(x)
-        return y
+        x = self._act(x)
+        return x
-class DepthwiseSeparable(fluid.dygraph.Layer):
+class DepthwiseSeparable(nn.Layer):
-    def __init__(self,
+    def __init__(self, in_channels, out_channels1, out_channels2, num_groups,
-                 num_channels,
+                 stride, scale):
-                 num_filters1,
-                 num_filters2,
-                 num_groups,
-                 stride,
-                 scale,
-                 name=None):
        super(DepthwiseSeparable, self).__init__()
        self._depthwise_conv = ConvBNLayer(
-            num_channels=num_channels,
+            in_channels,
-            num_filters=int(num_filters1 * scale),
+            int(out_channels1 * scale),
-            filter_size=3,
+            kernel_size=3,
            stride=stride,
            padding=1,
-            num_groups=int(num_groups * scale),
+            num_groups=int(num_groups * scale))
-            use_cudnn=False)
        self._pointwise_conv = ConvBNLayer(
-            num_channels=int(num_filters1 * scale),
+            int(out_channels1 * scale),
-            filter_size=1,
+            int(out_channels2 * scale),
-            num_filters=int(num_filters2 * scale),
+            kernel_size=1,
            stride=1,
            padding=0)
-    def forward(self, inputs):
+    def forward(self, x):
-        y = self._depthwise_conv(inputs)
+        x = self._depthwise_conv(x)
-        y = self._pointwise_conv(y)
+        x = self._pointwise_conv(x)
-        return y
+        return x
-class MobileNetV1(fluid.dygraph.Layer):
+class MobileNetV1(nn.Layer):
    """MobileNetV1 model from
    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_.
@@ -111,7 +90,6 @@ class MobileNetV1(fluid.dygraph.Layer):
        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                            will not be defined. Default: 1000.
        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    Examples:
        .. code-block:: python
@@ -121,11 +99,7 @@ class MobileNetV1(fluid.dygraph.Layer):
            model = MobileNetV1()
    """
-    def __init__(self,
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
        super(MobileNetV1, self).__init__()
        self.scale = scale
        self.dwsl = []
@@ -133,18 +107,17 @@ class MobileNetV1(fluid.dygraph.Layer):
        self.with_pool = with_pool
        self.conv1 = ConvBNLayer(
-            num_channels=3,
+            in_channels=3,
-            filter_size=3,
+            out_channels=int(32 * scale),
-            channels=3,
+            kernel_size=3,
-            num_filters=int(32 * scale),
            stride=2,
            padding=1)
        dws21 = self.add_sublayer(
            sublayer=DepthwiseSeparable(
-                num_channels=int(32 * scale),
+                in_channels=int(32 * scale),
-                num_filters1=32,
+                out_channels1=32,
-                num_filters2=64,
+                out_channels2=64,
                num_groups=32,
                stride=1,
                scale=scale),
@@ -153,9 +126,9 @@ class MobileNetV1(fluid.dygraph.Layer):
        dws22 = self.add_sublayer(
            sublayer=DepthwiseSeparable(
-                num_channels=int(64 * scale),
+                in_channels=int(64 * scale),
-                num_filters1=64,
+                out_channels1=64,
-                num_filters2=128,
+                out_channels2=128,
                num_groups=64,
                stride=2,
                scale=scale),
@@ -164,9 +137,9 @@ class MobileNetV1(fluid.dygraph.Layer):
        dws31 = self.add_sublayer(
            sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
+                in_channels=int(128 * scale),
-                num_filters1=128,
+                out_channels1=128,
-                num_filters2=128,
+                out_channels2=128,
                num_groups=128,
                stride=1,
                scale=scale),
@@ -175,9 +148,9 @@ class MobileNetV1(fluid.dygraph.Layer):
        dws32 = self.add_sublayer(
            sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
+                in_channels=int(128 * scale),
-                num_filters1=128,
+                out_channels1=128,
-                num_filters2=256,
+                out_channels2=256,
                num_groups=128,
                stride=2,
                scale=scale),
@@ -186,9 +159,9 @@ class MobileNetV1(fluid.dygraph.Layer):
        dws41 = self.add_sublayer(
            sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
+                in_channels=int(256 * scale),
-                num_filters1=256,
+                out_channels1=256,
-                num_filters2=256,
+                out_channels2=256,
                num_groups=256,
                stride=1,
                scale=scale),
@@ -197,9 +170,9 @@ class MobileNetV1(fluid.dygraph.Layer):
        dws42 = self.add_sublayer(
            sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
+                in_channels=int(256 * scale),
-                num_filters1=256,
+                out_channels1=256,
-                num_filters2=512,
+                out_channels2=512,
                num_groups=256,
                stride=2,
                scale=scale),
@@ -209,9 +182,9 @@ class MobileNetV1(fluid.dygraph.Layer):
        for i in range(5):
            tmp = self.add_sublayer(
                sublayer=DepthwiseSeparable(
-                    num_channels=int(512 * scale),
+                    in_channels=int(512 * scale),
-                    num_filters1=512,
+                    out_channels1=512,
-                    num_filters2=512,
+                    out_channels2=512,
                    num_groups=512,
                    stride=1,
                    scale=scale),
@@ -220,9 +193,9 @@ class MobileNetV1(fluid.dygraph.Layer):
        dws56 = self.add_sublayer(
            sublayer=DepthwiseSeparable(
-                num_channels=int(512 * scale),
+                in_channels=int(512 * scale),
-                num_filters1=512,
+                out_channels1=512,
-                num_filters2=1024,
+                out_channels2=1024,
                num_groups=512,
                stride=2,
                scale=scale),
@@ -231,9 +204,9 @@ class MobileNetV1(fluid.dygraph.Layer):
        dws6 = self.add_sublayer(
            sublayer=DepthwiseSeparable(
-                num_channels=int(1024 * scale),
+                in_channels=int(1024 * scale),
-                num_filters1=1024,
+                out_channels1=1024,
-                num_filters2=1024,
+                out_channels2=1024,
                num_groups=1024,
                stride=1,
                scale=scale),
@@ -241,29 +214,23 @@ class MobileNetV1(fluid.dygraph.Layer):
        self.dwsl.append(dws6)
        if with_pool:
-            self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
-        if num_classes > -1:
+        if num_classes > 0:
-            self.out = Linear(
+            self.fc = nn.Linear(int(1024 * scale), num_classes)
-                int(1024 * scale),
-                num_classes,
+    def forward(self, x):
-                act=classifier_activation,
+        x = self.conv1(x)
-                param_attr=ParamAttr(
-                    initializer=MSRA(), name=self.full_name() + "fc7_weights"),
-                bias_attr=ParamAttr(name="fc7_offset"))
-    def forward(self, inputs):
-        y = self.conv1(inputs)
        for dws in self.dwsl:
-            y = dws(y)
+            x = dws(x)
        if self.with_pool:
-            y = self.pool2d_avg(y)
+            x = self.pool2d_avg(x)
        if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, 1024])
+            x = paddle.flatten(x, 1)
-            y = self.out(y)
+            x = self.fc(x)
-        return y
+        return x
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -275,7 +242,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                model_urls[arch][1])
        assert weight_path.endswith(
            '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
        model.load_dict(param)
    return model

--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -14,9 +14,9 @@
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
+import paddle.nn as nn
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+import paddle.nn.functional as F
 from paddle.utils.download import get_weights_path_from_url
@@ -25,221 +25,166 @@ __all__ = ['MobileNetV2', 'mobilenet_v2']
 model_urls = {
    'mobilenetv2_1.0':
    ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams',
-     '8ff74f291f72533f2a7956a4efff9d88')
+     '0340af0a901346c8d46f4529882fb63d')
 }
-class ConvBNLayer(fluid.dygraph.Layer):
+def _make_divisible(v, divisor, min_value=None):
-    def __init__(self,
+    if min_value is None:
-                 num_channels,
+        min_value = divisor
-                 filter_size,
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-                 num_filters,
-                 stride,
-                 padding,
-                 channels=None,
-                 num_groups=1,
-                 use_cudnn=True):
-        super(ConvBNLayer, self).__init__()
-        tmp_param = ParamAttr(name=self.full_name() + "_weights")
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=tmp_param,
-            bias_attr=False)
-        self._batch_norm = BatchNorm(
-            num_filters,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
-    def forward(self, inputs, if_act=True):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if if_act:
-            y = fluid.layers.relu6(y)
-        return y
-class InvertedResidualUnit(fluid.dygraph.Layer):
-    def __init__(
-            self,
-            num_channels,
-            num_in_filter,
-            num_filters,
-            stride,
-            filter_size,
-            padding,
-            expansion_factor, ):
-        super(InvertedResidualUnit, self).__init__()
-        num_expfilter = int(round(num_in_filter * expansion_factor))
-        self._expand_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_expfilter,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-        self._bottleneck_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_expfilter,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            num_groups=num_expfilter,
-            use_cudnn=False)
-        self._linear_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_filters,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-    def forward(self, inputs, ifshortcut):
-        y = self._expand_conv(inputs, if_act=True)
-        y = self._bottleneck_conv(y, if_act=True)
-        y = self._linear_conv(y, if_act=False)
-        if ifshortcut:
-            y = fluid.layers.elementwise_add(inputs, y)
-        return y
-class InvresiBlocks(fluid.dygraph.Layer):
-    def __init__(self, in_c, t, c, n, s):
-        super(InvresiBlocks, self).__init__()
-        self._first_block = InvertedResidualUnit(
-            num_channels=in_c,
-            num_in_filter=in_c,
-            num_filters=c,
-            stride=s,
-            filter_size=3,
-            padding=1,
-            expansion_factor=t)
-        self._inv_blocks = []
-        for i in range(1, n):
-            tmp = self.add_sublayer(
-                sublayer=InvertedResidualUnit(
-                    num_channels=c,
-                    num_in_filter=c,
-                    num_filters=c,
-                    stride=1,
-                    filter_size=3,
-                    padding=1,
-                    expansion_factor=t),
-                name=self.full_name() + "_" + str(i + 1))
-            self._inv_blocks.append(tmp)
-    def forward(self, inputs):
-        y = self._first_block(inputs, ifshortcut=False)
-        for inv_block in self._inv_blocks:
-            y = inv_block(y, ifshortcut=True)
-        return y
-class MobileNetV2(fluid.dygraph.Layer):
-    """MobileNetV2 model from
-    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
-    Args:
-        scale (float): scale of channels in each layer. Default: 1.0.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-    Examples:
-        .. code-block:: python
-            from paddle.vision.models import MobileNetV2
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
-            model = MobileNetV2()
-    """
+class ConvBNReLU(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 norm_layer=nn.BatchNorm2d):
+        padding = (kernel_size - 1) // 2
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups=groups,
+                bias_attr=False),
+            norm_layer(out_planes),
+            nn.ReLU6())
+class InvertedResidual(nn.Layer):
    def __init__(self,
-                 scale=1.0,
+                 inp,
-                 num_classes=1000,
+                 oup,
-                 with_pool=True,
+                 stride,
-                 classifier_activation='softmax'):
+                 expand_ratio,
+                 norm_layer=nn.BatchNorm2d):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvBNReLU(
+                    inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
+        layers.extend([
+            ConvBNReLU(
+                hidden_dim,
+                hidden_dim,
+                stride=stride,
+                groups=hidden_dim,
+                norm_layer=norm_layer),
+            nn.Conv2d(
+                hidden_dim, oup, 1, 1, 0, bias_attr=False),
+            norm_layer(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Layer):
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        """MobileNetV2 model from
+        `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+        Args:
+            scale (float): scale of channels in each layer. Default: 1.0.
+            num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                                will not be defined. Default: 1000.
+            with_pool (bool): use pool before the last fc layer or not. Default: True.
+        Examples:
+            .. code-block:: python
+                from paddle.vision.models import MobileNetV2
+                model = MobileNetV2()
+        """
        super(MobileNetV2, self).__init__()
-        self.scale = scale
        self.num_classes = num_classes
        self.with_pool = with_pool
+        input_channel = 32
+        last_channel = 1280
+        block = InvertedResidual
+        round_nearest = 8
+        norm_layer = nn.BatchNorm2d
+        inverted_residual_setting = [
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
-        bottleneck_params_list = [
+        input_channel = _make_divisible(input_channel * scale, round_nearest)
-            (1, 16, 1, 1),
+        self.last_channel = _make_divisible(last_channel * max(1.0, scale),
-            (6, 24, 2, 2),
+                                            round_nearest)
-            (6, 32, 3, 2),
+        features = [
-            (6, 64, 4, 2),
+            ConvBNReLU(
-            (6, 96, 3, 1),
+                3, input_channel, stride=2, norm_layer=norm_layer)
-            (6, 160, 3, 2),
-            (6, 320, 1, 1),
        ]
-        self._conv1 = ConvBNLayer(
+        for t, c, n, s in inverted_residual_setting:
-            num_channels=3,
+            output_channel = _make_divisible(c * scale, round_nearest)
-            num_filters=int(32 * scale),
+            for i in range(n):
-            filter_size=3,
+                stride = s if i == 0 else 1
-            stride=2,
+                features.append(
-            padding=1)
+                    block(
+                        input_channel,
-        self._invl = []
+                        output_channel,
-        i = 1
+                        stride,
-        in_c = int(32 * scale)
+                        expand_ratio=t,
-        for layer_setting in bottleneck_params_list:
+                        norm_layer=norm_layer))
-            t, c, n, s = layer_setting
+                input_channel = output_channel
-            i += 1
-            tmp = self.add_sublayer(
+        features.append(
-                sublayer=InvresiBlocks(
+            ConvBNReLU(
-                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
+                input_channel,
-                name='conv' + str(i))
+                self.last_channel,
-            self._invl.append(tmp)
+                kernel_size=1,
-            in_c = int(c * scale)
+                norm_layer=norm_layer))
-        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
+        self.features = nn.Sequential(*features)
-        self._conv9 = ConvBNLayer(
-            num_channels=in_c,
-            num_filters=self._out_c,
-            filter_size=1,
-            stride=1,
-            padding=0)
        if with_pool:
-            self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
-        if num_classes > 0:
+        if self.num_classes > 0:
-            tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
+            self.classifier = nn.Sequential(
-            self._fc = Linear(
+                nn.Dropout(0.2), nn.Linear(self.last_channel, num_classes))
-                self._out_c,
-                num_classes,
+    def forward(self, x):
-                act=classifier_activation,
+        x = self.features(x)
-                param_attr=tmp_param,
-                bias_attr=ParamAttr(name="fc10_offset"))
-    def forward(self, inputs):
-        y = self._conv1(inputs, if_act=True)
-        for inv in self._invl:
-            y = inv(y)
-        y = self._conv9(y, if_act=True)
        if self.with_pool:
-            y = self._pool2d_avg(y)
+            x = self.pool2d_avg(x)
        if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, self._out_c])
+            x = paddle.flatten(x, 1)
-            y = self._fc(y)
+            x = self.classifier(x)
-        return y
+        return x
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -251,7 +196,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                model_urls[arch][1])
        assert weight_path.endswith(
            '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
        model.load_dict(param)
    return model

--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -15,11 +15,8 @@
 from __future__ import division
 from __future__ import print_function
-import math
+import paddle
-import paddle.fluid as fluid
+import paddle.nn as nn
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from paddle.fluid.dygraph.container import Sequential
 from paddle.utils.download import get_weights_path_from_url
@@ -29,143 +26,129 @@ __all__ = [
 model_urls = {
    'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams',
-                 '0ba53eea9bc970962d0ef96f7b94057e'),
+                 'cf548f46534aa3560945be4b95cd11c4'),
    'resnet34': ('https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams',
-                 '46bc9f7c3dd2e55b7866285bee91eff3'),
+                 '8d2275cf8706028345f78ac0e1d31969'),
    'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams',
-                 '5ce890a9ad386df17cf7fe2313dca0a1'),
+                 'ca6f485ee1ab0492d38f323885b0ad80'),
    'resnet101': ('https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams',
-                  'fb07a451df331e4b0bb861ed97c3a9b9'),
+                  '02f35f034ca3858e1e54d4036443c92d'),
    'resnet152': ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams',
-                  'f9c700f26d3644bb76ad2226ed5f5713'),
+                  '7ad16a2f1e7333859ff986138630fd7a'),
 }
-class ConvBNLayer(fluid.dygraph.Layer):
+class BasicBlock(nn.Layer):
+    expansion = 1
    def __init__(self,
-                 num_channels,
+                 inplanes,
-                 num_filters,
+                 planes,
-                 filter_size,
                 stride=1,
+                 downsample=None,
                 groups=1,
-                 act=None):
+                 base_width=64,
-        super(ConvBNLayer, self).__init__()
+                 dilation=1,
+                 norm_layer=None):
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
-        self._batch_norm = BatchNorm(num_filters, act=act)
-    def forward(self, inputs):
-        x = self._conv(inputs)
-        x = self._batch_norm(x)
-        return x
-class BasicBlock(fluid.dygraph.Layer):
-    """residual block of resnet18 and resnet34
-    """
-    expansion = 1
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
-        self.conv0 = ConvBNLayer(
+        if dilation > 1:
-            num_channels=num_channels,
+            raise NotImplementedError(
-            num_filters=num_filters,
+                "Dilation > 1 not supported in BasicBlock")
-            filter_size=3,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
-        if not shortcut:
+        self.conv1 = nn.Conv2d(
-            self.short = ConvBNLayer(
+            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
-                num_channels=num_channels,
+        self.bn1 = norm_layer(planes)
-                num_filters=num_filters,
+        self.relu = nn.ReLU()
-                filter_size=1,
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias_attr=False)
-                stride=stride)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
-    def forward(self, inputs):
+        out = self.conv1(x)
-        y = self.conv0(inputs)
+        out = self.bn1(out)
-        conv1 = self.conv1(y)
+        out = self.relu(out)
-        if self.shortcut:
+        out = self.conv2(out)
-            short = inputs
+        out = self.bn2(out)
-        else:
-            short = self.short(inputs)
-        y = short + conv1
+        if self.downsample is not None:
+            identity = self.downsample(x)
-        return fluid.layers.relu(y)
+        out += identity
+        out = self.relu(out)
+        return out
-class BottleneckBlock(fluid.dygraph.Layer):
-    """residual block of resnet50, resnet101 amd resnet152
+class BottleneckBlock(nn.Layer):
-    """
    expansion = 4
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
        super(BottleneckBlock, self).__init__()
+        if norm_layer is None:
-        self.conv0 = ConvBNLayer(
+            norm_layer = nn.BatchNorm2d
-            num_channels=num_channels,
+        width = int(planes * (base_width / 64.)) * groups
-            num_filters=num_filters,
-            filter_size=1,
+        self.conv1 = nn.Conv2d(inplanes, width, 1, bias_attr=False)
-            act='relu')
+        self.bn1 = norm_layer(width)
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
+        self.conv2 = nn.Conv2d(
-            num_filters=num_filters,
+            width,
-            filter_size=3,
+            width,
+            3,
+            padding=dilation,
            stride=stride,
-            act='relu')
+            groups=groups,
-        self.conv2 = ConvBNLayer(
+            dilation=dilation,
-            num_channels=num_filters,
+            bias_attr=False)
-            num_filters=num_filters * self.expansion,
+        self.bn2 = norm_layer(width)
-            filter_size=1,
-            act=None)
-        if not shortcut:
+        self.conv3 = nn.Conv2d(
-            self.short = ConvBNLayer(
+            width, planes * self.expansion, 1, bias_attr=False)
-                num_channels=num_channels,
+        self.bn3 = norm_layer(planes * self.expansion)
-                num_filters=num_filters * self.expansion,
+        self.relu = nn.ReLU()
-                filter_size=1,
+        self.downsample = downsample
-                stride=stride)
+        self.stride = stride
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
-        self._num_channels_out = num_filters * self.expansion
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
-    def forward(self, inputs):
+        out = self.conv2(out)
-        x = self.conv0(inputs)
+        out = self.bn2(out)
-        conv1 = self.conv1(x)
+        out = self.relu(out)
-        conv2 = self.conv2(conv1)
-        if self.shortcut:
+        out = self.conv3(out)
-            short = inputs
+        out = self.bn3(out)
-        else:
-            short = self.short(inputs)
-        x = fluid.layers.elementwise_add(x=short, y=conv2)
+        if self.downsample is not None:
+            identity = self.downsample(x)
-        return fluid.layers.relu(x)
+        out += identity
+        out = self.relu(out)
+        return out
-class ResNet(fluid.dygraph.Layer):
+class ResNet(nn.Layer):
    """ResNet model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
@@ -175,7 +158,6 @@ class ResNet(fluid.dygraph.Layer):
        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                            will not be defined. Default: 1000.
        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    Examples:
        .. code-block:: python
@@ -189,82 +171,87 @@ class ResNet(fluid.dygraph.Layer):
    """
-    def __init__(self,
+    def __init__(self, block, depth, num_classes=1000, with_pool=True):
-                 Block,
-                 depth=50,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
        super(ResNet, self).__init__()
+        layer_cfg = {
-        self.num_classes = num_classes
-        self.with_pool = with_pool
-        layer_config = {
            18: [2, 2, 2, 2],
            34: [3, 4, 6, 3],
            50: [3, 4, 6, 3],
            101: [3, 4, 23, 3],
-            152: [3, 8, 36, 3],
+            152: [3, 8, 36, 3]
        }
-        assert depth in layer_config.keys(), \
+        layers = layer_cfg[depth]
-            "supported depth are {} but input layer is {}".format(
+        self.num_classes = num_classes
-                layer_config.keys(), depth)
+        self.with_pool = with_pool
+        self._norm_layer = nn.BatchNorm2d
-        layers = layer_config[depth]
+        self.inplanes = 64
-        in_channels = 64
+        self.dilation = 1
-        out_channels = [64, 128, 256, 512]
-        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-        self.layers = []
-        for idx, num_blocks in enumerate(layers):
-            blocks = []
-            shortcut = False
-            for b in range(num_blocks):
-                if b == 1:
-                    in_channels = out_channels[idx] * Block.expansion
-                block = Block(
-                    num_channels=in_channels,
-                    num_filters=out_channels[idx],
-                    stride=2 if b == 0 and idx != 0 else 1,
-                    shortcut=shortcut)
-                blocks.append(block)
-                shortcut = True
-            layer = self.add_sublayer("layer_{}".format(idx),
-                                      Sequential(*blocks))
-            self.layers.append(layer)
+        self.conv1 = nn.Conv2d(
+            3,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias_attr=False)
+        self.bn1 = self._norm_layer(self.inplanes)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        if with_pool:
-            self.global_pool = Pool2D(
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-                pool_size=7, pool_type='avg', global_pooling=True)
        if num_classes > 0:
-            stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
-            self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
-            self.fc = Linear(
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
-                self.fc_input_dim,
+        norm_layer = self._norm_layer
-                num_classes,
+        downsample = None
-                act=classifier_activation,
+        previous_dilation = self.dilation
-                param_attr=fluid.param_attr.ParamAttr(
+        if dilate:
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
+            self.dilation *= stride
+            stride = 1
-    def forward(self, inputs):
+        if stride != 1 or self.inplanes != planes * block.expansion:
-        x = self.conv(inputs)
+            downsample = nn.Sequential(
-        x = self.pool(x)
+                nn.Conv2d(
-        for layer in self.layers:
+                    self.inplanes,
-            x = layer(x)
+                    planes * block.expansion,
+                    1,
-        if self.with_pool:
+                    stride=stride,
-            x = self.global_pool(x)
+                    bias_attr=False),
+                norm_layer(planes * block.expansion), )
-        if self.num_classes > -1:
-            x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, 1, 64,
+                  previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        if self.with_pool > 0:
+            x = self.avgpool(x)
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
            x = self.fc(x)
        return x
@@ -277,7 +264,7 @@ def _resnet(arch, Block, depth, pretrained, **kwargs):
                                                model_urls[arch][1])
        assert weight_path.endswith(
            '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
        model.set_dict(param)
    return model

--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import paddle.fluid as fluid
+import paddle
-from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax
+import paddle.nn as nn
-from paddle.fluid.dygraph.container import Sequential
 from paddle.utils.download import get_weights_path_from_url
@@ -28,39 +27,18 @@ __all__ = [
 model_urls = {
    'vgg16': ('https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams',
-              'c788f453a3b999063e8da043456281ee')
+              '89bbffc0f87d260be9b8cdc169c991c4')
 }
-class Classifier(fluid.dygraph.Layer):
+class VGG(nn.Layer):
-    def __init__(self, num_classes, classifier_activation='softmax'):
-        super(Classifier, self).__init__()
-        self.linear1 = Linear(512 * 7 * 7, 4096)
-        self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes)
-        self.act = Softmax()  #Todo: accept any activation
-    def forward(self, x):
-        x = self.linear1(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear2(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear3(x)
-        out = self.act(x)
-        return out
-class VGG(fluid.dygraph.Layer):
    """VGG model from
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
-        features (fluid.dygraph.Layer): vgg features create by function make_layers.
+        features (nn.Layer): vgg features create by function make_layers.
        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                            will not be defined. Default: 1000.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    Examples:
        .. code-block:: python
@@ -76,44 +54,41 @@ class VGG(fluid.dygraph.Layer):
    """
-    def __init__(self,
+    def __init__(self, features, num_classes=1000):
-                 features,
-                 num_classes=1000,
-                 classifier_activation='softmax'):
        super(VGG, self).__init__()
        self.features = features
-        self.num_classes = num_classes
+        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+        self.classifier = nn.Sequential(
-        if num_classes > 0:
+            nn.Linear(512 * 7 * 7, 4096),
-            classifier = Classifier(num_classes, classifier_activation)
+            nn.ReLU(),
-            self.classifier = self.add_sublayer("classifier",
+            nn.Dropout(),
-                                                Sequential(classifier))
+            nn.Linear(4096, 4096),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(4096, num_classes), )
    def forward(self, x):
        x = self.features(x)
+        x = self.avgpool(x)
-        if self.num_classes > 0:
+        x = paddle.flatten(x, 1)
-            x = fluid.layers.flatten(x, 1)
+        x = self.classifier(x)
-            x = self.classifier(x)
        return x
 def make_layers(cfg, batch_norm=False):
    layers = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
-            layers += [Pool2D(pool_size=2, pool_stride=2)]
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU()]
-                layers += [conv2d, BatchNorm(v), ReLU()]
            else:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
+                layers += [conv2d, nn.ReLU()]
-                layers += [conv2d, ReLU()]
            in_channels = v
-    return Sequential(*layers)
+    return nn.Sequential(*layers)
 cfgs = {
@@ -144,7 +119,7 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
                                                model_urls[arch][1])
        assert weight_path.endswith(
            '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
        model.load_dict(param)
    return model