From b38e4f2840ebc4ee0195ab8de789bf5b8d54ef37 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Fri, 25 Sep 2020 22:08:40 +0800
Subject: [PATCH] Refine vision models (#27476)

* refine vision models
---
 python/paddle/hapi/callbacks.py               |  10 +-
 python/paddle/hapi/model.py                   |  60 +--
 python/paddle/metric/metrics.py               |   9 +-
 python/paddle/tests/CMakeLists.txt            |   4 -
 .../paddle/tests/dist_hapi_mnist_dynamic.py   |   2 +-
 python/paddle/tests/dist_hapi_mnist_static.py |   2 +-
 python/paddle/tests/test_model.py             |  57 ++-
 python/paddle/tests/test_pretrained_model.py  |  45 ++-
 python/paddle/tests/test_vision_models.py     |   2 +-
 python/paddle/vision/models/lenet.py          |  30 +-
 python/paddle/vision/models/mobilenetv1.py    | 189 ++++------
 python/paddle/vision/models/mobilenetv2.py    | 347 ++++++++----------
 python/paddle/vision/models/resnet.py         | 337 ++++++++---------
 python/paddle/vision/models/vgg.py            |  73 ++--
 14 files changed, 539 insertions(+), 628 deletions(-)

diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 7ed571fa9c6..69b7fedd72e 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -301,10 +301,11 @@ class ProgBarLogger(Callback):
 
             train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                 inputs, labels)
 
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
@@ -436,10 +437,11 @@ class ModelCheckpoint(Callback):
 
             train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                 inputs, labels)
 
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 53928ebed1b..1bfe8f07a2f 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -814,10 +814,9 @@ class Model(object):
         from paddle.static import InputSpec
 
         device = paddle.set_device('cpu') # or 'gpu'
-        # if use static graph, do not set
-        paddle.disable_static(device)
 
         net = nn.Sequential(
+            nn.Flatten(1),
             nn.Linear(784, 200),
             nn.Tanh(),
             nn.Linear(200, 10))
@@ -833,7 +832,7 @@ class Model(object):
                       paddle.nn.CrossEntropyLoss(),
                       paddle.metric.Accuracy())
         
-        data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+        data = paddle.vision.datasets.MNIST(mode='train')
         model.fit(data, epochs=2, batch_size=32, verbose=1)
     """
 
@@ -850,7 +849,8 @@ class Model(object):
 
         if not isinstance(inputs, (list, dict, Input)):
             raise TypeError(
-                "'inputs' must be list or dict, and couldn't be None.")
+                "'inputs' must be list or dict in static graph mode")
+
         self._inputs = self._verify_spec(inputs, True)
         self._labels = self._verify_spec(labels)
 
@@ -885,7 +885,6 @@ class Model(object):
               from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -930,7 +929,6 @@ class Model(object):
               from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -970,9 +968,12 @@ class Model(object):
               import numpy as np
               import paddle
               import paddle.nn as nn
+              from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
+              
+              input = InputSpec([None, 784], 'float32', 'x')
+              label = InputSpec([None, 1], 'int64', 'label')
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -980,7 +981,7 @@ class Model(object):
                   nn.Linear(200, 10),
                   nn.Softmax())
 
-              model = paddle.Model(net)
+              model = paddle.Model(net, input, label)
               model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
               out = model.test_batch([data])
@@ -1026,6 +1027,7 @@ class Model(object):
                     def __init__(self):
                         super(Mnist, self).__init__()
                         self.net = nn.Sequential(
+                            nn.Flatten(1),
                             nn.Linear(784, 200),
                             nn.Tanh(),
                             nn.Linear(200, 10),
@@ -1045,7 +1047,7 @@ class Model(object):
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
                     parameters=model.parameters())
                 model.prepare(optim, paddle.nn.CrossEntropyLoss())
-                data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+                data = paddle.vision.datasets.MNIST(mode='train')
                 model.fit(data, epochs=1, batch_size=32, verbose=0)
                 model.save('checkpoint/test')  # save for training
                 model.save('inference_model', False)  # save for inference
@@ -1092,15 +1094,18 @@ class Model(object):
             
               import paddle
               import paddle.nn as nn
-              
+              from paddle.static import InputSpec
+
               device = paddle.set_device('cpu')
-              paddle.disable_static(device)
+
+              input = InputSpec([None, 784], 'float32', 'x')
 
               model = paddle.Model(nn.Sequential(
                   nn.Linear(784, 200),
                   nn.Tanh(),
                   nn.Linear(200, 10),
-                  nn.Softmax()))
+                  nn.Softmax()), input)
+
               model.save('checkpoint/test')
               model.load('checkpoint/test')
         """
@@ -1165,13 +1170,15 @@ class Model(object):
 
               import paddle
               import paddle.nn as nn
+              from paddle.static import InputSpec
 
-              paddle.disable_static()
-
+              input = InputSpec([None, 784], 'float32', 'x')
+              
               model = paddle.Model(nn.Sequential(
                   nn.Linear(784, 200),
                   nn.Tanh(),
-                  nn.Linear(200, 10)))
+                  nn.Linear(200, 10)), input)
+
               params = model.parameters()
         """
         return self._adapter.parameters()
@@ -1313,7 +1320,7 @@ class Model(object):
               label = InputSpec([None, 1], 'int64', 'label')
            
               model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None),
+                  paddle.vision.models.LeNet(),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
@@ -1350,7 +1357,7 @@ class Model(object):
               label = InputSpec([None, 1], 'int64', 'label')
            
               model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None), input, label)
+                  paddle.vision.models.LeNet(), input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
               model.prepare(
@@ -1483,7 +1490,7 @@ class Model(object):
 
             # imperative mode
             paddle.disable_static()
-            model = paddle.Model(paddle.vision.models.LeNet())
+            model = paddle.Model(paddle.vision.models.LeNet(), input, label)
             model.prepare(metrics=paddle.metric.Accuracy())
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
@@ -1580,19 +1587,20 @@ class Model(object):
 
             test_dataset = MnistDataset(mode='test', return_label=False)
 
-            # declarative mode
+            # imperative mode
             input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
             model = paddle.Model(paddle.vision.models.LeNet(), input)
             model.prepare()
-
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
 
-            # imperative mode
+            # declarative mode
             device = paddle.set_device('cpu')
-            paddle.disable_static(device)
-            model = paddle.Model(paddle.vision.models.LeNet())
+            paddle.enable_static()
+            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+            model = paddle.Model(paddle.vision.models.LeNet(), input)
             model.prepare()
+
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
         """
@@ -1832,15 +1840,11 @@ class Model(object):
 
               import paddle
               from paddle.static import InputSpec
-
-              dynamic = True
-              device = paddle.set_device('cpu')
-              paddle.disable_static(device) if dynamic else None
            
               input = InputSpec([None, 1, 28, 28], 'float32', 'image')
               label = InputSpec([None, 1], 'int64', 'label')
            
-              model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+              model = paddle.Model(paddle.vision.LeNet(),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 1cd65171ff0..f4a9b8c01d0 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -182,7 +182,6 @@ class Accuracy(Metric):
         import numpy as np
         import paddle
 
-        paddle.disable_static()
         x = paddle.to_tensor(np.array([
             [0.1, 0.2, 0.3, 0.4],
             [0.1, 0.4, 0.3, 0.2],
@@ -202,11 +201,13 @@ class Accuracy(Metric):
         .. code-block:: python
 
         import paddle
-
-        paddle.disable_static()
+        from paddle.static import InputSpec
+           
+        input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+        label = InputSpec([None, 1], 'int64', 'label')
         train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-        model = paddle.Model(paddle.vision.LeNet(classifier_activation=None))
+        model = paddle.Model(paddle.vision.LeNet(), input, label)
         optim = paddle.optimizer.Adam(
             learning_rate=0.001, parameters=model.parameters())
         model.prepare(
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 6fb73b08c11..e1bc65a5d15 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -8,10 +8,6 @@ foreach(TEST_OP ${DIST_TEST_OPS})
     list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
 
-# disable test_pretrained_model and test_vision_models
-list(REMOVE_ITEM TEST_OPS test_pretrained_model)
-list(REMOVE_ITEM TEST_OPS test_vision_models)
-
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py
index 13d966bf38f..46d02789402 100644
--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -68,7 +68,7 @@ class TestDistTraning(unittest.TestCase):
         inputs = [Input(im_shape, 'float32', 'image')]
         labels = [Input([None, 1], 'int64', 'label')]
 
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
         optim = fluid.optimizer.Momentum(
             learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
diff --git a/python/paddle/tests/dist_hapi_mnist_static.py b/python/paddle/tests/dist_hapi_mnist_static.py
index 9d8e5f3652c..eab34a6dafb 100644
--- a/python/paddle/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/tests/dist_hapi_mnist_static.py
@@ -67,7 +67,7 @@ class TestDistTraning(unittest.TestCase):
         inputs = [Input(im_shape, 'float32', 'image')]
         labels = [Input([None, 1], 'int64', 'label')]
 
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
         optim = fluid.optimizer.Momentum(
             learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index c89cbbbfbda..5a3d837407b 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -40,7 +40,7 @@ from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTra
 
 
 class LeNetDygraph(paddle.nn.Layer):
-    def __init__(self, num_classes=10, classifier_activation=None):
+    def __init__(self, num_classes=10):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
@@ -55,8 +55,7 @@ class LeNetDygraph(paddle.nn.Layer):
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -67,6 +66,34 @@ class LeNetDygraph(paddle.nn.Layer):
         return x
 
 
+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
+
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -198,7 +225,7 @@ class TestModel(unittest.TestCase):
         paddle.manual_seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
-        net = LeNet(classifier_activation=None)
+        net = LeNet()
         optim_new = fluid.optimizer.Adam(
             learning_rate=0.001, parameter_list=net.parameters())
         model = Model(net, inputs=self.inputs, labels=self.labels)
@@ -287,14 +314,12 @@ class TestModel(unittest.TestCase):
 
 
 class MyModel(paddle.nn.Layer):
-    def __init__(self, classifier_activation='softmax'):
+    def __init__(self):
         super(MyModel, self).__init__()
         self._fc = Linear(20, 10)
-        self._act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         y = self._fc(x)
-        y = self._act(y)
         return y
 
 
@@ -311,7 +336,7 @@ class TestModelFunction(unittest.TestCase):
         def get_expect():
             fluid.enable_dygraph(fluid.CPUPlace())
             self.set_seed()
-            m = MyModel(classifier_activation=None)
+            m = MyModel()
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=m.parameters())
             m.train()
@@ -330,7 +355,7 @@ class TestModelFunction(unittest.TestCase):
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
 
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             optim2 = fluid.optimizer.SGD(learning_rate=0.001,
                                          parameter_list=net.parameters())
 
@@ -374,7 +399,7 @@ class TestModelFunction(unittest.TestCase):
         for dynamic in [True, False]:
             device = paddle.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             inputs = [InputSpec([None, 20], 'float32', 'x')]
             labels = [InputSpec([None, 1], 'int64', 'label')]
             optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -417,7 +442,7 @@ class TestModelFunction(unittest.TestCase):
         fluid.enable_dygraph(device)
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
-        model = Model(MyModel(classifier_activation=None), inputs, labels)
+        model = Model(MyModel(), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -426,7 +451,7 @@ class TestModelFunction(unittest.TestCase):
 
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
-        model = Model(MyModel(classifier_activation=None), inputs, labels)
+        model = Model(MyModel(), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -436,7 +461,7 @@ class TestModelFunction(unittest.TestCase):
     def test_static_save_dynamic_load(self):
         path = tempfile.mkdtemp()
 
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -448,7 +473,7 @@ class TestModelFunction(unittest.TestCase):
         device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)  #if dynamic else None
 
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -557,7 +582,7 @@ class TestModelFunction(unittest.TestCase):
 
 class TestRaiseError(unittest.TestCase):
     def test_input_without_name(self):
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
 
         inputs = [InputSpec([None, 10], 'float32')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
@@ -567,7 +592,7 @@ class TestRaiseError(unittest.TestCase):
     def test_input_without_input_spec(self):
         for dynamic in [True, False]:
             paddle.disable_static() if dynamic else None
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             with self.assertRaises(TypeError):
                 model = Model(net)
             paddle.enable_static()
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index 641147d39e9..bf9c2a2ae06 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import unittest
+import tempfile
+import shutil
 import numpy as np
 
 import paddle
@@ -23,27 +25,36 @@ import paddle.vision.models as models
 # test the predicted resutls of static graph and dynamic graph are equal
 # when used pretrained model
 class TestPretrainedModel(unittest.TestCase):
-    def infer(self, x, arch, dygraph=True):
-        if dygraph:
-            paddle.disable_static()
-
-        net = models.__dict__[arch](pretrained=True, classifier_activation=None)
-        inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
-        model = paddle.Model(network=net, inputs=inputs)
-        model.prepare()
-        res = model.test_batch(x)
-
-        if dygraph:
-            paddle.enable_static()
-        return res
+    def infer(self, arch):
+        path = tempfile.mkdtemp()
+        x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
+        res = {}
+        for dygraph in [True, False]:
+            if not dygraph:
+                paddle.enable_static()
+
+            net = models.__dict__[arch]()
+            inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
+            model = paddle.Model(network=net, inputs=inputs)
+            model.prepare()
+
+            if dygraph:
+                model.save(path)
+                res['dygraph'] = model.test_batch(x)
+            else:
+                model.load(path)
+                res['static'] = model.test_batch(x)
+
+            if not dygraph:
+                paddle.disable_static()
+
+        shutil.rmtree(path)
+        np.testing.assert_allclose(res['dygraph'], res['static'])
 
     def test_models(self):
         arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18']
         for arch in arches:
-            x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
-            y_dygraph = self.infer(x, arch)
-            y_static = self.infer(x, arch, dygraph=False)
-            np.testing.assert_allclose(y_dygraph, y_static)
+            self.infer(arch)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 44f9ab53901..6489b02615b 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -36,7 +36,7 @@ class TestVisonModels(unittest.TestCase):
         model.test_batch(x)
 
     def test_mobilenetv2_pretrained(self):
-        self.models_infer('mobilenet_v2', pretrained=True)
+        self.models_infer('mobilenet_v2', pretrained=False)
 
     def test_mobilenetv1(self):
         self.models_infer('mobilenet_v1')
diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
index c2d4be7cda1..b30d5992f9a 100644
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -12,20 +12,19 @@
 #See the License for the specific language governing permissions and
 #limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
+import paddle
+import paddle.nn as nn
 
 __all__ = ['LeNet']
 
 
-class LeNet(fluid.dygraph.Layer):
+class LeNet(nn.Layer):
     """LeNet model from
     `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_
 
     Args:
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 10.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -35,28 +34,27 @@ class LeNet(fluid.dygraph.Layer):
             model = LeNet()
     """
 
-    def __init__(self, num_classes=10, classifier_activation='softmax'):
+    def __init__(self, num_classes=10):
         super(LeNet, self).__init__()
         self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2d(
+        self.features = nn.Sequential(
+            nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            Pool2D(2, 'max', 2),
-            Conv2d(
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            Pool2D(2, 'max', 2))
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2))
 
         if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
+            self.fc = nn.Sequential(
+                nn.Linear(400, 120), nn.Linear(120, 84), nn.Linear(84, 10))
 
     def forward(self, inputs):
         x = self.features(inputs)
 
         if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
+            x = paddle.flatten(x, 1)
             x = self.fc(x)
         return x
diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
index 10defbf593d..39654122e3b 100644
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -24,85 +22,66 @@ __all__ = ['MobileNetV1', 'mobilenet_v1']
 model_urls = {
     'mobilenetv1_1.0':
     ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams',
-     'bf0d25cb0bed1114d9dac9384ce2b4a6')
+     '42a154c2f26f86e7457d6daded114e8c')
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(nn.Layer):
     def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride,
                  padding,
-                 channels=None,
-                 num_groups=1,
-                 act='relu',
-                 use_cudnn=True,
-                 name=None):
+                 num_groups=1):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
+        self._conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
             stride=stride,
             padding=padding,
             groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "_weights"),
             bias_attr=False)
 
-        self._batch_norm = BatchNorm(
-            num_filters,
-            act=act,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
+        self._norm_layer = nn.BatchNorm2d(out_channels)
+        self._act = nn.ReLU()
 
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        return y
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._norm_layer(x)
+        x = self._act(x)
+        return x
 
 
-class DepthwiseSeparable(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters1,
-                 num_filters2,
-                 num_groups,
-                 stride,
-                 scale,
-                 name=None):
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self, in_channels, out_channels1, out_channels2, num_groups,
+                 stride, scale):
         super(DepthwiseSeparable, self).__init__()
 
         self._depthwise_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=int(num_filters1 * scale),
-            filter_size=3,
+            in_channels,
+            int(out_channels1 * scale),
+            kernel_size=3,
             stride=stride,
             padding=1,
-            num_groups=int(num_groups * scale),
-            use_cudnn=False)
+            num_groups=int(num_groups * scale))
 
         self._pointwise_conv = ConvBNLayer(
-            num_channels=int(num_filters1 * scale),
-            filter_size=1,
-            num_filters=int(num_filters2 * scale),
+            int(out_channels1 * scale),
+            int(out_channels2 * scale),
+            kernel_size=1,
             stride=1,
             padding=0)
 
-    def forward(self, inputs):
-        y = self._depthwise_conv(inputs)
-        y = self._pointwise_conv(y)
-        return y
+    def forward(self, x):
+        x = self._depthwise_conv(x)
+        x = self._pointwise_conv(x)
+        return x
 
 
-class MobileNetV1(fluid.dygraph.Layer):
+class MobileNetV1(nn.Layer):
     """MobileNetV1 model from
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_.
 
@@ -111,7 +90,6 @@ class MobileNetV1(fluid.dygraph.Layer):
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
         with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -121,11 +99,7 @@ class MobileNetV1(fluid.dygraph.Layer):
             model = MobileNetV1()
     """
 
-    def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
         super(MobileNetV1, self).__init__()
         self.scale = scale
         self.dwsl = []
@@ -133,18 +107,17 @@ class MobileNetV1(fluid.dygraph.Layer):
         self.with_pool = with_pool
 
         self.conv1 = ConvBNLayer(
-            num_channels=3,
-            filter_size=3,
-            channels=3,
-            num_filters=int(32 * scale),
+            in_channels=3,
+            out_channels=int(32 * scale),
+            kernel_size=3,
             stride=2,
             padding=1)
 
         dws21 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(32 * scale),
-                num_filters1=32,
-                num_filters2=64,
+                in_channels=int(32 * scale),
+                out_channels1=32,
+                out_channels2=64,
                 num_groups=32,
                 stride=1,
                 scale=scale),
@@ -153,9 +126,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws22 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(64 * scale),
-                num_filters1=64,
-                num_filters2=128,
+                in_channels=int(64 * scale),
+                out_channels1=64,
+                out_channels2=128,
                 num_groups=64,
                 stride=2,
                 scale=scale),
@@ -164,9 +137,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws31 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=128,
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=128,
                 num_groups=128,
                 stride=1,
                 scale=scale),
@@ -175,9 +148,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws32 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=256,
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=256,
                 num_groups=128,
                 stride=2,
                 scale=scale),
@@ -186,9 +159,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws41 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=256,
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=256,
                 num_groups=256,
                 stride=1,
                 scale=scale),
@@ -197,9 +170,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws42 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=512,
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=512,
                 num_groups=256,
                 stride=2,
                 scale=scale),
@@ -209,9 +182,9 @@ class MobileNetV1(fluid.dygraph.Layer):
         for i in range(5):
             tmp = self.add_sublayer(
                 sublayer=DepthwiseSeparable(
-                    num_channels=int(512 * scale),
-                    num_filters1=512,
-                    num_filters2=512,
+                    in_channels=int(512 * scale),
+                    out_channels1=512,
+                    out_channels2=512,
                     num_groups=512,
                     stride=1,
                     scale=scale),
@@ -220,9 +193,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws56 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(512 * scale),
-                num_filters1=512,
-                num_filters2=1024,
+                in_channels=int(512 * scale),
+                out_channels1=512,
+                out_channels2=1024,
                 num_groups=512,
                 stride=2,
                 scale=scale),
@@ -231,9 +204,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws6 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(1024 * scale),
-                num_filters1=1024,
-                num_filters2=1024,
+                in_channels=int(1024 * scale),
+                out_channels1=1024,
+                out_channels2=1024,
                 num_groups=1024,
                 stride=1,
                 scale=scale),
@@ -241,29 +214,23 @@ class MobileNetV1(fluid.dygraph.Layer):
         self.dwsl.append(dws6)
 
         if with_pool:
-            self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > -1:
-            self.out = Linear(
-                int(1024 * scale),
-                num_classes,
-                act=classifier_activation,
-                param_attr=ParamAttr(
-                    initializer=MSRA(), name=self.full_name() + "fc7_weights"),
-                bias_attr=ParamAttr(name="fc7_offset"))
-
-    def forward(self, inputs):
-        y = self.conv1(inputs)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+
+        if num_classes > 0:
+            self.fc = nn.Linear(int(1024 * scale), num_classes)
+
+    def forward(self, x):
+        x = self.conv1(x)
         for dws in self.dwsl:
-            y = dws(y)
+            x = dws(x)
 
         if self.with_pool:
-            y = self.pool2d_avg(y)
+            x = self.pool2d_avg(x)
 
         if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, 1024])
-            y = self.out(y)
-        return y
+            x = paddle.flatten(x, 1)
+            x = self.fc(x)
+        return x
 
 
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -275,7 +242,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index c08fb88f8bd..bab8b7b2b1b 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -14,9 +14,9 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+
+import paddle.nn as nn
+import paddle.nn.functional as F
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -25,221 +25,166 @@ __all__ = ['MobileNetV2', 'mobilenet_v2']
 model_urls = {
     'mobilenetv2_1.0':
     ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams',
-     '8ff74f291f72533f2a7956a4efff9d88')
+     '0340af0a901346c8d46f4529882fb63d')
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
-                 stride,
-                 padding,
-                 channels=None,
-                 num_groups=1,
-                 use_cudnn=True):
-        super(ConvBNLayer, self).__init__()
-
-        tmp_param = ParamAttr(name=self.full_name() + "_weights")
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=tmp_param,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(
-            num_filters,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
-
-    def forward(self, inputs, if_act=True):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if if_act:
-            y = fluid.layers.relu6(y)
-        return y
-
-
-class InvertedResidualUnit(fluid.dygraph.Layer):
-    def __init__(
-            self,
-            num_channels,
-            num_in_filter,
-            num_filters,
-            stride,
-            filter_size,
-            padding,
-            expansion_factor, ):
-        super(InvertedResidualUnit, self).__init__()
-        num_expfilter = int(round(num_in_filter * expansion_factor))
-        self._expand_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_expfilter,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-        self._bottleneck_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_expfilter,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            num_groups=num_expfilter,
-            use_cudnn=False)
-
-        self._linear_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_filters,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-    def forward(self, inputs, ifshortcut):
-        y = self._expand_conv(inputs, if_act=True)
-        y = self._bottleneck_conv(y, if_act=True)
-        y = self._linear_conv(y, if_act=False)
-        if ifshortcut:
-            y = fluid.layers.elementwise_add(inputs, y)
-        return y
-
-
-class InvresiBlocks(fluid.dygraph.Layer):
-    def __init__(self, in_c, t, c, n, s):
-        super(InvresiBlocks, self).__init__()
-
-        self._first_block = InvertedResidualUnit(
-            num_channels=in_c,
-            num_in_filter=in_c,
-            num_filters=c,
-            stride=s,
-            filter_size=3,
-            padding=1,
-            expansion_factor=t)
-
-        self._inv_blocks = []
-        for i in range(1, n):
-            tmp = self.add_sublayer(
-                sublayer=InvertedResidualUnit(
-                    num_channels=c,
-                    num_in_filter=c,
-                    num_filters=c,
-                    stride=1,
-                    filter_size=3,
-                    padding=1,
-                    expansion_factor=t),
-                name=self.full_name() + "_" + str(i + 1))
-            self._inv_blocks.append(tmp)
-
-    def forward(self, inputs):
-        y = self._first_block(inputs, ifshortcut=False)
-        for inv_block in self._inv_blocks:
-            y = inv_block(y, ifshortcut=True)
-        return y
-
-
-class MobileNetV2(fluid.dygraph.Layer):
-    """MobileNetV2 model from
-    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
-
-    Args:
-        scale (float): scale of channels in each layer. Default: 1.0.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-
-    Examples:
-        .. code-block:: python
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 
-            from paddle.vision.models import MobileNetV2
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
 
-            model = MobileNetV2()
-    """
 
+class ConvBNReLU(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 norm_layer=nn.BatchNorm2d):
+        padding = (kernel_size - 1) // 2
+
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups=groups,
+                bias_attr=False),
+            norm_layer(out_planes),
+            nn.ReLU6())
+
+
+class InvertedResidual(nn.Layer):
     def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+                 inp,
+                 oup,
+                 stride,
+                 expand_ratio,
+                 norm_layer=nn.BatchNorm2d):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvBNReLU(
+                    inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
+        layers.extend([
+            ConvBNReLU(
+                hidden_dim,
+                hidden_dim,
+                stride=stride,
+                groups=hidden_dim,
+                norm_layer=norm_layer),
+            nn.Conv2d(
+                hidden_dim, oup, 1, 1, 0, bias_attr=False),
+            norm_layer(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Layer):
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        """MobileNetV2 model from
+        `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+
+        Args:
+            scale (float): scale of channels in each layer. Default: 1.0.
+            num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                                will not be defined. Default: 1000.
+            with_pool (bool): use pool before the last fc layer or not. Default: True.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.vision.models import MobileNetV2
+
+                model = MobileNetV2()
+        """
         super(MobileNetV2, self).__init__()
-        self.scale = scale
         self.num_classes = num_classes
         self.with_pool = with_pool
+        input_channel = 32
+        last_channel = 1280
+
+        block = InvertedResidual
+        round_nearest = 8
+        norm_layer = nn.BatchNorm2d
+        inverted_residual_setting = [
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
 
-        bottleneck_params_list = [
-            (1, 16, 1, 1),
-            (6, 24, 2, 2),
-            (6, 32, 3, 2),
-            (6, 64, 4, 2),
-            (6, 96, 3, 1),
-            (6, 160, 3, 2),
-            (6, 320, 1, 1),
+        input_channel = _make_divisible(input_channel * scale, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, scale),
+                                            round_nearest)
+        features = [
+            ConvBNReLU(
+                3, input_channel, stride=2, norm_layer=norm_layer)
         ]
 
-        self._conv1 = ConvBNLayer(
-            num_channels=3,
-            num_filters=int(32 * scale),
-            filter_size=3,
-            stride=2,
-            padding=1)
-
-        self._invl = []
-        i = 1
-        in_c = int(32 * scale)
-        for layer_setting in bottleneck_params_list:
-            t, c, n, s = layer_setting
-            i += 1
-            tmp = self.add_sublayer(
-                sublayer=InvresiBlocks(
-                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
-                name='conv' + str(i))
-            self._invl.append(tmp)
-            in_c = int(c * scale)
-
-        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
-        self._conv9 = ConvBNLayer(
-            num_channels=in_c,
-            num_filters=self._out_c,
-            filter_size=1,
-            stride=1,
-            padding=0)
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * scale, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(
+                    block(
+                        input_channel,
+                        output_channel,
+                        stride,
+                        expand_ratio=t,
+                        norm_layer=norm_layer))
+                input_channel = output_channel
+
+        features.append(
+            ConvBNReLU(
+                input_channel,
+                self.last_channel,
+                kernel_size=1,
+                norm_layer=norm_layer))
+
+        self.features = nn.Sequential(*features)
 
         if with_pool:
-            self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > 0:
-            tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
-            self._fc = Linear(
-                self._out_c,
-                num_classes,
-                act=classifier_activation,
-                param_attr=tmp_param,
-                bias_attr=ParamAttr(name="fc10_offset"))
-
-    def forward(self, inputs):
-        y = self._conv1(inputs, if_act=True)
-        for inv in self._invl:
-            y = inv(y)
-        y = self._conv9(y, if_act=True)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(0.2), nn.Linear(self.last_channel, num_classes))
+
+    def forward(self, x):
+        x = self.features(x)
 
         if self.with_pool:
-            y = self._pool2d_avg(y)
+            x = self.pool2d_avg(x)
+
         if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, self._out_c])
-            y = self._fc(y)
-        return y
+            x = paddle.flatten(x, 1)
+            x = self.classifier(x)
+        return x
 
 
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -251,7 +196,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index da0c3e9eb3f..f9e00aefd6b 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -15,11 +15,8 @@
 from __future__ import division
 from __future__ import print_function
 
-import math
-import paddle.fluid as fluid
-
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from paddle.fluid.dygraph.container import Sequential
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -29,143 +26,129 @@ __all__ = [
 
 model_urls = {
     'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams',
-                 '0ba53eea9bc970962d0ef96f7b94057e'),
+                 'cf548f46534aa3560945be4b95cd11c4'),
     'resnet34': ('https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams',
-                 '46bc9f7c3dd2e55b7866285bee91eff3'),
+                 '8d2275cf8706028345f78ac0e1d31969'),
     'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams',
-                 '5ce890a9ad386df17cf7fe2313dca0a1'),
+                 'ca6f485ee1ab0492d38f323885b0ad80'),
     'resnet101': ('https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams',
-                  'fb07a451df331e4b0bb861ed97c3a9b9'),
+                  '02f35f034ca3858e1e54d4036443c92d'),
     'resnet152': ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams',
-                  'f9c700f26d3644bb76ad2226ed5f5713'),
+                  '7ad16a2f1e7333859ff986138630fd7a'),
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class BasicBlock(nn.Layer):
+    expansion = 1
+
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
+                 inplanes,
+                 planes,
                  stride=1,
+                 downsample=None,
                  groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(num_filters, act=act)
-
-    def forward(self, inputs):
-        x = self._conv(inputs)
-        x = self._batch_norm(x)
-
-        return x
-
-
-class BasicBlock(fluid.dygraph.Layer):
-    """residual block of resnet18 and resnet34
-    """
-    expansion = 1
-
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
         super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=3,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
+        if dilation > 1:
+            raise NotImplementedError(
+                "Dilation > 1 not supported in BasicBlock")
 
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters,
-                filter_size=1,
-                stride=stride)
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias_attr=False)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
 
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
 
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
 
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
+        out = self.conv2(out)
+        out = self.bn2(out)
 
-        y = short + conv1
+        if self.downsample is not None:
+            identity = self.downsample(x)
 
-        return fluid.layers.relu(y)
+        out += identity
+        out = self.relu(out)
 
+        return out
 
-class BottleneckBlock(fluid.dygraph.Layer):
-    """residual block of resnet50, resnet101 amd resnet152
-    """
+
+class BottleneckBlock(nn.Layer):
 
     expansion = 4
 
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
         super(BottleneckBlock, self).__init__()
-
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+
+        self.conv1 = nn.Conv2d(inplanes, width, 1, bias_attr=False)
+        self.bn1 = norm_layer(width)
+
+        self.conv2 = nn.Conv2d(
+            width,
+            width,
+            3,
+            padding=dilation,
             stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * self.expansion,
-            filter_size=1,
-            act=None)
+            groups=groups,
+            dilation=dilation,
+            bias_attr=False)
+        self.bn2 = norm_layer(width)
 
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * self.expansion,
-                filter_size=1,
-                stride=stride)
+        self.conv3 = nn.Conv2d(
+            width, planes * self.expansion, 1, bias_attr=False)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
 
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
 
-        self._num_channels_out = num_filters * self.expansion
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
 
-    def forward(self, inputs):
-        x = self.conv0(inputs)
-        conv1 = self.conv1(x)
-        conv2 = self.conv2(conv1)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
 
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
+        out = self.conv3(out)
+        out = self.bn3(out)
 
-        x = fluid.layers.elementwise_add(x=short, y=conv2)
+        if self.downsample is not None:
+            identity = self.downsample(x)
 
-        return fluid.layers.relu(x)
+        out += identity
+        out = self.relu(out)
 
+        return out
 
-class ResNet(fluid.dygraph.Layer):
+
+class ResNet(nn.Layer):
     """ResNet model from
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
@@ -175,7 +158,6 @@ class ResNet(fluid.dygraph.Layer):
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
         with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -189,82 +171,87 @@ class ResNet(fluid.dygraph.Layer):
 
     """
 
-    def __init__(self,
-                 Block,
-                 depth=50,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+    def __init__(self, block, depth, num_classes=1000, with_pool=True):
         super(ResNet, self).__init__()
-
-        self.num_classes = num_classes
-        self.with_pool = with_pool
-
-        layer_config = {
+        layer_cfg = {
             18: [2, 2, 2, 2],
             34: [3, 4, 6, 3],
             50: [3, 4, 6, 3],
             101: [3, 4, 23, 3],
-            152: [3, 8, 36, 3],
+            152: [3, 8, 36, 3]
         }
-        assert depth in layer_config.keys(), \
-            "supported depth are {} but input layer is {}".format(
-                layer_config.keys(), depth)
-
-        layers = layer_config[depth]
-
-        in_channels = 64
-        out_channels = [64, 128, 256, 512]
-
-        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-
-        self.layers = []
-        for idx, num_blocks in enumerate(layers):
-            blocks = []
-            shortcut = False
-            for b in range(num_blocks):
-                if b == 1:
-                    in_channels = out_channels[idx] * Block.expansion
-                block = Block(
-                    num_channels=in_channels,
-                    num_filters=out_channels[idx],
-                    stride=2 if b == 0 and idx != 0 else 1,
-                    shortcut=shortcut)
-                blocks.append(block)
-                shortcut = True
-            layer = self.add_sublayer("layer_{}".format(idx),
-                                      Sequential(*blocks))
-            self.layers.append(layer)
+        layers = layer_cfg[depth]
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        self._norm_layer = nn.BatchNorm2d
+
+        self.inplanes = 64
+        self.dilation = 1
 
+        self.conv1 = nn.Conv2d(
+            3,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias_attr=False)
+        self.bn1 = self._norm_layer(self.inplanes)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
         if with_pool:
-            self.global_pool = Pool2D(
-                pool_size=7, pool_type='avg', global_pooling=True)
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
 
         if num_classes > 0:
-            stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
-            self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
-            self.fc = Linear(
-                self.fc_input_dim,
-                num_classes,
-                act=classifier_activation,
-                param_attr=fluid.param_attr.ParamAttr(
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
-
-    def forward(self, inputs):
-        x = self.conv(inputs)
-        x = self.pool(x)
-        for layer in self.layers:
-            x = layer(x)
-
-        if self.with_pool:
-            x = self.global_pool(x)
-
-        if self.num_classes > -1:
-            x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    1,
+                    stride=stride,
+                    bias_attr=False),
+                norm_layer(planes * block.expansion), )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, 1, 64,
+                  previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.with_pool > 0:
+            x = self.avgpool(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
             x = self.fc(x)
+
         return x
 
 
@@ -277,7 +264,7 @@ def _resnet(arch, Block, depth, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.set_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index 8bfacda2476..d11845b6616 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax
-from paddle.fluid.dygraph.container import Sequential
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -28,39 +27,18 @@ __all__ = [
 
 model_urls = {
     'vgg16': ('https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams',
-              'c788f453a3b999063e8da043456281ee')
+              '89bbffc0f87d260be9b8cdc169c991c4')
 }
 
 
-class Classifier(fluid.dygraph.Layer):
-    def __init__(self, num_classes, classifier_activation='softmax'):
-        super(Classifier, self).__init__()
-        self.linear1 = Linear(512 * 7 * 7, 4096)
-        self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes)
-        self.act = Softmax()  #Todo: accept any activation
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear2(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear3(x)
-        out = self.act(x)
-        return out
-
-
-class VGG(fluid.dygraph.Layer):
+class VGG(nn.Layer):
     """VGG model from
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
 
     Args:
-        features (fluid.dygraph.Layer): vgg features create by function make_layers.
+        features (nn.Layer): vgg features create by function make_layers.
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -76,44 +54,41 @@ class VGG(fluid.dygraph.Layer):
 
     """
 
-    def __init__(self,
-                 features,
-                 num_classes=1000,
-                 classifier_activation='softmax'):
+    def __init__(self, features, num_classes=1000):
         super(VGG, self).__init__()
         self.features = features
-        self.num_classes = num_classes
-
-        if num_classes > 0:
-            classifier = Classifier(num_classes, classifier_activation)
-            self.classifier = self.add_sublayer("classifier",
-                                                Sequential(classifier))
+        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 7 * 7, 4096),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(4096, 4096),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(4096, num_classes), )
 
     def forward(self, x):
         x = self.features(x)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.classifier(x)
+        x = self.avgpool(x)
+        x = paddle.flatten(x, 1)
+        x = self.classifier(x)
         return x
 
 
 def make_layers(cfg, batch_norm=False):
     layers = []
     in_channels = 3
-
     for v in cfg:
         if v == 'M':
-            layers += [Pool2D(pool_size=2, pool_stride=2)]
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
         else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
             if batch_norm:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
-                layers += [conv2d, BatchNorm(v), ReLU()]
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU()]
             else:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
-                layers += [conv2d, ReLU()]
+                layers += [conv2d, nn.ReLU()]
             in_channels = v
-    return Sequential(*layers)
+    return nn.Sequential(*layers)
 
 
 cfgs = {
@@ -144,7 +119,7 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
-- 
GitLab