diff --git a/dygraph/mobilenet/RADEME.md b/dygraph/mobilenet/RADEME.md
deleted file mode 100644
index beee2f1b00b32e19d1f9f25caf1695c465ee2984..0000000000000000000000000000000000000000
--- a/dygraph/mobilenet/RADEME.md
+++ /dev/null
@@ -1,44 +0,0 @@
-**模型简介**
-
-图像分类是计算机视觉的重要领域，它的目标是将图像分类到预定义的标签。CNN模型在图像分类领域取得了突破的成果，同时模型复杂度也在不断增加。MobileNet是一种小巧而高效CNN模型，本文介绍如何使PaddlePaddle的动态图MobileNet进行图像分类。
-
-**代码结构**
-
-    ├── run_mul_v1.sh      # 多卡训练启动脚本_v1
-    ├── run_mul_v2.sh      # 多卡训练启动脚本_v2
-    ├── run_sing_v1.sh     # 单卡训练启动脚本_v1
-    ├── run_sing_v2.sh     # 单卡训练启动脚本_v2
-    ├── train.py           # 训练入口
-    ├── mobilenet_v1.py    # 网络结构v1
-    ├── mobilenet_v2.py    # 网络结构v2
-    ├── reader.py          # 数据reader
-    ├── utils              # 基础工具目录
-
-**数据准备**
-
-请参考：https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification
-
-**模型训练**
-
-若使用4卡训练，启动方式如下:
-
-    bash run_mul_v1.sh
-    bash run_mul_v2.sh
-若使用单卡训练，启动方式如下:
-
-    bash run_sing_v1.sh
-    bash run_sing_v2.sh
-
-**模型精度**
-
-    Model         Top-1      Top-5
-    
-    MobileNetV1    0.707     0.895
-    
-    MobileNetV2    0.626     0.845
-
-**参考论文**
-
-MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications, Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam
-
-MobileNetV2: Inverted Residuals and Linear Bottlenecks, Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
diff --git a/dygraph/mobilenet/README.md b/dygraph/mobilenet/README.md
index 5d3a3e64698727ec4d6eb16b7b2b378e71b3ad16..c2e0477e753d5abe9f90ed17682ad0e58f8e52e3 100644
--- a/dygraph/mobilenet/README.md
+++ b/dygraph/mobilenet/README.md
@@ -4,17 +4,21 @@
 
 **代码结构**
 
-    ├── run_mul_v1.sh      # 多卡训练启动脚本_v1
-    ├── run_mul_v2.sh      # 多卡训练启动脚本_v2
-    ├── run_sing_v1.sh     # 单卡训练启动脚本_v1
-    ├── run_sing_v2.sh     # 单卡训练启动脚本_v2
-    ├── run_cpu_v1.sh      # CPU训练启动脚本_v1
-    ├── run_cpu_v2.sh      # CPU训练启动脚本_v2
-    ├── train.py           # 训练入口
-    ├── mobilenet_v1.py    # 网络结构v1
-    ├── mobilenet_v2.py    # 网络结构v2
-    ├── reader.py          # 数据reader
-    ├── utils              # 基础工具目录
+    ├── run_mul_v1.sh                 # 多卡训练启动脚本_v1
+    ├── run_mul_v1_checkpoint.sh      # 加载checkpoint多卡训练启动脚本_v1
+    ├── run_mul_v2.sh                 # 多卡训练启动脚本_v2
+    ├── run_mul_v2_checkpoint.sh      # 加载checkpoint多卡训练启动脚本_v2
+    ├── run_sing_v1.sh                # 单卡训练启动脚本_v1
+    ├── run_sing_v1_checkpoint.sh     # 加载checkpoint单卡训练启动脚本_v1
+    ├── run_sing_v2.sh                # 单卡训练启动脚本_v2
+    ├── run_sing_v2_checkpoint.sh     # 加载checkpoint单卡训练启动脚本_v2
+    ├── run_cpu_v1.sh                 # CPU训练启动脚本_v1
+    ├── run_cpu_v2.sh                 # CPU训练启动脚本_v2
+    ├── train.py                      # 训练入口
+    ├── mobilenet_v1.py               # 网络结构v1
+    ├── mobilenet_v2.py               # 网络结构v2
+    ├── reader.py                     # 数据reader
+    ├── utils                         # 基础工具目录
 
 **数据准备**
 
@@ -26,6 +30,7 @@
 
     bash run_mul_v1.sh
     bash run_mul_v2.sh
+
 若使用单卡训练，启动方式如下:
 
     bash run_sing_v1.sh
@@ -36,6 +41,16 @@
     bash run_cpu_v1.sh
     bash run_cpu_v2.sh
 
+训练过程中,checkpoint会保存在参数model_save_dir指定的文件夹中,我们支持加载checkpoint继续训练.
+加载checkpoint使用4卡训练，启动方式如下:
+
+    bash run_mul_v1_checkpoint.sh
+    bash run_mul_v2_checkpoint.sh
+
+加载checkpoint使用单卡训练，启动方式如下:
+
+    bash run_sing_v1_checkpoint.sh
+    bash run_sing_v2_checkpoint.sh
 
 **模型性能**
 
diff --git a/dygraph/mobilenet/mobilenet_v1.py b/dygraph/mobilenet/mobilenet_v1.py
index 56c12b9a4d96d292bce2a68633ceff6f40e732cc..e3a5a94eab46477a8fb9676f5a5bf67000783018 100644
--- a/dygraph/mobilenet/mobilenet_v1.py
+++ b/dygraph/mobilenet/mobilenet_v1.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+#order: standard library, third party, local library 
 import os
 import time
 import sys
+import math
 import numpy as np
 import argparse
-import ast
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
@@ -26,8 +27,6 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid import framework
-import math
-import sys
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
diff --git a/dygraph/mobilenet/mobilenet_v2.py b/dygraph/mobilenet/mobilenet_v2.py
index 2466d4307dd8482f8e3c5070f9fdf6a1053e085f..6da031f298c1e76c21d6415da4b4fe0dd9715731 100644
--- a/dygraph/mobilenet/mobilenet_v2.py
+++ b/dygraph/mobilenet/mobilenet_v2.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+#order: standard library, third party, local library 
 import os
-import numpy as np
 import time
-import sys
+import math
 import sys
 import numpy as np
 import argparse
-import ast
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
@@ -27,11 +26,8 @@ from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
-
 from paddle.fluid import framework
 
-import math
-import sys
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
diff --git a/dygraph/mobilenet/run_mul_v1.sh b/dygraph/mobilenet/run_mul_v1.sh
index d84e4d1d7fff1a02045383667c4caa6dc5a0d548..fa48ef5fe46ebfcf86c84a21bc1ecb7ad8a492df 100644
--- a/dygraph/mobilenet/run_mul_v1.sh
+++ b/dygraph/mobilenet/run_mul_v1.sh
@@ -1,2 +1,2 @@
 export CUDA_VISIBLE_DEVICES=0,1,2,3
-python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256     --reader_thread=8    --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV1
+python3 -m paddle.distributed.launch --log_dir ./mylog.v1 train.py --use_data_parallel 1 --batch_size=256     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV1  --model_save_dir=output.v1.mul/ --num_epochs=120 
diff --git a/dygraph/mobilenet/run_mul_v2.sh b/dygraph/mobilenet/run_mul_v2.sh
index a3f9991e330e25edd7440cf681397d0fd4f78d77..485cad365c3727710678f7426e3238b94c20f6e9 100644
--- a/dygraph/mobilenet/run_mul_v2.sh
+++ b/dygraph/mobilenet/run_mul_v2.sh
@@ -1,2 +1,2 @@
 export CUDA_VISIBLE_DEVICES=0,1,2,3
-python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256     --reader_thread=8    --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV2
+python3 -m paddle.distributed.launch --log_dir ./mylog.v2 train.py --use_data_parallel 1 --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2
diff --git a/dygraph/mobilenet/run_sing_v1.sh b/dygraph/mobilenet/run_sing_v1.sh
index 3e480faadfba596a139b7709d81b9351ff97a85a..c4fef2984b06aa98b04e9ab0a481530ec3c22034 100644
--- a/dygraph/mobilenet/run_sing_v1.sh
+++ b/dygraph/mobilenet/run_sing_v1.sh
@@ -1,2 +1,2 @@
 export CUDA_VISIBLE_DEVICES=0
-python3 train.py      --batch_size=256        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1 
+python3 train.py      --batch_size=256        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1 
diff --git a/dygraph/mobilenet/run_sing_v2.sh b/dygraph/mobilenet/run_sing_v2.sh
index 9db7a20f169d8019d3b0c8c90b9104321fd1263b..f747ee5e01ba7d8d5c5eb35fb6e732a381a305b9 100644
--- a/dygraph/mobilenet/run_sing_v2.sh
+++ b/dygraph/mobilenet/run_sing_v2.sh
@@ -1,2 +1,2 @@
 export CUDA_VISIBLE_DEVICES=0
-python3 train.py      --batch_size=128        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012  --model=MobileNetV2
+python3 train.py  --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2
diff --git a/dygraph/mobilenet/train.py b/dygraph/mobilenet/train.py
index 42648b3e73305828fde2f0e4223fb3f27c29cf75..254279baedf3879ada6bc5c92ab3f733e5f3d524 100644
--- a/dygraph/mobilenet/train.py
+++ b/dygraph/mobilenet/train.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,35 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from mobilenet_v1 import *
-from mobilenet_v2 import *
+#order: standard library, third party, local library 
 import os
-import numpy as np
 import time
 import sys
-import sys
-import numpy as np
+import math
 import argparse
-import ast
+import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.layer_helper import LayerHelper
-#from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.dygraph.base import to_variable
-
 from paddle.fluid import framework
-
-import math
-import sys
 import reader
 from utils import *
-
-IMAGENET1000 = 1281167
-base_lr = 0.1
-momentum_rate = 0.9
-l2_decay = 1e-4
+from mobilenet_v1 import *
+from mobilenet_v2 import *
 
 args = parse_args()
 if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
@@ -56,7 +45,7 @@ def eval(net, test_data_loader, eop):
     for img, label in test_data_loader():
         t1 = time.time()
         label = to_variable(label.numpy().astype('int64').reshape(
-            int(args.batch_size / paddle.fluid.core.get_cuda_device_count()),
+            int(args.batch_size // paddle.fluid.core.get_cuda_device_count()),
             1))
         out = net(img)
         softmax_out = fluid.layers.softmax(out, use_cudnn=False)
@@ -80,10 +69,14 @@ def eval(net, test_data_loader, eop):
 
 
 def train_mobilenet():
-    epoch = args.num_epochs
-    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
-        if args.use_data_parallel else fluid.CUDAPlace(0)
+    if not args.use_gpu:
+        place = fluid.CPUPlace()
+    elif not args.use_data_parallel:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
     with fluid.dygraph.guard(place):
+        # 1. init net and optimizer
         if args.ce:
             print("ce mode")
             seed = 33
@@ -93,13 +86,12 @@ def train_mobilenet():
         if args.use_data_parallel:
             strategy = fluid.dygraph.parallel.prepare_context()
 
-        net = None
         if args.model == "MobileNetV1":
-            net = MobileNetV1(class_dim=args.class_dim)
-            para_name = 'mobilenet_v1_params'
+            net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
+            model_path_pre = 'mobilenet_v1'
         elif args.model == "MobileNetV2":
             net = MobileNetV2(class_dim=args.class_dim, scale=1.0)
-            para_name = 'mobilenet_v2_params'
+            model_path_pre = 'mobilenet_v2'
         else:
             print(
                 "wrong model name, please try model = MobileNetV1 or MobileNetV2"
@@ -109,6 +101,18 @@ def train_mobilenet():
         optimizer = create_optimizer(args=args, parameter_list=net.parameters())
         if args.use_data_parallel:
             net = fluid.dygraph.parallel.DataParallel(net, strategy)
+
+        # 2. load checkpoint
+        if args.checkpoint:
+            assert os.path.exists(args.checkpoint + ".pdparams"), \
+                "Given dir {}.pdparams not exist.".format(args.checkpoint)
+            assert os.path.exists(args.checkpoint + ".pdopt"), \
+                "Given dir {}.pdopt not exist.".format(args.checkpoint)
+            para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint)
+            net.set_dict(para_dict)
+            optimizer.set_dict(opti_dict)
+
+        # 3. reader
         train_data_loader, train_data = utility.create_data_loader(
             is_train=True, args=args)
         test_data_loader, test_data = utility.create_data_loader(
@@ -119,7 +123,9 @@ def train_mobilenet():
         test_reader = imagenet_reader.val(settings=args)
         train_data_loader.set_sample_list_generator(train_reader, place)
         test_data_loader.set_sample_list_generator(test_reader, place)
-        for eop in range(epoch):
+
+        # 4. train loop
+        for eop in range(args.num_epochs):
             if num_trainers > 1:
                 imagenet_reader.set_shuffle_seed(eop + (
                     args.random_seed if args.random_seed else 0))
@@ -130,13 +136,17 @@ def train_mobilenet():
             total_sample = 0
             batch_id = 0
             t_last = 0
+            # 4.1 for each batch, call net() , backward(), and minimize()
             for img, label in train_data_loader():
                 t1 = time.time()
                 label = to_variable(label.numpy().astype('int64').reshape(
-                    int(args.batch_size /
+                    int(args.batch_size //
                         paddle.fluid.core.get_cuda_device_count()), 1))
                 t_start = time.time()
+
+                # 4.1.1 call net()
                 out = net(img)
+
                 t_end = time.time()
                 softmax_out = fluid.layers.softmax(out, use_cudnn=False)
                 loss = fluid.layers.cross_entropy(
@@ -145,14 +155,20 @@ def train_mobilenet():
                 acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
                 acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
                 t_start_back = time.time()
+
+                # 4.1.2 call backward()
                 if args.use_data_parallel:
                     avg_loss = net.scale_loss(avg_loss)
                     avg_loss.backward()
                     net.apply_collective_grads()
                 else:
                     avg_loss.backward()
+
                 t_end_back = time.time()
+
+                # 4.1.3 call minimize()
                 optimizer.minimize(avg_loss)
+
                 net.clear_gradients()
                 t2 = time.time()
                 train_batch_elapse = t2 - t1
@@ -174,13 +190,31 @@ def train_mobilenet():
             print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
                   (eop, batch_id, total_loss / total_sample, \
                    total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse))
-            net.eval()
-            eval(net, test_data_loader, eop)
+   
+            # 4.2 save checkpoint
             save_parameters = (not args.use_data_parallel) or (
                 args.use_data_parallel and
                 fluid.dygraph.parallel.Env().local_rank == 0)
             if save_parameters:
-                fluid.save_dygraph(net.state_dict(), para_name)
+                if not os.path.isdir(args.model_save_dir):
+                    os.makedirs(args.model_save_dir)
+                model_path = os.path.join(
+                    args.model_save_dir, "_" + model_path_pre + "_epoch{}".format(eop))
+                fluid.dygraph.save_dygraph(net.state_dict(), model_path)
+                fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
+
+            # 4.3 validation
+            net.eval()
+            eval(net, test_data_loader, eop)
+
+        # 5. save final results
+        save_parameters = (not args.use_data_parallel) or (
+            args.use_data_parallel and
+            fluid.dygraph.parallel.Env().local_rank == 0)
+        if save_parameters:
+            model_path = os.path.join(
+                args.model_save_dir, "_" + model_path_pre + "_final")
+            fluid.dygraph.save_dygraph(net.state_dict(), model_path)
 
 
 if __name__ == '__main__':