diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4cd4bd7fa92f0c5d4de590838220d4cd6973c2bf..15d8e516abaffd444f20057c1099658c5aa215d5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,6 +33,3 @@
     - id: trailing-whitespace
       files: \.(md|yml)$
     - id: check-case-conflict
-    - id: flake8
-      args: ['--ignore=E265']
-
diff --git a/README.md b/README.md
index 227c86c258aed4aeaec61af807216c0a67497b99..c78e28f865fb30c218861ddbd2f98af858f37645 100644
--- a/README.md
+++ b/README.md
@@ -97,10 +97,6 @@ PaddleClas的安装说明、模型训练、预测、评估以及模型微调（f
 
 近年来，学术界和工业界广泛关注图像中目标检测任务，而图像分类的网络结构以及预训练模型效果直接影响目标检测的效果。PaddleDetection使用PaddleClas的82.39%的ResNet50_vd的预训练模型，结合自身丰富的检测算子，提供了一种面向服务器端应用的目标检测方案，PSS-DET (Practical Server Side Detection)。该方案融合了多种只增加少许计算量，但是可以有效提升两阶段Faster RCNN目标检测效果的策略，包括检测模型剪裁、使用分类效果更优的预训练模型、DCNv2、Cascade RCNN、AutoAugment、Libra sampling以及多尺度训练。其中基于82.39%的R50_vd_ssld预训练模型，与79.12%的R50_vd的预训练模型相比，检测效果可以提升1.5%。在COCO目标检测数据集上测试PSS-DET，当V100单卡预测速度为61FPS时，mAP是41.6%，预测速度为20FPS时，mAP是47.8%。详情请参考[**通用目标检测章节**](https://paddleclas.readthedocs.io/zh_CN/latest/application/object_detection.html)。
 
-<div align="center">
-<img
-src="./docs/images/det/pssdet.png" width="500">
-</div>
 
 - TODO
 - [ ] PaddleClas在OCR任务中的应用
diff --git a/configs/EfficientNet/EfficientNetB0.yaml b/configs/EfficientNet/EfficientNetB0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..01932d43a591f300d0aa7e21784c41ef94479572
--- /dev/null
+++ b/configs/EfficientNet/EfficientNetB0.yaml
@@ -0,0 +1,89 @@
+mode: 'train'
+ARCHITECTURE:
+    name: "EfficientNetB0"
+    params:
+        is_test: False
+        padding_type : "SAME"
+        override_params:
+            drop_connect_rate: 0.1
+
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+use_ema: True
+ema_decay: 0.9999
+use_aa: True
+ls_epsilon: 0.1
+
+LEARNING_RATE:
+    function: 'ExponentialWarmup'
+    params:
+        lr: 0.032
+
+OPTIMIZER:
+    function: 'RMSProp'
+    params:
+        momentum: 0.9
+        rho: 0.9
+        epsilon: 0.001
+    regularizer:
+        function: 'L2'
+        factor: 0.00001
+
+TRAIN:
+    batch_size: 512
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: Fals
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: 2
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+
+
+VALID:
+    batch_size: 128
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            interpolation: 2
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+
diff --git a/docs/en/extension/index.rst b/docs/en/extension/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cc17c18ab03788f44b97bf17037a9a69acf186ae
--- /dev/null
+++ b/docs/en/extension/index.rst
@@ -0,0 +1,12 @@
+extension
+================================
+
+.. toctree::
+   :maxdepth: 1
+   
+   paddle_inference.md
+   paddle_mobile_inference.md
+   paddle_quantization.md
+   multi_machine_training.md
+   paddle_hub.md
+   paddle_serving.md
diff --git a/docs/zh_CN/faq.md b/docs/zh_CN/faq.md
index 36aff3ec2458c80802a9da2c2c9244efa3387dd9..ca79b1e61fab47f726ead529f746d4ed26354f99 100644
--- a/docs/zh_CN/faq.md
+++ b/docs/zh_CN/faq.md
@@ -17,7 +17,7 @@
 
 >>
 * Q: 在评测`EfficientNetB0_small`模型时，为什么最终的精度始终比官网的低0.3%左右？
-* A: `EfficientNet`系列的网络在进行resize的时候，是使用`cubic插值方式`(resize参数的interpolation值设置为2)，而其他模型默认情况下为None，因此在训练和评估的时候需要显式地指定resiz的interpolation值。具体地，可以参考以下配置中预处理过程中ResizeImage的参数。
+* A: `EfficientNet`系列的网络在进行resize的时候，是使用`cubic插值方式`(resize参数的interpolation值设置为2)，而其他模型默认情况下为None，因此在训练和评估的时候需要显式地指定resize的interpolation值。具体地，可以参考以下配置中预处理过程中ResizeImage的参数。
 ```
 VALID:
     batch_size: 16
diff --git a/docs/zh_CN/models/ResNet_and_vd.md b/docs/zh_CN/models/ResNet_and_vd.md
index ea045f12ca545ca0cea2229fcfc1993fd50ec77b..f8c9aa3e60fb55453148a6c790cd2217151823a3 100644
--- a/docs/zh_CN/models/ResNet_and_vd.md
+++ b/docs/zh_CN/models/ResNet_and_vd.md
@@ -42,8 +42,11 @@ ResNet系列模型是在2015年提出的，一举在ILSVRC2015比赛中取得冠
 | ResNet152_vd     | 0.806           | 0.953           |                          |                          | 23.530    | 60.210    |
 | ResNet200_vd     | 0.809           | 0.953           |                          |                          | 30.530    | 74.740    |
 | ResNet50_vd_ssld | 0.824           | 0.961           |                          |                          | 8.670     | 25.580    |
+| ResNet50_vd_ssld_v2 | 0.830           | 0.964           |                          |                          | 8.670     | 25.580    |
+| Fix_ResNet50_vd_ssld_v2 | 0.840           | 0.970           |                          |                          | 17.696     | 25.580    |
 | ResNet101_vd_ssld | 0.837           | 0.967           |                          |                          | 16.100    | 44.570     |
 
+* 注：`ResNet50_vd_ssld_v2`是在`ResNet50_vd_ssld`训练策略的基础上加上AutoAugment训练得到，`Fix_ResNet50_vd_ssld_v2`是固定`ResNet50_vd_ssld_v2`除FC层外所有的网络参数，在320x320的图像输入分辨率下，基于ImageNet1k数据集微调得到。
 
 
 
@@ -86,4 +89,6 @@ ResNet系列模型是在2015年提出的，一举在ILSVRC2015比赛中取得冠
 | ResNet152_vd      | 224       | 256               | 7.29127                      | 10.86137                     | 15.32444                     | 8.54376                      | 19.52157                     | 36.64445                     |
 | ResNet200_vd      | 224       | 256               | 9.36026                      | 13.5474                      | 19.0725                      | 10.80619                     | 25.01731                     | 48.81399                     |
 | ResNet50_vd_ssld  | 224       | 256               | 2.65164                      | 4.84109                      | 7.46225                      | 3.53131                      | 8.09057                      | 14.45965                     |
+| ResNet50_vd_ssld_v2  | 224       | 256               | 2.65164                      | 4.84109                      | 7.46225                      | 3.53131                      | 8.09057                      | 14.45965                     |
+| Fix_ResNet50_vd_ssld_v2  | 320       | 320               | 3.42818                      | 7.51534                      | 13.19370                      | 5.07696                      | 14.64218                      | 27.01453                     |
 | ResNet101_vd_ssld | 224       | 256               | 5.05972                      | 7.83685                      | 11.34235                     | 6.11704                      | 13.76222                     | 25.11071                     |
diff --git a/docs/zh_CN/models/models_intro.md b/docs/zh_CN/models/models_intro.md
index ff309b3c599777528f495d4fe3df8c05b12ed9be..3ed9a4b1aa804d06d4bf65ae0a81cb2e365915c2 100644
--- a/docs/zh_CN/models/models_intro.md
+++ b/docs/zh_CN/models/models_intro.md
@@ -51,6 +51,8 @@ python tools/infer/predict.py \
     - [ResNet152_vd](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet152_vd_pretrained.tar)
     - [ResNet200_vd](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet200_vd_pretrained.tar)
     - [ResNet50_vd_ssld](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_ssld_pretrained.tar)
+    - [ResNet50_vd_ssld_v2](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_ssld_v2_pretrained.tar)
+    - [Fix_ResNet50_vd_ssld_v2](https://paddle-imagenet-models-name.bj.bcebos.com/Fix_ResNet50_vd_ssld_v2_pretrained.tar)
     - [ResNet101_vd_ssld](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_vd_ssld_pretrained.tar)
 
 
diff --git a/docs/zh_CN/tutorials/getting_started.md b/docs/zh_CN/tutorials/getting_started.md
index 11bea8411362ac0b4122d9e6da8d9f9c7d24b54e..8790faf9c037d9f5b7902eff96eabc922759bba8 100644
--- a/docs/zh_CN/tutorials/getting_started.md
+++ b/docs/zh_CN/tutorials/getting_started.md
@@ -41,7 +41,8 @@ python -m paddle.distributed.launch \
     --selected_gpus="0,1,2,3" \
     tools/train.py \
         -c ./configs/ResNet/ResNet50_vd.yaml \
-        -o use_mix=1
+        -o use_mix=1 \
+	--vdl_dir=./scalar/
 
 ```
 
@@ -53,6 +54,13 @@ epoch:0    train    step:522    loss:1.6330    lr:0.100000    elapse:0.210
 
 也可以直接修改模型对应的配置文件更新配置。具体配置参数参考[配置文档](config.md)。
 
+训练期间可以通过VisualDL实时观察loss变化，启动命令如下：
+
+```bash
+visualdl --logdir ./scalar --host <host_IP> --port <port_num>
+
+```
+
 
 ### 2.2 模型微调
 
diff --git a/docs/zh_CN/tutorials/install.md b/docs/zh_CN/tutorials/install.md
index 2982d25c8954fa4045864ae2f3adcbbbb7608a17..861c40041083181239922a3e876f9d302f5618dc 100644
--- a/docs/zh_CN/tutorials/install.md
+++ b/docs/zh_CN/tutorials/install.md
@@ -38,7 +38,7 @@ python -c "import paddle; print(paddle.__version__)"
 
 **运行环境需求:**
 
-- Python2（官方已不提供更新维护）或Python3 (当前只支持Linux系统)
+- Python3 (当前只支持Linux系统)
 - CUDA >= 9.0
 - cuDNN >= 5.0
 - nccl >= 2.1.2
@@ -60,3 +60,10 @@ Python依赖库在[requirements.txt](https://github.com/PaddlePaddle/PaddleClas/
 ```
 pip install --upgrade -r requirements.txt
 ```
+
+visualdl可能出现安装失败，请尝试
+
+```
+pip3 install --upgrade visualdl==2.0.0b3 -i https://mirror.baidu.com/pypi/simple
+
+```
diff --git a/ppcls/data/imaug/operators.py b/ppcls/data/imaug/operators.py
index a8454740f4f6047f16104b32c5a06b6f5cc1e327..49c4d95ced597f6fc53e639bae36d488632ccf83 100644
--- a/ppcls/data/imaug/operators.py
+++ b/ppcls/data/imaug/operators.py
@@ -25,6 +25,8 @@ import random
 import cv2
 import numpy as np
 
+from .autoaugment import ImageNetPolicy
+
 
 class OperatorParamError(ValueError):
     """ OperatorParamError
@@ -115,7 +117,9 @@ class CropImage(object):
 class RandCropImage(object):
     """ random crop image """
 
-    def __init__(self, size, scale=None, ratio=None):
+    def __init__(self, size, scale=None, ratio=None, interpolation=-1):
+
+        self.interpolation = interpolation if interpolation >= 0 else None
         if type(size) is int:
             self.size = (size, size)  # (h, w)
         else:
@@ -149,7 +153,10 @@ class RandCropImage(object):
         j = random.randint(0, img_h - h)
 
         img = img[j:j + h, i:i + w, :]
-        return cv2.resize(img, size)
+        if self.interpolation is None:
+            return cv2.resize(img, size)
+        else:
+            return cv2.resize(img, size, interpolation=self.interpolation)
 
 
 class RandFlipImage(object):
@@ -172,6 +179,18 @@ class RandFlipImage(object):
             return img
 
 
+class AutoAugment(object):
+    def __init__(self):
+        self.policy = ImageNetPolicy()
+
+    def __call__(self, img):
+        from PIL import Image
+        img = np.ascontiguousarray(img)
+        img = Image.fromarray(img)
+        img = self.policy(img)
+        img = np.asarray(img)
+
+
 class NormalizeImage(object):
     """ normalize image such as substract mean, divide std
     """
diff --git a/ppcls/modeling/architectures/efficientnet.py b/ppcls/modeling/architectures/efficientnet.py
index d6bac79bd8674b6bcb315d512fc095577fc6c97a..8da13a753247d067490ba142fdfa01943c7b058a 100644
--- a/ppcls/modeling/architectures/efficientnet.py
+++ b/ppcls/modeling/architectures/efficientnet.py
@@ -383,7 +383,9 @@ class EfficientNet():
             use_bias=True,
             padding_type=self.padding_type,
             name=name + '_se_expand')
-        se_out = inputs * fluid.layers.sigmoid(x_squeezed)
+        #se_out = inputs * fluid.layers.sigmoid(x_squeezed)
+        se_out = fluid.layers.elementwise_mul(
+            inputs, fluid.layers.sigmoid(x_squeezed), axis=-1)
         return se_out
 
     def extract_features(self, inputs, is_test):
@@ -467,8 +469,8 @@ class BlockDecoder(object):
 
         # Check stride
         cond_1 = ('s' in options and len(options['s']) == 1)
-        cond_2 = ((len(options['s']) == 2)
-                  and (options['s'][0] == options['s'][1]))
+        cond_2 = ((len(options['s']) == 2) and
+                  (options['s'][0] == options['s'][1]))
         assert (cond_1 or cond_2)
 
         return BlockArgs(
diff --git a/ppcls/optimizer/learning_rate.py b/ppcls/optimizer/learning_rate.py
index 197f8af14cf0fc75b86183da193bcf342eba8d95..0227ae43e0f5c99894dcaeb1b51425fbdc9a8c82 100644
--- a/ppcls/optimizer/learning_rate.py
+++ b/ppcls/optimizer/learning_rate.py
@@ -1,16 +1,16 @@
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from __future__ import absolute_import
 from __future__ import division
@@ -130,7 +130,7 @@ class CosineWarmup(object):
         with fluid.layers.control_flow.Switch() as switch:
             with switch.case(epoch < self.warmup_epoch):
                 decayed_lr = self.lr * \
-                        (global_step / (self.step_each_epoch * self.warmup_epoch))
+                    (global_step / (self.step_each_epoch * self.warmup_epoch))
                 fluid.layers.tensor.assign(
                     input=decayed_lr, output=learning_rate)
             with switch.default():
@@ -145,6 +145,65 @@ class CosineWarmup(object):
         return learning_rate
 
 
+class ExponentialWarmup(object):
+    """
+    Exponential learning rate decay with warmup
+    [0, warmup_epoch): linear warmup
+    [warmup_epoch, epochs): Exponential decay
+
+    Args:
+        lr(float): initial learning rate
+        step_each_epoch(int): steps each epoch
+        decay_epochs(float): decay epochs
+        decay_rate(float): decay rate
+        warmup_epoch(int): epoch num of warmup
+    """
+
+    def __init__(self,
+                 lr,
+                 step_each_epoch,
+                 decay_epochs=2.4,
+                 decay_rate=0.97,
+                 warmup_epoch=5,
+                 **kwargs):
+        super(ExponentialWarmup, self).__init__()
+        self.lr = lr
+        self.step_each_epoch = step_each_epoch
+        self.decay_epochs = decay_epochs * self.step_each_epoch
+        self.decay_rate = decay_rate
+        self.warmup_epoch = fluid.layers.fill_constant(
+            shape=[1],
+            value=float(warmup_epoch),
+            dtype='float32',
+            force_cpu=True)
+
+    def __call__(self):
+        global_step = _decay_step_counter()
+        learning_rate = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
+
+        epoch = ops.floor(global_step / self.step_each_epoch)
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(epoch < self.warmup_epoch):
+                decayed_lr = self.lr * \
+                    (global_step / (self.step_each_epoch * self.warmup_epoch))
+                fluid.layers.tensor.assign(
+                    input=decayed_lr, output=learning_rate)
+            with switch.default():
+                rest_step = global_step - self.warmup_epoch * self.step_each_epoch
+                div_res = ops.floor(rest_step / self.decay_epochs)
+
+                decayed_lr = self.lr * (self.decay_rate**div_res)
+                fluid.layers.tensor.assign(
+                    input=decayed_lr, output=learning_rate)
+
+        return learning_rate
+
+
 class LearningRateBuilder():
     """
     Build learning rate variable
diff --git a/ppcls/utils/logger.py b/ppcls/utils/logger.py
index 5b192c61b46d82cc99e88269c5118c3e9e182b66..12789c7c893a9d48a189b43dfd251c1a88e45f76 100644
--- a/ppcls/utils/logger.py
+++ b/ppcls/utils/logger.py
@@ -19,9 +19,10 @@ import datetime
 from imp import reload
 reload(logging)
 
-logging.basicConfig(level=logging.INFO, 
-                    format="%(asctime)s %(levelname)s: %(message)s",
-                    datefmt = "%Y-%m-%d %H:%M:%S")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s: %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S")
 
 
 def time_zone(sec, fmt):
@@ -32,22 +33,22 @@ def time_zone(sec, fmt):
 logging.Formatter.converter = time_zone
 _logger = logging.getLogger(__name__)
 
-
-Color= {
-        'RED' : '\033[31m' ,
-        'HEADER' : '\033[35m' , # deep purple
-        'PURPLE' : '\033[95m' ,# purple
-        'OKBLUE' : '\033[94m' ,
-        'OKGREEN' : '\033[92m' ,
-        'WARNING' : '\033[93m' ,
-        'FAIL' : '\033[91m' ,
-        'ENDC' : '\033[0m' }
+Color = {
+    'RED': '\033[31m',
+    'HEADER': '\033[35m',  # deep purple
+    'PURPLE': '\033[95m',  # purple
+    'OKBLUE': '\033[94m',
+    'OKGREEN': '\033[92m',
+    'WARNING': '\033[93m',
+    'FAIL': '\033[91m',
+    'ENDC': '\033[0m'
+}
 
 
 def coloring(message, color="OKGREEN"):
     assert color in Color.keys()
     if os.environ.get('PADDLECLAS_COLORING', False):
-        return Color[color]+str(message)+Color["ENDC"]
+        return Color[color] + str(message) + Color["ENDC"]
     else:
         return message
 
@@ -80,6 +81,17 @@ def error(fmt, *args):
     _logger.error(coloring(fmt, "FAIL"), *args)
 
 
+def scaler(name, value, step, writer):
+    """
+    This function will draw a scalar curve generated by the visualdl.
+    Usage: Install visualdl: pip3 install visualdl==2.0.0b4
+           and then:
+           visualdl --logdir ./scalar --host 0.0.0.0 --port 8830 
+           to preview loss corve in real time.
+    """
+    writer.add_scalar(name, value, step)
+
+
 def advertise():
     """
     Show the advertising message like the following:
@@ -99,12 +111,13 @@ def advertise():
     website = "https://github.com/PaddlePaddle/PaddleClas"
     AD_LEN = 6 + len(max([copyright, ad, website], key=len))
 
-    info(coloring("\n{0}\n{1}\n{2}\n{3}\n{4}\n{5}\n{6}\n{7}\n".format(
-        "=" * (AD_LEN + 4),
-        "=={}==".format(copyright.center(AD_LEN)),
-        "=" * (AD_LEN + 4),
-        "=={}==".format(' ' * AD_LEN),
-        "=={}==".format(ad.center(AD_LEN)),
-        "=={}==".format(' ' * AD_LEN),
-        "=={}==".format(website.center(AD_LEN)),
-        "=" * (AD_LEN + 4), ),"RED"))
+    info(
+        coloring("\n{0}\n{1}\n{2}\n{3}\n{4}\n{5}\n{6}\n{7}\n".format(
+            "=" * (AD_LEN + 4),
+            "=={}==".format(copyright.center(AD_LEN)),
+            "=" * (AD_LEN + 4),
+            "=={}==".format(' ' * AD_LEN),
+            "=={}==".format(ad.center(AD_LEN)),
+            "=={}==".format(' ' * AD_LEN),
+            "=={}==".format(website.center(AD_LEN)),
+            "=" * (AD_LEN + 4), ), "RED"))
diff --git a/ppcls/utils/pretrained.list b/ppcls/utils/pretrained.list
index 91ae4409f9289b0634b4c6fa95ae3e1d75cc42aa..36d70f5a24624dad507d51e3bc7c77eeb5444e9c 100644
--- a/ppcls/utils/pretrained.list
+++ b/ppcls/utils/pretrained.list
@@ -12,6 +12,8 @@ ResNet101_vd
 ResNet152_vd
 ResNet200_vd
 ResNet50_vd_ssld
+ResNet50_vd_ssld_v2
+Fix_ResNet50_vd_ssld_v2
 ResNet101_vd_ssld
 MobileNetV3_large_x0_35
 MobileNetV3_large_x0_5
diff --git a/requirements.txt b/requirements.txt
index 8ba583cbd2ca0cfae5b4527b65d3fa6d6f634ca0..4f89d7567adeef94d28a2db6bc2d0ec8134f211c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ opencv-python
 pillow
 tqdm
 PyYAML
+visualdl >= 2.0.0b
diff --git a/tools/ema.py b/tools/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbd5af2ed4af566406a562a0759a2d845f86f6b8
--- /dev/null
+++ b/tools/ema.py
@@ -0,0 +1,165 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
+from paddle.fluid.framework import Program, program_guard, name_scope, default_main_program
+from paddle.fluid import unique_name, layers
+
+
+class ExponentialMovingAverage(object):
+    def __init__(self,
+                 decay=0.999,
+                 thres_steps=None,
+                 zero_debias=False,
+                 name=None):
+        self._decay = decay
+        self._thres_steps = thres_steps
+        self._name = name if name is not None else ''
+        self._decay_var = self._get_ema_decay()
+
+        self._params_tmps = []
+        for param in default_main_program().global_block().all_parameters():
+            if param.do_model_average != False:
+                tmp = param.block.create_var(
+                    name=unique_name.generate(".".join(
+                        [self._name + param.name, 'ema_tmp'])),
+                    dtype=param.dtype,
+                    persistable=False,
+                    stop_gradient=True)
+                self._params_tmps.append((param, tmp))
+
+        self._ema_vars = {}
+        for param, tmp in self._params_tmps:
+            with param.block.program._optimized_guard(
+                [param, tmp]), name_scope('moving_average'):
+                self._ema_vars[param.name] = self._create_ema_vars(param)
+
+        self.apply_program = Program()
+        block = self.apply_program.global_block()
+        with program_guard(main_program=self.apply_program):
+            decay_pow = self._get_decay_pow(block)
+            for param, tmp in self._params_tmps:
+                param = block._clone_variable(param)
+                tmp = block._clone_variable(tmp)
+                ema = block._clone_variable(self._ema_vars[param.name])
+                layers.assign(input=param, output=tmp)
+                # bias correction
+                if zero_debias:
+                    ema = ema / (1.0 - decay_pow)
+                layers.assign(input=ema, output=param)
+
+        self.restore_program = Program()
+        block = self.restore_program.global_block()
+        with program_guard(main_program=self.restore_program):
+            for param, tmp in self._params_tmps:
+                tmp = block._clone_variable(tmp)
+                param = block._clone_variable(param)
+                layers.assign(input=tmp, output=param)
+
+    def _get_ema_decay(self):
+        with default_main_program()._lr_schedule_guard():
+            decay_var = layers.tensor.create_global_var(
+                shape=[1],
+                value=self._decay,
+                dtype='float32',
+                persistable=True,
+                name="scheduled_ema_decay_rate")
+
+            if self._thres_steps is not None:
+                decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0)
+                with layers.control_flow.Switch() as switch:
+                    with switch.case(decay_t < self._decay):
+                        layers.tensor.assign(decay_t, decay_var)
+                    with switch.default():
+                        layers.tensor.assign(
+                            np.array(
+                                [self._decay], dtype=np.float32),
+                            decay_var)
+        return decay_var
+
+    def _get_decay_pow(self, block):
+        global_steps = layers.learning_rate_scheduler._decay_step_counter()
+        decay_var = block._clone_variable(self._decay_var)
+        decay_pow_acc = layers.elementwise_pow(decay_var, global_steps + 1)
+        return decay_pow_acc
+
+    def _create_ema_vars(self, param):
+        param_ema = layers.create_global_var(
+            name=unique_name.generate(self._name + param.name + '_ema'),
+            shape=param.shape,
+            value=0.0,
+            dtype=param.dtype,
+            persistable=True)
+
+        return param_ema
+
+    def update(self):
+        """
+        Update Exponential Moving Average. Should only call this method in
+        train program.
+        """
+        param_master_emas = []
+        for param, tmp in self._params_tmps:
+            with param.block.program._optimized_guard(
+                [param, tmp]), name_scope('moving_average'):
+                param_ema = self._ema_vars[param.name]
+                if param.name + '.master' in self._ema_vars:
+                    master_ema = self._ema_vars[param.name + '.master']
+                    param_master_emas.append([param_ema, master_ema])
+                else:
+                    ema_t = param_ema * self._decay_var + param * (
+                        1 - self._decay_var)
+                    layers.assign(input=ema_t, output=param_ema)
+
+        # for fp16 params
+        for param_ema, master_ema in param_master_emas:
+            default_main_program().global_block().append_op(
+                type="cast",
+                inputs={"X": master_ema},
+                outputs={"Out": param_ema},
+                attrs={
+                    "in_dtype": master_ema.dtype,
+                    "out_dtype": param_ema.dtype
+                })
+
+    @signature_safe_contextmanager
+    def apply(self, executor, need_restore=True):
+        """
+        Apply moving average to parameters for evaluation.
+        Args:
+            executor (Executor): The Executor to execute applying.
+            need_restore (bool): Whether to restore parameters after applying.
+        """
+        executor.run(self.apply_program)
+        try:
+            yield
+        finally:
+            if need_restore:
+                self.restore(executor)
+
+    def restore(self, executor):
+        """Restore parameters.
+        Args:
+            executor (Executor): The Executor to execute restoring.
+        """
+        executor.run(self.restore_program)
diff --git a/tools/ema_clean.py b/tools/ema_clean.py
new file mode 100644
index 0000000000000000000000000000000000000000..39570dfa476baefccb571467ec1abfd2758d91ea
--- /dev/null
+++ b/tools/ema_clean.py
@@ -0,0 +1,48 @@
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import argparse
+import functools
+import shutil
+import sys
+
+def main():
+"""
+Usage: when training with flag use_ema, and evaluating EMA model, should clean the saved model at first.
+       To generate clean model:
+    
+       python ema_clean.py ema_model_dir cleaned_model_dir
+"""
+    cleaned_model_dir = sys.argv[1]
+    ema_model_dir = sys.argv[2]
+    if not os.path.exists(cleaned_model_dir):
+        os.makedirs(cleaned_model_dir)
+
+    items = os.listdir(ema_model_dir)
+    for item in items:
+        if item.find('ema') > -1:
+            item_clean = item.replace('_ema_0', '')
+            shutil.copyfile(os.path.join(ema_model_dir, item),
+                            os.path.join(cleaned_model_dir, item_clean))
+        elif item.find('mean') > -1 or item.find('variance') > -1:
+            shutil.copyfile(os.path.join(ema_model_dir, item),
+                            os.path.join(cleaned_model_dir, item))
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/program.py b/tools/program.py
index fc44d2b1c6a48662994c8b4772805687649cbf48..0a28dbe6bf29587e73918d0d494504b3683366ad 100644
--- a/tools/program.py
+++ b/tools/program.py
@@ -378,4 +378,4 @@ def run(dataloader, config, net, optimizer=None, epoch=0, mode='train'):
 
     # return top1_acc in order to save the best model
     if mode == 'valid':
-        return metric_list['top1'].avg
+        return metric_list['top1'].avg
\ No newline at end of file
diff --git a/tools/train.py b/tools/train.py
index 5a570ff6e7a50472524aee6b7f11811154a65407..c244dd490297afa1132a794f4a0f3b85578c7408 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -108,4 +108,4 @@ def main(args):
 
 if __name__ == '__main__':
     args = parse_args()
-    main(args)
+    main(args)
\ No newline at end of file