Merge branch 'develop' of https://github.com/PaddlePaddle/models into pr_5291452

90fce5a7 · yudongxu(许煜东) · 4200cc94 · b0239e3a · 90fce5a7 · 90fce5a7
44 changed file
--- a/PaddleCV/rrpn/README.md
+++ b/PaddleCV/rrpn/README.md
@@ -12,7 +12,7 @@

 ## 安装

-在当前目录下运行样例代码需要PadddlePaddle Fluid的develop或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本，请根据[安装文档](http://www.paddlepaddle.org/)中的说明来更新PaddlePaddle。
+在当前目录下运行样例代码需要PadddlePaddle Fluid的1.8.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本，请根据[安装文档](http://www.paddlepaddle.org/)中的说明来更新PaddlePaddle。


 ## 简介
@@ -27,16 +27,23 @@ RRPN是在Faster RCNN基础上拓展出的两阶段目标检测器，可用于

 ### 编译自定义OP

+**注意：** 通过pip方式安装的PaddlePaddle由GCC 4.8编译得到，由于GCC 4.8和GCC 5以上C++11 ABI不兼容，您编写的自定义OP，需要通过GCC 4.8编译。若是GCC 5及以上的环境上使用自定义OP，推荐使用Docker安装PaddlePaddle，使得编Paddle和编译自定义OP的GCC版本相同。
+
 自定义OP编译方式如下：

    进入 `models/ext_op/src` 目录，执行编译脚本
    ```
    cd models/ext_op/src
    sh make.sh  ${cuda_path} ${cudnn_path} ${nccl_path}
-    '''
+    ```
    其中${cuda_path}、$cudnn_path}和{nccl_path}分别为cuda、cudnn、nccl的安装路径，需通过命令行进行指定
-    成功编译后，`ext_op/src` 目录下将会生成 `rrpn_lib.so` 
-    
+    成功编译后，`ext_op/src` 目录下将会生成 `rrpn_lib.so`。
+    需要将`rrpn_lib.so`所在路径以及libpaddle_framework.so路径(即paddle.sysconfig.get_lib()得到路径)设置到环境变量LD_LIBRARY_PATH中:
+    ```
+    # 假如rrpn_lib.so路径是：`rrpn/models/ext_op/src/`，对于Linux环境设置:
+    export LD_LIBRARY_PATH=rrpn/models/ext_op/src/:$( python -c 'import paddle; print(paddle.sysconfig.get_lib())'):$LD_LIBRARY_PATH
+    ```
+
 ## 数据准备
 ### 公开数据集
 在[ICDAR2015数据集](https://rrc.cvc.uab.es/?ch=4&com=downloads)上进行训练，数据集需进入官网进行注册后方可下载。
@@ -58,8 +65,8 @@ dataset/icdar2015/
 │   ├── img_112.jpg
 |   ...
 ├── ch4_test_localization_transcription_gt
-│   ├── img_111.jpg
-│   ├── img_112.jpg
+│   ├── img_111.txt
+│   ├── img_112.txt
 |   ...
 ```
 ### 自定义数据
@@ -88,7 +95,7 @@ x1, y1, x2, y2, x3, y3, x4, y4, class_name
    python train.py \
       --model_save_dir=output/ \
       --pretrained_model=${path_to_pretrain_model} \
-       --data_dir=${path_to_data} \
+       --data_dir=${path_to_icdar2015} \
    ```


@@ -126,7 +133,7 @@ x1, y1, x2, y2, x3, y3, x4, y4, class_name

    ```
    python eval.py \
-        --dataset=icdar2015 \
+        --data_dir=${path_to_icdar2015} \
        --pretrained_model=${path_to_trained_model}
    ```

@@ -143,10 +150,6 @@ RRPN
 | [RRPN](https://paddleseg.bj.bcebos.com/deploy/temp/model_final.tar) |8   |    17500       | 0.8048 |


-
-
-
-
 ## 模型推断及可视化

 模型推断可以获取图像中的物体及其对应的类别，`infer.py`是主要执行程序，调用示例如下：

--- a/PaddleCV/rrpn/checkpoint.py
+++ b/PaddleCV/rrpn/checkpoint.py
@@ -41,6 +41,13 @@ def _load_state(path):
    return state


+def _strip_postfix(path):
+    path, ext = os.path.splitext(path)
+    assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
+            "Unknown postfix {} from weights".format(ext)
+    return path
+
+
 def load_params(exe, prog, path):
    """
    Load model from the given path.
@@ -50,20 +57,33 @@ def load_params(exe, prog, path):
        path (string): URL string or loca model path.
    """

-    if not os.path.exists(path):
+    path = _strip_postfix(path)
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
        raise ValueError("Model pretrain path {} does not "
                         "exists.".format(path))

    logger.info('Loading parameters from {}...'.format(path))

-    def _if_exist(var):
-        param_exist = os.path.exists(os.path.join(path, var.name))
-        do_load = param_exist
-        if do_load:
-            logger.debug('load weight {}'.format(var.name))
-        return do_load
+    ignore_set = set()
+    state = _load_state(path)

-    fluid.io.load_vars(exe, path, prog, predicate=_if_exist)
+    # ignore the parameter which mismatch the shape 
+    # between the model and pretrain weight.
+    all_var_shape = {}
+    for block in prog.blocks:
+        for param in block.all_parameters():
+            all_var_shape[param.name] = param.shape
+    ignore_set.update([
+        name for name, shape in all_var_shape.items()
+        if name in state and shape != state[name].shape
+    ])
+
+    if len(ignore_set) > 0:
+        for k in ignore_set:
+            if k in state:
+                logger.warning('variable {} not used'.format(k))
+                del state[k]
+    fluid.io.set_program_state(prog, state)


 def save(exe, prog, path):
@@ -83,6 +103,7 @@ def save(exe, prog, path):
 def load_and_fusebn(exe, prog, path):
    """
    Fuse params of batch norm to scale and bias.
+
    Args:
        exe (fluid.Executor): The fluid.Executor object.
        prog (fluid.Program): save weight from which Program object.
@@ -104,19 +125,12 @@ def load_and_fusebn(exe, prog, path):
    #  x is any prefix
    mean_variances = set()
    bn_vars = []
-
-    state = None
-    if os.path.exists(path + '.pdparams'):
-        state = _load_state(path)
+    state = _load_state(path)

    def check_mean_and_bias(prefix):
        m = prefix + 'mean'
        v = prefix + 'variance'
-        if state:
-            return v in state and m in state
-        else:
-            return (os.path.exists(os.path.join(path, m)) and
-                    os.path.exists(os.path.join(path, v)))
+        return v in state and m in state

    has_mean_bias = True

@@ -156,16 +170,14 @@ def load_and_fusebn(exe, prog, path):
                    bn_vars.append(
                        [scale_name, bias_name, mean_name, variance_name])

-    if state:
-        fluid.io.set_program_state(prog, state)
-    else:
-        load_params(exe, prog, path)
    if not has_mean_bias:
+        fluid.io.set_program_state(prog, state)
        logger.warning(
            "There is no paramters of batch norm in model {}. "
            "Skip to fuse batch norm. And load paramters done.".format(path))
        return

+    fluid.load(prog, path, exe)
    eps = 1e-5
    for names in bn_vars:
        scale_name, bias_name, mean_name, var_name = names

--- a/PaddleCV/rrpn/models/ext_op/rrpn_lib.py
+++ b/PaddleCV/rrpn/models/ext_op/rrpn_lib.py
@@ -15,7 +15,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import Variable
-fluid.load_op_library('models/ext_op/src/rrpn_lib.so')
+fluid.load_op_library('rrpn_lib.so')


 def rrpn_target_assign(bbox_pred,

--- a/PaddleCV/rrpn/models/ext_op/src/make.sh
+++ b/PaddleCV/rrpn/models/ext_op/src/make.sh
@@ -27,7 +27,7 @@ git clone https://github.com/NVlabs/cub.git

 nvcc rrpn_generate_proposals_op.cu -c -o rrpn_generate_proposals_op.cu.o -ccbin cc -DPADDLE_WITH_MKLDNN -DPADDLE_WITH_CUDA -DEIGEN_USE_GPU -DPADDLE_USE_DSO -Xcompiler -fPIC -std=c++11 -Xcompiler -fPIC -w --expt-relaxed-constexpr -O3 -DNVCC \
    -I ${include_dir} \
-   -I ${include_dir}/third_party \
+    -I ${include_dir}/third_party \
    -I ${CUDA}/include \
    -I ${CUDNN}/include \
    -I ${NCCL}/include \

--- a/PaddleCV/rrpn/models/ext_op/src/rrpn_rotated_roi_align_op.cc
+++ b/PaddleCV/rrpn/models/ext_op/src/rrpn_rotated_roi_align_op.cc
@@ -165,8 +165,7 @@ public:
  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;

 protected:
-  std::unique_ptr<T> Apply() const override {
-    std::unique_ptr<T> op(new T);
+  void Apply(GradOpPtr<T> op) const override {
    op->SetType("rrpn_rotated_roi_align_grad");
    op->SetInput("X", this->Input("X"));
    op->SetInput("ROIs", this->Input("ROIs"));
@@ -175,12 +174,11 @@ protected:
    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
    op->SetAttrMap(this->Attrs());
-    return op;
  }
 };

-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    RRPNRotatedRoiAlignGradNoNeedBufVarsInferer, "X");
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(RRPNRotatedRoiAlignGradNoNeedBufVarsInferer,
+                                    "X");

 }  // namespace operators
 }  // namespace paddle

--- a/PaddleCV/rrpn/reader.py
+++ b/PaddleCV/rrpn/reader.py
@@ -96,7 +96,6 @@ def RRPNData(mode,
                    continue
                batch_out.append(datas)
                end = time.time()
-                #print('reader time:', end - start)
                if len(batch_out) == batch_size:
                    yield batch_out
                    count += 1

--- a/PaddleCV/rrpn/roidbs.py
+++ b/PaddleCV/rrpn/roidbs.py
@@ -101,7 +101,6 @@ class ICDAR2015Dataset(object):
                    elif edge2 >= edge1:
                        width = edge2
                        height = edge1
-                        # print pt2[0], pt3[0]
                        if pt2[0] - pt3[0] != 0:
                            angle = -np.arctan(
                                float(pt2[1] - pt3[1]) /
@@ -160,7 +159,6 @@ class ICDAR2015Dataset(object):
                    else:
                        hard_boxes.append([x_ctr, y_ctr, width, height, angle])

-            #print(easy_boxes)
            if self.mode == 'train':
                boxes.extend(easy_boxes)
                # hard box only get 1/3 for train
@@ -173,8 +171,6 @@ class ICDAR2015Dataset(object):
                is_difficult = [0] * len(easy_boxes)
                is_difficult.extend([1] * int(len(hard_boxes)))
            len_of_bboxes = len(boxes)
-            #is_difficult = [0] * len(easy_boxes)
-            #is_difficult.extend([1] * int(len(hard_boxes)))
            is_difficult = np.array(is_difficult).reshape(
                1, len_of_bboxes).astype(np.int32)
            if self.mode == 'train':
@@ -221,11 +217,9 @@ class ICDAR2017Dataset(object):
    def __init__(self, mode):
        print('Creating: {}'.format(cfg.dataset))
        self.name = cfg.data_dir
-        #print('**************', self.name)
        self.mode = mode
        data_path = DatasetPath(mode, self.name)
        data_dir = data_path.get_data_dir()
-        #print("&**************", data_dir)
        file_list = data_path.get_file_list()
        self.image_dir = data_dir
        self.gt_dir = file_list
@@ -245,15 +239,12 @@ class ICDAR2017Dataset(object):
            labels_map = get_labels_maps()
        for image in image_list:
            prefix = image[:-4]
-            #print(image)

            if image.split('.')[-1] not in post_fix:
                continue
            img_name = os.path.join(self.image_dir, image)
            gt_name = os.path.join(self.gt_dir, 'gt_' + prefix + '.txt')
            gt_classes = []
-            #boxes = []
-            #hard_boxes = []
            boxes = []
            gt_obj = open(gt_name, 'r', encoding='UTF-8-sig')
            gt_txt = gt_obj.read()
@@ -293,7 +284,6 @@ class ICDAR2017Dataset(object):
                    elif edge2 >= edge1:
                        width = edge2
                        height = edge1
-                        # print pt2[0], pt3[0]
                        if pt2[0] - pt3[0] != 0:
                            angle = -np.arctan(
                                float(pt2[1] - pt3[1]) /
@@ -312,7 +302,6 @@ class ICDAR2017Dataset(object):
                    else:
                        boxes.append([x_ctr, y_ctr, width, height, angle])
            len_of_bboxes = len(boxes)
-            #print(len_of_bboxes)
            is_difficult = np.zeros((len_of_bboxes, 1), dtype=np.int32)
            if self.mode == 'train':
                gt_boxes = np.zeros((len_of_bboxes, 5), dtype=np.int32)
@@ -332,7 +321,6 @@ class ICDAR2017Dataset(object):
                        boxes[idx][3], boxes[idx][4], boxes[idx][5],
                        boxes[idx][6], boxes[idx][7]
                    ]
-                #gt_classes[idx] = 1
            if gt_boxes.shape[0] <= 0:
                continue
            gt_boxes = gt_boxes.astype(np.float64)

--- a/PaddleCV/rrpn/utility.py
+++ b/PaddleCV/rrpn/utility.py
@@ -154,7 +154,7 @@ def parse_args():
    add_arg('pixel_means',     float,   [0.485, 0.456, 0.406], "pixel mean")
    add_arg('nms_thresh',    float, 0.3,    "NMS threshold.")
    add_arg('score_thresh',    float, 0.01,    "score threshold for NMS.")
-    add_arg('snapshot_stride',  int,    1000,    "save model every snapshot stride.")
+    add_arg('snapshot_iter',  int,    1000,    "save model every snapshot iter.")
    # SINGLE EVAL AND DRAW
    add_arg('draw_threshold',  float, 0.8,    "Confidence threshold to draw bbox.")
    add_arg('image_path',       str,   'ICDAR2015/tmp/',  "The image path used to inference and visualize.")

--- a/PaddleCV/video/application/video_tag/README.md
+++ b/PaddleCV/video/application/video_tag/README.md
+# VideoTag 飞桨大规模视频分类模型
+
+---
+## 内容
+
+- [模型简介](#模型简介)
+- [安装说明](#安装说明)
+- [数据准备](#数据准备)
+- [模型推断](#模型推断)
+- [模型微调](#模型微调)
+- [参考论文](#参考论文)
+
+
+## 模型简介
+
+飞桨大规模视频分类模型VideoTag基于百度短视频业务千万级数据，支持3000个源于产业实践的实用标签，具有良好的泛化能力，非常适用于国内大规模（千万/亿/十亿级别）短视频分类场景的应用。VideoTag采用两阶段建模方式，即图像建模和序列学习。第一阶段，使用少量视频样本（十万级别）训练大规模视频特征提取模型(Extractor)；第二阶段，使用千万级数据训练预测器(Predictor)，最终实现在超大规模（千万/亿/十亿级别）短视频上产业应用，其原理示意如下图所示。
+
+<p align="center">
+<img src="video_tag.png" height=220 width=800 hspace='10'/> <br />
+Temporal shift module
+</p>
+
+- 数据处理：视频是按特定顺序排列的一组图像的集合，这些图像也称为帧。视频分类任务需要先对短视频进行解码，然后再将输出的图像帧序列灌入到VideoTag中进行训练和预测。
+
+- 图像建模：先从训练数据中，对每个类别均匀采样少量样本数据，构成十万量级的训练视频。然后使用TSN网络进行训练，提取所有视频帧的TSN模型分类层前一层的特征数据。在这个过程中，每一帧都被转化成相应的特征向量，一段视频被转化成一个特征序列。
+
+- 序列学习：采用Attention clusters、LSTM和Nextvlad对特征序列进行建模，学习各个特征之间的组合方式，进一步提高模型准确率。由于序列学习相比于图像建模耗时更短，因此可以融合多个具有互补性的序列模型。示例代码仅使用Attention\_LSTM网络进行序列特征预测。
+
+- 预测结果：融合多个模型结果实现视频分类，进一步提高分类准确率。
+
+
+## 安装说明
+
+运行样例代码需要PaddlePaddle版本>= 1.7.0，请参考[安装文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.7/install/index_cn.html)安装PaddlePaddle。
+
+- 环境依赖：
+
+```
+    CUDA >= 9.0
+    cudnn >= 7.5
+    OpenCV >= 4.1.0 : pip install opencv-python
+```
+
+## 数据准备
+
+- 预训练权重下载：我们提供了[TSN](https://videotag.bj.bcebos.com/video_tag_tsn.tar)和[AttentionLSTM](https://videotag.bj.bcebos.com/video_tag_lstm.tar)预训练权重，请下载后解压，并将参数文件放在weights目录下，目录结构如下：
+
+```
+video_tag
+  ├──weights
+    ├── attention_lstm.pdmodel
+    ├── attention_lstm.pdopt  
+    ├── attention_lstm.pdparams
+    ├── tsn.pdmodel
+    ├── tsn.pdopt
+    └── tsn.pdparams
+```
+
+- 示例视频下载：我们提供了[样例视频](https://videotag.bj.bcebos.com/mp4.tar)方便用户测试，请下载后解压，并将视频文件放置在video\_tag/data/mp4目录下，目录结构如下：
+
+```
+video_tag
+  ├──data
+    ├── mp4
+      ├── 1.mp4
+      └── 2.mp4
+```
+
+- 目前支持的视频文件输入格式为：mp4、mkv和webm格式；
+
+- 模型会从输入的视频文件中均匀抽取300帧用于预测。对于较长的视频文件，建议先截取有效部分输入模型以提高预测速度。
+
+
+## 模型推断
+
+模型推断的启动方式如下：
+
+    bash run_TSN_LSTM.sh
+
+- 可修改video\_tag/data/tsn.list文件内容，指定待推断的文件路径列表；
+
+- 通过--filelist可指定输入list文件路径，默认为video\_tag/data/tsn.list；
+
+- 通过--extractor\_weights可指定特征提取器参数的存储路径，默认为video\_tag/weights/tsn；
+
+- 通过--predictor\_weights可指定预测器参数的存储路径，默认为video\_tag/weights/attention\_lstm；
+
+- 通过--use\_gpu参数可指定是否使用gpu进行推断，默认使用gpu。对于10s左右的短视频文件，gpu推断时间约为4s；
+
+- 通过--save\_dir可指定预测结果存储路径，默认为video\_tag/data/results，结果保存在json文件中，其格式为：
+
+```
+    [file_path,
+     {"class_name": class_name1, "probability": probability1, "class_id": class_id1},
+     {"class_name": class_name2, "probability": probability2, "class_id": class_id2},
+     ...
+    ]
+```
+
+- 通过--label\_file可指定标签文件存储路径，默认为video\_tag/label\_3396.txt；
+
+- 模型相关配置写在video\_tag/configs目录下的yaml文件中。
+
+
+## 模型微调
+
+- VideoTag中的TSN模型只输出视频特征，无需输出最终分类结果，fine-tune请参考PaddleCV视频库[TSN视频分类模型](../../models/tsn/README.md)请对应修改模型文件。
+
+- VideoTag中的attention\_lstm模型只需要输入视频特征，无需音频特征输入，fine-tune请参考PaddleCV视频库[AttentionLSTM视频分类模型](../../models/attention_lstm/README.md)对应修改模型文件。
+
+## 参考论文
+
+- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859), Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, Luc Van Gool
+
+- [Beyond Short Snippets: Deep Networks for Video Classification](https://arxiv.org/abs/1503.08909) Joe Yue-Hei Ng, Matthew Hausknecht, Sudheendra Vijayanarasimhan, Oriol Vinyals, Rajat Monga, George Toderici
--- a/PaddleCV/video/application/video_tag/configs/attention_lstm.yaml
+++ b/PaddleCV/video/application/video_tag/configs/attention_lstm.yaml
+MODEL:
+    name: "AttentionLSTM"
+    dataset: None 
+    bone_nework: None
+    drop_rate: 0.5
+    feature_num: 2
+    feature_names: ['rgb'] 
+    feature_dims: [2048] 
+    embedding_size: 1024 
+    lstm_size: 512 
+    num_classes: 3396 
+    topk: 20
+
+INFER:
+    batch_size: 1
--- a/PaddleCV/video/application/video_tag/configs/tsn.yaml
+++ b/PaddleCV/video/application/video_tag/configs/tsn.yaml
+MODEL:
+    name: "TSN"
+    format: "mp4"
+    num_classes: 400
+    seglen: 1
+    image_mean: [0.485, 0.456, 0.406]
+    image_std: [0.229, 0.224, 0.225]
+    num_layers: 50
+    topk: 5
+
+INFER:
+    seg_num: 300 
+    short_size: 256
+    target_size: 224
+    num_reader_threads: 1 
+    buf_size: 1024
+    batch_size: 1
+    kinetics_labels: None 
+    video_path: ""
+    filelist: "./data/tsn.list" 
--- a/PaddleCV/video/application/video_tag/data/tsn.list
+++ b/PaddleCV/video/application/video_tag/data/tsn.list
+data/mp4/1.mp4
--- a/PaddleCV/video/application/video_tag/label_3396.txt
+++ b/PaddleCV/video/application/video_tag/label_3396.txt
--- a/PaddleCV/video/application/video_tag/metrics/__init__.py
+++ b/PaddleCV/video/application/video_tag/metrics/__init__.py
+from .metrics_util import get_metrics
--- a/PaddleCV/video/application/video_tag/metrics/metrics_util.py
+++ b/PaddleCV/video/application/video_tag/metrics/metrics_util.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import division
+
+import os
+import io
+import logging
+
+import numpy as np
+import json
+from metrics.youtube8m import eval_util as youtube8m_metrics
+
+logger = logging.getLogger(__name__)
+
+
+class Metrics(object):
+    def __init__(self, name, mode, metrics_args):
+        """Not implemented"""
+        pass
+
+    def calculate_and_log_out(self, fetch_list, info=''):
+        """Not implemented"""
+        pass
+
+    def accumulate(self, fetch_list, info=''):
+        """Not implemented"""
+        pass
+
+    def finalize_and_log_out(self, info='', savedir='./'):
+        """Not implemented"""
+        pass
+
+    def reset(self):
+        """Not implemented"""
+        pass
+
+
+class Youtube8mMetrics(Metrics):
+    def __init__(self, name, mode, metrics_args):
+        self.name = name
+        self.mode = mode
+        self.num_classes = metrics_args['MODEL']['num_classes']
+        self.topk = metrics_args['MODEL']['topk']
+        self.calculator = youtube8m_metrics.EvaluationMetrics(self.num_classes,
+                                                              self.topk)
+        if self.mode == 'infer':
+            self.infer_results = []
+
+    def calculate_and_log_out(self, fetch_list, info=''):
+        loss = np.mean(np.array(fetch_list[0]))
+        pred = np.array(fetch_list[1])
+        label = np.array(fetch_list[2])
+        hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)
+        perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(pred,
+                                                                          label)
+        gap = youtube8m_metrics.calculate_gap(pred, label)
+        logger.info(info + ' , loss = {0}, Hit@1 = {1}, PERR = {2}, GAP = {3}'.format(\
+                     '%.6f' % loss, '%.2f' % hit_at_one, '%.2f' % perr, '%.2f' % gap))
+
+    def accumulate(self, fetch_list, info=''):
+        if self.mode == 'infer':
+            predictions = np.array(fetch_list[0])
+            video_id = fetch_list[1]
+            for i in range(len(predictions)):
+                topk_inds = predictions[i].argsort()[0 - self.topk:]
+                topk_inds = topk_inds[::-1]
+                preds = predictions[i][topk_inds]
+                self.infer_results.append(
+                    (video_id[i], topk_inds.tolist(), preds.tolist()))
+        else:
+            loss = np.array(fetch_list[0])
+            pred = np.array(fetch_list[1])
+            label = np.array(fetch_list[2])
+            self.calculator.accumulate(loss, pred, label)
+
+    def finalize_and_log_out(self,
+                             info='',
+                             savedir='./data/results',
+                             label_file='./label_3396.txt'):
+        if self.mode == 'infer':
+            for index, item in enumerate(self.infer_results):
+                video_id = item[0]
+                logger.info(
+                    '========video_id [ {} ] , topk({}) preds: ========\n'.
+                    format(video_id, self.topk))
+
+                f = io.open(label_file, "r", encoding="utf-8")
+                fl = f.readlines()
+                res_list = []
+                res_list.append(video_id)
+                for i in range(len(item[1])):
+                    class_id = item[1][i]
+                    class_prob = item[2][i]
+                    class_name = fl[class_id].split('\n')[0]
+                    print('class_id: {},'.format(class_id), 'class_name:',
+                          class_name,
+                          ',  probability:  {} \n'.format(class_prob))
+                    save_dict = {
+                        "'class_id": class_id,
+                        "class_name": class_name,
+                        "probability": class_prob
+                    }
+                    res_list.append(save_dict)
+
+                # save infer result into output dir
+                with io.open(
+                        os.path.join(savedir, 'result' + str(index) + '.json'),
+                        'w',
+                        encoding='utf-8') as f:
+                    f.write(json.dumps(res_list, f, ensure_ascii=False))
+        else:
+            epoch_info_dict = self.calculator.get()
+            logger.info(info + '\tavg_hit_at_one: {0},\tavg_perr: {1},\tavg_loss :{2},\taps: {3},\tgap:{4}'\
+                     .format(epoch_info_dict['avg_hit_at_one'], epoch_info_dict['avg_perr'], \
+                             epoch_info_dict['avg_loss'], epoch_info_dict['aps'], epoch_info_dict['gap']))
+
+    def reset(self):
+        self.calculator.clear()
+        if self.mode == 'infer':
+            self.infer_results = []
+
+
+class MetricsZoo(object):
+    def __init__(self):
+        self.metrics_zoo = {}
+
+    def regist(self, name, metrics):
+        assert metrics.__base__ == Metrics, "Unknow model type {}".format(
+            type(metrics))
+        self.metrics_zoo[name] = metrics
+
+    def get(self, name, mode, cfg):
+        for k, v in self.metrics_zoo.items():
+            if k == name:
+                return v(name, mode, cfg)
+        raise MetricsNotFoundError(name, self.metrics_zoo.keys())
+
+
+# singleton metrics_zoo
+metrics_zoo = MetricsZoo()
+
+
+def regist_metrics(name, metrics):
+    metrics_zoo.regist(name, metrics)
+
+
+def get_metrics(name, mode, cfg):
+    return metrics_zoo.get(name, mode, cfg)
+
+
+# sort by alphabet
+regist_metrics("ATTENTIONCLUSTER", Youtube8mMetrics)
+regist_metrics("ATTENTIONLSTM", Youtube8mMetrics)
+regist_metrics("NEXTVLAD", Youtube8mMetrics)
--- a/PaddleCV/video/application/video_tag/metrics/youtube8m/__init__.py
+++ b/PaddleCV/video/application/video_tag/metrics/youtube8m/__init__.py
--- a/PaddleCV/video/application/video_tag/metrics/youtube8m/average_precision_calculator.py
+++ b/PaddleCV/video/application/video_tag/metrics/youtube8m/average_precision_calculator.py
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate or keep track of the interpolated average precision.
+
+It provides an interface for calculating interpolated average precision for an
+entire list or the top-n ranked items. For the definition of the
+(non-)interpolated average precision:
+http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf
+
+Example usages:
+1) Use it as a static function call to directly calculate average precision for
+a short ranked list in the memory.
+
+```
+import random
+
+p = np.array([random.random() for _ in xrange(10)])
+a = np.array([random.choice([0, 1]) for _ in xrange(10)])
+
+ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)
+```
+
+2) Use it as an object for long ranked list that cannot be stored in memory or
+the case where partial predictions can be observed at a time (Tensorflow
+predictions). In this case, we first call the function accumulate many times
+to process parts of the ranked list. After processing all the parts, we call
+peek_interpolated_ap_at_n.
+```
+p1 = np.array([random.random() for _ in xrange(5)])
+a1 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+p2 = np.array([random.random() for _ in xrange(5)])
+a2 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+
+# interpolated average precision at 10 using 1000 break points
+calculator = average_precision_calculator.AveragePrecisionCalculator(10)
+calculator.accumulate(p1, a1)
+calculator.accumulate(p2, a2)
+ap3 = calculator.peek_ap_at_n()
+```
+"""
+
+import heapq
+import random
+import numbers
+
+import numpy
+
+
+class AveragePrecisionCalculator(object):
+    """Calculate the average precision and average precision at n."""
+
+    def __init__(self, top_n=None):
+        """Construct an AveragePrecisionCalculator to calculate average precision.
+
+    This class is used to calculate the average precision for a single label.
+
+    Args:
+      top_n: A positive Integer specifying the average precision at n, or
+        None to use all provided data points.
+
+    Raises:
+      ValueError: An error occurred when the top_n is not a positive integer.
+    """
+        if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):
+            raise ValueError("top_n must be a positive integer or None.")
+
+        self._top_n = top_n  # average precision at n
+        self._total_positives = 0  # total number of positives have seen
+        self._heap = []  # max heap of (prediction, actual)
+
+    @property
+    def heap_size(self):
+        """Gets the heap size maintained in the class."""
+        return len(self._heap)
+
+    @property
+    def num_accumulated_positives(self):
+        """Gets the number of positive samples that have been accumulated."""
+        return self._total_positives
+
+    def accumulate(self, predictions, actuals, num_positives=None):
+        """Accumulate the predictions and their ground truth labels.
+
+    After the function call, we may call peek_ap_at_n to actually calculate
+    the average precision.
+    Note predictions and actuals must have the same shape.
+
+    Args:
+      predictions: a list storing the prediction scores.
+      actuals: a list storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      num_positives = If the 'predictions' and 'actuals' inputs aren't complete,
+      then it's possible some true positives were missed in them. In that case,
+      you can provide 'num_positives' in order to accurately track recall.
+
+    Raises:
+      ValueError: An error occurred when the format of the input is not the
+      numpy 1-D array or the shape of predictions and actuals does not match.
+    """
+        if len(predictions) != len(actuals):
+            raise ValueError(
+                "the shape of predictions and actuals does not match.")
+
+        if not num_positives is None:
+            if not isinstance(num_positives,
+                              numbers.Number) or num_positives < 0:
+                raise ValueError(
+                    "'num_positives' was provided but it wan't a nonzero number."
+                )
+
+        if not num_positives is None:
+            self._total_positives += num_positives
+        else:
+            self._total_positives += numpy.size(numpy.where(actuals > 0))
+        topk = self._top_n
+        heap = self._heap
+
+        for i in range(numpy.size(predictions)):
+            if topk is None or len(heap) < topk:
+                heapq.heappush(heap, (predictions[i], actuals[i]))
+            else:
+                if predictions[i] > heap[0][0]:  # heap[0] is the smallest
+                    heapq.heappop(heap)
+                    heapq.heappush(heap, (predictions[i], actuals[i]))
+
+    def clear(self):
+        """Clear the accumulated predictions."""
+        self._heap = []
+        self._total_positives = 0
+
+    def peek_ap_at_n(self):
+        """Peek the non-interpolated average precision at n.
+
+    Returns:
+      The non-interpolated average precision at n (default 0).
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+    """
+        if self.heap_size <= 0:
+            return 0
+        predlists = numpy.array(list(zip(*self._heap)))
+
+        ap = self.ap_at_n(
+            predlists[0],
+            predlists[1],
+            n=self._top_n,
+            total_num_positives=self._total_positives)
+        return ap
+
+    @staticmethod
+    def ap(predictions, actuals):
+        """Calculate the non-interpolated average precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      actuals: a numpy 1-D array storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+
+    Returns:
+      The non-interpolated average precision at n.
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+
+    Raises:
+      ValueError: An error occurred when the format of the input is not the
+      numpy 1-D array or the shape of predictions and actuals does not match.
+    """
+        return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None)
+
+    @staticmethod
+    def ap_at_n(predictions, actuals, n=20, total_num_positives=None):
+        """Calculate the non-interpolated average precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      actuals: a numpy 1-D array storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      n: the top n items to be considered in ap@n.
+      total_num_positives : (optionally) you can specify the number of total
+        positive
+      in the list. If specified, it will be used in calculation.
+
+    Returns:
+      The non-interpolated average precision at n.
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+
+    Raises:
+      ValueError: An error occurred when
+      1) the format of the input is not the numpy 1-D array;
+      2) the shape of predictions and actuals does not match;
+      3) the input n is not a positive integer.
+    """
+        if len(predictions) != len(actuals):
+            raise ValueError(
+                "the shape of predictions and actuals does not match.")
+
+        if n is not None:
+            if not isinstance(n, int) or n <= 0:
+                raise ValueError("n must be 'None' or a positive integer."
+                                 " It was '%s'." % n)
+
+        ap = 0.0
+
+        predictions = numpy.array(predictions)
+        actuals = numpy.array(actuals)
+
+        # add a shuffler to avoid overestimating the ap
+        predictions, actuals = AveragePrecisionCalculator._shuffle(predictions,
+                                                                   actuals)
+        sortidx = sorted(
+            range(len(predictions)), key=lambda k: predictions[k], reverse=True)
+
+        if total_num_positives is None:
+            numpos = numpy.size(numpy.where(actuals > 0))
+        else:
+            numpos = total_num_positives
+
+        if numpos == 0:
+            return 0
+
+        if n is not None:
+            numpos = min(numpos, n)
+        delta_recall = 1.0 / numpos
+        poscount = 0.0
+
+        # calculate the ap
+        r = len(sortidx)
+        if n is not None:
+            r = min(r, n)
+        for i in range(r):
+            if actuals[sortidx[i]] > 0:
+                poscount += 1
+                ap += poscount / (i + 1) * delta_recall
+        return ap
+
+    @staticmethod
+    def _shuffle(predictions, actuals):
+        random.seed(0)
+        suffidx = random.sample(range(len(predictions)), len(predictions))
+        predictions = predictions[suffidx]
+        actuals = actuals[suffidx]
+        return predictions, actuals
+
+    @staticmethod
+    def _zero_one_normalize(predictions, epsilon=1e-7):
+        """Normalize the predictions to the range between 0.0 and 1.0.
+
+    For some predictions like SVM predictions, we need to normalize them before
+    calculate the interpolated average precision. The normalization will not
+    change the rank in the original list and thus won't change the average
+    precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      epsilon: a small constant to avoid denominator being zero.
+
+    Returns:
+      The normalized prediction.
+    """
+        denominator = numpy.max(predictions) - numpy.min(predictions)
+        ret = (predictions - numpy.min(predictions)) / numpy.max(denominator,
+                                                                 epsilon)
+        return ret
--- a/PaddleCV/video/application/video_tag/metrics/youtube8m/eval_util.py
+++ b/PaddleCV/video/application/video_tag/metrics/youtube8m/eval_util.py
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Provides functions to help with evaluating models."""
+import datetime
+import numpy
+
+from . import mean_average_precision_calculator as map_calculator
+from . import average_precision_calculator as ap_calculator
+
+
+def flatten(l):
+    """ Merges a list of lists into a single list. """
+    return [item for sublist in l for item in sublist]
+
+
+def calculate_hit_at_one(predictions, actuals):
+    """Performs a local (numpy) calculation of the hit at one.
+
+  Args:
+    predictions: Matrix containing the outputs of the model.
+      Dimensions are 'batch' x 'num_classes'.
+    actuals: Matrix containing the ground truth labels.
+      Dimensions are 'batch' x 'num_classes'.
+
+  Returns:
+    float: The average hit at one across the entire batch.
+  """
+    top_prediction = numpy.argmax(predictions, 1)
+    hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]
+    return numpy.average(hits)
+
+
+def calculate_precision_at_equal_recall_rate(predictions, actuals):
+    """Performs a local (numpy) calculation of the PERR.
+
+  Args:
+    predictions: Matrix containing the outputs of the model.
+      Dimensions are 'batch' x 'num_classes'.
+    actuals: Matrix containing the ground truth labels.
+      Dimensions are 'batch' x 'num_classes'.
+
+  Returns:
+    float: The average precision at equal recall rate across the entire batch.
+  """
+    aggregated_precision = 0.0
+    num_videos = actuals.shape[0]
+    for row in numpy.arange(num_videos):
+        num_labels = int(numpy.sum(actuals[row]))
+        top_indices = numpy.argpartition(predictions[row],
+                                         -num_labels)[-num_labels:]
+        item_precision = 0.0
+        for label_index in top_indices:
+            if predictions[row][label_index] > 0:
+                item_precision += actuals[row][label_index]
+        item_precision /= top_indices.size
+        aggregated_precision += item_precision
+    aggregated_precision /= num_videos
+    return aggregated_precision
+
+
+def calculate_gap(predictions, actuals, top_k=20):
+    """Performs a local (numpy) calculation of the global average precision.
+
+  Only the top_k predictions are taken for each of the videos.
+
+  Args:
+    predictions: Matrix containing the outputs of the model.
+      Dimensions are 'batch' x 'num_classes'.
+    actuals: Matrix containing the ground truth labels.
+      Dimensions are 'batch' x 'num_classes'.
+    top_k: How many predictions to use per video.
+
+  Returns:
+    float: The global average precision.
+  """
+    gap_calculator = ap_calculator.AveragePrecisionCalculator()
+    sparse_predictions, sparse_labels, num_positives = top_k_by_class(
+        predictions, actuals, top_k)
+    gap_calculator.accumulate(
+        flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
+    return gap_calculator.peek_ap_at_n()
+
+
+def top_k_by_class(predictions, labels, k=20):
+    """Extracts the top k predictions for each video, sorted by class.
+
+  Args:
+    predictions: A numpy matrix containing the outputs of the model.
+      Dimensions are 'batch' x 'num_classes'.
+    k: the top k non-zero entries to preserve in each prediction.
+
+  Returns:
+    A tuple (predictions,labels, true_positives). 'predictions' and 'labels'
+    are lists of lists of floats. 'true_positives' is a list of scalars. The
+    length of the lists are equal to the number of classes. The entries in the
+    predictions variable are probability predictions, and
+    the corresponding entries in the labels variable are the ground truth for
+    those predictions. The entries in 'true_positives' are the number of true
+    positives for each class in the ground truth.
+
+  Raises:
+    ValueError: An error occurred when the k is not a positive integer.
+  """
+    if k <= 0:
+        raise ValueError("k must be a positive integer.")
+    k = min(k, predictions.shape[1])
+    num_classes = predictions.shape[1]
+    prediction_triplets = []
+    for video_index in range(predictions.shape[0]):
+        prediction_triplets.extend(
+            top_k_triplets(predictions[video_index], labels[video_index], k))
+    out_predictions = [[] for v in range(num_classes)]
+    out_labels = [[] for v in range(num_classes)]
+    for triplet in prediction_triplets:
+        out_predictions[triplet[0]].append(triplet[1])
+        out_labels[triplet[0]].append(triplet[2])
+    out_true_positives = [numpy.sum(labels[:, i]) for i in range(num_classes)]
+
+    return out_predictions, out_labels, out_true_positives
+
+
+def top_k_triplets(predictions, labels, k=20):
+    """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in
+  (prediction, class) format"""
+    m = len(predictions)
+    k = min(k, m)
+    indices = numpy.argpartition(predictions, -k)[-k:]
+    return [(index, predictions[index], labels[index]) for index in indices]
+
+
+class EvaluationMetrics(object):
+    """A class to store the evaluation metrics."""
+
+    def __init__(self, num_class, top_k):
+        """Construct an EvaluationMetrics object to store the evaluation metrics.
+
+    Args:
+      num_class: A positive integer specifying the number of classes.
+      top_k: A positive integer specifying how many predictions are considered per video.
+
+    Raises:
+      ValueError: An error occurred when MeanAveragePrecisionCalculator cannot
+        not be constructed.
+    """
+        self.sum_hit_at_one = 0.0
+        self.sum_perr = 0.0
+        self.sum_loss = 0.0
+        self.map_calculator = map_calculator.MeanAveragePrecisionCalculator(
+            num_class)
+        self.global_ap_calculator = ap_calculator.AveragePrecisionCalculator()
+        self.top_k = top_k
+        self.num_examples = 0
+
+    #def accumulate(self, predictions, labels, loss):
+    def accumulate(self, loss, predictions, labels):
+        """Accumulate the metrics calculated locally for this mini-batch.
+
+    Args:
+      predictions: A numpy matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+      labels: A numpy matrix containing the ground truth labels.
+        Dimensions are 'batch' x 'num_classes'.
+      loss: A numpy array containing the loss for each sample.
+
+    Returns:
+      dictionary: A dictionary storing the metrics for the mini-batch.
+
+    Raises:
+      ValueError: An error occurred when the shape of predictions and actuals
+        does not match.
+    """
+        batch_size = labels.shape[0]
+        mean_hit_at_one = calculate_hit_at_one(predictions, labels)
+        mean_perr = calculate_precision_at_equal_recall_rate(predictions,
+                                                             labels)
+        mean_loss = numpy.mean(loss)
+
+        # Take the top 20 predictions.
+        sparse_predictions, sparse_labels, num_positives = top_k_by_class(
+            predictions, labels, self.top_k)
+        self.map_calculator.accumulate(sparse_predictions, sparse_labels,
+                                       num_positives)
+        self.global_ap_calculator.accumulate(
+            flatten(sparse_predictions),
+            flatten(sparse_labels), sum(num_positives))
+
+        self.num_examples += batch_size
+        self.sum_hit_at_one += mean_hit_at_one * batch_size
+        self.sum_perr += mean_perr * batch_size
+        self.sum_loss += mean_loss * batch_size
+
+        return {
+            "hit_at_one": mean_hit_at_one,
+            "perr": mean_perr,
+            "loss": mean_loss
+        }
+
+    def get(self):
+        """Calculate the evaluation metrics for the whole epoch.
+
+    Raises:
+      ValueError: If no examples were accumulated.
+
+    Returns:
+      dictionary: a dictionary storing the evaluation metrics for the epoch. The
+        dictionary has the fields: avg_hit_at_one, avg_perr, avg_loss, and
+        aps (default nan).
+    """
+        if self.num_examples <= 0:
+            raise ValueError("total_sample must be positive.")
+        avg_hit_at_one = self.sum_hit_at_one / self.num_examples
+        avg_perr = self.sum_perr / self.num_examples
+        avg_loss = self.sum_loss / self.num_examples
+
+        aps = self.map_calculator.peek_map_at_n()
+        gap = self.global_ap_calculator.peek_ap_at_n()
+
+        epoch_info_dict = {}
+        return {
+            "avg_hit_at_one": avg_hit_at_one,
+            "avg_perr": avg_perr,
+            "avg_loss": avg_loss,
+            "aps": aps,
+            "gap": gap
+        }
+
+    def clear(self):
+        """Clear the evaluation metrics and reset the EvaluationMetrics object."""
+        self.sum_hit_at_one = 0.0
+        self.sum_perr = 0.0
+        self.sum_loss = 0.0
+        self.map_calculator.clear()
+        self.global_ap_calculator.clear()
+        self.num_examples = 0
--- a/PaddleCV/video/application/video_tag/metrics/youtube8m/mean_average_precision_calculator.py
+++ b/PaddleCV/video/application/video_tag/metrics/youtube8m/mean_average_precision_calculator.py
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate the mean average precision.
+
+It provides an interface for calculating mean average precision
+for an entire list or the top-n ranked items.
+
+Example usages:
+We first call the function accumulate many times to process parts of the ranked
+list. After processing all the parts, we call peek_map_at_n
+to calculate the mean average precision.
+
+```
+import random
+
+p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])
+a = np.array([[random.choice([0, 1]) for _ in xrange(50)]
+     for _ in xrange(1000)])
+
+# mean average precision for 50 classes.
+calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(
+            num_class=50)
+calculator.accumulate(p, a)
+aps = calculator.peek_map_at_n()
+```
+"""
+
+import numpy
+from . import average_precision_calculator
+
+
+class MeanAveragePrecisionCalculator(object):
+    """This class is to calculate mean average precision.
+  """
+
+    def __init__(self, num_class):
+        """Construct a calculator to calculate the (macro) average precision.
+
+    Args:
+      num_class: A positive Integer specifying the number of classes.
+      top_n_array: A list of positive integers specifying the top n for each
+      class. The top n in each class will be used to calculate its average
+      precision at n.
+      The size of the array must be num_class.
+
+    Raises:
+      ValueError: An error occurred when num_class is not a positive integer;
+      or the top_n_array is not a list of positive integers.
+    """
+        if not isinstance(num_class, int) or num_class <= 1:
+            raise ValueError("num_class must be a positive integer.")
+
+        self._ap_calculators = []  # member of AveragePrecisionCalculator
+        self._num_class = num_class  # total number of classes
+        for i in range(num_class):
+            self._ap_calculators.append(
+                average_precision_calculator.AveragePrecisionCalculator())
+
+    def accumulate(self, predictions, actuals, num_positives=None):
+        """Accumulate the predictions and their ground truth labels.
+
+    Args:
+      predictions: A list of lists storing the prediction scores. The outer
+      dimension corresponds to classes.
+      actuals: A list of lists storing the ground truth labels. The dimensions
+      should correspond to the predictions input. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      num_positives: If provided, it is a list of numbers representing the
+      number of true positives for each class. If not provided, the number of
+      true positives will be inferred from the 'actuals' array.
+
+    Raises:
+      ValueError: An error occurred when the shape of predictions and actuals
+      does not match.
+    """
+        if not num_positives:
+            num_positives = [None for i in predictions.shape[1]]
+
+        calculators = self._ap_calculators
+        for i in range(len(predictions)):
+            calculators[i].accumulate(predictions[i], actuals[i],
+                                      num_positives[i])
+
+    def clear(self):
+        for calculator in self._ap_calculators:
+            calculator.clear()
+
+    def is_empty(self):
+        return ([calculator.heap_size for calculator in self._ap_calculators] ==
+                [0 for _ in range(self._num_class)])
+
+    def peek_map_at_n(self):
+        """Peek the non-interpolated mean average precision at n.
+
+    Returns:
+      An array of non-interpolated average precision at n (default 0) for each
+      class.
+    """
+        aps = [
+            self._ap_calculators[i].peek_ap_at_n()
+            for i in range(self._num_class)
+        ]
+        return aps
--- a/PaddleCV/video/application/video_tag/models/__init__.py
+++ b/PaddleCV/video/application/video_tag/models/__init__.py
+from .model import regist_model, get_model
+from .attention_lstm import AttentionLSTM
+from .tsn import TSN
+
+# regist models, sort by alphabet
+regist_model("AttentionLSTM", AttentionLSTM)
+regist_model("TSN", TSN)
--- a/PaddleCV/video/application/video_tag/models/attention_lstm/__init__.py
+++ b/PaddleCV/video/application/video_tag/models/attention_lstm/__init__.py
+from .attention_lstm import *
--- a/PaddleCV/video/application/video_tag/models/attention_lstm/attention_lstm.py
+++ b/PaddleCV/video/application/video_tag/models/attention_lstm/attention_lstm.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid import ParamAttr
+
+from ..model import ModelBase
+from .lstm_attention import LSTMAttentionModel
+
+import logging
+logger = logging.getLogger(__name__)
+
+__all__ = ["AttentionLSTM"]
+
+
+class AttentionLSTM(ModelBase):
+    def __init__(self, name, cfg, mode='train'):
+        super(AttentionLSTM, self).__init__(name, cfg, mode)
+        self.get_config()
+
+    def get_config(self):
+        # get model configs
+        self.feature_num = self.cfg.MODEL.feature_num
+        self.feature_names = self.cfg.MODEL.feature_names
+        self.feature_dims = self.cfg.MODEL.feature_dims
+        self.num_classes = self.cfg.MODEL.num_classes
+        self.embedding_size = self.cfg.MODEL.embedding_size
+        self.lstm_size = self.cfg.MODEL.lstm_size
+        self.drop_rate = self.cfg.MODEL.drop_rate
+
+        # get mode configs
+        self.batch_size = self.get_config_from_sec(self.mode, 'batch_size', 1)
+        self.num_gpus = self.get_config_from_sec(self.mode, 'num_gpus', 1)
+
+    def build_input(self, use_dataloader):
+        self.feature_input = []
+        for name, dim in zip(self.feature_names, self.feature_dims):
+            self.feature_input.append(
+                fluid.data(
+                    shape=[None, dim], lod_level=1, dtype='float32', name=name))
+        #video_tag without label_input
+        if use_dataloader:
+            assert self.mode != 'infer', \
+                    'dataloader is not recommendated when infer, please set use_dataloader to be false.'
+            self.dataloader = fluid.io.DataLoader.from_generator(
+                feed_list=self.feature_input,  #video_tag
+                capacity=8,
+                iterable=True)
+
+    def build_model(self):
+        att_outs = []
+        for i, (input_dim, feature
+                ) in enumerate(zip(self.feature_dims, self.feature_input)):
+            att = LSTMAttentionModel(input_dim, self.embedding_size,
+                                     self.lstm_size, self.drop_rate)
+            att_out = att.forward(feature, is_training=(self.mode == 'train'))
+            att_outs.append(att_out)
+        if len(att_outs) > 1:
+            out = fluid.layers.concat(att_outs, axis=1)
+        else:
+            out = att_outs[0]
+
+        fc1 = fluid.layers.fc(
+            input=out,
+            size=8192,
+            act='relu',
+            bias_attr=ParamAttr(
+                regularizer=fluid.regularizer.L2Decay(0.0),
+                initializer=fluid.initializer.NormalInitializer(scale=0.0)),
+            name='fc1')
+        fc2 = fluid.layers.fc(
+            input=fc1,
+            size=4096,
+            act='tanh',
+            bias_attr=ParamAttr(
+                regularizer=fluid.regularizer.L2Decay(0.0),
+                initializer=fluid.initializer.NormalInitializer(scale=0.0)),
+            name='fc2')
+
+        self.logit = fluid.layers.fc(input=fc2, size=self.num_classes, act=None, \
+                              bias_attr=ParamAttr(regularizer=fluid.regularizer.L2Decay(0.0),
+                                                  initializer=fluid.initializer.NormalInitializer(scale=0.0)),
+                              name = 'output')
+
+        self.output = fluid.layers.sigmoid(self.logit)
+
+    def optimizer(self):
+        assert self.mode == 'train', "optimizer only can be get in train mode"
+        values = [
+            self.learning_rate * (self.decay_gamma**i)
+            for i in range(len(self.decay_epochs) + 1)
+        ]
+        iter_per_epoch = self.num_samples / self.batch_size
+        boundaries = [e * iter_per_epoch for e in self.decay_epochs]
+        return fluid.optimizer.RMSProp(
+            learning_rate=fluid.layers.piecewise_decay(
+                values=values, boundaries=boundaries),
+            centered=True,
+            regularization=fluid.regularizer.L2Decay(self.weight_decay))
+
+    def loss(self):
+        assert self.mode != 'infer', "invalid loss calculationg in infer mode"
+        cost = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=self.logit, label=self.label_input)
+        cost = fluid.layers.reduce_sum(cost, dim=-1)
+        sum_cost = fluid.layers.reduce_sum(cost)
+        self.loss_ = fluid.layers.scale(
+            sum_cost, scale=self.num_gpus, bias_after_scale=False)
+        return self.loss_
+
+    def outputs(self):
+        return [self.output, self.logit]
+
+    def feeds(self):
+        return self.feature_input
+
+    def fetches(self):
+        fetch_list = [self.output]
+        return fetch_list
+
+    def weights_info(self):
+        return None
+
+    def load_pretrain_params(self, exe, pretrain, prog, place):
+        logger.info("Load pretrain weights from {}, exclude fc layer.".format(
+            pretrain))
+
+        state_dict = fluid.load_program_state(pretrain)
+        dict_keys = list(state_dict.keys())
+        for name in dict_keys:
+            if "fc_0" in name:
+                del state_dict[name]
+                logger.info(
+                    'Delete {} from pretrained parameters. Do not load it'.
+                    format(name))
+        fluid.set_program_state(prog, state_dict)
--- a/PaddleCV/video/application/video_tag/models/attention_lstm/lstm_attention.py
+++ b/PaddleCV/video/application/video_tag/models/attention_lstm/lstm_attention.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle.fluid as fluid
+from paddle.fluid import ParamAttr
+import numpy as np
+
+
+class LSTMAttentionModel(object):
+    """LSTM Attention Model"""
+
+    def __init__(self,
+                 bias_attr,
+                 embedding_size=512,
+                 lstm_size=1024,
+                 drop_rate=0.5):
+        self.lstm_size = lstm_size
+        self.embedding_size = embedding_size
+        self.drop_rate = drop_rate
+
+    def forward(self, input, is_training):
+        input_fc = fluid.layers.fc(
+            input=input,
+            size=self.embedding_size,
+            act='tanh',
+            bias_attr=ParamAttr(
+                regularizer=fluid.regularizer.L2Decay(0.0),
+                initializer=fluid.initializer.NormalInitializer(scale=0.0)),
+            name='rgb_fc')
+
+        lstm_forward_fc = fluid.layers.fc(
+            input=input_fc,
+            size=self.lstm_size * 4,
+            act=None,
+            bias_attr=False,  # video_tag
+            name='rgb_fc_forward')
+
+        lstm_forward, _ = fluid.layers.dynamic_lstm(
+            input=lstm_forward_fc,
+            size=self.lstm_size * 4,
+            is_reverse=False,
+            name='rgb_lstm_forward')
+
+        lsmt_backward_fc = fluid.layers.fc(
+            input=input_fc,
+            size=self.lstm_size * 4,
+            act=None,
+            bias_attr=False,  #video_tag
+            name='rgb_fc_backward')
+
+        lstm_backward, _ = fluid.layers.dynamic_lstm(
+            input=lsmt_backward_fc,
+            size=self.lstm_size * 4,
+            is_reverse=True,
+            name='rgb_lstm_backward')
+
+        lstm_concat = fluid.layers.concat(
+            input=[lstm_forward, lstm_backward], axis=1)
+
+        lstm_dropout = fluid.layers.dropout(
+            x=lstm_concat,
+            dropout_prob=self.drop_rate,
+            is_test=(not is_training))
+
+        lstm_weight = fluid.layers.fc(
+            input=lstm_dropout,
+            size=1,
+            act='sequence_softmax',
+            bias_attr=False,  #video_tag
+            name='rgb_weight')
+
+        scaled = fluid.layers.elementwise_mul(
+            x=lstm_dropout, y=lstm_weight, axis=0)
+        lstm_pool = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+
+        return lstm_pool
--- a/PaddleCV/video/application/video_tag/models/model.py
+++ b/PaddleCV/video/application/video_tag/models/model.py
--- a/PaddleCV/video/application/video_tag/models/tsn/__init__.py
+++ b/PaddleCV/video/application/video_tag/models/tsn/__init__.py
+from .tsn import *
--- a/PaddleCV/video/application/video_tag/models/tsn/tsn.py
+++ b/PaddleCV/video/application/video_tag/models/tsn/tsn.py
--- a/PaddleCV/video/application/video_tag/models/tsn/tsn_res_model.py
+++ b/PaddleCV/video/application/video_tag/models/tsn/tsn_res_model.py
--- a/PaddleCV/video/application/video_tag/models/utils.py
+++ b/PaddleCV/video/application/video_tag/models/utils.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import wget
+import tarfile
+
+__all__ = ['decompress', 'download', 'AttrDict']
+
+
+def decompress(path):
+    t = tarfile.open(path)
+    t.extractall(path=os.path.split(path)[0])
+    t.close()
+    os.remove(path)
+
+
+def download(url, path):
+    weight_dir = os.path.split(path)[0]
+    if not os.path.exists(weight_dir):
+        os.makedirs(weight_dir)
+
+    path = path + ".tar.gz"
+    wget.download(url, path)
+    decompress(path)
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
--- a/PaddleCV/video/application/video_tag/reader/__init__.py
+++ b/PaddleCV/video/application/video_tag/reader/__init__.py
+from .reader_utils import regist_reader, get_reader
+from .kinetics_reader import KineticsReader
+
+# regist reader, sort by alphabet
+regist_reader("TSN", KineticsReader)
--- a/PaddleCV/video/application/video_tag/reader/kinetics_reader.py
+++ b/PaddleCV/video/application/video_tag/reader/kinetics_reader.py
--- a/PaddleCV/video/application/video_tag/reader/reader_utils.py
+++ b/PaddleCV/video/application/video_tag/reader/reader_utils.py
--- a/PaddleCV/video/application/video_tag/run_TSN_LSTM.sh
+++ b/PaddleCV/video/application/video_tag/run_TSN_LSTM.sh
+export CUDA_VISIBLE_DEVICES=0
+
+# TSN + AttentionLSTM
+python videotag_main.py
--- a/PaddleCV/video/application/video_tag/utils/__init__.py
+++ b/PaddleCV/video/application/video_tag/utils/__init__.py
--- a/PaddleCV/video/application/video_tag/utils/config_utils.py
+++ b/PaddleCV/video/application/video_tag/utils/config_utils.py
--- a/PaddleCV/video/application/video_tag/utils/utility.py
+++ b/PaddleCV/video/application/video_tag/utils/utility.py
--- a/PaddleCV/video/application/video_tag/video_tag.png
+++ b/PaddleCV/video/application/video_tag/video_tag.png
--- a/PaddleCV/video/application/video_tag/videotag_main.py
+++ b/PaddleCV/video/application/video_tag/videotag_main.py
--- a/dygraph/mnist/train.py
+++ b/dygraph/mnist/train.py
--- a/dygraph/mobilenet/reader.py
+++ b/dygraph/mobilenet/reader.py
@@ -239,7 +239,7 @@ def process_image(sample, settings, mode, color_jitter, rotate):
    img /= img_std

    if mode == 'train' or mode == 'val':
-        return (img, sample[1])
+        return (img, [sample[1]])
    elif mode == 'test':
        return (img, )


--- a/dygraph/mobilenet/train.py
+++ b/dygraph/mobilenet/train.py
@@ -116,10 +116,8 @@ def train_mobilenet():
            optimizer.set_dict(opti_dict)

        # 3. reader
-        train_data_loader, train_data = utility.create_data_loader(
-            is_train=True, args=args)
-        test_data_loader, test_data = utility.create_data_loader(
-            is_train=False, args=args)
+        train_data_loader = utility.create_data_loader(is_train=True, args=args)
+        test_data_loader = utility.create_data_loader(is_train=False, args=args)
        num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
        imagenet_reader = reader.ImageNetReader(seed=0, place_num=place_num)
        train_reader = imagenet_reader.train(settings=args)
@@ -145,8 +143,6 @@ def train_mobilenet():
                t1 = time.time()
                if args.max_iter and total_batch_num == args.max_iter:
                    return
-                label = to_variable(label.numpy().astype('int64').reshape(
-                    int(args.batch_size // place_num), 1))
                t_start = time.time()

                # 4.1.1 call net()

--- a/dygraph/mobilenet/utils/utility.py
+++ b/dygraph/mobilenet/utils/utility.py
--- a/dygraph/ptb_lm/ptb_dy.py
+++ b/dygraph/ptb_lm/ptb_dy.py
--- a/dygraph/resnet/train.py
+++ b/dygraph/resnet/train.py
--- a/dygraph/se_resnet/train.py
+++ b/dygraph/se_resnet/train.py