vision Models to vision.models and example

3f4149cd · dengkaipeng · 1c9f502c · 3f4149cd · 3f4149cd · 3f4149cd
60 changed file
--- a/bmn/BMN.png
+++ b/bmn/BMN.png
--- a/bmn/README.md
+++ b/bmn/README.md
--- a/bmn/bmn.yaml
+++ b/bmn/bmn.yaml
--- a/bmn/bmn_metric.py
+++ b/bmn/bmn_metric.py
@@ -20,7 +20,7 @@ import json
 sys.path.append('../')
-from metrics import Metric
+from hapi.metrics import Metric
 from bmn_utils import boundary_choose, bmn_post_processing

--- a/bmn/bmn_utils.py
+++ b/bmn/bmn_utils.py
@@ -162,56 +162,3 @@ def bmn_post_processing(video_dict, subset, output_path, result_path):
    outfile.close()
-def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
-                           num_sample_perbin):
-    """ generate sample mask for a boundary-matching pair """
-    plen = float(seg_xmax - seg_xmin)
-    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
-    total_samples = [
-        seg_xmin + plen_sample * ii
-        for ii in range(num_sample * num_sample_perbin)
-    ]
-    p_mask = []
-    for idx in range(num_sample):
-        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
-                                    num_sample_perbin]
-        bin_vector = np.zeros([tscale])
-        for sample in bin_samples:
-            sample_upper = math.ceil(sample)
-            sample_decimal, sample_down = math.modf(sample)
-            if int(sample_down) <= (tscale - 1) and int(sample_down) >= 0:
-                bin_vector[int(sample_down)] += 1 - sample_decimal
-            if int(sample_upper) <= (tscale - 1) and int(sample_upper) >= 0:
-                bin_vector[int(sample_upper)] += sample_decimal
-        bin_vector = 1.0 / num_sample_perbin * bin_vector
-        p_mask.append(bin_vector)
-    p_mask = np.stack(p_mask, axis=1)
-    return p_mask
-def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
-                      num_sample_perbin):
-    """ generate sample mask for each point in Boundary-Matching Map """
-    mask_mat = []
-    for start_index in range(tscale):
-        mask_mat_vector = []
-        for duration_index in range(dscale):
-            if start_index + duration_index < tscale:
-                p_xmin = start_index
-                p_xmax = start_index + duration_index
-                center_len = float(p_xmax - p_xmin) + 1
-                sample_xmin = p_xmin - center_len * prop_boundary_ratio
-                sample_xmax = p_xmax + center_len * prop_boundary_ratio
-                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
-                                                tscale, num_sample,
-                                                num_sample_perbin)
-            else:
-                p_mask = np.zeros([tscale, num_sample])
-            mask_mat_vector.append(p_mask)
-        mask_mat_vector = np.stack(mask_mat_vector, axis=2)
-        mask_mat.append(mask_mat_vector)
-    mask_mat = np.stack(mask_mat, axis=3)
-    mask_mat = mask_mat.astype(np.float32)
-    sample_mask = np.reshape(mask_mat, [tscale, -1])
-    return sample_mask
--- a/bmn/config_utils.py
+++ b/bmn/config_utils.py
--- a/bmn/eval.py
+++ b/bmn/eval.py
@@ -18,11 +18,9 @@ import sys
 import logging
 import paddle.fluid as fluid
-sys.path.append('../')
+from hapi.model import set_device, Input
+from hapi.vision.models import BMN, BmnLoss
-from model import set_device, Input
 from bmn_metric import BmnMetric
-from bmn_model import BMN, BmnLoss
 from reader import BmnDataset
 from config_utils import *

--- a/bmn/eval_anet_prop.py
+++ b/bmn/eval_anet_prop.py
--- a/bmn/infer.list
+++ b/bmn/infer.list
--- a/bmn/predict.py
+++ b/bmn/predict.py
@@ -18,11 +18,9 @@ import os
 import logging
 import paddle.fluid as fluid
-sys.path.append('../')
+from hapi.model import set_device, Input
+from hapi.vision.models import BMN, BmnLoss
-from model import set_device, Input
 from bmn_metric import BmnMetric
-from bmn_model import BMN, BmnLoss
 from reader import BmnDataset
 from config_utils import *

--- a/bmn/reader.py
+++ b/bmn/reader.py
--- a/bmn/run.sh
+++ b/bmn/run.sh
--- a/bmn/train.py
+++ b/bmn/train.py
@@ -18,10 +18,8 @@ import logging
 import sys
 import os
-sys.path.append('../')
+from hapi.model import set_device, Input
+from hapi.vision.models import BMN, BmnLoss
-from model import set_device, Input
-from bmn_model import BMN, BmnLoss
 from reader import BmnDataset
 from config_utils import *

--- a/image_classification/README.MD
+++ b/image_classification/README.MD
--- a/image_classification/imagenet_dataset.py
+++ b/image_classification/imagenet_dataset.py
@@ -18,8 +18,8 @@ import math
 import random
 import numpy as np
-from datasets.folder import DatasetFolder
+from hapi.datasets import DatasetFolder
-from transform import transforms
+from hapi.vision.transforms import transforms
 from paddle import fluid
@@ -45,7 +45,8 @@ class ImageNetDataset(DatasetFolder):
    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        img = cv2.imread(img_path).astype(np.float32)
-        return self.transform(img), [label]
+        label = np.array([label])
+        return self.transform(img, label)
    def __len__(self):
        return len(self.samples)
--- a/image_classification/main.py
+++ b/image_classification/main.py
@@ -24,16 +24,18 @@ sys.path.append('../')
 import time
 import math
 import numpy as np
-import models
-import paddle.fluid as fluid
-from model import CrossEntropy, Input, set_device
+import paddle.fluid as fluid
-from imagenet_dataset import ImageNetDataset
-from distributed import DistributedBatchSampler
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from metrics import Accuracy
 from paddle.io import BatchSampler, DataLoader
+from hapi.model import CrossEntropy, Input, set_device
+from hapi.distributed import DistributedBatchSampler
+from hapi.metrics import Accuracy
+import hapi.vision.models as models
+from imagenet_dataset import ImageNetDataset
 def make_optimizer(step_per_epoch, parameter_list=None):
    base_lr = FLAGS.lr

--- a/tsm/README.md
+++ b/tsm/README.md
--- a/tsm/check.py
+++ b/tsm/check.py
--- a/tsm/dataset/README.md
+++ b/tsm/dataset/README.md
--- a/tsm/dataset/kinetics/generate_label.py
+++ b/tsm/dataset/kinetics/generate_label.py
--- a/tsm/dataset/kinetics/video2pkl.py
+++ b/tsm/dataset/kinetics/video2pkl.py
--- a/tsm/images/temporal_shift.png
+++ b/tsm/images/temporal_shift.png
--- a/tsm/infer.py
+++ b/tsm/infer.py
@@ -19,8 +19,8 @@ import os
 import argparse
 import numpy as np
-from model import Input, set_device
+from hapi.model import Input, set_device
-from models import tsm_resnet50
+from hapi.vision.models import tsm_resnet50
 from check import check_gpu, check_version
 from kinetics_dataset import KineticsDataset

--- a/tsm/kinetics_dataset.py
+++ b/tsm/kinetics_dataset.py
--- a/tsm/main.py
+++ b/tsm/main.py
@@ -22,9 +22,9 @@ import numpy as np
 from paddle import fluid
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from model import Model, CrossEntropy, Input, set_device
+from hapi.model import Model, CrossEntropy, Input, set_device
-from metrics import Accuracy
+from hapi.metrics import Accuracy
-from models import tsm_resnet50
+from hapi.vision.models import tsm_resnet50
 from check import check_gpu, check_version
 from kinetics_dataset import KineticsDataset

--- a/tsm/transforms.py
+++ b/tsm/transforms.py
--- a/yolov3/README.md
+++ b/yolov3/README.md
--- a/yolov3/coco_metric.py
+++ b/yolov3/coco_metric.py
--- a/yolov3/dataset/download_voc.py
+++ b/yolov3/dataset/download_voc.py
--- a/yolov3/image/YOLOv3.jpg
+++ b/yolov3/image/YOLOv3.jpg
--- a/yolov3/image/YOLOv3_structure.jpg
+++ b/yolov3/image/YOLOv3_structure.jpg
--- a/yolov3/image/dog.jpg
+++ b/yolov3/image/dog.jpg
--- a/yolov3/infer.py
+++ b/yolov3/infer.py
@@ -24,11 +24,11 @@ from paddle import fluid
 from paddle.fluid.optimizer import Momentum
 from paddle.io import DataLoader
-from model import Model, Input, set_device
+from hapi.model import Model, Input, set_device
-from models import yolov3_darknet53, YoloLoss
+from hapi.vision.models import yolov3_darknet53, YoloLoss
+from hapi.vision.transforms import *
 from coco import COCODataset
-from transforms import *
 from visualizer import draw_bbox
 import logging
@@ -65,7 +65,8 @@ def main():
    device = set_device(FLAGS.device)
    fluid.enable_dygraph(device) if FLAGS.dynamic else None
-    inputs = [Input([None, 3], 'int32', name='img_info'),
+    inputs = [Input([None, 1], 'int64', name='img_id'),
+              Input([None, 2], 'int32', name='img_shape'),
              Input([None, 3, None, None], 'float32', name='image')]
    cat2name = load_labels(FLAGS.label_list, with_background=False)
@@ -87,9 +88,10 @@ def main():
    img -= np.array(IMAGE_MEAN)
    img /= np.array(IMAGE_STD)
    img = img.transpose((2, 0, 1))[np.newaxis, :]
-    img_info = np.array([0, h, w]).astype('int32')[np.newaxis, :]
+    img_id = np.array([0]).astype('int64')[np.newaxis, :]
+    img_shape = np.array([h, w]).astype('int32')[np.newaxis, :]
-    _, bboxes = model.test([img_info, img])
+    _, bboxes = model.test([img_id, img_shape, img])
    vis_img = draw_bbox(orig_img, cat2name, bboxes, FLAGS.draw_threshold)
    save_name = get_save_image_name(FLAGS.output_dir, FLAGS.infer_image)

--- a/yolov3/main.py
+++ b/yolov3/main.py
@@ -25,13 +25,13 @@ from paddle import fluid
 from paddle.fluid.optimizer import Momentum
 from paddle.io import DataLoader
-from model import Model, Input, set_device
+from hapi.model import Model, Input, set_device
-from distributed import DistributedBatchSampler
+from hapi.distributed import DistributedBatchSampler
-from models import yolov3_darknet53, YoloLoss
+from hapi.datasets import COCODataset
+from hapi.vision.transforms import *
+from hapi.vision.models import yolov3_darknet53, YoloLoss
 from coco_metric import COCOMetric
-from vision.datasets import COCODataset
-from vision.transforms import *
 NUM_MAX_BOXES = 50

--- a/yolov3/visualizer.py
+++ b/yolov3/visualizer.py
--- a/hapi/callbacks.py
+++ b/hapi/callbacks.py
@@ -15,7 +15,7 @@
 import six
 import copy
-from hapi.progressbar import ProgressBar
+from progressbar import ProgressBar
 from paddle.fluid.dygraph.parallel import ParallelEnv

--- a/vision/datasets/__init__.py
+++ b/vision/datasets/__init__.py
--- a/vision/datasets/coco.py
+++ b/vision/datasets/coco.py
--- a/vision/datasets/flowers.py
+++ b/vision/datasets/flowers.py
@@ -75,7 +75,6 @@ class Flowers(Dataset):
                 setid_file=None,
                 mode='train',
                 transform=None,
-                 target_transform=None,
                 download=True):
        assert mode.lower() in ['train', 'valid', 'test'], \
                "mode should be 'train', 'valid' or 'test', but got {}".format(mode)
@@ -100,7 +99,6 @@ class Flowers(Dataset):
                setid_file, SETID_URL, SETID_MD5, 'flowers', download)
        self.transform = transform
-        self.target_transform = target_transform
        # read dataset into memory
        self._load_anno()
@@ -123,9 +121,7 @@ class Flowers(Dataset):
        image = np.array(Image.open(io.BytesIO(image)))
        if self.transform is not None:
-            image = self.transform(image)
+            image, label = self.transform(image, label)
-        if self.target_transform is not None:
-            label = self.target_transform(label)
        return image, label

--- a/vision/datasets/folder.py
+++ b/vision/datasets/folder.py
@@ -78,8 +78,6 @@ class DatasetFolder(Dataset):
            both extensions and is_valid_file should not be passed.
        transform (callable|optional): A function/transform that takes in
            a sample and returns a transformed version.
-        target_transform (callable|optional): A function/transform that takes
-            in the target and transforms it.
        is_valid_file (callable|optional): A function that takes path of a file
            and check if the file is a valid file (used to check of corrupt files)
            both extensions and is_valid_file should not be passed.
@@ -96,11 +94,9 @@ class DatasetFolder(Dataset):
                 loader=None,
                 extensions=None,
                 transform=None,
-                 target_transform=None,
                 is_valid_file=None):
        self.root = root
        self.transform = transform
-        self.target_transform = target_transform
        if extensions is None:
            extensions = IMG_EXTENSIONS
        classes, class_to_idx = self._find_classes(self.root)
@@ -154,9 +150,7 @@ class DatasetFolder(Dataset):
        path, target = self.samples[index]
        sample = self.loader(path)
        if self.transform is not None:
-            sample = self.transform(sample)
+            sample, target = self.transform(sample, target)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
        return sample, target

--- a/vision/datasets/mnist.py
+++ b/vision/datasets/mnist.py
@@ -72,7 +72,6 @@ class MNIST(Dataset):
                 label_path=None,
                 mode='train',
                 transform=None,
-                 target_transform=None,
                 download=True):
        assert mode.lower() in ['train', 'test'], \
                "mode should be 'train' or 'test', but got {}".format(mode)
@@ -95,7 +94,6 @@ class MNIST(Dataset):
                label_path, label_url, label_md5, 'mnist', download)
        self.transform = transform
-        self.target_transform = target_transform
        # read dataset into memory
        self._parse_dataset()
@@ -151,9 +149,7 @@ class MNIST(Dataset):
    def __getitem__(self, idx):
        image, label = self.images[idx], self.labels[idx]
        if self.transform is not None:
-            image = self.transform(image)
+            image, label = self.transform(image, label)
-        if self.target_transform is not None:
-            label = self.target_transform(label)
        return image, label
    def __len__(self):

--- a/vision/datasets/utils.py
+++ b/vision/datasets/utils.py
--- a/hapi/distributed.py
+++ b/hapi/distributed.py
@@ -23,7 +23,7 @@ import numpy as np
 from paddle import fluid
 from paddle.fluid.layers import collective
 from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
-from paddle.fluid.io import BatchSampler
+from paddle.io import BatchSampler
 _parallel_context_initialized = False
@@ -39,7 +39,7 @@ class DistributedBatchSampler(BatchSampler):
        Dataset is assumed to be of constant size.
    Args:
-        data_source: this could be a `fluid.io.Dataset` implement
+        data_source: this could be a `paddle.io.Dataset` implement
                     or other python object which implemented
                     `__len__` for BatchSampler to get sample
                     number of data source.

--- a/hapi/download.py
+++ b/hapi/download.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import os.path as osp
+import shutil
+import requests
+import tqdm
+import hashlib
+import time
+from paddle.fluid.dygraph.parallel import ParallelEnv
+import logging
+logger = logging.getLogger(__name__)
+__all__ = ['get_weights_path']
+WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
+DOWNLOAD_RETRY_LIMIT = 3
+def get_weights_path(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+    """
+    path, _ = get_path(url, WEIGHTS_HOME, md5sum)
+    return path
+def map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+def get_path(url, root_dir, md5sum=None, check_exist=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+    url (str): download url
+    root_dir (str): root dir for downloading, it should be
+                    WEIGHTS_HOME or DATASET_HOME
+    md5sum (str): md5 sum of download package
+    """
+    # parse path after download to decompress under root_dir
+    fullpath = map_path(url, root_dir)
+    exist_flag = False
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        exist_flag = True
+        if ParallelEnv().local_rank == 0:
+            logger.info("Found {}".format(fullpath))
+    else:
+        if ParallelEnv().local_rank == 0:
+            fullpath = _download(url, root_dir, md5sum)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+    return fullpath, exist_flag
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    if not osp.exists(path):
+        os.makedirs(path)
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+        if ParallelEnv().local_rank == 0:
+            logger.info("Downloading {} from {}".format(fname, url))
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+    return fullname
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+    if ParallelEnv().local_rank == 0:
+        logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+    if calc_md5sum != md5sum:
+        if ParallelEnv().local_rank == 0:
+            logger.info("File {} md5 check failed, {}(calc) != "
+                        "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
--- a/hapi/model.py
+++ b/hapi/model.py
@@ -32,7 +32,7 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 from paddle.fluid.incubate.fleet.base import role_maker
-from paddle.fluid.io import DataLoader, Dataset
+from paddle.io import DataLoader, Dataset
 from hapi.distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
 from hapi.metrics import Metric
@@ -45,6 +45,14 @@ __all__ = [
 def set_device(device):
+    """
+    Args:
+        device (str): specify device type, 'cpu' or 'gpu'.
+    Returns:
+        fluid.CUDAPlace or fluid.CPUPlace: Created GPU or CPU place.
+    """
    assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
    "Expected device in ['cpu', 'gpu'], but got {}".format(device)
@@ -117,9 +125,9 @@ class Loss(object):
    def forward(self, outputs, labels):
        raise NotImplementedError()
-    def __call__(self, outputs, labels):
+    def __call__(self, outputs, labels=None):
        labels = to_list(labels)
-        if in_dygraph_mode():
+        if in_dygraph_mode() and labels:
            labels = [to_variable(l) for l in labels]
        losses = to_list(self.forward(to_list(outputs), labels))
        if self.average:
@@ -366,10 +374,27 @@ class StaticGraphAdapter(object):
            metric_list, metric_splits = flatten_list(endpoints['metric'])
            fetch_list = endpoints['loss'] + metric_list
            num_loss = len(endpoints['loss'])
+        # if fetch Variable is same as input Variable, do not fetch
+        # from program, get it from input directly
+        pruned_fetch_list = []
+        pruned_fetch_idx_name_map = [""] * len(fetch_list)
+        for i, fetch_var in enumerate(fetch_list):
+            if fetch_var.name in feed.keys():
+                pruned_fetch_idx_name_map[i] = fetch_var.name
+            else:
+                pruned_fetch_list.append(fetch_var)
        rets = self._executor.run(compiled_prog,
                                  feed=feed,
-                                  fetch_list=fetch_list,
+                                  fetch_list=pruned_fetch_list,
                                  return_numpy=False)
+        # restore pruned fetch_list Variable from feeds
+        for i, name in enumerate(pruned_fetch_idx_name_map):
+            if len(name) > 0:
+                rets.insert(i, feed[name])
        # LoDTensor cannot be fetch as numpy directly
        rets = [np.array(v) for v in rets]
        if self.mode == 'test':
@@ -867,8 +892,6 @@ class Model(fluid.dygraph.Layer):
            if not isinstance(inputs, (list, dict, Input)):
                raise TypeError(
                    "'inputs' must be list or dict in static graph mode")
-            if loss_function and not isinstance(labels, (list, Input)):
-                raise TypeError("'labels' must be list in static graph mode")
        metrics = metrics or []
        for metric in to_list(metrics):
@@ -904,11 +927,11 @@ class Model(fluid.dygraph.Layer):
        FIXME: add more comments and usage
        Args:
            train_data (Dataset|DataLoader): An iterable data loader is used for 
-                train. An instance of paddle.fluid.io.Dataset or 
+                train. An instance of paddle paddle.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended.
+                paddle.io.Dataloader is recomended.
            eval_data (Dataset|DataLoader): An iterable data loader is used for
                evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
+                An instance of paddle.io.Dataset or paddle.io.Dataloader 
                is recomended.
            batch_size (int): Integer number. The batch size of train_data and eval_data. 
                When train_data and eval_data are both the instance of Dataloader, this 
@@ -1032,8 +1055,8 @@ class Model(fluid.dygraph.Layer):
        FIXME: add more comments and usage
        Args:
            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.fluid.io.Dataset or 
+                evaluation. An instance of paddle.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended.
+                paddle.io.Dataloader is recomended.
            batch_size (int): Integer number. The batch size of train_data and eval_data. 
                When train_data and eval_data are both the instance of Dataloader, this 
                parameter will be ignored.
@@ -1098,12 +1121,16 @@ class Model(fluid.dygraph.Layer):
        return eval_result
-    def predict(self, test_data, batch_size=1, num_workers=0):
+    def predict(self,
+                test_data,
+                batch_size=1,
+                num_workers=0,
+                stack_outputs=True):
        """
        FIXME: add more comments and usage
        Args:
            test_data (Dataset|DataLoader): An iterable data loader is used for
-                predict. An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
+                predict. An instance of paddle.io.Dataset or paddle.io.Dataloader 
                is recomended.
            batch_size (int): Integer number. The batch size of train_data and eval_data. 
                When train_data and eval_data are both the instance of Dataloader, this 
@@ -1111,6 +1138,12 @@ class Model(fluid.dygraph.Layer):
            num_workers (int): the number of subprocess to load data, 0 for no subprocess 
                used and loading data in main process. When train_data and eval_data are
                both the instance of Dataloader, this parameter will be ignored.
+            stack_output (bool): whether stack output field like a batch, as for an output
+                filed of a sample is in shape [X, Y], test_data contains N samples, predict
+                output field will be in shape [N, X, Y] if stack_output is True, and will
+                be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
+                is False. stack_outputs as False is used for LoDTensor output situation,
+                it is recommended set as True if outputs contains no LoDTensor. Default False
        """
        if fluid.in_dygraph_mode():
@@ -1137,19 +1170,16 @@ class Model(fluid.dygraph.Layer):
        if not isinstance(test_loader, Iterable):
            loader = test_loader()
-        outputs = None
+        outputs = []
        for data in tqdm.tqdm(loader):
-            if not fluid.in_dygraph_mode():
+            data = flatten(data)
-                data = data[0]
+            outputs.append(self.test(data[:len(self._inputs)]))
-            outs = self.test(*data)
-            if outputs is None:
+        # NOTE: for lod tensor output, we should not stack outputs
-                outputs = outs
+        # for stacking may loss its detail info
-            else:
+        outputs = list(zip(*outputs))
-                outputs = [
+        if stack_outputs:
-                    np.vstack([x, outs[i]]) for i, x in enumerate(outputs)
+            outputs = [np.stack(outs, axis=0) for outs in outputs]
-                ]
        self._test_dataloader = None
        if test_loader is not None and self._adapter._nranks > 1 \
@@ -1161,8 +1191,8 @@ class Model(fluid.dygraph.Layer):
        """
        Args:
            eval_data (Dataset|DataLoader|None): An iterable data loader is used for 
-                eval. An instance of paddle.fluid.io.Dataset or 
+                eval. An instance of paddle.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended. 
+                paddle.io.Dataloader is recomended. 
        """
        assert isinstance(
            eval_data,

--- a/hapi/text/bert/dataloader.py
+++ b/hapi/text/bert/dataloader.py
@@ -25,7 +25,7 @@ from functools import partial
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.io import BatchSampler, DataLoader, Dataset
+from paddle.io import BatchSampler, DataLoader, Dataset
 from hapi.distributed import DistributedBatchSampler
 from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
 from hapi.text.bert.batching import prepare_batch_data

--- a/vision/__init__.py
+++ b/vision/__init__.py
--- a/hapi/vision/models/__init__.py
+++ b/hapi/vision/models/__init__.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from . import resnet
+from . import vgg
+from . import mobilenetv1
+from . import mobilenetv2
+from . import darknet
+from . import yolov3
+from . import tsm
+from . import bmn
+from .resnet import *
+from .mobilenetv1 import *
+from .mobilenetv2 import *
+from .vgg import *
+from .darknet import *
+from .yolov3 import *
+from .tsm import *
+from .bmn import *
+__all__ = resnet.__all__ \
+        + vgg.__all__ \
+        + mobilenetv1.__all__ \
+        + mobilenetv2.__all__ \
+        + darknet.__all__ \
+        + yolov3.__all__ \
+        + tsm.__all__ \
+        + bmn.__all__
--- a/bmn/bmn_model.py
+++ b/bmn/bmn_model.py
@@ -17,12 +17,68 @@ from paddle.fluid import ParamAttr
 import numpy as np
 import math
-from bmn_utils import get_interp1d_mask
+from hapi.model import Model, Loss
-from model import Model, Loss
+__all__ = ["BMN", "BmnLoss"]
 DATATYPE = 'float32'
+def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
+                           num_sample_perbin):
+    """ generate sample mask for a boundary-matching pair """
+    plen = float(seg_xmax - seg_xmin)
+    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
+    total_samples = [
+        seg_xmin + plen_sample * ii
+        for ii in range(num_sample * num_sample_perbin)
+    ]
+    p_mask = []
+    for idx in range(num_sample):
+        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
+                                    num_sample_perbin]
+        bin_vector = np.zeros([tscale])
+        for sample in bin_samples:
+            sample_upper = math.ceil(sample)
+            sample_decimal, sample_down = math.modf(sample)
+            if int(sample_down) <= (tscale - 1) and int(sample_down) >= 0:
+                bin_vector[int(sample_down)] += 1 - sample_decimal
+            if int(sample_upper) <= (tscale - 1) and int(sample_upper) >= 0:
+                bin_vector[int(sample_upper)] += sample_decimal
+        bin_vector = 1.0 / num_sample_perbin * bin_vector
+        p_mask.append(bin_vector)
+    p_mask = np.stack(p_mask, axis=1)
+    return p_mask
+def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
+                      num_sample_perbin):
+    """ generate sample mask for each point in Boundary-Matching Map """
+    mask_mat = []
+    for start_index in range(tscale):
+        mask_mat_vector = []
+        for duration_index in range(dscale):
+            if start_index + duration_index < tscale:
+                p_xmin = start_index
+                p_xmax = start_index + duration_index
+                center_len = float(p_xmax - p_xmin) + 1
+                sample_xmin = p_xmin - center_len * prop_boundary_ratio
+                sample_xmax = p_xmax + center_len * prop_boundary_ratio
+                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
+                                                tscale, num_sample,
+                                                num_sample_perbin)
+            else:
+                p_mask = np.zeros([tscale, num_sample])
+            mask_mat_vector.append(p_mask)
+        mask_mat_vector = np.stack(mask_mat_vector, axis=2)
+        mask_mat.append(mask_mat_vector)
+    mask_mat = np.stack(mask_mat, axis=3)
+    mask_mat = mask_mat.astype(np.float32)
+    sample_mask = np.reshape(mask_mat, [tscale, -1])
+    return sample_mask
 # Net
 class Conv1D(fluid.dygraph.Layer):
    def __init__(self,

--- a/hapi/vision/models/darknet.py
+++ b/hapi/vision/models/darknet.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.regularizer import L2Decay
+from paddle.fluid.dygraph.nn import Conv2D, BatchNorm
+from hapi.model import Model
+from hapi.download import get_weights_path
+__all__ = ['DarkNet', 'ConvBNLayer', 'darknet53']
+# {num_layers: (url, md5)}
+pretrain_infos = {
+        53: ('https://paddlemodels.bj.bcebos.com/hapi/darknet53.pdparams',
+            '2506357a5c31e865785112fc614a487d')
+}
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act="leaky"):
+        super(ConvBNLayer, self).__init__()
+        self.conv = Conv2D(
+            num_channels=ch_in,
+            num_filters=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Normal(0., 0.02)),
+            bias_attr=False,
+            act=None)
+        self.batch_norm = BatchNorm(
+            num_channels=ch_out,
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Normal(0., 0.02),
+                regularizer=L2Decay(0.)),
+            bias_attr=ParamAttr(
+                initializer=fluid.initializer.Constant(0.0),
+                regularizer=L2Decay(0.)))
+        self.act = act
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.batch_norm(out)
+        if self.act == 'leaky':
+            out = fluid.layers.leaky_relu(x=out, alpha=0.1)
+        return out
+class DownSample(fluid.dygraph.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=2,
+                 padding=1):
+        super(DownSample, self).__init__()
+        self.conv_bn_layer = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding)
+        self.ch_out = ch_out
+    def forward(self, inputs):
+        out = self.conv_bn_layer(inputs)
+        return out
+class BasicBlock(fluid.dygraph.Layer):
+    def __init__(self, ch_in, ch_out):
+        super(BasicBlock, self).__init__()
+        self.conv1 = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=1,
+            stride=1,
+            padding=0)
+        self.conv2 = ConvBNLayer(
+            ch_in=ch_out,
+            ch_out=ch_out*2,
+            filter_size=3,
+            stride=1,
+            padding=1)
+    def forward(self, inputs):
+        conv1 = self.conv1(inputs)
+        conv2 = self.conv2(conv1)
+        out = fluid.layers.elementwise_add(x=inputs, y=conv2, act=None)
+        return out
+class LayerWarp(fluid.dygraph.Layer):
+    def __init__(self, ch_in, ch_out, count):
+        super(LayerWarp,self).__init__()
+        self.basicblock0 = BasicBlock(ch_in, ch_out)
+        self.res_out_list = []
+        for i in range(1,count):
+            res_out = self.add_sublayer("basic_block_%d" % (i),
+                BasicBlock(
+                    ch_out*2,
+                    ch_out))
+            self.res_out_list.append(res_out)
+        self.ch_out = ch_out
+    def forward(self,inputs):
+        y = self.basicblock0(inputs)
+        for basic_block_i in self.res_out_list:
+            y = basic_block_i(y)
+        return y
+DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
+class DarkNet(Model):
+    """DarkNet model from
+    `"YOLOv3: An Incremental Improvement" <https://arxiv.org/abs/1804.02767>`_
+    Args:
+        num_layers (int): layer number of DarkNet, only 53 supported currently, default: 53.
+        ch_in (int): channel number of input data, default 3.
+    """
+    def __init__(self, num_layers=53, ch_in=3):
+        super(DarkNet, self).__init__()
+        assert num_layers in DarkNet_cfg.keys(), \
+            "only support num_layers in {} currently" \
+            .format(DarkNet_cfg.keys())
+        self.stages = DarkNet_cfg[num_layers]
+        self.stages = self.stages[0:5]
+        self.conv0 = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=32,
+            filter_size=3,
+            stride=1,
+            padding=1)
+        self.downsample0 = DownSample(
+            ch_in=32,
+            ch_out=32 * 2)
+        self.darknet53_conv_block_list = []
+        self.downsample_list = []
+        ch_in = [64,128,256,512,1024]
+        for i, stage in enumerate(self.stages):
+            conv_block = self.add_sublayer(
+                "stage_%d" % (i),
+                LayerWarp(
+                int(ch_in[i]),
+                32*(2**i),
+                stage))
+            self.darknet53_conv_block_list.append(conv_block)
+        for i in range(len(self.stages) - 1):
+            downsample = self.add_sublayer(
+                "stage_%d_downsample" % i,
+                DownSample(
+                    ch_in = 32*(2**(i+1)),
+                    ch_out = 32*(2**(i+2))))
+            self.downsample_list.append(downsample)
+    def forward(self,inputs):
+        out = self.conv0(inputs)
+        out = self.downsample0(out)
+        blocks = []
+        for i, conv_block_i in enumerate(self.darknet53_conv_block_list):
+            out = conv_block_i(out)
+            blocks.append(out)
+            if i < len(self.stages) - 1:
+                out = self.downsample_list[i](out)
+        return blocks[-1:-4:-1]
+def _darknet(num_layers=53, input_channels=3, pretrained=True):
+    model = DarkNet(num_layers, input_channels)
+    if pretrained:
+        assert num_layers in pretrain_infos.keys(), \
+                "DarkNet{} do not have pretrained weights now, " \
+                "pretrained should be set as False".format(num_layers)
+        weight_path = get_weights_path(*(pretrain_infos[num_layers]))
+        assert weight_path.endswith('.pdparams'), \
+                "suffix of weight must be .pdparams"
+        model.load(weight_path[:-9])
+    return model
+def darknet53(input_channels=3, pretrained=True):
+    """DarkNet 53-layer model
+    Args:
+        input_channels (bool): channel number of input data, default 3. 
+        pretrained (bool): If True, returns a model pre-trained on ImageNet,
+            default True.
+    """
+    return _darknet(53, input_channels, pretrained)
--- a/hapi/vision/models/mobilenetv1.py
+++ b/hapi/vision/models/mobilenetv1.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from hapi.model import Model
+from hapi.download import get_weights_path
+__all__ = ['MobileNetV1', 'mobilenet_v1']
+model_urls = {
+    'mobilenetv1_1.0':
+    ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams',
+     'bf0d25cb0bed1114d9dac9384ce2b4a6')
+}
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 act='relu',
+                 use_cudnn=True,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.full_name() + "_weights"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
+            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
+            moving_mean_name=self.full_name() + "_bn" + '_mean',
+            moving_variance_name=self.full_name() + "_bn" + '_variance')
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+class DepthwiseSeparable(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters1,
+                 num_filters2,
+                 num_groups,
+                 stride,
+                 scale,
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+        self._depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False)
+        self._pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+    def forward(self, inputs):
+        y = self._depthwise_conv(inputs)
+        y = self._pointwise_conv(y)
+        return y
+class MobileNetV1(Model):
+    """MobileNetV1 model from
+    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_.
+    Args:
+        scale (float): scale of channels in each layer. Default: 1.0.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
+    """
+    def __init__(self,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True,
+                 classifier_activation='softmax'):
+        super(MobileNetV1, self).__init__()
+        self.scale = scale
+        self.dwsl = []
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+        dws21 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(32 * scale),
+                num_filters1=32,
+                num_filters2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale),
+            name="conv2_1")
+        self.dwsl.append(dws21)
+        dws22 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(64 * scale),
+                num_filters1=64,
+                num_filters2=128,
+                num_groups=64,
+                stride=2,
+                scale=scale),
+            name="conv2_2")
+        self.dwsl.append(dws22)
+        dws31 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale),
+            name="conv3_1")
+        self.dwsl.append(dws31)
+        dws32 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale),
+            name="conv3_2")
+        self.dwsl.append(dws32)
+        dws41 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale),
+            name="conv4_1")
+        self.dwsl.append(dws41)
+        dws42 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale),
+            name="conv4_2")
+        self.dwsl.append(dws42)
+        for i in range(5):
+            tmp = self.add_sublayer(
+                sublayer=DepthwiseSeparable(
+                    num_channels=int(512 * scale),
+                    num_filters1=512,
+                    num_filters2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale),
+                name="conv5_" + str(i + 1))
+            self.dwsl.append(tmp)
+        dws56 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(512 * scale),
+                num_filters1=512,
+                num_filters2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale),
+            name="conv5_6")
+        self.dwsl.append(dws56)
+        dws6 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(1024 * scale),
+                num_filters1=1024,
+                num_filters2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale),
+            name="conv6")
+        self.dwsl.append(dws6)
+        if with_pool:
+            self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+        if num_classes > -1:
+            self.out = Linear(
+                int(1024 * scale),
+                num_classes,
+                act=classifier_activation,
+                param_attr=ParamAttr(
+                    initializer=MSRA(), name=self.full_name() + "fc7_weights"),
+                bias_attr=ParamAttr(name="fc7_offset"))
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        for dws in self.dwsl:
+            y = dws(y)
+        if self.with_pool:
+            y = self.pool2d_avg(y)
+        if self.num_classes > 0:
+            y = fluid.layers.reshape(y, shape=[-1, 1024])
+            y = self.out(y)
+        return y
+def _mobilenet(arch, pretrained=False, **kwargs):
+    model = MobileNetV1(num_classes=1000, with_pool=True, **kwargs)
+    if pretrained:
+        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path(model_urls[arch][0],
+                                       model_urls[arch][1])
+        assert weight_path.endswith(
+            '.pdparams'), "suffix of weight must be .pdparams"
+        model.load(weight_path[:-9])
+    return model
+def mobilenet_v1(pretrained=False, scale=1.0):
+    """MobileNetV1
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale: (float): scale of channels in each layer. Default: 1.0.
+    """
+    model = _mobilenet('mobilenetv1_' + str(scale), pretrained, scale=scale)
+    return model
--- a/hapi/vision/models/mobilenetv2.py
+++ b/hapi/vision/models/mobilenetv2.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from hapi.model import Model
+from hapi.download import get_weights_path
+__all__ = ['MobileNetV2', 'mobilenet_v2']
+model_urls = {
+    'mobilenetv2_1.0':
+    ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams',
+     '8ff74f291f72533f2a7956a4efff9d88')
+}
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 use_cudnn=True):
+        super(ConvBNLayer, self).__init__()
+        tmp_param = ParamAttr(name=self.full_name() + "_weights")
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=tmp_param,
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
+            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
+            moving_mean_name=self.full_name() + "_bn" + '_mean',
+            moving_variance_name=self.full_name() + "_bn" + '_variance')
+    def forward(self, inputs, if_act=True):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if if_act:
+            y = fluid.layers.relu6(y)
+        return y
+class InvertedResidualUnit(fluid.dygraph.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_in_filter,
+            num_filters,
+            stride,
+            filter_size,
+            padding,
+            expansion_factor, ):
+        super(InvertedResidualUnit, self).__init__()
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        self._expand_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1)
+        self._bottleneck_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            use_cudnn=False)
+        self._linear_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1)
+    def forward(self, inputs, ifshortcut):
+        y = self._expand_conv(inputs, if_act=True)
+        y = self._bottleneck_conv(y, if_act=True)
+        y = self._linear_conv(y, if_act=False)
+        if ifshortcut:
+            y = fluid.layers.elementwise_add(inputs, y)
+        return y
+class InvresiBlocks(fluid.dygraph.Layer):
+    def __init__(self, in_c, t, c, n, s):
+        super(InvresiBlocks, self).__init__()
+        self._first_block = InvertedResidualUnit(
+            num_channels=in_c,
+            num_in_filter=in_c,
+            num_filters=c,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t)
+        self._inv_blocks = []
+        for i in range(1, n):
+            tmp = self.add_sublayer(
+                sublayer=InvertedResidualUnit(
+                    num_channels=c,
+                    num_in_filter=c,
+                    num_filters=c,
+                    stride=1,
+                    filter_size=3,
+                    padding=1,
+                    expansion_factor=t),
+                name=self.full_name() + "_" + str(i + 1))
+            self._inv_blocks.append(tmp)
+    def forward(self, inputs):
+        y = self._first_block(inputs, ifshortcut=False)
+        for inv_block in self._inv_blocks:
+            y = inv_block(y, ifshortcut=True)
+        return y
+class MobileNetV2(Model):
+    """MobileNetV2 model from
+    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+    Args:
+        scale (float): scale of channels in each layer. Default: 1.0.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
+    """
+    def __init__(self,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True,
+                 classifier_activation='softmax'):
+        super(MobileNetV2, self).__init__()
+        self.scale = scale
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+        self._conv1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1)
+        self._invl = []
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            tmp = self.add_sublayer(
+                sublayer=InvresiBlocks(
+                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
+                name='conv' + str(i))
+            self._invl.append(tmp)
+            in_c = int(c * scale)
+        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
+        self._conv9 = ConvBNLayer(
+            num_channels=in_c,
+            num_filters=self._out_c,
+            filter_size=1,
+            stride=1,
+            padding=0)
+        if with_pool:
+            self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+        if num_classes > 0:
+            tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
+            self._fc = Linear(
+                self._out_c,
+                num_classes,
+                act=classifier_activation,
+                param_attr=tmp_param,
+                bias_attr=ParamAttr(name="fc10_offset"))
+    def forward(self, inputs):
+        y = self._conv1(inputs, if_act=True)
+        for inv in self._invl:
+            y = inv(y)
+        y = self._conv9(y, if_act=True)
+        if self.with_pool:
+            y = self._pool2d_avg(y)
+        if self.num_classes > 0:
+            y = fluid.layers.reshape(y, shape=[-1, self._out_c])
+            y = self._fc(y)
+        return y
+def _mobilenet(arch, pretrained=False, **kwargs):
+    model = MobileNetV2(num_classes=1000, with_pool=True, **kwargs)
+    if pretrained:
+        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path(model_urls[arch][0],
+                                       model_urls[arch][1])
+        assert weight_path.endswith(
+            '.pdparams'), "suffix of weight must be .pdparams"
+        model.load(weight_path[:-9])
+    return model
+def mobilenet_v2(pretrained=False, scale=1.0):
+    """MobileNetV2
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale: (float): scale of channels in each layer. Default: 1.0.
+    """
+    model = _mobilenet('mobilenetv2_' + str(scale), pretrained, scale=scale)
+    return model
--- a/hapi/vision/models/resnet.py
+++ b/hapi/vision/models/resnet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle.fluid as fluid
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.container import Sequential
+from hapi.model import Model
+from hapi.download import get_weights_path
+__all__ = [
+    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'
+]
+model_urls = {
+    'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams',
+                 '0884c9087266496c41c60d14a96f8530')
+}
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        self._batch_norm = BatchNorm(num_filters, act=act)
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._batch_norm(x)
+        return x
+class BasicBlock(fluid.dygraph.Layer):
+    expansion = 1
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BasicBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride)
+        self.shortcut = shortcut
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = short + conv1
+        return fluid.layers.relu(y)
+class BottleneckBlock(fluid.dygraph.Layer):
+    expansion = 4
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * self.expansion,
+            filter_size=1,
+            act=None)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * self.expansion,
+                filter_size=1,
+                stride=stride)
+        self.shortcut = shortcut
+        self._num_channels_out = num_filters * self.expansion
+    def forward(self, inputs):
+        x = self.conv0(inputs)
+        conv1 = self.conv1(x)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        x = fluid.layers.elementwise_add(x=short, y=conv2)
+        return fluid.layers.relu(x)
+class ResNet(Model):
+    """ResNet model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        Block (BasicBlock|BottleneckBlock): block module of model.
+        depth (int): layers of resnet, default: 50.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
+    """
+    def __init__(self,
+                 Block,
+                 depth=50,
+                 num_classes=1000,
+                 with_pool=True,
+                 classifier_activation='softmax'):
+        super(ResNet, self).__init__()
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        layer_config = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }
+        assert depth in layer_config.keys(), \
+            "supported depth are {} but input layer is {}".format(
+                layer_config.keys(), depth)
+        layers = layer_config[depth]
+        in_channels = 64
+        out_channels = [64, 128, 256, 512]
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu')
+        self.pool = Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+        self.layers = []
+        for idx, num_blocks in enumerate(layers):
+            blocks = []
+            shortcut = False
+            for b in range(num_blocks):
+                if b == 1:
+                    in_channels = out_channels[idx] * Block.expansion
+                block = Block(
+                    num_channels=in_channels,
+                    num_filters=out_channels[idx],
+                    stride=2 if b == 0 and idx != 0 else 1,
+                    shortcut=shortcut)
+                blocks.append(block)
+                shortcut = True
+            layer = self.add_sublayer("layer_{}".format(idx),
+                                      Sequential(*blocks))
+            self.layers.append(layer)
+        if with_pool:
+            self.global_pool = Pool2D(
+                pool_size=7, pool_type='avg', global_pooling=True)
+        if num_classes > 0:
+            stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
+            self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
+            self.fc = Linear(
+                self.fc_input_dim,
+                num_classes,
+                act=classifier_activation,
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
+    def forward(self, inputs):
+        x = self.conv(inputs)
+        x = self.pool(x)
+        for layer in self.layers:
+            x = layer(x)
+        if self.with_pool:
+            x = self.global_pool(x)
+        if self.num_classes > -1:
+            x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
+            x = self.fc(x)
+        return x
+def _resnet(arch, Block, depth, pretrained):
+    model = ResNet(Block, depth, num_classes=1000, with_pool=True)
+    if pretrained:
+        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path(model_urls[arch][0],
+                                       model_urls[arch][1])
+        assert weight_path.endswith(
+            '.pdparams'), "suffix of weight must be .pdparams"
+        model.load(weight_path[:-9])
+    return model
+def resnet18(pretrained=False):
+    """ResNet 18-layer model
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    return _resnet('resnet18', BasicBlock, 18, pretrained)
+def resnet34(pretrained=False):
+    """ResNet 34-layer model
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    return _resnet('resnet34', BasicBlock, 34, pretrained)
+def resnet50(pretrained=False):
+    """ResNet 50-layer model
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    return _resnet('resnet50', BottleneckBlock, 50, pretrained)
+def resnet101(pretrained=False):
+    """ResNet 101-layer model
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    return _resnet('resnet101', BottleneckBlock, 101, pretrained)
+def resnet152(pretrained=False):
+    """ResNet 152-layer model
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    return _resnet('resnet152', BottleneckBlock, 152, pretrained)
--- a/hapi/vision/models/tsm.py
+++ b/hapi/vision/models/tsm.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import math
+import paddle.fluid as fluid
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from hapi.model import Model
+from hapi.download import get_weights_path
+__all__ = ["TSM_ResNet", "tsm_resnet50"]
+# {num_layers: (url, md5)}
+pretrain_infos = {
+    50: ('https://paddlemodels.bj.bcebos.com/hapi/tsm_resnet50.pdparams',
+         '5755dc538e422589f417f7b38d7cc3c7')
+}
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=None,
+            act=None,
+            param_attr=fluid.param_attr.ParamAttr(),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=fluid.param_attr.ParamAttr(),
+            bias_attr=fluid.param_attr.ParamAttr())
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+class BottleneckBlock(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 seg_num=8):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride)
+        self.shortcut = shortcut
+        self.seg_num = seg_num
+        self._num_channels_out = int(num_filters * 4)
+    def forward(self, inputs):
+        shifts = fluid.layers.temporal_shift(inputs, self.seg_num, 1.0 / 8)
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = fluid.layers.elementwise_add(x=short, y=conv2, act="relu")
+        return y
+class TSM_ResNet(Model):
+    """
+    TSM network with ResNet as backbone
+    Args:
+        num_layers (int): ResNet layer number, only support 50 currently.
+            Default 50.
+        seg_num (int): segment number of each video sample. Default 8.
+        num_classes (int): video class number. Default 400.
+    """
+    def __init__(self, num_layers=50, seg_num=8, num_classes=400):
+        super(TSM_ResNet, self).__init__()
+        self.layers = num_layers
+        self.seg_num = seg_num
+        self.class_dim = num_classes
+        if self.layers == 50:
+            depth = [3, 4, 6, 3]
+        else:
+            raise NotImplementedError
+        num_filters = [64, 128, 256, 512]
+        self.conv = ConvBNLayer(
+            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+        self.pool2d_max = Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+        self.bottleneck_block_list = []
+        num_channels = 64
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        shortcut=shortcut,
+                        seg_num=self.seg_num))
+                num_channels = int(bottleneck_block._num_channels_out)
+                self.bottleneck_block_list.append(bottleneck_block)
+                shortcut = True
+        self.pool2d_avg = Pool2D(
+            pool_size=7, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self.out = Linear(
+            2048,
+            self.class_dim,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            bias_attr=fluid.param_attr.ParamAttr(
+                learning_rate=2.0, regularizer=fluid.regularizer.L2Decay(0.)))
+    def forward(self, inputs):
+        y = fluid.layers.reshape(
+            inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+        y = self.conv(y)
+        y = self.pool2d_max(y)
+        for bottleneck_block in self.bottleneck_block_list:
+            y = bottleneck_block(y)
+        y = self.pool2d_avg(y)
+        y = fluid.layers.dropout(y, dropout_prob=0.5)
+        y = fluid.layers.reshape(y, [-1, self.seg_num, y.shape[1]])
+        y = fluid.layers.reduce_mean(y, dim=1)
+        y = fluid.layers.reshape(y, shape=[-1, 2048])
+        y = self.out(y)
+        return y
+def _tsm_resnet(num_layers, seg_num=8, num_classes=400, pretrained=True):
+    model = TSM_ResNet(num_layers, seg_num, num_classes)
+    if pretrained:
+        assert num_layers in pretrain_infos.keys(), \
+                "TSM-ResNet{} do not have pretrained weights now, " \
+                "pretrained should be set as False".format(num_layers)
+        weight_path = get_weights_path(*(pretrain_infos[num_layers]))
+        assert weight_path.endswith('.pdparams'), \
+                "suffix of weight must be .pdparams"
+        model.load(weight_path[:-9])
+    return model
+def tsm_resnet50(seg_num=8, num_classes=400, pretrained=True):
+    """TSM model with 50-layer ResNet as backbone
+    Args:
+        seg_num (int): segment number of each video sample. Default 8.
+        num_classes (int): video class number. Default 400.
+        pretrained (bool): If True, returns a model with pre-trained model
+            on COCO, default True
+    """
+    return _tsm_resnet(50, seg_num, num_classes, pretrained)
--- a/hapi/vision/models/vgg.py
+++ b/hapi/vision/models/vgg.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.container import Sequential
+from hapi.model import Model
+from hapi.download import get_weights_path
+__all__ = [
+    'VGG',
+    'vgg11',
+    'vgg13',
+    'vgg16',
+    'vgg19',
+]
+model_urls = {
+    'vgg16': ('https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams',
+              'c788f453a3b999063e8da043456281ee')
+}
+class Classifier(fluid.dygraph.Layer):
+    def __init__(self, num_classes, classifier_activation='softmax'):
+        super(Classifier, self).__init__()
+        self.linear1 = Linear(512 * 7 * 7, 4096)
+        self.linear2 = Linear(4096, 4096)
+        self.linear3 = Linear(4096, num_classes, act=classifier_activation)
+    def forward(self, x):
+        x = self.linear1(x)
+        x = fluid.layers.relu(x)
+        x = fluid.layers.dropout(x, 0.5)
+        x = self.linear2(x)
+        x = fluid.layers.relu(x)
+        x = fluid.layers.dropout(x, 0.5)
+        out = self.linear3(x)
+        return out
+class VGG(Model):
+    """VGG model from
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
+    Args:
+        features (fluid.dygraph.Layer): vgg features create by function make_layers.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
+    """
+    def __init__(self,
+                 features,
+                 num_classes=1000,
+                 classifier_activation='softmax'):
+        super(VGG, self).__init__()
+        self.features = features
+        self.num_classes = num_classes
+        if num_classes > 0:
+            classifier = Classifier(num_classes, classifier_activation)
+            self.classifier = self.add_sublayer("classifier",
+                                                Sequential(classifier))
+    def forward(self, x):
+        x = self.features(x)
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.classifier(x)
+        return x
+def make_layers(cfg, batch_norm=False):
+    layers = []
+    in_channels = 3
+    for v in cfg:
+        if v == 'M':
+            layers += [Pool2D(pool_size=2, pool_stride=2)]
+        else:
+            if batch_norm:
+                conv2d = Conv2D(in_channels, v, filter_size=3, padding=1)
+                layers += [conv2d, BatchNorm(v, act='relu')]
+            else:
+                conv2d = Conv2D(
+                    in_channels, v, filter_size=3, padding=1, act='relu')
+                layers += [conv2d]
+            in_channels = v
+    return Sequential(*layers)
+cfgs = {
+    'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'B':
+    [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'D': [
+        64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M',
+        512, 512, 512, 'M'
+    ],
+    'E': [
+        64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512,
+        512, 'M', 512, 512, 512, 512, 'M'
+    ],
+}
+def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
+    model = VGG(make_layers(
+        cfgs[cfg], batch_norm=batch_norm),
+                num_classes=1000,
+                **kwargs)
+    if pretrained:
+        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path(model_urls[arch][0],
+                                       model_urls[arch][1])
+        assert weight_path.endswith(
+            '.pdparams'), "suffix of weight must be .pdparams"
+        model.load(weight_path[:-9])
+    return model
+def vgg11(pretrained=False, batch_norm=False):
+    """VGG 11-layer model
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
+    """
+    model_name = 'vgg11'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'A', batch_norm, pretrained)
+def vgg13(pretrained=False, batch_norm=False):
+    """VGG 13-layer model
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
+    """
+    model_name = 'vgg13'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'B', batch_norm, pretrained)
+def vgg16(pretrained=False, batch_norm=False):
+    """VGG 16-layer model 
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
+    """
+    model_name = 'vgg16'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'D', batch_norm, pretrained)
+def vgg19(pretrained=False, batch_norm=False):
+    """VGG 19-layer model 
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
+    """
+    model_name = 'vgg19'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'E', batch_norm, pretrained)
--- a/hapi/vision/models/yolov3.py
+++ b/hapi/vision/models/yolov3.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Conv2D
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.regularizer import L2Decay
+from hapi.model import Model, Loss
+from hapi.download import get_weights_path
+from .darknet import darknet53, ConvBNLayer
+__all__ = ['YoloLoss', 'YOLOv3', 'yolov3_darknet53']
+# {num_layers: (url, md5)}
+pretrain_infos = {
+    53: ('https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams',
+         'aed7dd45124ff2e844ae3bd5ba6c91d2')
+}
+class YoloDetectionBlock(fluid.dygraph.Layer):
+    def __init__(self, ch_in, channel):
+        super(YoloDetectionBlock, self).__init__()
+        assert channel % 2 == 0, \
+            "channel {} cannot be divided by 2".format(channel)
+        self.conv0 = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=channel,
+            filter_size=1,
+            stride=1,
+            padding=0)
+        self.conv1 = ConvBNLayer(
+            ch_in=channel,
+            ch_out=channel*2,
+            filter_size=3,
+            stride=1,
+            padding=1)
+        self.conv2 = ConvBNLayer(
+            ch_in=channel*2,
+            ch_out=channel,
+            filter_size=1,
+            stride=1,
+            padding=0)
+        self.conv3 = ConvBNLayer(
+            ch_in=channel,
+            ch_out=channel*2,
+            filter_size=3,
+            stride=1,
+            padding=1)
+        self.route = ConvBNLayer(
+            ch_in=channel*2,
+            ch_out=channel,
+            filter_size=1,
+            stride=1,
+            padding=0)
+        self.tip = ConvBNLayer(
+            ch_in=channel,
+            ch_out=channel*2,
+            filter_size=3,
+            stride=1,
+            padding=1)
+    def forward(self, inputs):
+        out = self.conv0(inputs)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        route = self.route(out)
+        tip = self.tip(route)
+        return route, tip
+class YOLOv3(Model):
+    """YOLOv3 model from
+    `"YOLOv3: An Incremental Improvement" <https://arxiv.org/abs/1804.02767>`_
+    Args:
+        num_classes (int): class number, default 80.
+        model_mode (str): 'train', 'eval', 'test' mode, network structure
+            will be diffrent in the output layer and data, in 'train' mode,
+            no output layer append, in 'eval' and 'test', output feature
+            map will be decode to predictions by 'fluid.layers.yolo_box',
+            in 'eval' mode, return feature maps and predictions, in 'test'
+            mode, only return predictions. Default 'train'.
+    """
+    def __init__(self, num_classes=80, model_mode='train'):
+        super(YOLOv3, self).__init__()
+        self.num_classes = num_classes
+        assert str.lower(model_mode) in ['train', 'eval', 'test'], \
+            "model_mode should be 'train' 'eval' or 'test', but got " \
+            "{}".format(model_mode)
+        self.model_mode = str.lower(model_mode)
+        self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
+                        59, 119, 116, 90, 156, 198, 373, 326]
+        self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+        self.valid_thresh = 0.005
+        self.nms_thresh = 0.45
+        self.nms_topk = 400
+        self.nms_posk = 100
+        self.draw_thresh = 0.5
+        self.backbone = darknet53(pretrained=(model_mode=='train'))
+        self.block_outputs = []
+        self.yolo_blocks = []
+        self.route_blocks = []
+        for idx, num_chan in enumerate([1024, 768, 384]):
+            yolo_block = self.add_sublayer(
+                "yolo_detecton_block_{}".format(idx),
+                YoloDetectionBlock(num_chan, 512 // (2**idx)))
+            self.yolo_blocks.append(yolo_block)
+            num_filters = len(self.anchor_masks[idx]) * (self.num_classes + 5)
+            block_out = self.add_sublayer(
+                "block_out_{}".format(idx),
+                Conv2D(num_channels=1024 // (2**idx),
+                       num_filters=num_filters,
+                       filter_size=1,
+                       act=None,
+                       param_attr=ParamAttr(
+                           initializer=fluid.initializer.Normal(0., 0.02)),
+                       bias_attr=ParamAttr(
+                           initializer=fluid.initializer.Constant(0.0),
+                           regularizer=L2Decay(0.))))
+            self.block_outputs.append(block_out)
+            if idx < 2:
+                route = self.add_sublayer(
+                    "route2_{}".format(idx),
+                    ConvBNLayer(ch_in=512 // (2**idx),
+                                ch_out=256 // (2**idx),
+                                filter_size=1,
+                                act='leaky_relu'))
+                self.route_blocks.append(route)
+    def forward(self, img_id, img_shape, inputs):
+        outputs = []
+        boxes = []
+        scores = []
+        downsample = 32
+        feats = self.backbone(inputs)
+        route = None
+        for idx, feat in enumerate(feats):
+            if idx > 0:
+                feat = fluid.layers.concat(input=[route, feat], axis=1)
+            route, tip = self.yolo_blocks[idx](feat)
+            block_out = self.block_outputs[idx](tip)
+            outputs.append(block_out)
+            if idx < 2:
+                route = self.route_blocks[idx](route)
+                route = fluid.layers.resize_nearest(route, scale=2)
+            if self.model_mode != 'train':
+                anchor_mask = self.anchor_masks[idx]
+                mask_anchors = []
+                for m in anchor_mask:
+                    mask_anchors.append(self.anchors[2 * m])
+                    mask_anchors.append(self.anchors[2 * m + 1])
+                b, s = fluid.layers.yolo_box(
+                    x=block_out,
+                    img_size=img_shape,
+                    anchors=mask_anchors,
+                    class_num=self.num_classes,
+                    conf_thresh=self.valid_thresh,
+                    downsample_ratio=downsample)
+                boxes.append(b)
+                scores.append(fluid.layers.transpose(s, perm=[0, 2, 1]))
+            downsample //= 2
+        if self.model_mode == 'train':
+            return outputs
+        preds = [img_id,
+                 fluid.layers.multiclass_nms(
+                    bboxes=fluid.layers.concat(boxes, axis=1),
+                    scores=fluid.layers.concat(scores, axis=2),
+                    score_threshold=self.valid_thresh,
+                    nms_top_k=self.nms_topk,
+                    keep_top_k=self.nms_posk,
+                    nms_threshold=self.nms_thresh,
+                    background_label=-1)]
+        if self.model_mode == 'test':
+            return preds
+        # model_mode == "eval"
+        return outputs + preds
+class YoloLoss(Loss):
+    def __init__(self, num_classes=80, num_max_boxes=50):
+        super(YoloLoss, self).__init__()
+        self.num_classes = num_classes
+        self.num_max_boxes = num_max_boxes
+        self.ignore_thresh = 0.7
+        self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
+                        59, 119, 116, 90, 156, 198, 373, 326]
+        self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+    def forward(self, outputs, labels):
+        downsample = 32
+        gt_box, gt_label, gt_score = labels
+        losses = []
+        for idx, out in enumerate(outputs):
+            if idx == 3: break # debug
+            anchor_mask = self.anchor_masks[idx]
+            loss = fluid.layers.yolov3_loss(
+                x=out,
+                gt_box=gt_box,
+                gt_label=gt_label,
+                gt_score=gt_score,
+                anchor_mask=anchor_mask,
+                downsample_ratio=downsample,
+                anchors=self.anchors,
+                class_num=self.num_classes,
+                ignore_thresh=self.ignore_thresh,
+                use_label_smooth=True)
+            loss = fluid.layers.reduce_mean(loss)
+            losses.append(loss)
+            downsample //= 2
+        return losses
+def _yolov3_darknet(num_layers=53, num_classes=80,
+                    model_mode='train', pretrained=True):
+    model = YOLOv3(num_classes, model_mode)
+    if pretrained:
+        assert num_layers in pretrain_infos.keys(), \
+                "YOLOv3-DarkNet{} do not have pretrained weights now, " \
+                "pretrained should be set as False".format(num_layers)
+        weight_path = get_weights_path(*(pretrain_infos[num_layers]))
+        assert weight_path.endswith('.pdparams'), \
+                "suffix of weight must be .pdparams"
+        model.load(weight_path[:-9])
+    return model
+def yolov3_darknet53(num_classes=80, model_mode='train', pretrained=True):
+    """YOLOv3 model with 53-layer DarkNet as backbone
+    Args:
+        num_classes (int): class number, default 80.
+        model_mode (str): 'train', 'eval', 'test' mode, network structure
+            will be diffrent in the output layer and data, in 'train' mode,
+            no output layer append, in 'eval' and 'test', output feature
+            map will be decode to predictions by 'fluid.layers.yolo_box',
+            in 'eval' mode, return feature maps and predictions, in 'test'
+            mode, only return predictions. Default 'train'.
+        pretrained (bool): If True, returns a model with pre-trained model
+            on COCO, default True
+    """
+    return _yolov3_darknet(53, num_classes, model_mode, pretrained)
--- a/vision/transforms/__init__.py
+++ b/vision/transforms/__init__.py
--- a/vision/transforms/detection_transforms.py
+++ b/vision/transforms/detection_transforms.py
--- a/vision/transforms/functional.py
+++ b/vision/transforms/functional.py
--- a/vision/transforms/transforms.py
+++ b/vision/transforms/transforms.py
@@ -129,7 +129,7 @@ class Resize(object):
        self.size = size
        self.interpolation = interpolation
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        """
        Args:
            img (PIL Image): Image to be scaled.
@@ -137,7 +137,7 @@ class Resize(object):
        Returns:
            PIL Image: Rescaled image.
        """
-        return F.resize(img, self.size, self.interpolation)
+        return F.resize(img, self.size, self.interpolation), lbl
 class RandomResizedCrop(object):
@@ -199,10 +199,10 @@ class RandomResizedCrop(object):
        y = (height - h) // 2
        return x, y, w, h
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        x, y, w, h = self._get_params(img)
        cropped_img = img[y:y + h, x:x + w]
-        return F.resize(cropped_img, self.output_size, self.interpolation)
+        return F.resize(cropped_img, self.output_size, self.interpolation), lbl
 class CenterCropResize(object):
@@ -230,10 +230,10 @@ class CenterCropResize(object):
        y = (w + 1 - c) // 2
        return c, x, y
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        c, x, y = self._get_params(img)
        cropped_img = img[x:x + c, y:y + c, :]
-        return F.resize(cropped_img, self.size, self.interpolation)
+        return F.resize(cropped_img, self.size, self.interpolation), lbl
 class CenterCrop(object):
@@ -257,10 +257,10 @@ class CenterCrop(object):
        y = int(round((h - th) / 2.0))
        return x, y
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        x, y = self._get_params(img)
        th, tw = self.output_size
-        return img[y:y + th, x:x + tw]
+        return img[y:y + th, x:x + tw], lbl
 class RandomHorizontalFlip(object):
@@ -273,10 +273,10 @@ class RandomHorizontalFlip(object):
    def __init__(self, prob=0.5):
        self.prob = prob
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        if np.random.random() < self.prob:
-            return F.flip(img, code=1)
+            return F.flip(img, code=1), lbl
-        return img
+        return img, lbl
 class RandomVerticalFlip(object):
@@ -289,10 +289,10 @@ class RandomVerticalFlip(object):
    def __init__(self, prob=0.5):
        self.prob = prob
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        if np.random.random() < self.prob:
-            return F.flip(img, code=0)
+            return F.flip(img, code=0), lbl
-        return img
+        return img, lbl
 class Normalize(object):
@@ -317,8 +317,8 @@ class Normalize(object):
        self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
        self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)
-    def __call__(self, img):
+    def __call__(self, img, lbl):
-        return (img - self.mean) / self.std
+        return (img - self.mean) / self.std, lbl
 class Permute(object):
@@ -337,10 +337,10 @@ class Permute(object):
        ], "Only support 'CHW' mode, but received mode: {}".format(mode)
        self.mode = mode
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        if self.mode == "CHW":
-            return img.transpose((2, 0, 1))[::-1, ...]
+            return img.transpose((2, 0, 1))[::-1, ...], lbl
-        return img
+        return img, lbl
 class GaussianNoise(object):
@@ -356,11 +356,11 @@ class GaussianNoise(object):
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        dtype = img.dtype
        noise = np.random.normal(self.mean, self.std, img.shape) * 255
        img = img + noise.astype(np.float32)
-        return np.clip(img, 0, 255).astype(dtype)
+        return np.clip(img, 0, 255).astype(dtype), lbl
 class BrightnessTransform(object):
@@ -376,15 +376,15 @@ class BrightnessTransform(object):
            raise ValueError("brightness value should be non-negative")
        self.value = value
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        if self.value == 0:
-            return img
+            return img, lbl
        dtype = img.dtype
        img = img.astype(np.float32)
        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
        img = img * alpha
-        return img.clip(0, 255).astype(dtype)
+        return img.clip(0, 255).astype(dtype), lbl
 class ContrastTransform(object):
@@ -400,16 +400,16 @@ class ContrastTransform(object):
            raise ValueError("contrast value should be non-negative")
        self.value = value
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        if self.value == 0:
-            return img
+            return img, lbl
        dtype = img.dtype
        img = img.astype(np.float32)
        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
        img = img * alpha + cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).mean() * (
            1 - alpha)
-        return img.clip(0, 255).astype(dtype)
+        return img.clip(0, 255).astype(dtype), lbl
 class SaturationTransform(object):
@@ -425,9 +425,9 @@ class SaturationTransform(object):
            raise ValueError("saturation value should be non-negative")
        self.value = value
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        if self.value == 0:
-            return img
+            return img, lbl
        dtype = img.dtype
        img = img.astype(np.float32)
@@ -435,7 +435,7 @@ class SaturationTransform(object):
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray_img = gray_img[..., np.newaxis]
        img = img * alpha + gray_img * (1 - alpha)
-        return img.clip(0, 255).astype(dtype)
+        return img.clip(0, 255).astype(dtype), lbl
 class HueTransform(object):
@@ -451,9 +451,9 @@ class HueTransform(object):
            raise ValueError("hue value should be in [0.0, 0.5]")
        self.value = value
-    def __call__(self, img):
+    def __call__(self, img, lbl):
        if self.value == 0:
-            return img
+            return img, lbl
        dtype = img.dtype
        img = img.astype(np.uint8)
@@ -466,7 +466,7 @@ class HueTransform(object):
        with np.errstate(over="ignore"):
            h += np.uint8(alpha * 255)
        hsv_img = cv2.merge([h, s, v])
-        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
+        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype), lbl
 class ColorJitter(object):
@@ -501,5 +501,5 @@ class ColorJitter(object):
        random.shuffle(transforms)
        self.transforms = Compose(transforms)
-    def __call__(self, img):
+    def __call__(self, img, lbl):
-        return self.transforms(img)
+        return self.transforms(img), lbl