diff --git a/.gitignore b/.gitignore
index aaa0630fc9b187a1a4760bce1e15b0302764eabf..336502e288e2e76551751ed44a2677716efea7d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,6 @@
 *.json
 output*
 *checkpoint*
+build
+dist
+hapi.egg-info
diff --git a/callbacks.py b/callbacks.py
deleted file mode 100644
index 66690cf288efe8ba0d8dcc9eec64031674c8a18b..0000000000000000000000000000000000000000
--- a/callbacks.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import six
-import copy
-
-from progressbar import ProgressBar
-from paddle.fluid.dygraph.parallel import ParallelEnv
-
-
-def config_callbacks(callbacks=None,
-                     model=None,
-                     batch_size=None,
-                     epochs=None,
-                     steps=None,
-                     log_freq=2,
-                     verbose=2,
-                     save_freq=1,
-                     save_dir=None,
-                     metrics=None,
-                     mode='train'):
-    cbks = callbacks or []
-    cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks]
-    if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose:
-        cbks = cbks + [ProgBarLogger(log_freq, verbose=verbose)]
-
-    if not any(isinstance(k, ModelCheckpoint) for k in cbks):
-        cbks = cbks + [ModelCheckpoint(save_freq, save_dir)]
-
-    cbk_list = CallbackList(cbks)
-    cbk_list.set_model(model)
-    metrics = metrics or [] if mode != 'test' else []
-    params = {
-        'batch_size': batch_size,
-        'epochs': epochs,
-        'steps': steps,
-        'verbose': verbose,
-        'metrics': metrics,
-    }
-    cbk_list.set_params(params)
-    return cbk_list
-
-
-class CallbackList(object):
-    def __init__(self, callbacks=None):
-        # copy
-        self.callbacks = [c for c in callbacks]
-        self.params = {}
-        self.model = None
-
-    def append(self, callback):
-        self.callbacks.append(callback)
-
-    def __iter__(self):
-        return iter(self.callbacks)
-
-    def set_params(self, params):
-        for c in self.callbacks:
-            c.set_params(params)
-
-    def set_model(self, model):
-        for c in self.callbacks:
-            c.set_model(model)
-
-    def _call(self, name, *args):
-        for c in self.callbacks:
-            func = getattr(c, name)
-            func(*args)
-
-    def _check_mode(self, mode):
-        assert mode in ['train', 'eval', 'test'], \
-            'mode should be train, eval or test'
-
-    def on_begin(self, mode, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_begin'.format(mode)
-        self._call(name, logs)
-
-    def on_end(self, mode, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_end'.format(mode)
-        self._call(name, logs)
-
-    def on_epoch_begin(self, epoch=None, logs=None):
-        self._call('on_epoch_begin', epoch, logs)
-
-    def on_epoch_end(self, epoch=None, logs=None):
-        self._call('on_epoch_end', epoch, logs)
-
-    def on_batch_begin(self, mode, step=None, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_batch_begin'.format(mode)
-        self._call(name, step, logs)
-
-    def on_batch_end(self, mode, step=None, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_batch_end'.format(mode)
-        self._call(name, step, logs)
-
-
-class Callback(object):
-    def __init__(self):
-        self.model = None
-        self.params = {}
-
-    def set_params(self, params):
-        self.params = params
-
-    def set_model(self, model):
-        self.model = model
-
-    def on_train_begin(self, logs=None):
-        """
-        """
-
-    def on_train_end(self, logs=None):
-        """
-        """
-
-    def on_eval_begin(self, logs=None):
-        """
-        """
-
-    def on_eval_end(self, logs=None):
-        """
-        """
-
-    def on_test_begin(self, logs=None):
-        """
-        """
-
-    def on_test_end(self, logs=None):
-        """
-        """
-
-    def on_epoch_begin(self, epoch, logs=None):
-        """
-        """
-
-    def on_epoch_end(self, epoch, logs=None):
-        """
-        """
-
-    def on_train_batch_begin(self, step, logs=None):
-        """
-        """
-
-    def on_train_batch_end(self, step, logs=None):
-        """
-        """
-
-    def on_eval_batch_begin(self, step, logs=None):
-        """
-        """
-
-    def on_eval_batch_end(self, step, logs=None):
-        """
-        """
-
-    def on_eval_batch_begin(self, step, logs=None):
-        """
-        """
-
-    def on_eval_batch_end(self, step, logs=None):
-        """
-        """
-
-
-class ProgBarLogger(Callback):
-    def __init__(self, log_freq=1, verbose=2):
-        self.epochs = None
-        self.steps = None
-        self.progbar = None
-        self.verbose = verbose
-        self.log_freq = log_freq
-
-    def on_train_begin(self, logs=None):
-        self.epochs = self.params['epochs']
-        assert self.epochs
-        self.train_metrics = self.params['metrics']
-        assert self.train_metrics
-
-    def on_epoch_begin(self, epoch=None, logs=None):
-        self.steps = self.params['steps']
-        self.epoch = epoch
-        self.train_step = 0
-        if self.verbose and self.epochs and ParallelEnv().local_rank == 0:
-            print('Epoch %d/%d' % (epoch + 1, self.epochs))
-        self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
-
-    def _updates(self, logs, mode):
-        values = []
-        metrics = getattr(self, '%s_metrics' % (mode))
-        progbar = getattr(self, '%s_progbar' % (mode))
-        steps = getattr(self, '%s_step' % (mode))
-        for k in metrics:
-            if k in logs:
-                values.append((k, logs[k]))
-        progbar.update(steps, values)
-
-    def on_train_batch_end(self, step, logs=None):
-        logs = logs or {}
-        self.train_step += 1
-
-        if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
-        ).local_rank == 0:
-            # if steps is not None, last step will update in on_epoch_end
-            if self.steps and self.train_step < self.steps:
-                self._updates(logs, 'train')
-            else:
-                self._updates(logs, 'train')
-
-    def on_epoch_end(self, epoch, logs=None):
-        logs = logs or {}
-        if self.verbose and ParallelEnv().local_rank == 0:
-            self._updates(logs, 'train')
-
-    def on_eval_begin(self, logs=None):
-        self.eval_steps = logs.get('steps', None)
-        self.eval_metrics = logs.get('metrics_name', [])
-        self.eval_step = 0
-        self.evaled_samples = 0
-        self.eval_progbar = ProgressBar(
-            num=self.eval_steps, verbose=self.verbose)
-        if ParallelEnv().local_rank == 0:
-            print('Eval begin...')
-
-    def on_eval_batch_end(self, step, logs=None):
-        logs = logs or {}
-        self.eval_step = step
-        samples = logs.get('batch_size', 1)
-        self.evaled_samples += samples
-
-        if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
-        ).local_rank == 0:
-            # if steps is not None, last step will update in on_epoch_end
-            if self.eval_steps and self.eval_step < self.eval_steps:
-                self._updates(logs, 'eval')
-
-    def on_eval_end(self, logs=None):
-        logs = logs or {}
-        if self.verbose and ParallelEnv().local_rank == 0:
-            self._updates(logs, 'eval')
-            print('Eval samples: %d' % (self.evaled_samples))
-
-
-class ModelCheckpoint(Callback):
-    def __init__(self, save_freq=1, save_dir=None):
-        self.save_freq = save_freq
-        self.save_dir = save_dir
-
-    def on_epoch_begin(self, epoch=None, logs=None):
-        self.epoch = epoch
-
-    def _is_save(self):
-        return self.model and self.save_dir and ParallelEnv().local_rank == 0
-
-    def on_epoch_end(self, epoch, logs=None):
-        if self._is_save() and self.epoch % self.save_freq == 0:
-            path = '{}/{}'.format(self.save_dir, epoch)
-            print('save checkpoint at {}'.format(path))
-            self.model.save(path)
-
-    def on_train_end(self, logs=None):
-        if self._is_save():
-            path = '{}/final'.format(self.save_dir)
-            print('save checkpoint at {}'.format(path))
-            self.model.save(path)
diff --git a/cyclegan/data.py b/cyclegan/data.py
index effa4eeee12a7a4905f3cc40687d8349601bc6c6..b4c0a1f011a38fba5e84b8bc669eee90f073bb0f 100644
--- a/cyclegan/data.py
+++ b/cyclegan/data.py
@@ -30,7 +30,7 @@ IMAGES_ROOT = "./data/" + DATASET + "/"
 import paddle.fluid as fluid
 
 
-class Cityscapes(fluid.io.Dataset):
+class Cityscapes(paddle.io.Dataset):
     def __init__(self, root_path, file_path, mode='train', return_name=False):
         self.root_path = root_path
         self.file_path = file_path
diff --git a/cyclegan/train.py b/cyclegan/train.py
index c2203fc19c8e0381fa27bde26a22a863130532e9..d4273a3304a2cd6673f89afdea01ee61e379a568 100644
--- a/cyclegan/train.py
+++ b/cyclegan/train.py
@@ -86,13 +86,13 @@ def main():
     if FLAGS.resume:
         g.load(FLAGS.resume)
 
-    loader_A = fluid.io.DataLoader(
+    loader_A = paddle.io.DataLoader(
         data.DataA(),
         places=place,
         shuffle=True,
         return_list=True,
         batch_size=FLAGS.batch_size)
-    loader_B = fluid.io.DataLoader(
+    loader_B = paddle.io.DataLoader(
         data.DataB(),
         places=place,
         shuffle=True,
diff --git a/distributed.py b/distributed.py
deleted file mode 100644
index 87818545671c45cf4faba234406e87762e897784..0000000000000000000000000000000000000000
--- a/distributed.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import six
-import time
-import math
-import socket
-import contextlib
-import numpy as np
-
-from paddle import fluid
-from paddle.fluid.layers import collective
-from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
-from paddle.fluid.io import BatchSampler
-
-_parallel_context_initialized = False
-
-
-class DistributedBatchSampler(BatchSampler):
-    """Sampler that restricts data loading to a subset of the dataset.
-
-    In such case, each process can pass a DistributedBatchSampler instance 
-    as a DataLoader sampler, and load a subset of the original dataset that 
-    is exclusive to it.
-
-    .. note::
-        Dataset is assumed to be of constant size.
-        
-    Args:
-        data_source: this could be a `fluid.io.Dataset` implement
-                     or other python object which implemented
-                     `__len__` for BatchSampler to get sample
-                     number of data source.
-        batch_size(int): sample indice number in a mini-batch indices.
-        shuffle(bool): whther to shuffle indices order before genrating
-            batch indices. Default False.
-        drop_last(bool): whether drop the last incomplete batch dataset size
-            is not divisible by the batch size. Default False
-    """
-
-    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
-        self.dataset = dataset
-
-        assert isinstance(batch_size, int) and batch_size > 0, \
-                "batch_size should be a positive integer"
-        self.batch_size = batch_size
-        assert isinstance(shuffle, bool), \
-                "shuffle should be a boolean value"
-        self.shuffle = shuffle
-        assert isinstance(drop_last, bool), \
-                "drop_last should be a boolean number"
-
-        self.drop_last = drop_last
-        self.nranks = ParallelEnv().nranks
-        self.local_rank = ParallelEnv().local_rank
-        self.epoch = 0
-        self.num_samples = int(
-            math.ceil(len(self.dataset) * 1.0 / self.nranks))
-        self.total_size = self.num_samples * self.nranks
-
-    def __iter__(self):
-        num_samples = len(self.dataset)
-        indices = np.arange(num_samples).tolist()
-        indices += indices[:(self.total_size - len(indices))]
-        assert len(indices) == self.total_size
-        if self.shuffle:
-            np.random.RandomState(self.epoch).shuffle(indices)
-            self.epoch += 1
-
-        # subsample
-        def _get_indices_by_batch_size(indices):
-            subsampled_indices = []
-            last_batch_size = self.total_size % (self.batch_size * self.nranks)
-            assert last_batch_size % self.nranks == 0
-            last_local_batch_size = last_batch_size // self.nranks
-
-            for i in range(self.local_rank * self.batch_size,
-                           len(indices) - last_batch_size,
-                           self.batch_size * self.nranks):
-                subsampled_indices.extend(indices[i:i + self.batch_size])
-
-            indices = indices[len(indices) - last_batch_size:]
-            subsampled_indices.extend(indices[
-                self.local_rank * last_local_batch_size:(
-                    self.local_rank + 1) * last_local_batch_size])
-            return subsampled_indices
-
-        if self.nranks > 1:
-            indices = _get_indices_by_batch_size(indices)
-
-        assert len(indices) == self.num_samples
-        _sample_iter = iter(indices)
-
-        batch_indices = []
-        for idx in _sample_iter:
-            batch_indices.append(idx)
-            if len(batch_indices) == self.batch_size:
-                yield batch_indices
-                batch_indices = []
-        if not self.drop_last and len(batch_indices) > 0:
-            yield batch_indices
-
-    def __len__(self):
-        num_samples = self.num_samples
-        num_samples += int(not self.drop_last) * (self.batch_size - 1)
-        return num_samples // self.batch_size
-
-    def set_epoch(self, epoch):
-        self.epoch = epoch
-
-
-def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
-    return collective._c_allgather(
-        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
-
-
-def wait_server_ready(endpoints):
-    assert not isinstance(endpoints, six.string_types)
-    while True:
-        all_ok = True
-        not_ready_endpoints = []
-        for ep in endpoints:
-            ip_port = ep.split(":")
-            with contextlib.closing(
-                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-                sock.settimeout(2)
-                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
-                if result != 0:
-                    all_ok = False
-                    not_ready_endpoints.append(ep)
-        if not all_ok:
-            time.sleep(3)
-        else:
-            break
-
-
-def init_communicator(program, rank, nranks, wait_port, current_endpoint,
-                      endpoints):
-    if nranks < 2:
-        return
-    other_endpoints = endpoints[:]
-    other_endpoints.remove(current_endpoint)
-    if rank == 0 and wait_port:
-        wait_server_ready(other_endpoints)
-    block = program.global_block()
-    nccl_id_var = block.create_var(
-        name=fluid.unique_name.generate('nccl_id'),
-        persistable=True,
-        type=fluid.core.VarDesc.VarType.RAW)
-
-    block.append_op(
-        type='c_gen_nccl_id',
-        inputs={},
-        outputs={'Out': nccl_id_var},
-        attrs={
-            'rank': rank,
-            'endpoint': current_endpoint,
-            'other_endpoints': other_endpoints
-        })
-
-    block.append_op(
-        type='c_comm_init',
-        inputs={'X': nccl_id_var},
-        outputs={},
-        attrs={
-            'nranks': nranks,
-            'rank': rank,
-            'ring_id': 0,
-        })
-
-
-def prepare_distributed_context(place=None):
-    if place is None:
-        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
-            else fluid.CUDAPlace(0)
-
-    strategy = ParallelStrategy()
-    strategy.nranks = ParallelEnv().nranks
-    strategy.local_rank = ParallelEnv().local_rank
-    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-    strategy.current_endpoint = ParallelEnv().current_endpoint
-
-    if strategy.nranks < 2:
-        return
-
-    global _parallel_context_initialized
-
-    if not _parallel_context_initialized and isinstance(place,
-                                                        fluid.CUDAPlace):
-
-        def _init_context():
-            communicator_prog = fluid.Program()
-            init_communicator(communicator_prog, strategy.local_rank,
-                              strategy.nranks, True, strategy.current_endpoint,
-                              strategy.trainer_endpoints)
-            exe = fluid.Executor(place)
-            exe.run(communicator_prog)
-
-        if fluid.in_dygraph_mode():
-            fluid.disable_dygraph()
-            _init_context()
-            fluid.enable_dygraph(place)
-        else:
-            _init_context()
-
-    else:
-        assert ("Only support CUDAPlace for now.")
-
-    _parallel_context_initialized = True
-    return strategy
diff --git a/bmn/BMN.png b/examples/bmn/BMN.png
similarity index 100%
rename from bmn/BMN.png
rename to examples/bmn/BMN.png
diff --git a/bmn/README.md b/examples/bmn/README.md
similarity index 100%
rename from bmn/README.md
rename to examples/bmn/README.md
diff --git a/bmn/bmn.yaml b/examples/bmn/bmn.yaml
similarity index 100%
rename from bmn/bmn.yaml
rename to examples/bmn/bmn.yaml
diff --git a/bmn/bmn_metric.py b/examples/bmn/bmn_metric.py
similarity index 99%
rename from bmn/bmn_metric.py
rename to examples/bmn/bmn_metric.py
index a19f87c6b42b8737ddeb52c3a330f59dcc932004..f9bf101f825913572803fbb1168260f83a0d96ac 100644
--- a/bmn/bmn_metric.py
+++ b/examples/bmn/bmn_metric.py
@@ -20,7 +20,7 @@ import json
 
 sys.path.append('../')
 
-from metrics import Metric
+from hapi.metrics import Metric
 from bmn_utils import boundary_choose, bmn_post_processing
 
 
diff --git a/bmn/bmn_utils.py b/examples/bmn/bmn_utils.py
similarity index 69%
rename from bmn/bmn_utils.py
rename to examples/bmn/bmn_utils.py
index 06812e636fdaf6ccc419ca58151402ab50082112..cccf50647a55fabdfe94dd0f1f7e1370e15d0fe2 100644
--- a/bmn/bmn_utils.py
+++ b/examples/bmn/bmn_utils.py
@@ -162,56 +162,3 @@ def bmn_post_processing(video_dict, subset, output_path, result_path):
     outfile.close()
 
 
-def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
-                           num_sample_perbin):
-    """ generate sample mask for a boundary-matching pair """
-    plen = float(seg_xmax - seg_xmin)
-    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
-    total_samples = [
-        seg_xmin + plen_sample * ii
-        for ii in range(num_sample * num_sample_perbin)
-    ]
-    p_mask = []
-    for idx in range(num_sample):
-        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
-                                    num_sample_perbin]
-        bin_vector = np.zeros([tscale])
-        for sample in bin_samples:
-            sample_upper = math.ceil(sample)
-            sample_decimal, sample_down = math.modf(sample)
-            if int(sample_down) <= (tscale - 1) and int(sample_down) >= 0:
-                bin_vector[int(sample_down)] += 1 - sample_decimal
-            if int(sample_upper) <= (tscale - 1) and int(sample_upper) >= 0:
-                bin_vector[int(sample_upper)] += sample_decimal
-        bin_vector = 1.0 / num_sample_perbin * bin_vector
-        p_mask.append(bin_vector)
-    p_mask = np.stack(p_mask, axis=1)
-    return p_mask
-
-
-def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
-                      num_sample_perbin):
-    """ generate sample mask for each point in Boundary-Matching Map """
-    mask_mat = []
-    for start_index in range(tscale):
-        mask_mat_vector = []
-        for duration_index in range(dscale):
-            if start_index + duration_index < tscale:
-                p_xmin = start_index
-                p_xmax = start_index + duration_index
-                center_len = float(p_xmax - p_xmin) + 1
-                sample_xmin = p_xmin - center_len * prop_boundary_ratio
-                sample_xmax = p_xmax + center_len * prop_boundary_ratio
-                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
-                                                tscale, num_sample,
-                                                num_sample_perbin)
-            else:
-                p_mask = np.zeros([tscale, num_sample])
-            mask_mat_vector.append(p_mask)
-        mask_mat_vector = np.stack(mask_mat_vector, axis=2)
-        mask_mat.append(mask_mat_vector)
-    mask_mat = np.stack(mask_mat, axis=3)
-    mask_mat = mask_mat.astype(np.float32)
-
-    sample_mask = np.reshape(mask_mat, [tscale, -1])
-    return sample_mask
diff --git a/bmn/config_utils.py b/examples/bmn/config_utils.py
similarity index 100%
rename from bmn/config_utils.py
rename to examples/bmn/config_utils.py
diff --git a/bmn/eval.py b/examples/bmn/eval.py
similarity index 97%
rename from bmn/eval.py
rename to examples/bmn/eval.py
index d25fc5c79d21fd55743def09445db5821e3e93af..ae6ef6d49e73d1d7c866be4d4346f2b13d262fab 100644
--- a/bmn/eval.py
+++ b/examples/bmn/eval.py
@@ -18,11 +18,9 @@ import sys
 import logging
 import paddle.fluid as fluid
 
-sys.path.append('../')
-
-from model import set_device, Input
+from hapi.model import set_device, Input
+from hapi.vision.models import BMN, BmnLoss
 from bmn_metric import BmnMetric
-from bmn_model import BMN, BmnLoss
 from reader import BmnDataset
 from config_utils import *
 
diff --git a/bmn/eval_anet_prop.py b/examples/bmn/eval_anet_prop.py
similarity index 100%
rename from bmn/eval_anet_prop.py
rename to examples/bmn/eval_anet_prop.py
diff --git a/bmn/infer.list b/examples/bmn/infer.list
similarity index 100%
rename from bmn/infer.list
rename to examples/bmn/infer.list
diff --git a/bmn/predict.py b/examples/bmn/predict.py
similarity index 97%
rename from bmn/predict.py
rename to examples/bmn/predict.py
index e52927b60562425a1f03cfea12ab6cb21e76b3ef..2fce373b87645e933f5e434128346e9d8898fc2d 100644
--- a/bmn/predict.py
+++ b/examples/bmn/predict.py
@@ -18,11 +18,9 @@ import os
 import logging
 import paddle.fluid as fluid
 
-sys.path.append('../')
-
-from model import set_device, Input
+from hapi.model import set_device, Input
+from hapi.vision.models import BMN, BmnLoss
 from bmn_metric import BmnMetric
-from bmn_model import BMN, BmnLoss
 from reader import BmnDataset
 from config_utils import *
 
diff --git a/bmn/reader.py b/examples/bmn/reader.py
similarity index 99%
rename from bmn/reader.py
rename to examples/bmn/reader.py
index e1c1da592e932b6ddbd19476e994f4267ae2f927..c58518055e9af178a85c1960d147cfafe69f7701 100644
--- a/bmn/reader.py
+++ b/examples/bmn/reader.py
@@ -22,7 +22,7 @@ import sys
 sys.path.append('../')
 
 from distributed import DistributedBatchSampler
-from paddle.fluid.io import Dataset, DataLoader
+from paddle.io import Dataset, DataLoader
 
 logger = logging.getLogger(__name__)
 
diff --git a/bmn/run.sh b/examples/bmn/run.sh
similarity index 100%
rename from bmn/run.sh
rename to examples/bmn/run.sh
diff --git a/bmn/train.py b/examples/bmn/train.py
similarity index 98%
rename from bmn/train.py
rename to examples/bmn/train.py
index fe46f6a607c6ab8f93be45ffeee11478ef862eb6..bca44177ab7d27eef57660a00eed2218c1422aaa 100644
--- a/bmn/train.py
+++ b/examples/bmn/train.py
@@ -18,10 +18,8 @@ import logging
 import sys
 import os
 
-sys.path.append('../')
-
-from model import set_device, Input
-from bmn_model import BMN, BmnLoss
+from hapi.model import set_device, Input
+from hapi.vision.models import BMN, BmnLoss
 from reader import BmnDataset
 from config_utils import *
 
diff --git a/image_classification/README.MD b/examples/image_classification/README.MD
similarity index 100%
rename from image_classification/README.MD
rename to examples/image_classification/README.MD
diff --git a/image_classification/imagenet_dataset.py b/examples/image_classification/imagenet_dataset.py
similarity index 90%
rename from image_classification/imagenet_dataset.py
rename to examples/image_classification/imagenet_dataset.py
index 158093b3aa9380547490ac5da2386695dd71dd33..6572df01440a36c21330cc905da045e03ff79700 100644
--- a/image_classification/imagenet_dataset.py
+++ b/examples/image_classification/imagenet_dataset.py
@@ -18,8 +18,8 @@ import math
 import random
 import numpy as np
 
-from datasets.folder import DatasetFolder
-from transform import transforms
+from hapi.datasets import DatasetFolder
+from hapi.vision.transforms import transforms
 from paddle import fluid
 
 
@@ -45,7 +45,8 @@ class ImageNetDataset(DatasetFolder):
     def __getitem__(self, idx):
         img_path, label = self.samples[idx]
         img = cv2.imread(img_path).astype(np.float32)
-        return self.transform(img), [label]
+        label = np.array([label])
+        return self.transform(img, label)
 
     def __len__(self):
         return len(self.samples)
diff --git a/image_classification/main.py b/examples/image_classification/main.py
similarity index 96%
rename from image_classification/main.py
rename to examples/image_classification/main.py
index 781824fa60f9d703187697825595d81889b9c53c..546991528631909d5f75caec4df96c63053e7fdb 100644
--- a/image_classification/main.py
+++ b/examples/image_classification/main.py
@@ -24,15 +24,17 @@ sys.path.append('../')
 import time
 import math
 import numpy as np
-import models
+
 import paddle.fluid as fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.io import BatchSampler, DataLoader
+
+from hapi.model import CrossEntropy, Input, set_device
+from hapi.distributed import DistributedBatchSampler
+from hapi.metrics import Accuracy
+import hapi.vision.models as models
 
-from model import CrossEntropy, Input, set_device
 from imagenet_dataset import ImageNetDataset
-from distributed import DistributedBatchSampler
-from paddle.fluid.dygraph.parallel import ParallelEnv
-from metrics import Accuracy
-from paddle.fluid.io import BatchSampler, DataLoader
 
 
 def make_optimizer(step_per_epoch, parameter_list=None):
diff --git a/tsm/README.md b/examples/tsm/README.md
similarity index 100%
rename from tsm/README.md
rename to examples/tsm/README.md
diff --git a/tsm/check.py b/examples/tsm/check.py
similarity index 100%
rename from tsm/check.py
rename to examples/tsm/check.py
diff --git a/tsm/dataset/README.md b/examples/tsm/dataset/README.md
similarity index 100%
rename from tsm/dataset/README.md
rename to examples/tsm/dataset/README.md
diff --git a/tsm/dataset/kinetics/generate_label.py b/examples/tsm/dataset/kinetics/generate_label.py
similarity index 100%
rename from tsm/dataset/kinetics/generate_label.py
rename to examples/tsm/dataset/kinetics/generate_label.py
diff --git a/tsm/dataset/kinetics/video2pkl.py b/examples/tsm/dataset/kinetics/video2pkl.py
similarity index 100%
rename from tsm/dataset/kinetics/video2pkl.py
rename to examples/tsm/dataset/kinetics/video2pkl.py
diff --git a/tsm/images/temporal_shift.png b/examples/tsm/images/temporal_shift.png
similarity index 100%
rename from tsm/images/temporal_shift.png
rename to examples/tsm/images/temporal_shift.png
diff --git a/tsm/infer.py b/examples/tsm/infer.py
similarity index 97%
rename from tsm/infer.py
rename to examples/tsm/infer.py
index 78dbe2cc6ab92dc2a85fee8f186b1b1ae8d74fdd..3de1c8438fe3f35be3a527950e0fa65705defe77 100644
--- a/tsm/infer.py
+++ b/examples/tsm/infer.py
@@ -19,8 +19,8 @@ import os
 import argparse
 import numpy as np
 
-from model import Input, set_device
-from models import tsm_resnet50
+from hapi.model import Input, set_device
+from hapi.vision.models import tsm_resnet50
 
 from check import check_gpu, check_version
 from kinetics_dataset import KineticsDataset
diff --git a/tsm/kinetics_dataset.py b/examples/tsm/kinetics_dataset.py
similarity index 99%
rename from tsm/kinetics_dataset.py
rename to examples/tsm/kinetics_dataset.py
index 7e07543f37392744a2bf82ecc9b038e78d2d5524..c8570018cfbcf808917f28806ab841da874782d3 100644
--- a/tsm/kinetics_dataset.py
+++ b/examples/tsm/kinetics_dataset.py
@@ -26,7 +26,7 @@ except ImportError:
     import pickle
     from io import BytesIO
 
-from paddle.fluid.io import Dataset
+from paddle.io import Dataset
 
 import logging
 logger = logging.getLogger(__name__)
diff --git a/tsm/main.py b/examples/tsm/main.py
similarity index 97%
rename from tsm/main.py
rename to examples/tsm/main.py
index 07868dbdc43565341b19ef6fe69c693f812c6258..24b37938e82d999bfd046913d0f711bf74650cc3 100644
--- a/tsm/main.py
+++ b/examples/tsm/main.py
@@ -22,9 +22,9 @@ import numpy as np
 from paddle import fluid
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
-from model import Model, CrossEntropy, Input, set_device
-from metrics import Accuracy
-from models import tsm_resnet50
+from hapi.model import Model, CrossEntropy, Input, set_device
+from hapi.metrics import Accuracy
+from hapi.vision.models import tsm_resnet50
 
 from check import check_gpu, check_version
 from kinetics_dataset import KineticsDataset
diff --git a/tsm/transforms.py b/examples/tsm/transforms.py
similarity index 100%
rename from tsm/transforms.py
rename to examples/tsm/transforms.py
diff --git a/examples/yolov3/.gitignore b/examples/yolov3/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..01cdc1cbde3507576115b3a9832be30158e56fe5
--- /dev/null
+++ b/examples/yolov3/.gitignore
@@ -0,0 +1 @@
+dataset/voc*
diff --git a/yolov3/README.md b/examples/yolov3/README.md
similarity index 100%
rename from yolov3/README.md
rename to examples/yolov3/README.md
diff --git a/yolov3/coco_metric.py b/examples/yolov3/coco_metric.py
similarity index 100%
rename from yolov3/coco_metric.py
rename to examples/yolov3/coco_metric.py
diff --git a/yolov3/dataset/download_voc.py b/examples/yolov3/dataset/download_voc.py
similarity index 97%
rename from yolov3/dataset/download_voc.py
rename to examples/yolov3/dataset/download_voc.py
index 8b064ed4034e5fa1471c8094a78266d531d9c111..9877d7cd6b4946c01f58476b6fe81c328005e711 100644
--- a/yolov3/dataset/download_voc.py
+++ b/examples/yolov3/dataset/download_voc.py
@@ -17,7 +17,7 @@ import os.path as osp
 import sys
 import tarfile
 
-from models.download import _download
+from hapi.download import _download
 
 import logging
 logger = logging.getLogger(__name__)
diff --git a/yolov3/image/YOLOv3.jpg b/examples/yolov3/image/YOLOv3.jpg
similarity index 100%
rename from yolov3/image/YOLOv3.jpg
rename to examples/yolov3/image/YOLOv3.jpg
diff --git a/yolov3/image/YOLOv3_structure.jpg b/examples/yolov3/image/YOLOv3_structure.jpg
similarity index 100%
rename from yolov3/image/YOLOv3_structure.jpg
rename to examples/yolov3/image/YOLOv3_structure.jpg
diff --git a/yolov3/image/dog.jpg b/examples/yolov3/image/dog.jpg
similarity index 100%
rename from yolov3/image/dog.jpg
rename to examples/yolov3/image/dog.jpg
diff --git a/yolov3/infer.py b/examples/yolov3/infer.py
similarity index 89%
rename from yolov3/infer.py
rename to examples/yolov3/infer.py
index f19e86615a0b1c8c57f3469f5a5bdcaa85535e9c..cc7cbdc823082520b0adb22d8dcf71ab3b8fab74 100644
--- a/yolov3/infer.py
+++ b/examples/yolov3/infer.py
@@ -22,13 +22,12 @@ from PIL import Image
 
 from paddle import fluid
 from paddle.fluid.optimizer import Momentum
-from paddle.fluid.io import DataLoader
+from paddle.io import DataLoader
 
-from model import Model, Input, set_device
-from models import yolov3_darknet53, YoloLoss
+from hapi.model import Model, Input, set_device
+from hapi.vision.models import yolov3_darknet53, YoloLoss
+from hapi.vision.transforms import *
 
-from coco import COCODataset
-from transforms import *
 from visualizer import draw_bbox
 
 import logging
@@ -65,7 +64,8 @@ def main():
     device = set_device(FLAGS.device)
     fluid.enable_dygraph(device) if FLAGS.dynamic else None
     
-    inputs = [Input([None, 3], 'int32', name='img_info'),
+    inputs = [Input([None, 1], 'int64', name='img_id'),
+              Input([None, 2], 'int32', name='img_shape'),
               Input([None, 3, None, None], 'float32', name='image')]
 
     cat2name = load_labels(FLAGS.label_list, with_background=False)
@@ -87,9 +87,10 @@ def main():
     img -= np.array(IMAGE_MEAN)
     img /= np.array(IMAGE_STD)
     img = img.transpose((2, 0, 1))[np.newaxis, :]
-    img_info = np.array([0, h, w]).astype('int32')[np.newaxis, :]
+    img_id = np.array([0]).astype('int64')[np.newaxis, :]
+    img_shape = np.array([h, w]).astype('int32')[np.newaxis, :]
 
-    _, bboxes = model.test([img_info, img])
+    _, bboxes = model.test([img_id, img_shape, img])
 
     vis_img = draw_bbox(orig_img, cat2name, bboxes, FLAGS.draw_threshold)
     save_name = get_save_image_name(FLAGS.output_dir, FLAGS.infer_image)
diff --git a/yolov3/main.py b/examples/yolov3/main.py
similarity index 96%
rename from yolov3/main.py
rename to examples/yolov3/main.py
index 9730d4fd89ed24cd1039d180a314f674131aea51..ebe85543712d82267eaded26da4c1db8800b735f 100644
--- a/yolov3/main.py
+++ b/examples/yolov3/main.py
@@ -23,15 +23,15 @@ import numpy as np
 
 from paddle import fluid
 from paddle.fluid.optimizer import Momentum
-from paddle.fluid.io import DataLoader
+from paddle.io import DataLoader
 
-from model import Model, Input, set_device
-from distributed import DistributedBatchSampler
-from models import yolov3_darknet53, YoloLoss
+from hapi.model import Model, Input, set_device
+from hapi.distributed import DistributedBatchSampler
+from hapi.datasets import COCODataset
+from hapi.vision.transforms import *
+from hapi.vision.models import yolov3_darknet53, YoloLoss
 
 from coco_metric import COCOMetric
-from coco import COCODataset
-from transforms import *
 
 NUM_MAX_BOXES = 50
 
diff --git a/yolov3/visualizer.py b/examples/yolov3/visualizer.py
similarity index 100%
rename from yolov3/visualizer.py
rename to examples/yolov3/visualizer.py
diff --git a/hapi/__init__.py b/hapi/__init__.py
index 2397ffd8c7c64c2f9eb220edce73a5f8ef786a12..eb3f008db4e690a5cf8999862432bedddbf2ef1c 100644
--- a/hapi/__init__.py
+++ b/hapi/__init__.py
@@ -12,4 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from hapi.configure import Config as Config
+from hapi.configure import Config
+from hapi import callbacks
+from hapi import datasets
+from hapi import distributed
+from hapi import download
+from hapi import metrics
+from hapi import model
+from hapi import progressbar
+from hapi import text
+from hapi import vision
+
+__all__ = [
+    'Config',
+    'callbacks',
+    'datasets',
+    'distributed',
+    'download',
+    'metrics',
+    'model',
+    'progressbar',
+    'text',
+    'vision',
+]
diff --git a/hapi/callbacks.py b/hapi/callbacks.py
index 4e76bf7209d82234f5dc6954002911a07ca55d0f..66690cf288efe8ba0d8dcc9eec64031674c8a18b 100644
--- a/hapi/callbacks.py
+++ b/hapi/callbacks.py
@@ -15,7 +15,7 @@
 import six
 import copy
 
-from hapi.progressbar import ProgressBar
+from progressbar import ProgressBar
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
 
diff --git a/hapi/datasets/__init__.py b/hapi/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e982a6bc1736aa3db69fab4bc7a74c82a4a7edde
--- /dev/null
+++ b/hapi/datasets/__init__.py
@@ -0,0 +1,18 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .folder import *
+from .mnist import *
+from .flowers import *
+from .coco import *
diff --git a/yolov3/coco.py b/hapi/datasets/coco.py
similarity index 99%
rename from yolov3/coco.py
rename to hapi/datasets/coco.py
index 947f032ff9e988841be29fae7fb32558287c8135..f1ab97281a6e0e20834c33f1e6663903f25349a0 100644
--- a/yolov3/coco.py
+++ b/hapi/datasets/coco.py
@@ -20,7 +20,7 @@ import cv2
 import numpy as np
 from pycocotools.coco import COCO
 
-from paddle.fluid.io import Dataset
+from paddle.io import Dataset
 
 import logging
 logger = logging.getLogger(__name__)
diff --git a/hapi/datasets/flowers.py b/hapi/datasets/flowers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f4f707888d460260d598826ba15ca3c69455f7b
--- /dev/null
+++ b/hapi/datasets/flowers.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import io
+import tarfile
+import numpy as np
+import scipy.io as scio
+from PIL import Image
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ["Flowers"]
+
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
+LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
+SETID_URL = 'http://paddlemodels.bj.bcebos.com/flowers/setid.mat'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+MODE_FLAG_MAP = {'train': 'tstid', 'test': 'trnid', 'valid': "valid"}
+
+
+class Flowers(Dataset):
+    """
+    Implement of flowers dataset
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        label_file(str): path to label file, can be set None if
+            :attr:`download` is True. Default None
+        setid_file(str): path to subset index file, can be set
+            None if :attr:`download` is True. Default None
+        mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
+        download(bool): whether auto download mnist dataset if
+            :attr:`image_path`/:attr:`label_path` unset. Default
+            True
+
+    Examples:
+        
+        .. code-block:: python
+
+            from hapi.vision.datasets import Flowers
+
+            flowers = Flowers(mode='test')
+
+            for i in range(len(flowers)):
+                sample = flowers[i]
+                print(sample[0].shape, sample[1])
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 label_file=None,
+                 setid_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'valid', 'test'], \
+                "mode should be 'train', 'valid' or 'test', but got {}".format(mode)
+        self.flag = MODE_FLAG_MAP[mode.lower()]
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file not set and auto download disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, DATA_URL, DATA_MD5, 'flowers', download)
+
+        self.label_file = label_file
+        if self.label_file is None:
+            assert download, "label_file not set and auto download disabled"
+            self.label_file = _check_exists_and_download(
+                label_file, LABEL_URL, LABEL_MD5, 'flowers', download)
+
+        self.setid_file = setid_file
+        if self.setid_file is None:
+            assert download, "setid_file not set and auto download disabled"
+            self.setid_file = _check_exists_and_download(
+                setid_file, SETID_URL, SETID_MD5, 'flowers', download)
+
+        self.transform = transform
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _load_anno(self):
+        self.name2mem = {}
+        self.data_tar = tarfile.open(self.data_file)
+        for ele in self.data_tar.getmembers():
+            self.name2mem[ele.name] = ele
+
+        self.labels = scio.loadmat(self.label_file)['labels'][0]
+        self.indexes = scio.loadmat(self.setid_file)[self.flag][0]
+
+    def __getitem__(self, idx):
+        index = self.indexes[idx]
+        label = np.array([self.labels[index - 1]])
+        img_name = "jpg/image_%05d.jpg" % index
+        img_ele = self.name2mem[img_name]
+        image = self.data_tar.extractfile(img_ele).read()
+        image = np.array(Image.open(io.BytesIO(image)))
+
+        if self.transform is not None:
+            image, label = self.transform(image, label)
+
+        return image, label
+
+    def __len__(self):
+        return len(self.indexes)
diff --git a/datasets/folder.py b/hapi/datasets/folder.py
similarity index 93%
rename from datasets/folder.py
rename to hapi/datasets/folder.py
index 521939ac8f76a293e98b431e087cd64767e0fc33..5c728a63f8d8b0bf313d94a3d5e5c605686d6451 100644
--- a/datasets/folder.py
+++ b/hapi/datasets/folder.py
@@ -16,7 +16,9 @@ import os
 import sys
 import cv2
 
-from paddle.fluid.io import Dataset
+from paddle.io import Dataset
+
+__all__ = ["DatasetFolder"]
 
 
 def has_valid_extension(filename, extensions):
@@ -76,8 +78,6 @@ class DatasetFolder(Dataset):
             both extensions and is_valid_file should not be passed.
         transform (callable|optional): A function/transform that takes in
             a sample and returns a transformed version.
-        target_transform (callable|optional): A function/transform that takes
-            in the target and transforms it.
         is_valid_file (callable|optional): A function that takes path of a file
             and check if the file is a valid file (used to check of corrupt files)
             both extensions and is_valid_file should not be passed.
@@ -94,11 +94,9 @@ class DatasetFolder(Dataset):
                  loader=None,
                  extensions=None,
                  transform=None,
-                 target_transform=None,
                  is_valid_file=None):
         self.root = root
         self.transform = transform
-        self.target_transform = target_transform
         if extensions is None:
             extensions = IMG_EXTENSIONS
         classes, class_to_idx = self._find_classes(self.root)
@@ -152,9 +150,7 @@ class DatasetFolder(Dataset):
         path, target = self.samples[index]
         sample = self.loader(path)
         if self.transform is not None:
-            sample = self.transform(sample)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
+            sample, target = self.transform(sample, target)
 
         return sample, target
 
diff --git a/hapi/datasets/mnist.py b/hapi/datasets/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..18c62901edb95fd573334a4f3fe2201be7447711
--- /dev/null
+++ b/hapi/datasets/mnist.py
@@ -0,0 +1,156 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import gzip
+import struct
+import numpy as np
+
+import paddle.dataset.common
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ["MNIST"]
+
+URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+class MNIST(Dataset):
+    """
+    Implement of MNIST dataset
+
+    Args:
+        image_path(str): path to image file, can be set None if
+            :attr:`download` is True. Default None
+        label_path(str): path to label file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' or 'test' mode. Default 'train'.
+        download(bool): whether auto download mnist dataset if
+            :attr:`image_path`/:attr:`label_path` unset. Default
+            True
+
+    Returns:
+        Dataset: MNIST Dataset.
+
+    Examples:
+        
+        .. code-block:: python
+
+            from hapi.vision.datasets import MNIST
+
+            mnist = MNIST(mode='test')
+
+            for i in range(len(mnist)):
+                sample = mnist[i]
+                print(sample[0].shape, sample[1])
+
+    """
+
+    def __init__(self,
+                 image_path=None,
+                 label_path=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'test'], \
+                "mode should be 'train' or 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.image_path = image_path
+        if self.image_path is None:
+            assert download, "image_path not set and auto download disabled"
+            image_url = TRAIN_IMAGE_URL if mode == 'train' else TEST_IMAGE_URL
+            image_md5 = TRAIN_IMAGE_MD5 if mode == 'train' else TEST_IMAGE_MD5
+            self.image_path = _check_exists_and_download(
+                image_path, image_url, image_md5, 'mnist', download)
+
+        self.label_path = label_path
+        if self.label_path is None:
+            assert download, "label_path not set and auto download disabled"
+            label_url = TRAIN_LABEL_URL if mode == 'train' else TEST_LABEL_URL
+            label_md5 = TRAIN_LABEL_MD5 if mode == 'train' else TEST_LABEL_MD5
+            self.label_path = _check_exists_and_download(
+                label_path, label_url, label_md5, 'mnist', download)
+
+        self.transform = transform
+
+        # read dataset into memory
+        self._parse_dataset()
+
+    def _parse_dataset(self, buffer_size=100):
+        self.images = []
+        self.labels = []
+        with gzip.GzipFile(self.image_path, 'rb') as image_file:
+            img_buf = image_file.read()
+            with gzip.GzipFile(self.label_path, 'rb') as label_file:
+                lab_buf = label_file.read()
+
+                step_label = 0
+                offset_img = 0
+                # read from Big-endian
+                # get file info from magic byte
+                # image file : 16B
+                magic_byte_img = '>IIII'
+                magic_img, image_num, rows, cols = struct.unpack_from(
+                    magic_byte_img, img_buf, offset_img)
+                offset_img += struct.calcsize(magic_byte_img)
+
+                offset_lab = 0
+                # label file : 8B
+                magic_byte_lab = '>II'
+                magic_lab, label_num = struct.unpack_from(magic_byte_lab,
+                                                          lab_buf, offset_lab)
+                offset_lab += struct.calcsize(magic_byte_lab)
+
+                while True:
+                    if step_label >= label_num:
+                        break
+                    fmt_label = '>' + str(buffer_size) + 'B'
+                    labels = struct.unpack_from(fmt_label, lab_buf, offset_lab)
+                    offset_lab += struct.calcsize(fmt_label)
+                    step_label += buffer_size
+
+                    fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
+                    images_temp = struct.unpack_from(fmt_images, img_buf,
+                                                     offset_img)
+                    images = np.reshape(images_temp, (buffer_size, rows *
+                                                      cols)).astype('float32')
+                    offset_img += struct.calcsize(fmt_images)
+
+                    images = images / 255.0
+                    images = images * 2.0
+                    images = images - 1.0
+
+                    for i in range(buffer_size):
+                        self.images.append(images[i, :])
+                        self.labels.append(np.array([labels[i]]))
+
+    def __getitem__(self, idx):
+        image, label = self.images[idx], self.labels[idx]
+        if self.transform is not None:
+            image, label = self.transform(image, label)
+        return image, label
+
+    def __len__(self):
+        return len(self.labels)
diff --git a/hapi/datasets/utils.py b/hapi/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b580dd235739fe2d096a38fed16c8ef4af427ca1
--- /dev/null
+++ b/hapi/datasets/utils.py
@@ -0,0 +1,29 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import paddle.dataset.common
+
+
+def _check_exists_and_download(path, url, md5, module_name, download=True):
+    if path and os.path.exists(path):
+        return path
+
+    if download:
+        return paddle.dataset.common.download(url, module_name, md5)
+    else:
+        raise FileNotFoundError(
+            '{} not exists and auto download disabled'.format(path))
diff --git a/hapi/distributed.py b/hapi/distributed.py
index 87818545671c45cf4faba234406e87762e897784..39bf9a35e79792a1f0c9dd23d296730fdc31daf5 100644
--- a/hapi/distributed.py
+++ b/hapi/distributed.py
@@ -23,7 +23,7 @@ import numpy as np
 from paddle import fluid
 from paddle.fluid.layers import collective
 from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
-from paddle.fluid.io import BatchSampler
+from paddle.io import BatchSampler
 
 _parallel_context_initialized = False
 
@@ -39,7 +39,7 @@ class DistributedBatchSampler(BatchSampler):
         Dataset is assumed to be of constant size.
         
     Args:
-        data_source: this could be a `fluid.io.Dataset` implement
+        data_source: this could be a `paddle.io.Dataset` implement
                      or other python object which implemented
                      `__len__` for BatchSampler to get sample
                      number of data source.
diff --git a/models/download.py b/hapi/download.py
similarity index 100%
rename from models/download.py
rename to hapi/download.py
diff --git a/hapi/model.py b/hapi/model.py
index d9451084bdc5e81447eb9b2eb5fc9dbf2cadcabe..3255e614fd80529cdd7ac17ca31604c6815a11c4 100644
--- a/hapi/model.py
+++ b/hapi/model.py
@@ -32,7 +32,7 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 from paddle.fluid.incubate.fleet.base import role_maker
-from paddle.fluid.io import DataLoader, Dataset
+from paddle.io import DataLoader, Dataset
 
 from hapi.distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
 from hapi.metrics import Metric
@@ -45,6 +45,14 @@ __all__ = [
 
 
 def set_device(device):
+    """
+    Args:
+        device (str): specify device type, 'cpu' or 'gpu'.
+        
+    Returns:
+        fluid.CUDAPlace or fluid.CPUPlace: Created GPU or CPU place.
+    """
+
     assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
     "Expected device in ['cpu', 'gpu'], but got {}".format(device)
 
@@ -117,9 +125,9 @@ class Loss(object):
     def forward(self, outputs, labels):
         raise NotImplementedError()
 
-    def __call__(self, outputs, labels):
+    def __call__(self, outputs, labels=None):
         labels = to_list(labels)
-        if in_dygraph_mode():
+        if in_dygraph_mode() and labels:
             labels = [to_variable(l) for l in labels]
         losses = to_list(self.forward(to_list(outputs), labels))
         if self.average:
@@ -366,10 +374,27 @@ class StaticGraphAdapter(object):
             metric_list, metric_splits = flatten_list(endpoints['metric'])
             fetch_list = endpoints['loss'] + metric_list
             num_loss = len(endpoints['loss'])
+
+        # if fetch Variable is same as input Variable, do not fetch
+        # from program, get it from input directly
+        pruned_fetch_list = []
+        pruned_fetch_idx_name_map = [""] * len(fetch_list)
+        for i, fetch_var in enumerate(fetch_list):
+            if fetch_var.name in feed.keys():
+                pruned_fetch_idx_name_map[i] = fetch_var.name
+            else:
+                pruned_fetch_list.append(fetch_var)
+
         rets = self._executor.run(compiled_prog,
                                   feed=feed,
-                                  fetch_list=fetch_list,
+                                  fetch_list=pruned_fetch_list,
                                   return_numpy=False)
+
+        # restore pruned fetch_list Variable from feeds
+        for i, name in enumerate(pruned_fetch_idx_name_map):
+            if len(name) > 0:
+                rets.insert(i, feed[name])
+
         # LoDTensor cannot be fetch as numpy directly
         rets = [np.array(v) for v in rets]
         if self.mode == 'test':
@@ -867,8 +892,6 @@ class Model(fluid.dygraph.Layer):
             if not isinstance(inputs, (list, dict, Input)):
                 raise TypeError(
                     "'inputs' must be list or dict in static graph mode")
-            if loss_function and not isinstance(labels, (list, Input)):
-                raise TypeError("'labels' must be list in static graph mode")
 
         metrics = metrics or []
         for metric in to_list(metrics):
@@ -904,11 +927,11 @@ class Model(fluid.dygraph.Layer):
         FIXME: add more comments and usage
         Args:
             train_data (Dataset|DataLoader): An iterable data loader is used for 
-                train. An instance of paddle.fluid.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended.
+                train. An instance of paddle paddle.io.Dataset or 
+                paddle.io.Dataloader is recomended.
             eval_data (Dataset|DataLoader): An iterable data loader is used for
                 evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
+                An instance of paddle.io.Dataset or paddle.io.Dataloader 
                 is recomended.
             batch_size (int): Integer number. The batch size of train_data and eval_data. 
                 When train_data and eval_data are both the instance of Dataloader, this 
@@ -1032,8 +1055,8 @@ class Model(fluid.dygraph.Layer):
         FIXME: add more comments and usage
         Args:
             eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.fluid.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended.
+                evaluation. An instance of paddle.io.Dataset or 
+                paddle.io.Dataloader is recomended.
             batch_size (int): Integer number. The batch size of train_data and eval_data. 
                 When train_data and eval_data are both the instance of Dataloader, this 
                 parameter will be ignored.
@@ -1098,12 +1121,16 @@ class Model(fluid.dygraph.Layer):
 
         return eval_result
 
-    def predict(self, test_data, batch_size=1, num_workers=0):
+    def predict(self,
+                test_data,
+                batch_size=1,
+                num_workers=0,
+                stack_outputs=True):
         """
         FIXME: add more comments and usage
         Args:
             test_data (Dataset|DataLoader): An iterable data loader is used for
-                predict. An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
+                predict. An instance of paddle.io.Dataset or paddle.io.Dataloader 
                 is recomended.
             batch_size (int): Integer number. The batch size of train_data and eval_data. 
                 When train_data and eval_data are both the instance of Dataloader, this 
@@ -1111,6 +1138,12 @@ class Model(fluid.dygraph.Layer):
             num_workers (int): the number of subprocess to load data, 0 for no subprocess 
                 used and loading data in main process. When train_data and eval_data are
                 both the instance of Dataloader, this parameter will be ignored.
+            stack_output (bool): whether stack output field like a batch, as for an output
+                filed of a sample is in shape [X, Y], test_data contains N samples, predict
+                output field will be in shape [N, X, Y] if stack_output is True, and will
+                be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
+                is False. stack_outputs as False is used for LoDTensor output situation,
+                it is recommended set as True if outputs contains no LoDTensor. Default False
         """
 
         if fluid.in_dygraph_mode():
@@ -1137,19 +1170,16 @@ class Model(fluid.dygraph.Layer):
         if not isinstance(test_loader, Iterable):
             loader = test_loader()
 
-        outputs = None
+        outputs = []
         for data in tqdm.tqdm(loader):
-            if not fluid.in_dygraph_mode():
-                data = data[0]
-
-            outs = self.test(*data)
+            data = flatten(data)
+            outputs.append(self.test(data[:len(self._inputs)]))
 
-            if outputs is None:
-                outputs = outs
-            else:
-                outputs = [
-                    np.vstack([x, outs[i]]) for i, x in enumerate(outputs)
-                ]
+        # NOTE: for lod tensor output, we should not stack outputs
+        # for stacking may loss its detail info
+        outputs = list(zip(*outputs))
+        if stack_outputs:
+            outputs = [np.stack(outs, axis=0) for outs in outputs]
 
         self._test_dataloader = None
         if test_loader is not None and self._adapter._nranks > 1 \
@@ -1161,8 +1191,8 @@ class Model(fluid.dygraph.Layer):
         """
         Args:
             eval_data (Dataset|DataLoader|None): An iterable data loader is used for 
-                eval. An instance of paddle.fluid.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended. 
+                eval. An instance of paddle.io.Dataset or 
+                paddle.io.Dataloader is recomended. 
         """
         assert isinstance(
             eval_data,
diff --git a/hapi/text/bert/dataloader.py b/hapi/text/bert/dataloader.py
index 0f5384b27e8a1539ae24205fbb5080a797608eb7..2cbddac1d266c8ebb26d96f4a3f2a8e81781c562 100644
--- a/hapi/text/bert/dataloader.py
+++ b/hapi/text/bert/dataloader.py
@@ -25,7 +25,7 @@ from functools import partial
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.io import BatchSampler, DataLoader, Dataset
+from paddle.io import BatchSampler, DataLoader, Dataset
 from hapi.distributed import DistributedBatchSampler
 from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
 from hapi.text.bert.batching import prepare_batch_data
diff --git a/hapi/vision/__init__.py b/hapi/vision/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2be76375599071e4b5016f1e4d1cb3f679050e8
--- /dev/null
+++ b/hapi/vision/__init__.py
@@ -0,0 +1,18 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import models
+from . import transforms
+
+__all__ = ["models", "transforms"]
diff --git a/models/__init__.py b/hapi/vision/models/__init__.py
similarity index 89%
rename from models/__init__.py
rename to hapi/vision/models/__init__.py
index 26ad506c20a7395108bb1999806d8667fbb074dd..25148ff23567b8f7a01f44f15248c80fd05e585f 100644
--- a/models/__init__.py
+++ b/hapi/vision/models/__init__.py
@@ -19,6 +19,7 @@ from . import mobilenetv2
 from . import darknet
 from . import yolov3
 from . import tsm
+from . import bmn
 
 from .resnet import *
 from .mobilenetv1 import *
@@ -27,6 +28,7 @@ from .vgg import *
 from .darknet import *
 from .yolov3 import *
 from .tsm import *
+from .bmn import *
 
 __all__ = resnet.__all__ \
         + vgg.__all__ \
@@ -34,4 +36,5 @@ __all__ = resnet.__all__ \
         + mobilenetv2.__all__ \
         + darknet.__all__ \
         + yolov3.__all__ \
-        + tsm.__all__
+        + tsm.__all__ \
+        + bmn.__all__
diff --git a/bmn/bmn_model.py b/hapi/vision/models/bmn.py
similarity index 83%
rename from bmn/bmn_model.py
rename to hapi/vision/models/bmn.py
index dfde7bcc5cdaac8aa5ea3c069f580308b49ec01f..65ce6eaa0b4b2551e3542e623e6193fb25fb32d1 100644
--- a/bmn/bmn_model.py
+++ b/hapi/vision/models/bmn.py
@@ -17,12 +17,68 @@ from paddle.fluid import ParamAttr
 import numpy as np
 import math
 
-from bmn_utils import get_interp1d_mask
-from model import Model, Loss
+from hapi.model import Model, Loss
+
+__all__ = ["BMN", "BmnLoss"]
 
 DATATYPE = 'float32'
 
 
+def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
+                           num_sample_perbin):
+    """ generate sample mask for a boundary-matching pair """
+    plen = float(seg_xmax - seg_xmin)
+    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
+    total_samples = [
+        seg_xmin + plen_sample * ii
+        for ii in range(num_sample * num_sample_perbin)
+    ]
+    p_mask = []
+    for idx in range(num_sample):
+        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
+                                    num_sample_perbin]
+        bin_vector = np.zeros([tscale])
+        for sample in bin_samples:
+            sample_upper = math.ceil(sample)
+            sample_decimal, sample_down = math.modf(sample)
+            if int(sample_down) <= (tscale - 1) and int(sample_down) >= 0:
+                bin_vector[int(sample_down)] += 1 - sample_decimal
+            if int(sample_upper) <= (tscale - 1) and int(sample_upper) >= 0:
+                bin_vector[int(sample_upper)] += sample_decimal
+        bin_vector = 1.0 / num_sample_perbin * bin_vector
+        p_mask.append(bin_vector)
+    p_mask = np.stack(p_mask, axis=1)
+    return p_mask
+
+
+def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
+                      num_sample_perbin):
+    """ generate sample mask for each point in Boundary-Matching Map """
+    mask_mat = []
+    for start_index in range(tscale):
+        mask_mat_vector = []
+        for duration_index in range(dscale):
+            if start_index + duration_index < tscale:
+                p_xmin = start_index
+                p_xmax = start_index + duration_index
+                center_len = float(p_xmax - p_xmin) + 1
+                sample_xmin = p_xmin - center_len * prop_boundary_ratio
+                sample_xmax = p_xmax + center_len * prop_boundary_ratio
+                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
+                                                tscale, num_sample,
+                                                num_sample_perbin)
+            else:
+                p_mask = np.zeros([tscale, num_sample])
+            mask_mat_vector.append(p_mask)
+        mask_mat_vector = np.stack(mask_mat_vector, axis=2)
+        mask_mat.append(mask_mat_vector)
+    mask_mat = np.stack(mask_mat, axis=3)
+    mask_mat = mask_mat.astype(np.float32)
+
+    sample_mask = np.reshape(mask_mat, [tscale, -1])
+    return sample_mask
+
+
 # Net
 class Conv1D(fluid.dygraph.Layer):
     def __init__(self,
diff --git a/models/darknet.py b/hapi/vision/models/darknet.py
similarity index 89%
rename from models/darknet.py
rename to hapi/vision/models/darknet.py
index 095cf7d63c628483b3b0842f4c54d81bba75ceb6..85e25f4e1205ea62ec878409d640ba42e7335ee2 100755
--- a/models/darknet.py
+++ b/hapi/vision/models/darknet.py
@@ -18,10 +18,10 @@ from paddle.fluid.regularizer import L2Decay
 
 from paddle.fluid.dygraph.nn import Conv2D, BatchNorm
 
-from model import Model
-from .download import get_weights_path
+from hapi.model import Model
+from hapi.download import get_weights_path
 
-__all__ = ['DarkNet53', 'ConvBNLayer', 'darknet53']
+__all__ = ['DarkNet', 'ConvBNLayer', 'darknet53']
 
 # {num_layers: (url, md5)}
 pretrain_infos = {
@@ -136,9 +136,17 @@ class LayerWarp(fluid.dygraph.Layer):
 DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
 
 
-class DarkNet53(Model):
+class DarkNet(Model):
+    """DarkNet model from
+    `"YOLOv3: An Incremental Improvement" <https://arxiv.org/abs/1804.02767>`_
+
+    Args:
+        num_layers (int): layer number of DarkNet, only 53 supported currently, default: 53.
+        ch_in (int): channel number of input data, default 3.
+    """
+
     def __init__(self, num_layers=53, ch_in=3):
-        super(DarkNet53, self).__init__()
+        super(DarkNet, self).__init__()
         assert num_layers in DarkNet_cfg.keys(), \
             "only support num_layers in {} currently" \
             .format(DarkNet_cfg.keys())
@@ -188,7 +196,7 @@ class DarkNet53(Model):
 
 
 def _darknet(num_layers=53, input_channels=3, pretrained=True):
-    model = DarkNet53(num_layers, input_channels)
+    model = DarkNet(num_layers, input_channels)
     if pretrained:
         assert num_layers in pretrain_infos.keys(), \
                 "DarkNet{} do not have pretrained weights now, " \
@@ -201,4 +209,11 @@ def _darknet(num_layers=53, input_channels=3, pretrained=True):
 
 
 def darknet53(input_channels=3, pretrained=True):
+    """DarkNet 53-layer model
+    
+    Args:
+        input_channels (bool): channel number of input data, default 3. 
+        pretrained (bool): If True, returns a model pre-trained on ImageNet,
+            default True.
+    """
     return _darknet(53, input_channels, pretrained)
diff --git a/models/mobilenetv1.py b/hapi/vision/models/mobilenetv1.py
similarity index 99%
rename from models/mobilenetv1.py
rename to hapi/vision/models/mobilenetv1.py
index 11f8799ac260b825997d54abb682b486ddf7a655..ff27cb9c5d7745361858c3f6ec13e5865fafa605 100644
--- a/models/mobilenetv1.py
+++ b/hapi/vision/models/mobilenetv1.py
@@ -19,8 +19,8 @@ from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 
-from model import Model
-from .download import get_weights_path
+from hapi.model import Model
+from hapi.download import get_weights_path
 
 __all__ = ['MobileNetV1', 'mobilenet_v1']
 
diff --git a/models/mobilenetv2.py b/hapi/vision/models/mobilenetv2.py
similarity index 99%
rename from models/mobilenetv2.py
rename to hapi/vision/models/mobilenetv2.py
index 1d592fb5bcbdd8483aa72f72f4ec2789a88ab557..02db68e569cea06dac876dd3b7bc044cd15542f7 100644
--- a/models/mobilenetv2.py
+++ b/hapi/vision/models/mobilenetv2.py
@@ -18,8 +18,8 @@ import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 
-from model import Model
-from .download import get_weights_path
+from hapi.model import Model
+from hapi.download import get_weights_path
 
 __all__ = ['MobileNetV2', 'mobilenet_v2']
 
diff --git a/models/resnet.py b/hapi/vision/models/resnet.py
similarity index 99%
rename from models/resnet.py
rename to hapi/vision/models/resnet.py
index 6999fb72ecc4529aab3eb28f5b8273c149c23e90..804cc3534ad4c3cda4f800b41d8567922450e037 100644
--- a/models/resnet.py
+++ b/hapi/vision/models/resnet.py
@@ -22,8 +22,8 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.container import Sequential
 
-from model import Model
-from .download import get_weights_path
+from hapi.model import Model
+from hapi.download import get_weights_path
 
 __all__ = [
     'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'
diff --git a/models/tsm.py b/hapi/vision/models/tsm.py
similarity index 94%
rename from models/tsm.py
rename to hapi/vision/models/tsm.py
index 91acd16b288e7e0803e0448f0e93a484b0b92c17..8b50f7073ee6e229acf4953c778ef60e2815cdb8 100644
--- a/models/tsm.py
+++ b/hapi/vision/models/tsm.py
@@ -17,8 +17,8 @@ import paddle.fluid as fluid
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 
-from model import Model
-from .download import get_weights_path
+from hapi.model import Model
+from hapi.download import get_weights_path
 
 __all__ = ["TSM_ResNet", "tsm_resnet50"]
 
@@ -201,4 +201,12 @@ def _tsm_resnet(num_layers, seg_num=8, num_classes=400, pretrained=True):
 
 
 def tsm_resnet50(seg_num=8, num_classes=400, pretrained=True):
+    """TSM model with 50-layer ResNet as backbone
+    
+    Args:
+        seg_num (int): segment number of each video sample. Default 8.
+        num_classes (int): video class number. Default 400.
+        pretrained (bool): If True, returns a model with pre-trained model
+            on COCO, default True
+    """
     return _tsm_resnet(50, seg_num, num_classes, pretrained)
diff --git a/models/vgg.py b/hapi/vision/models/vgg.py
similarity index 98%
rename from models/vgg.py
rename to hapi/vision/models/vgg.py
index 324ddc05a0d7991e2502b65361b5a07357414499..5ef09bd665e4308739651d868203a4a56b14de38 100644
--- a/models/vgg.py
+++ b/hapi/vision/models/vgg.py
@@ -17,8 +17,8 @@ import paddle.fluid as fluid
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.container import Sequential
 
-from model import Model
-from .download import get_weights_path
+from hapi.model import Model
+from hapi.download import get_weights_path
 
 __all__ = [
     'VGG',
diff --git a/models/yolov3.py b/hapi/vision/models/yolov3.py
similarity index 86%
rename from models/yolov3.py
rename to hapi/vision/models/yolov3.py
index c4428e37bb524441731dde63266c2ec07c67bab7..840a402e3e1b473cc3c3deffc1b2d0fe6e7a2307 100644
--- a/models/yolov3.py
+++ b/hapi/vision/models/yolov3.py
@@ -20,9 +20,9 @@ from paddle.fluid.dygraph.nn import Conv2D
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.regularizer import L2Decay
 
-from model import Model, Loss
+from hapi.model import Model, Loss
+from hapi.download import get_weights_path
 from .darknet import darknet53, ConvBNLayer
-from .download import get_weights_path
 
 __all__ = ['YoloLoss', 'YOLOv3', 'yolov3_darknet53']
 
@@ -88,6 +88,20 @@ class YoloDetectionBlock(fluid.dygraph.Layer):
 
 
 class YOLOv3(Model):
+    """YOLOv3 model from
+    `"YOLOv3: An Incremental Improvement" <https://arxiv.org/abs/1804.02767>`_
+
+    Args:
+        num_classes (int): class number, default 80.
+        model_mode (str): 'train', 'eval', 'test' mode, network structure
+            will be diffrent in the output layer and data, in 'train' mode,
+            no output layer append, in 'eval' and 'test', output feature
+            map will be decode to predictions by 'fluid.layers.yolo_box',
+            in 'eval' mode, return feature maps and predictions, in 'test'
+            mode, only return predictions. Default 'train'.
+
+    """
+
     def __init__(self, num_classes=80, model_mode='train'):
         super(YOLOv3, self).__init__()
         self.num_classes = num_classes
@@ -245,4 +259,17 @@ def _yolov3_darknet(num_layers=53, num_classes=80,
 
 
 def yolov3_darknet53(num_classes=80, model_mode='train', pretrained=True):
+    """YOLOv3 model with 53-layer DarkNet as backbone
+    
+    Args:
+        num_classes (int): class number, default 80.
+        model_mode (str): 'train', 'eval', 'test' mode, network structure
+            will be diffrent in the output layer and data, in 'train' mode,
+            no output layer append, in 'eval' and 'test', output feature
+            map will be decode to predictions by 'fluid.layers.yolo_box',
+            in 'eval' mode, return feature maps and predictions, in 'test'
+            mode, only return predictions. Default 'train'.
+        pretrained (bool): If True, returns a model with pre-trained model
+            on COCO, default True
+    """
     return _yolov3_darknet(53, num_classes, model_mode, pretrained)
diff --git a/transform/__init__.py b/hapi/vision/transforms/__init__.py
similarity index 91%
rename from transform/__init__.py
rename to hapi/vision/transforms/__init__.py
index 78f618e617f4c1c94f95695e47c77509a28b7837..4367c712420fef78c6a81d681f8e4e9342f1540a 100644
--- a/transform/__init__.py
+++ b/hapi/vision/transforms/__init__.py
@@ -13,3 +13,5 @@
 # limitations under the License.
 
 from .transforms import *
+from .functional import *
+from .detection_transforms import *
diff --git a/yolov3/transforms.py b/hapi/vision/transforms/detection_transforms.py
similarity index 94%
rename from yolov3/transforms.py
rename to hapi/vision/transforms/detection_transforms.py
index f3d3ebe8487959f76641b2440c672a52f168268d..8d81c274dfb574bac52855cda95c970e4c8a444f 100644
--- a/yolov3/transforms.py
+++ b/hapi/vision/transforms/detection_transforms.py
@@ -19,48 +19,18 @@ import cv2
 import traceback
 import numpy as np
 
-import logging
-logger = logging.getLogger(__name__)
-
-__all__ = ['ColorDistort', 'RandomExpand', 'RandomCrop', 'RandomFlip',
-           'NormalizeBox', 'PadBox', 'RandomShape', 'NormalizeImage',
-           'BboxXYXY2XYWH', 'ResizeImage', 'Compose', 'BatchCompose']
-
-
-class Compose(object):
-    def __init__(self, transforms=[]):
-        self.transforms = transforms
-
-    def __call__(self, *data):
-        for f in self.transforms:
-            try:
-                data = f(*data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.info("fail to perform transform [{}] with error: "
-                        "{} and stack:\n{}".format(f, e, str(stack_info)))
-                raise e
-        return data
-
-
-class BatchCompose(object):
-    def __init__(self, transforms=[]):
-        self.transforms = transforms
-
-    def __call__(self, data):
-        for f in self.transforms:
-            try:
-                data = f(data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.info("fail to perform batch transform [{}] with error: "
-                        "{} and stack:\n{}".format(f, e, str(stack_info)))
-                raise e
-
-        # sample list to batch data
-        batch = list(zip(*data))
-
-        return batch
+__all__ = [
+    'ColorDistort',
+    'RandomExpand',
+    'RandomCrop',
+    'RandomFlip',
+    'NormalizeBox',
+    'PadBox',
+    'RandomShape',
+    'NormalizeImage',
+    'BboxXYXY2XYWH',
+    'ResizeImage',
+]
 
 
 class ColorDistort(object):
diff --git a/transform/functional.py b/hapi/vision/transforms/functional.py
similarity index 100%
rename from transform/functional.py
rename to hapi/vision/transforms/functional.py
diff --git a/transform/transforms.py b/hapi/vision/transforms/transforms.py
similarity index 84%
rename from transform/transforms.py
rename to hapi/vision/transforms/transforms.py
index d29b4fdfe8aae2e7bb4fc990ad425c59e498f6b5..79926f811fcac0844d3290bbbccd4a5d389c626e 100644
--- a/transform/transforms.py
+++ b/hapi/vision/transforms/transforms.py
@@ -24,6 +24,7 @@ import numbers
 import types
 import collections
 import warnings
+import traceback
 
 from . import functional as F
 
@@ -34,6 +35,7 @@ else:
 
 __all__ = [
     "Compose",
+    "BatchCompose",
     "Resize",
     "RandomResizedCrop",
     "CenterCropResize",
@@ -62,10 +64,16 @@ class Compose(object):
     def __init__(self, transforms):
         self.transforms = transforms
 
-    def __call__(self, img):
-        for t in self.transforms:
-            img = t(img)
-        return img
+    def __call__(self, *data):
+        for f in self.transforms:
+            try:
+                data = f(*data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                print("fail to perform transform [{}] with error: "
+                        "{} and stack:\n{}".format(f, e, str(stack_info)))
+                raise e
+        return data
 
     def __repr__(self):
         format_string = self.__class__.__name__ + '('
@@ -76,6 +84,33 @@ class Compose(object):
         return format_string
 
 
+class BatchCompose(object):
+    """Composes several batch transforms together
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+                                            these transforms perform on batch data.
+
+    """
+    def __init__(self, transforms=[]):
+        self.transforms = transforms
+
+    def __call__(self, data):
+        for f in self.transforms:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                print("fail to perform batch transform [{}] with error: "
+                        "{} and stack:\n{}".format(f, e, str(stack_info)))
+                raise e
+
+        # sample list to batch data
+        batch = list(zip(*data))
+
+        return batch
+
+
 class Resize(object):
     """Resize the input PIL Image to the given size.
 
@@ -94,7 +129,7 @@ class Resize(object):
         self.size = size
         self.interpolation = interpolation
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         """
         Args:
             img (PIL Image): Image to be scaled.
@@ -102,7 +137,7 @@ class Resize(object):
         Returns:
             PIL Image: Rescaled image.
         """
-        return F.resize(img, self.size, self.interpolation)
+        return F.resize(img, self.size, self.interpolation), lbl
 
 
 class RandomResizedCrop(object):
@@ -164,10 +199,10 @@ class RandomResizedCrop(object):
         y = (height - h) // 2
         return x, y, w, h
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         x, y, w, h = self._get_params(img)
         cropped_img = img[y:y + h, x:x + w]
-        return F.resize(cropped_img, self.output_size, self.interpolation)
+        return F.resize(cropped_img, self.output_size, self.interpolation), lbl
 
 
 class CenterCropResize(object):
@@ -195,10 +230,10 @@ class CenterCropResize(object):
         y = (w + 1 - c) // 2
         return c, x, y
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         c, x, y = self._get_params(img)
         cropped_img = img[x:x + c, y:y + c, :]
-        return F.resize(cropped_img, self.size, self.interpolation)
+        return F.resize(cropped_img, self.size, self.interpolation), lbl
 
 
 class CenterCrop(object):
@@ -222,10 +257,10 @@ class CenterCrop(object):
         y = int(round((h - th) / 2.0))
         return x, y
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         x, y = self._get_params(img)
         th, tw = self.output_size
-        return img[y:y + th, x:x + tw]
+        return img[y:y + th, x:x + tw], lbl
 
 
 class RandomHorizontalFlip(object):
@@ -238,10 +273,10 @@ class RandomHorizontalFlip(object):
     def __init__(self, prob=0.5):
         self.prob = prob
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         if np.random.random() < self.prob:
-            return F.flip(img, code=1)
-        return img
+            return F.flip(img, code=1), lbl
+        return img, lbl
 
 
 class RandomVerticalFlip(object):
@@ -254,10 +289,10 @@ class RandomVerticalFlip(object):
     def __init__(self, prob=0.5):
         self.prob = prob
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         if np.random.random() < self.prob:
-            return F.flip(img, code=0)
-        return img
+            return F.flip(img, code=0), lbl
+        return img, lbl
 
 
 class Normalize(object):
@@ -282,8 +317,8 @@ class Normalize(object):
         self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
         self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)
 
-    def __call__(self, img):
-        return (img - self.mean) / self.std
+    def __call__(self, img, lbl):
+        return (img - self.mean) / self.std, lbl
 
 
 class Permute(object):
@@ -302,10 +337,10 @@ class Permute(object):
         ], "Only support 'CHW' mode, but received mode: {}".format(mode)
         self.mode = mode
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         if self.mode == "CHW":
-            return img.transpose((2, 0, 1))[::-1, ...]
-        return img
+            return img.transpose((2, 0, 1))[::-1, ...], lbl
+        return img, lbl
 
 
 class GaussianNoise(object):
@@ -321,11 +356,11 @@ class GaussianNoise(object):
         self.mean = np.array(mean, dtype=np.float32)
         self.std = np.array(std, dtype=np.float32)
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         dtype = img.dtype
         noise = np.random.normal(self.mean, self.std, img.shape) * 255
         img = img + noise.astype(np.float32)
-        return np.clip(img, 0, 255).astype(dtype)
+        return np.clip(img, 0, 255).astype(dtype), lbl
 
 
 class BrightnessTransform(object):
@@ -341,15 +376,15 @@ class BrightnessTransform(object):
             raise ValueError("brightness value should be non-negative")
         self.value = value
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         if self.value == 0:
-            return img
+            return img, lbl
 
         dtype = img.dtype
         img = img.astype(np.float32)
         alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
         img = img * alpha
-        return img.clip(0, 255).astype(dtype)
+        return img.clip(0, 255).astype(dtype), lbl
 
 
 class ContrastTransform(object):
@@ -365,16 +400,16 @@ class ContrastTransform(object):
             raise ValueError("contrast value should be non-negative")
         self.value = value
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         if self.value == 0:
-            return img
+            return img, lbl
 
         dtype = img.dtype
         img = img.astype(np.float32)
         alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
         img = img * alpha + cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).mean() * (
             1 - alpha)
-        return img.clip(0, 255).astype(dtype)
+        return img.clip(0, 255).astype(dtype), lbl
 
 
 class SaturationTransform(object):
@@ -390,9 +425,9 @@ class SaturationTransform(object):
             raise ValueError("saturation value should be non-negative")
         self.value = value
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         if self.value == 0:
-            return img
+            return img, lbl
 
         dtype = img.dtype
         img = img.astype(np.float32)
@@ -400,7 +435,7 @@ class SaturationTransform(object):
         gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
         gray_img = gray_img[..., np.newaxis]
         img = img * alpha + gray_img * (1 - alpha)
-        return img.clip(0, 255).astype(dtype)
+        return img.clip(0, 255).astype(dtype), lbl
 
 
 class HueTransform(object):
@@ -416,9 +451,9 @@ class HueTransform(object):
             raise ValueError("hue value should be in [0.0, 0.5]")
         self.value = value
 
-    def __call__(self, img):
+    def __call__(self, img, lbl):
         if self.value == 0:
-            return img
+            return img, lbl
 
         dtype = img.dtype
         img = img.astype(np.uint8)
@@ -431,7 +466,7 @@ class HueTransform(object):
         with np.errstate(over="ignore"):
             h += np.uint8(alpha * 255)
         hsv_img = cv2.merge([h, s, v])
-        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
+        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype), lbl
 
 
 class ColorJitter(object):
@@ -466,5 +501,5 @@ class ColorJitter(object):
         random.shuffle(transforms)
         self.transforms = Compose(transforms)
 
-    def __call__(self, img):
-        return self.transforms(img)
+    def __call__(self, img, lbl):
+        return self.transforms(img, lbl)
diff --git a/metrics.py b/metrics.py
deleted file mode 100644
index 3350853677b62275bb0107addff3f3b3780ea81c..0000000000000000000000000000000000000000
--- a/metrics.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-import six
-import abc
-import numpy as np
-import paddle.fluid as fluid
-
-import logging
-FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-logger = logging.getLogger(__name__)
-
-__all__ = ['Metric', 'Accuracy']
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Metric(object):
-    """
-    Base class for metric, encapsulates metric logic and APIs
-
-    Usage:
-    m = SomeMetric()
-    for prediction, label in ...:
-        m.update(prediction, label)
-    m.accumulate()
-    """
-
-    @abc.abstractmethod
-    def reset(self):
-        """
-        Reset states and result
-        """
-        raise NotImplementedError("function 'reset' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def update(self, *args, **kwargs):
-        """
-        Update states for metric
-        """
-        raise NotImplementedError("function 'update' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def accumulate(self):
-        """
-        Accumulates statistics, computes and returns the metric value
-        """
-        raise NotImplementedError(
-            "function 'accumulate' not implemented in {}.".format(
-                self.__class__.__name__))
-
-    @abc.abstractmethod
-    def name(self):
-        """
-        Returns metric name
-        """
-        raise NotImplementedError("function 'name' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    def add_metric_op(self, pred, label):
-        """
-        Add process op for metric in program
-        """
-        return pred, label
-
-
-class Accuracy(Metric):
-    """
-    Encapsulates accuracy metric logic
-    """
-
-    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
-        super(Accuracy, self).__init__(*args, **kwargs)
-        self.topk = topk
-        self.maxk = max(topk)
-        self._init_name(name)
-        self.reset()
-
-    def add_metric_op(self, pred, label, *args, **kwargs):
-        pred = fluid.layers.argsort(pred[0], descending=True)[1][:, :self.maxk]
-        correct = pred == label[0]
-        return correct
-
-    def update(self, correct, *args, **kwargs):
-        accs = []
-        for i, k in enumerate(self.topk):
-            num_corrects = correct[:, :k].sum()
-            num_samples = len(correct)
-            accs.append(float(num_corrects) / num_samples)
-            self.total[i] += num_corrects
-            self.count[i] += num_samples
-        return accs
-
-    def reset(self):
-        self.total = [0.] * len(self.topk)
-        self.count = [0] * len(self.topk)
-
-    def accumulate(self):
-        res = []
-        for t, c in zip(self.total, self.count):
-            res.append(float(t) / c)
-        return res
-
-    def _init_name(self, name):
-        name = name or 'acc'
-        if self.maxk != 1:
-            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
-        else:
-            self._name = ['acc']
-
-    def name(self):
-        return self._name
diff --git a/mnist.py b/mnist.py
index 745dc2f06e54136756ce5ae4f3b077c24468dd1d..39f323ac6454ed7dd06359017703401321428611 100644
--- a/mnist.py
+++ b/mnist.py
@@ -24,7 +24,7 @@ import numpy as np
 from paddle import fluid
 from paddle.fluid.optimizer import Momentum
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
-from paddle.fluid.io import MNIST as MnistDataset
+from vision.datasets import MNIST as MnistDataset
 
 from model import Model, CrossEntropy, Input, set_device
 from metrics import Accuracy
diff --git a/model.py b/model.py
deleted file mode 100644
index 5bc126c4df6079c6d8238ea608c09a427c61c67d..0000000000000000000000000000000000000000
--- a/model.py
+++ /dev/null
@@ -1,1268 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-import inspect
-import os
-import pickle
-import numpy as np
-import six
-import warnings
-import tqdm
-
-from collections import Iterable
-from paddle import fluid
-from paddle.fluid.framework import in_dygraph_mode, Variable
-from paddle.fluid.executor import global_scope
-from paddle.fluid.io import is_belong_to_optimizer
-from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.layers.utils import flatten
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-from paddle.fluid.incubate.fleet.base import role_maker
-from paddle.fluid.io import DataLoader, Dataset
-
-from distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
-from metrics import Metric
-from callbacks import config_callbacks
-
-__all__ = ['Model', 'Loss', 'CrossEntropy', 'Input', 'set_device']
-
-
-def set_device(device):
-    """
-    Args:
-        device (str): specify device type, 'cpu' or 'gpu'.
-        
-    Returns:
-        fluid.CUDAPlace or fluid.CPUPlace: Created GPU or CPU place.
-    """
-
-    assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
-    "Expected device in ['cpu', 'gpu'], but got {}".format(device)
-
-    place = fluid.CUDAPlace(ParallelEnv().dev_id) \
-            if device.lower() == 'gpu' and fluid.is_compiled_with_cuda() \
-                else fluid.CPUPlace()
-
-    return place
-
-
-def to_list(value):
-    if value is None:
-        return value
-    if isinstance(value, (list, tuple)):
-        return value
-    return [value]
-
-
-def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable"
-    if isinstance(var, fluid.core.VarBase):
-        return var.numpy()
-    t = global_scope().find_var(var.name).get_tensor()
-    return np.array(t)
-
-
-def flatten_list(l):
-    assert isinstance(l, list), "not a list"
-    outl = []
-    splits = []
-    for sl in l:
-        assert isinstance(sl, list), "sub content not a list"
-        splits.append(len(sl))
-        outl += sl
-    return outl, splits
-
-
-def restore_flatten_list(l, splits):
-    outl = []
-    for split in splits:
-        assert len(l) >= split, "list length invalid"
-        sl, l = l[:split], l[split:]
-        outl.append(sl)
-    return outl
-
-
-def extract_args(func):
-    if hasattr(inspect, 'getfullargspec'):
-        return inspect.getfullargspec(func)[0]
-    else:
-        return inspect.getargspec(func)[0]
-
-
-class Input(fluid.dygraph.Layer):
-    def __init__(self, shape=None, dtype=None, name=None):
-        super(Input, self).__init__()
-        self.shape = shape
-        self.dtype = dtype
-        self.name = name
-
-    def forward(self):
-        return fluid.data(self.name, shape=self.shape, dtype=self.dtype)
-
-
-class Loss(object):
-    def __init__(self, average=True):
-        super(Loss, self).__init__()
-        self.average = average
-
-    def forward(self, outputs, labels):
-        raise NotImplementedError()
-
-    def __call__(self, outputs, labels=None):
-        labels = to_list(labels)
-        if in_dygraph_mode() and labels:
-            labels = [to_variable(l) for l in labels]
-        losses = to_list(self.forward(to_list(outputs), labels))
-        if self.average:
-            losses = [fluid.layers.reduce_mean(l) for l in losses]
-        else:
-            losses = [fluid.layers.reduce_sum(l) for l in losses]
-        return losses
-
-
-class CrossEntropy(Loss):
-    def __init__(self, average=True):
-        super(CrossEntropy, self).__init__()
-
-    def forward(self, outputs, labels):
-        return [
-            fluid.layers.cross_entropy(o, l) for o, l in zip(outputs, labels)
-        ]
-
-
-class StaticGraphAdapter(object):
-    def __init__(self, model):
-        super(StaticGraphAdapter, self).__init__()
-        self.model = model
-        # with `_build_once` gone, parameters are now created in `__init__`
-        # so we need to keep track of the parameters already created
-        self._startup_prog = fluid.default_startup_program()
-        self._orig_prog = fluid.default_main_program()
-
-        self._label_vars = {}  # label variables
-        self._input_vars = {}  # label variables
-        self._endpoints = {}
-        self._loss_endpoint = None
-        self._executor = None
-        self._progs = {}
-        self._compiled_progs = {}
-
-        self._merge_count = {
-            'eval_total': 0,
-            'test_total': 0,
-            'eval_batch': 0,
-            'test_batch': 0
-        }
-
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
-
-    @property
-    def mode(self):
-        return self.model.mode
-
-    @mode.setter
-    def mode(self, value):
-        self.model.mode = value
-
-    def train(self, inputs, labels=None):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
-        self.mode = 'train'
-        return self._run(inputs, labels)
-
-    def eval(self, inputs, labels=None):
-        self.mode = 'eval'
-        return self._run(inputs, labels)
-
-    def test(self, inputs):
-        self.mode = 'test'
-        return self._run(inputs, None)
-
-    def parameters(self, *args, **kwargs):
-        return super(Model, self.model).parameters(*args, **kwargs)
-
-    def save(self, path):
-        def _save(state, path):
-            if not state:
-                return
-            state = {
-                k: to_numpy(v) if isinstance(v, Variable) else v
-                for k, v in state.items()
-            }
-            with open(path, 'wb') as f:
-                pickle.dump(state, f)
-
-        base = os.path.basename(path)
-        assert base != "", "path should be of 'dirname/filename' format"
-        dir_name = os.path.dirname(path)
-        if dir_name and not os.path.exists(dir_name):
-            os.makedirs(dir_name)
-        param_path = path + ".pdparams"
-        _save(self.model.state_dict(), param_path)
-        prog = self._progs.get('train', None)
-        if prog is None or self.model._optimizer is None:
-            return
-        # XXX `optimizer.state_dict()` only work in dygraph mode
-        optim_path = path + ".pdopt"
-        optim = {
-            p.name: p
-            for p in filter(is_belong_to_optimizer, prog.list_vars())
-        }
-        if not optim:
-            return
-
-        _save(optim, optim_path)
-
-    def load(self, param_state_pairs, optim_state):
-        if self._executor is None:
-            executor = fluid.Executor(fluid.CPUPlace())._default_executor
-        else:
-            executor = self._executor._default_executor
-
-        # restore parameter states
-        fluid.core._create_loaded_parameter(
-            [param for param, state in param_state_pairs],
-            global_scope(), executor)
-        for param, state in param_state_pairs:
-            self._set_var(param, state)
-
-        # restore optimizer states
-        # FIXME what if a different optimizer is used?
-        if not self.model._optimizer or not optim_state:
-            return
-        self._load_optimizer(optim_state, executor)
-
-    def _load_optimizer(self, state, executor):
-        prog = self._progs.get('train', None)
-        optim = list(filter(is_belong_to_optimizer, prog.list_vars()))
-        if not optim:
-            return
-
-        fluid.core._create_loaded_parameter(optim, global_scope(), executor)
-
-        converted_state = dict(state)
-        for var in optim:
-            if var.name in ["@LR_DECAY_COUNTER@", "global_step"]:
-                # When using learning rate scheduler, dygraph would name the
-                # global step var as "global_step" to save, while static-graph
-                # would has a state var named as "@LR_DECAY_COUNTER@".
-                # NOTE: dygraph saved global_step is 1 larger than that in
-                # static-graph, since the time of global_step to increase is
-                # different.
-                state_val = (
-                    np.array(converted_state.pop("global_step")) - 1
-                ) if "global_step" in converted_state else converted_state.pop(
-                    "@LR_DECAY_COUNTER@", None)
-                if state_val is not None:
-                    converted_state[var.name] = state_val
-            elif var.name.startswith("learning_rate_"):
-                # When using static learning rate, static-graph would make it
-                # a persistable var named 'unique_name.generate("learning_rate")',
-                # However, dygraph wouldn't save it.
-                if var.name not in state:
-                    continue
-            else:
-                # moment and other accumulators
-                if var.name not in converted_state:
-                    # try to convert from dygraph name
-                    opt_name = self.model._optimizer._name
-                    opt_cls_name = self.model._optimizer.__class__.__name__
-                    opt_unq_name = None
-                    for name in self.model._optimizer._accumulators.keys():
-                        accum_name = name if opt_name is None else name[len(
-                            opt_name) + 1:]
-                        for param_name, state_var in self.model._optimizer._accumulators[
-                                name].items():
-                            if opt_unq_name is None:
-                                # can not infer out the exact unique(opt_name),
-                                # thus try to extract rather than generate
-                                for state_key in sorted(
-                                        state.keys(),
-                                        key=lambda x: len(x),
-                                        reverse=True):
-                                    prefix = param_name + "_" + (
-                                        opt_cls_name if opt_name is None else
-                                        opt_name) + "_"
-                                    if state_key.startswith(prefix):
-                                        prefix_offset = state_key[len(
-                                            prefix):].find("_") + len(prefix)
-                                        opt_unq_name = state_key[len(
-                                            param_name + "_"):prefix_offset]
-                                        # TODO: assert
-                                        # assert opt_unq_name is None
-                                    # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
-                                    # always end with "_0" since the unique optimizer._name
-                            dy_state_name = (param_name + "_" + opt_unq_name +
-                                             "_" + accum_name + "_0")
-                            converted_state[
-                                state_var.name] = converted_state.pop(
-                                    dy_state_name)
-
-            assert var.name in converted_state, \
-                "variable [{}] is not in optimizer state file".format(var.name)
-            self._set_var(var, converted_state[var.name])
-
-    def _set_var(self, var, ndarray):
-        t = global_scope().find_var(var.name).get_tensor()
-        p = t._place()
-        if p.is_cpu_place():
-            place = fluid.CPUPlace()
-        elif p.is_cuda_pinned_place():
-            place = fluid.CUDAPinnedPlace()
-        else:
-            p = fluid.core.Place()
-            p.set_place(t._place())
-            place = fluid.CUDAPlace(p.gpu_device_id())
-
-        t.set(ndarray, place)
-
-    def _run(self, inputs, labels=None):
-        compiled_prog = self._compiled_progs.get(self.mode, None)
-        assert compiled_prog, \
-            "Model is not ready, please call `model.prepare()` first"
-
-        inputs = to_list(inputs)
-        if labels is not None:
-            labels = to_list(labels)
-        assert len(inputs) == len(self._input_vars[self.mode]), \
-            "number of inputs" \
-            + " does not match number of arguments of `forward` method"
-
-        feed = {}
-        input_names = [v.name for v in self._input_vars[self.mode]]
-        for idx, n in enumerate(input_names):
-            # train and test may take different arguments
-            if inputs[idx] is not None:
-                feed[n] = inputs[idx]
-        if labels is not None:
-            for idx, v in enumerate(self._label_vars[self.mode]):
-                feed[v.name] = labels[idx]
-
-        endpoints = self._endpoints[self.mode]
-        if self.mode == 'test':
-            fetch_list = endpoints['output']
-        else:
-            metric_list, metric_splits = flatten_list(endpoints['metric'])
-            fetch_list = endpoints['loss'] + metric_list
-            num_loss = len(endpoints['loss'])
-
-        # if fetch Variable is same as input Variable, do not fetch
-        # from program, get it from input directly
-        pruned_fetch_list = []
-        pruned_fetch_idx_name_map = [""] * len(fetch_list)
-        for i, fetch_var in enumerate(fetch_list):
-            if fetch_var.name in feed.keys():
-                pruned_fetch_idx_name_map[i] = fetch_var.name
-            else:
-                pruned_fetch_list.append(fetch_var)
-
-        rets = self._executor.run(compiled_prog,
-                                  feed=feed,
-                                  fetch_list=pruned_fetch_list,
-                                  return_numpy=False)
-
-        # restore pruned fetch_list Variable from feeds
-        for i, name in enumerate(pruned_fetch_idx_name_map):
-            if len(name) > 0:
-                rets.insert(i, feed[name])
-
-        # LoDTensor cannot be fetch as numpy directly
-        rets = [np.array(v) for v in rets]
-        if self.mode == 'test':
-            return rets[:]
-        losses = rets[:num_loss]
-        metric_states = restore_flatten_list(rets[num_loss:], metric_splits)
-        metrics = []
-        for metric, state in zip(self.model._metrics, metric_states):
-            # cut off padding size
-            if self.mode != 'train' and self.model._test_dataloader is not None \
-                    and isinstance(self.model._test_dataloader, DataLoader) \
-                    and self._nranks > 1:
-                total_size = len(self.model._test_dataloader.dataset)
-                # TODO: fixme if have better way to get batch size
-                samples = state[0].shape[0]
-                current_count = self._merge_count.get(self.mode + '_total', 0)
-                if current_count + samples >= total_size:
-                    state = [
-                        s[:total_size - current_count, ...] for s in state
-                    ]
-                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode +
-                                      '_batch'] = total_size - current_count
-                else:
-                    self._merge_count[self.mode + '_total'] += samples
-                    self._merge_count[self.mode + '_batch'] = samples
-
-            metrics.append(metric.update(*state))
-        return (losses, metrics) if len(metrics) > 0 else losses
-
-    def prepare(self):
-        modes = ['train', 'eval', 'test']
-        for mode in modes:
-            self._make_program(mode)
-            self._compile_and_initialize(self._progs[mode], mode)
-
-    def _make_program(self, mode):
-        prog = self._progs.get(mode, None)
-        if prog is not None:
-            return
-
-        prog = self._orig_prog.clone()
-        # NOTE: When defining learning rate scheduling in static-graph, ops to
-        # increase the global step var and calculate learning rate would be
-        # prepended into _orig_prog. test program maked by `_orig_prog.clone`
-        # also would include these ops. Thus must prune these ops in test
-        # program, otherwise the global step would be changed in test.
-        if mode != 'train':
-            for op in list(prog.global_block().ops):
-                prog.global_block()._remove_op(0)
-        if mode == 'train' and self.model._optimizer \
-                and self.model._optimizer._learning_rate_map:
-            # HACK workaround learning rate map issue
-            lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
-            new_lr_var = prog.global_block().vars[lr_var.name]
-            self.model._optimizer._learning_rate_map[prog] = new_lr_var
-
-        losses = []
-        metrics = []
-        with fluid.program_guard(prog, self._startup_prog):
-            ins = self.model._inputs
-            lbls = self.model._labels if self.model._labels else []
-            inputs = [k.forward() for k in to_list(ins)]
-            labels = [k.forward() for k in to_list(lbls)]
-            self._label_vars[mode] = labels
-            outputs = to_list(self.model.forward(*inputs))
-
-            if mode != 'test' and self.model._loss_function:
-                losses = self.model._loss_function(outputs, labels)
-
-            if self._nranks > 1 and mode != 'train':
-                outputs = [_all_gather(o, self._nranks) for o in outputs]
-                if mode != 'test':
-                    labels = [_all_gather(l, self._nranks) for l in labels]
-
-            if mode != 'test':
-                for metric in self.model._metrics:
-                    metrics.append(
-                        to_list(metric.add_metric_op(outputs, labels)))
-
-            if mode == 'train' and self.model._optimizer:
-                self._loss_endpoint = fluid.layers.sum(losses)
-                if self._nranks > 1:
-                    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-                    fleet.init(role)
-                    dist_strategy = DistributedStrategy()
-                    dist_strategy.mode = "collective"
-                    dist_strategy.collective_mode = "grad_allreduce"
-                    self.model._optimizer = fleet.distributed_optimizer(
-                        self.model._optimizer, strategy=dist_strategy)
-
-                self.model._optimizer.minimize(self._loss_endpoint)
-
-        if mode != 'train':  # clone again to put it in test mode
-            prog = prog.clone(for_test=True)
-
-        self._input_vars[mode] = inputs
-
-        self._progs[mode] = prog
-        self._endpoints[mode] = {
-            "output": outputs,
-            "loss": losses,
-            "metric": metrics
-        }
-
-    def _compile_and_initialize(self, prog, mode):
-        compiled_prog = self._compiled_progs.get(mode, None)
-        if compiled_prog is not None:
-            return compiled_prog
-
-        assert self.model._place is not None, \
-            "device is not set, please call `model.prepare()` first"
-
-        place = self.model._place
-
-        # XXX *ALL WEIGHTS* should be initialized upon model construction
-        # even if `forward()` may run different code path for different mode
-        # therefore startup program only needs to run once
-        if self._executor is None:
-            self._executor = fluid.Executor(place)
-            # XXX incremental initialization
-            uninitialized = []
-            for var_py in self._startup_prog.list_vars():
-                var = fluid.global_scope().find_var(var_py.name)
-                if not var_py.name.startswith('nccl_id') and var and \
-                        var.get_tensor()._is_initialized():
-                    continue
-
-                uninitialized.append(var_py)
-            if uninitialized:
-                startup_prog = self._startup_prog._prune(uninitialized)
-                self._executor.run(startup_prog)
-
-        if self._nranks < 2:
-            compiled_prog = fluid.CompiledProgram(prog)
-        else:
-            compiled_prog = prog
-
-        self._compiled_progs[mode] = compiled_prog
-
-
-class DynamicGraphAdapter(object):
-    def __init__(self, model):
-        super(DynamicGraphAdapter, self).__init__()
-        self.model = model
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
-        self._merge_count = {
-            'eval_total': 0,
-            'test_total': 0,
-            'eval_batch': 0,
-            'test_batch': 0
-        }
-
-        if self._nranks > 1:
-            stradegy = fluid.dygraph.parallel.ParallelStrategy()
-            stradegy.nranks = ParallelEnv().nranks
-            stradegy.local_rank = ParallelEnv().local_rank
-            stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
-            stradegy.current_endpoint = ParallelEnv().current_endpoint
-            self.ddp_model = fluid.dygraph.parallel.DataParallel(self.model,
-                                                                 stradegy)
-
-    @property
-    def mode(self):
-        return self.model.mode
-
-    @mode.setter
-    def mode(self, value):
-        self.model.mode = value
-
-    # TODO multi device in dygraph mode not implemented at present time
-    def train(self, inputs, labels=None):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
-        super(Model, self.model).train()
-        self.mode = 'train'
-        inputs = to_list(inputs)
-        if labels is not None:
-            labels = [to_variable(l) for l in to_list(labels)]
-        if self._nranks > 1:
-            outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs])
-            losses = self.model._loss_function(outputs, labels)
-            final_loss = fluid.layers.sum(losses)
-            final_loss = self.ddp_model.scale_loss(final_loss)
-            final_loss.backward()
-            self.ddp_model.apply_collective_grads()
-        else:
-            outputs = self.model.forward(*[to_variable(x) for x in inputs])
-            losses = self.model._loss_function(outputs, labels)
-            final_loss = fluid.layers.sum(losses)
-            final_loss.backward()
-
-        self.model._optimizer.minimize(final_loss)
-        self.model.clear_gradients()
-        metrics = []
-        for metric in self.model._metrics:
-            metric_outs = metric.add_metric_op(
-                to_list(outputs), to_list(labels))
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
-            metrics.append(m)
-
-        return ([to_numpy(l) for l in losses], metrics) \
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
-
-    def eval(self, inputs, labels=None):
-        super(Model, self.model).eval()
-        self.mode = 'eval'
-        inputs = to_list(inputs)
-        if labels is not None:
-            labels = [to_variable(l) for l in to_list(labels)]
-        outputs = self.model.forward(*[to_variable(x) for x in inputs])
-        if self.model._loss_function:
-            losses = self.model._loss_function(outputs, labels)
-        else:
-            losses = []
-        if self._nranks > 1:
-            outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
-            labels = [_all_gather(l, self._nranks) for l in labels]
-        metrics = []
-        for metric in self.model._metrics:
-            # cut off padding value.
-            if self.model._test_dataloader is not None and self._nranks > 1 \
-                    and isinstance(self.model._test_dataloader, DataLoader):
-                total_size = len(self.model._test_dataloader.dataset)
-                samples = outputs[0].shape[0]
-                current_count = self._merge_count.get(self.mode + '_total', 0)
-                if current_count + samples >= total_size:
-                    outputs = [o[:total_size - current_count] for o in outputs]
-                    labels = [l[:total_size - current_count] for l in labels]
-                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode +
-                                      '_batch'] = total_size - current_count
-                else:
-                    self._merge_count[self.mode + '_total'] += samples
-                    self._merge_count[self.mode + '_batch'] = samples
-
-            metric_outs = metric.add_metric_op(to_list(outputs), labels)
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
-            metrics.append(m)
-
-        # To be consistent with static graph
-        # return empty loss if loss_function is None
-        return ([to_numpy(l) for l in losses], metrics) \
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
-
-    def test(self, inputs):
-        super(Model, self.model).eval()
-        self.mode = 'test'
-        inputs = [to_variable(x) for x in to_list(inputs)]
-        outputs = self.model.forward(*inputs)
-        if self._nranks > 1 and isinstance(self.model._place, fluid.CUDAPlace):
-            outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
-
-        return [to_numpy(o) for o in to_list(outputs)]
-
-    def parameters(self, *args, **kwargs):
-        return super(Model, self.model).parameters(*args, **kwargs)
-
-    def save(self, path):
-        params = self.model.state_dict()
-        fluid.save_dygraph(params, path)
-        if self.model._optimizer is None:
-            return
-        if self.model._optimizer.state_dict():
-            optim = self.model._optimizer.state_dict()
-            fluid.save_dygraph(optim, path)
-
-    def load(self, param_state_pairs, optim_state):
-        # restore parameter states
-        for param, state in param_state_pairs:
-            param.set_value(state)
-
-        # resotre optimizer states
-        if not self.model._optimizer or not optim_state:
-            return
-
-        # If optimizer performs set_dict when state vars haven't been created,
-        # which would happen when set_dict before minimize, the state would be
-        # stored in optimizer._accumulators_holder and loaded lazily.
-        # To contrive this when loading from static-graph saved states, extend
-        # state dict to include keys named accoring to dygraph naming rules.
-        # TODO: if len(self.model._optimizer._accumulators) > 0
-        converted_state = dict(optim_state)
-        opt_unq_name = self.model._optimizer._name
-        opt_cls_name = self.model._optimizer.__class__.__name__
-        opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
-        param_names = [param.name for param in self.model.parameters()]
-        for var_name, state_var in sorted(
-                optim_state.items(), key=lambda x: len(x[0]), reverse=True):
-            if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
-                # NOTE: dygraph saved global_step is 1 larger than that in
-                # static-graph, since the time of global_step to increase is
-                # different.
-                if var_name == "@LR_DECAY_COUNTER@":
-                    converted_state["global_step"] = np.array(
-                        converted_state.pop("@LR_DECAY_COUNTER@")) + 1
-            else:
-                # moment and other accumulators
-                # extend state dict to include promising dygraph names
-                for param_name in param_names:
-                    if var_name.startswith(param_name + "_" + opt_name):
-                        # when init optimizer with name
-                        accum_name = var_name[len(param_name + "_" + opt_name +
-                                                  "_"):]
-                    elif var_name.startswith(param_name +
-                                             "_") and opt_name == opt_cls_name:
-                        # when init optimizer without name
-                        accum_name = var_name[len(param_name + "_"):]
-                    else:
-                        continue
-                    # remove suffix idx
-                    accum_name = accum_name[:accum_name.rfind("_")]
-                    # state names always end with "_0" in dygraph because of the
-                    # unique optimizer._name
-                    dy_state_name = (param_name + "_" + opt_unq_name + "_" +
-                                     accum_name + "_0")
-                    converted_state[dy_state_name] = state_var
-
-        self.model._optimizer.set_dict(converted_state)
-
-
-class Model(fluid.dygraph.Layer):
-    """
-    FIXME: add more comments and usage
-    """
-
-    def __init__(self):
-        super(Model, self).__init__(self.__class__.__name__)
-        self.mode = 'train'
-        self._inputs = None
-        self._labels = None
-        self._loss_function = None
-        self._loss_weights = None
-        self._optimizer = None
-        self._device = None
-        self._optimizer = None
-        self._test_dataloader = None
-
-        # init backend
-        if fluid.in_dygraph_mode():
-            self._adapter = DynamicGraphAdapter(self)
-        else:
-            self._adapter = StaticGraphAdapter(self)
-
-    def train(self, *args, **kwargs):
-        return self._adapter.train(*args, **kwargs)
-
-    def eval(self, *args, **kwargs):
-        return self._adapter.eval(*args, **kwargs)
-
-    def test(self, *args, **kwargs):
-        return self._adapter.test(*args, **kwargs)
-
-    def save(self, *args, **kwargs):
-        if ParallelEnv().local_rank == 0:
-            return self._adapter.save(*args, **kwargs)
-
-    def load(self, path, skip_mismatch=False, reset_optimizer=False):
-        """
-        Load from files storing the model states and optimizer states. The file
-        for optimizer states is not necessary if no need to restore the optimizer.
-
-        NOTE: parameters are retrieved out from the file storing model states
-        accoring to their structured names.
-
-        For fine-tuning or transfer-learning models where some of the layers have
-        changed, keep parameters needed to restore have same structured names in
-        the pre-trained model and fine-tuning model.
-
-        Args:
-            path (str): The prefix of files storing the model states and
-                optimizer states. The files would be `path.pdparams` and
-                `path.pdopt` separately, and the latter is not necessary
-                when no need to restore.
-            skip_mismatch (bool): Whether to skip the loading of mismatch
-                parameter or raise an error when mismatch happens (not found
-                the parameter in file storing model states of or receives a
-                mismatch shape).
-            reset_optimizer (bool): If True, ignore the providing file storing
-                optimizer states and initialize optimizer states from scratch.
-                Otherwise, restore optimizer states from `path.pdopt` if
-                a optimizer has been set to the model. Default False.
-        """
-
-        def _load_state_from_path(path):
-            if not os.path.exists(path):
-                return
-            with open(path, 'rb') as f:
-                return pickle.load(f) if six.PY2 else pickle.load(
-                    f, encoding='latin1')
-
-        def _check_match(key, param):
-            state = param_state.get(key, None)
-            if state is None:
-                raise ValueError(
-                    "{} is not found in the providing file.".format(key))
-            if list(state.shape) != list(param.shape):
-                raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
-            return param, state
-
-        param_state = _load_state_from_path(path + ".pdparams")
-        assert param_state, "Failed to load parameters, please check path."
-
-        matched_param_state = []
-        for key, param in self.state_dict().items():
-            try:
-                match_res = _check_match(key, param)
-            except ValueError as err:
-                if skip_mismatch:
-                    warnings.warn(
-                        ("Skip loading for {}. ".format(key) + err.message))
-                    # reset optimizer when mismatch happens
-                    reset_optimizer = True
-                else:
-                    raise err
-            matched_param_state.append(match_res)
-
-        optim_state = None if reset_optimizer else _load_state_from_path(
-            path + ".pdopt")
-        return self._adapter.load(matched_param_state, optim_state)
-
-    def parameters(self, *args, **kwargs):
-        return self._adapter.parameters(*args, **kwargs)
-
-    def prepare(self,
-                optimizer=None,
-                loss_function=None,
-                metrics=None,
-                inputs=None,
-                labels=None,
-                device=None):
-        """
-        FIXME: add comments
-        Args:
-            optimizer (Optimizer|None): optimizer must be set in training
-                and should be a Optimizer instance. It can be None in eval
-                and test mode.
-            loss_function (Loss|None): loss function must be set in training
-                and should be a Loss instance. It can be None when there is
-                no loss.
-            metrics (Metric|list of Metric|None): if metrics is set, all
-                metric will be calculate and output in train/eval mode.
-            inputs (Input|list|dict|None): inputs, entry points of network,
-                could be a Input layer, or lits of Input layers,
-                or dict (name: Input), or None. For static graph,
-                inputs must be set. For dynamic graph, it could be None.
-            labels (Input|list|None): labels, entry points of network,
-                could be a Input layer or lits of Input layers, or None.
-                For static graph, if set loss_function in Model.prepare(), it
-                must be set. Otherwise, it could be None.
-            device (str|None): specify device type, 'CPU' or 'GPU'.
-                If None, automatically select device according to
-                installation package version.
-        """
-
-        if isinstance(device, fluid.CUDAPlace) or \
-            (isinstance(device, six.string_types) and device.lower() == 'gpu') \
-            or (device is None and fluid.is_compiled_with_cuda()):
-            if isinstance(device, fluid.CUDAPlace):
-                self._place = device
-            else:
-                self._place = fluid.CUDAPlace(ParallelEnv().dev_id) \
-                    if ParallelEnv().nranks > 1 else fluid.CUDAPlace(0)
-
-            global _parallel_context_initialized
-            if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
-                if fluid.in_dygraph_mode():
-                    fluid.disable_dygraph()
-                    fluid.enable_dygraph(self._place)
-                    fluid.dygraph.parallel.prepare_context()
-                else:
-                    prepare_distributed_context(self._place)
-
-                _parallel_context_initialized = True
-        elif isinstance(device, fluid.CPUPlace):
-            self._place = device
-        elif (isinstance(device, six.string_types) and device.lower() == 'cpu') \
-            or (device is None):
-            self._place = fluid.CPUPlace()
-        else:
-            raise ValueError(
-                "Expected device in ('gpu', 'cpu', fluid.CUDAPlace, fluid.CPUPlace, None), \
-                but got {}".format(device))
-
-        self._optimizer = optimizer
-        if loss_function:
-            if not isinstance(loss_function, Loss):
-                raise TypeError(
-                    "'loss_function' must be sub classes of 'Loss'")
-        self._loss_function = loss_function
-        if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
-                raise TypeError(
-                    "'inputs' must be list or dict in static graph mode")
-
-        metrics = metrics or []
-        for metric in to_list(metrics):
-            assert isinstance(metric, Metric), \
-                "{} is not sub class of Metric".format(
-                    metric.__class__.__name__)
-        self._metrics = to_list(metrics)
-
-        self._inputs = to_list(inputs) if not isinstance(inputs, dict) else [
-            inputs[n] for n in extract_args(self.forward) if n != 'self'
-        ]
-        self._labels = to_list(labels)
-
-        if not in_dygraph_mode():
-            self._adapter.prepare()
-
-    def fit(
-            self,
-            train_data=None,
-            eval_data=None,
-            batch_size=1,
-            epochs=1,
-            eval_freq=1,
-            log_freq=10,
-            save_dir=None,
-            save_freq=1,
-            verbose=2,
-            drop_last=False,
-            shuffle=True,
-            num_workers=0,
-            callbacks=None, ):
-        """
-        FIXME: add more comments and usage
-        Args:
-            train_data (Dataset|DataLoader): An iterable data loader is used for 
-                train. An instance of paddle.fluid.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended.
-            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
-                is recomended.
-            batch_size (int): Integer number. The batch size of train_data and eval_data. 
-                When train_data and eval_data are both the instance of Dataloader, this 
-                parameter will be ignored.
-            epochs (int): Integer number. The number of epochs to train the model.
-            eval_freq (int): The frequency, in number of epochs, an evalutation
-                is performed.
-            log_freq (int): The frequency, in number of steps, the training logs
-                are printed.
-            save_dir(str|None): The directory to save checkpoint during training.
-                If None, will not save checkpoint.
-            save_freq (int): The frequency, in number of epochs, to save checkpoint.
-            verbose (int): The verbosity mode, should be 0, 1, or 2.
-                0 = silent, 1 = progress bar, 2 = one line per epoch.
-            drop_last (bool): whether drop the last incomplete batch of train_data 
-                when dataset size is not divisible by the batch size. When train_data 
-                is an instance of Dataloader, this parameter will be ignored.
-            shuffle (bool): whther to shuffle train_data. When train_data is an instance 
-                of Dataloader, this parameter will be ignored.
-            num_workers (int): the number of subprocess to load data, 0 for no subprocess 
-                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this parameter will be ignored.
-            callbacks (Callback|None): A list of `Callback` instances to apply
-                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
-                are automatically inserted.
-        """
-
-        assert train_data is not None, \
-                "train_data must be given!"
-
-        if fluid.in_dygraph_mode():
-            feed_list = None
-        else:
-            feed_list = [x.forward() for x in self._inputs + self._labels]
-
-        if isinstance(train_data, Dataset):
-            train_sampler = DistributedBatchSampler(
-                train_data,
-                batch_size=batch_size,
-                shuffle=shuffle,
-                drop_last=drop_last)
-            train_loader = DataLoader(
-                train_data,
-                batch_sampler=train_sampler,
-                places=self._place,
-                feed_list=feed_list,
-                num_workers=num_workers,
-                return_list=True)
-        else:
-            train_loader = train_data
-
-        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(
-                eval_data, batch_size=batch_size)
-            eval_loader = DataLoader(
-                eval_data,
-                batch_sampler=eval_sampler,
-                places=self._place,
-                feed_list=feed_list,
-                num_workers=num_workers,
-                return_list=True)
-        elif eval_data is not None:
-            eval_loader = eval_data
-        else:
-            eval_loader = None
-
-        do_eval = eval_loader is not None
-        self._test_dataloader = eval_loader
-        metrics_name = self._metrics_name()
-        steps = len(train_loader) if hasattr(train_loader, '__len__') else None
-        cbks = config_callbacks(
-            callbacks,
-            model=self,
-            epochs=epochs,
-            steps=steps,
-            log_freq=log_freq,
-            save_freq=save_freq,
-            save_dir=save_dir,
-            verbose=verbose,
-            metrics=self._metrics_name(), )
-
-        cbks.on_begin('train')
-        for epoch in range(epochs):
-
-            # FIXME: adapt to DataLoader
-            loader = train_loader
-            if not isinstance(train_loader, Iterable):
-                loader = train_loader()
-            logs = self._run_one_epoch(
-                loader, cbks, 'train', metrics_name, epoch=epoch)
-
-            if do_eval and epoch % eval_freq == 0:
-                # FIXME: adapt to DataLoader
-                loader = eval_loader
-                if not isinstance(eval_loader, Iterable):
-                    loader = eval_loader()
-
-                eval_steps = len(loader) if hasattr(loader,
-                                                    '__len__') else None
-                cbks.on_begin('eval', {
-                    'steps': eval_steps,
-                    'metrics_name': metrics_name
-                })
-
-                logs = self._run_one_epoch(loader, cbks, 'eval', metrics_name)
-
-                cbks.on_end('eval', logs)
-
-        cbks.on_end('train', logs)
-        self._test_dataloader = None
-
-    def evaluate(
-            self,
-            eval_data,
-            batch_size=1,
-            log_freq=10,
-            verbose=2,
-            num_workers=0,
-            callbacks=None, ):
-        """
-        FIXME: add more comments and usage
-        Args:
-            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.fluid.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended.
-            batch_size (int): Integer number. The batch size of train_data and eval_data. 
-                When train_data and eval_data are both the instance of Dataloader, this 
-                parameter will be ignored.
-            log_freq (int): The frequency, in number of steps, the eval logs
-                are printed.
-            verbose (int): The verbosity mode, should be 0, 1, or 2.
-                0 = silent, 1 = progress bar, 2 = one line per epoch.
-            num_workers (int): The number of subprocess to load data, 0 for no subprocess 
-                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this parameter will be ignored.
-            callbacks (Callback|None): A list of `Callback` instances to apply
-                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
-                are automatically inserted.
-        """
-
-        if fluid.in_dygraph_mode():
-            feed_list = None
-        else:
-            feed_list = [x.forward() for x in self._inputs + self._labels]
-
-        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(
-                eval_data, batch_size=batch_size)
-            eval_loader = DataLoader(
-                eval_data,
-                batch_sampler=eval_sampler,
-                places=self._place,
-                feed_list=feed_list,
-                num_workers=num_workers,
-                return_list=True)
-        else:
-            eval_loader = eval_data
-
-        self._test_dataloader = eval_loader
-        metrics_name = self._metrics_name()
-
-        cbks = config_callbacks(
-            callbacks,
-            model=self,
-            log_freq=log_freq,
-            verbose=verbose,
-            metrics=self._metrics_name(), )
-
-        loader = eval_loader
-        if not isinstance(eval_loader, Iterable):
-            loader = eval_loader()
-
-        eval_steps = len(loader) if hasattr(loader, '__len__') else None
-        cbks.on_begin('eval',
-                      {'steps': eval_steps,
-                       'metrics_name': metrics_name})
-
-        logs = self._run_one_epoch(loader, cbks, 'eval', metrics_name)
-
-        cbks.on_end('eval', logs)
-
-        self._test_dataloader = None
-
-        eval_result = {}
-        for k in self._metrics_name():
-            eval_result[k] = logs[k]
-
-        return eval_result
-
-    def predict(self,
-                test_data,
-                batch_size=1,
-                num_workers=0,
-                stack_outputs=True):
-        """
-        FIXME: add more comments and usage
-        Args:
-            test_data (Dataset|DataLoader): An iterable data loader is used for
-                predict. An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
-                is recomended.
-            batch_size (int): Integer number. The batch size of train_data and eval_data. 
-                When train_data and eval_data are both the instance of Dataloader, this 
-                parameter will be ignored.
-            num_workers (int): the number of subprocess to load data, 0 for no subprocess 
-                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this parameter will be ignored.
-            stack_output (bool): whether stack output field like a batch, as for an output
-                filed of a sample is in shape [X, Y], test_data contains N samples, predict
-                output field will be in shape [N, X, Y] if stack_output is True, and will
-                be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
-                is False. stack_outputs as False is used for LoDTensor output situation,
-                it is recommended set as True if outputs contains no LoDTensor. Default False
-        """
-
-        if fluid.in_dygraph_mode():
-            feed_list = None
-        else:
-            feed_list = [x.forward() for x in self._inputs + self._labels]
-
-        if test_data is not None and isinstance(test_data, Dataset):
-            test_sampler = DistributedBatchSampler(
-                test_data, batch_size=batch_size)
-            test_loader = DataLoader(
-                test_data,
-                batch_sampler=test_sampler,
-                places=self._place,
-                feed_list=feed_list,
-                num_workers=num_workers,
-                return_list=True)
-        else:
-            test_loader = test_data
-
-        self._test_dataloader = test_loader
-
-        loader = test_loader
-        if not isinstance(test_loader, Iterable):
-            loader = test_loader()
-
-        outputs = []
-        for data in tqdm.tqdm(loader):
-            data = flatten(data)
-            outputs.append(self.test(data[:len(self._inputs)]))
-
-        # NOTE: for lod tensor output, we should not stack outputs
-        # for stacking may loss its detail info
-        outputs = list(zip(*outputs))
-        if stack_outputs:
-            outputs = [np.stack(outs, axis=0) for outs in outputs]
-
-        self._test_dataloader = None
-        if test_loader is not None and self._adapter._nranks > 1 \
-                    and isinstance(test_loader, DataLoader):
-            outputs = [o[:len(test_loader.dataset)] for o in outputs]
-        return outputs
-
-    def set_eval_data(self, eval_data):
-        """
-        Args:
-            eval_data (Dataset|DataLoader|None): An iterable data loader is used for 
-                eval. An instance of paddle.fluid.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended. 
-        """
-        assert isinstance(
-            eval_data,
-            DataLoader), "eval_data must be a instance of Dataloader!"
-        self._test_dataloader = eval_data
-
-    def _run_one_epoch(self,
-                       data_loader,
-                       callbacks,
-                       mode,
-                       metrics_name,
-                       epoch=None):
-        size = len(data_loader) if hasattr(data_loader, '__len__') else None
-        logs = {
-            'steps': size,
-            'metrics_name': metrics_name,
-        }
-
-        if mode == 'train':
-            assert epoch is not None, 'when mode is train, epoch must be given'
-            callbacks.on_epoch_begin(epoch)
-
-        for step, data in enumerate(data_loader):
-            # data might come from different types of data_loader and have
-            # different format, as following:
-            # 1. DataLoader in static graph:
-            #    [[input1, input2, ..., label1, lable2, ...]]
-            # 2. DataLoader in dygraph
-            #    [input1, input2, ..., label1, lable2, ...]
-            # 3. custumed iterator yield concated inputs and labels:
-            #   [input1, input2, ..., label1, lable2, ...]
-            # 4. custumed iterator yield seperated inputs and labels:
-            #   ([input1, input2, ...], [label1, lable2, ...])
-            # To handle all of these, flatten (nested) list to list.
-            data = flatten(data)
-            # LoDTensor.shape is callable, where LoDTensor comes from
-            # DataLoader in static graph
-            batch_size = data[0].shape()[0] if callable(data[
-                0].shape) else data[0].shape[0]
-
-            callbacks.on_batch_begin(mode, step, logs)
-            if mode == 'train':
-                outs = self.train(data[:len(self._inputs)],
-                                  data[len(self._inputs):])
-            else:
-                outs = self.eval(data[:len(self._inputs)],
-                                 data[len(self._inputs):])
-
-            # losses
-            loss = outs[0] if self._metrics else outs
-            metrics = [[l[0] for l in loss]]
-
-            # metrics
-            for metric in self._metrics:
-                res = metric.accumulate()
-                metrics.extend(to_list(res))
-
-            assert len(metrics_name) == len(metrics)
-            for k, v in zip(metrics_name, metrics):
-                logs[k] = v
-
-            logs['step'] = step
-            if mode == 'train' or self._adapter._merge_count.get(
-                    mode + '_batch', 0) <= 0:
-                logs['batch_size'] = batch_size * ParallelEnv().nranks
-            else:
-                logs['batch_size'] = self._adapter._merge_count[mode +
-                                                                '_batch']
-
-            callbacks.on_batch_end(mode, step, logs)
-        self._reset_metrics()
-
-        if mode == 'train':
-            assert epoch is not None, 'when mode is train, epoch must be given'
-            callbacks.on_epoch_end(epoch)
-
-        return logs
-
-    def _reset_metrics(self):
-        for metric in self._metrics:
-            metric.reset()
-
-    def _metrics_name(self):
-        metrics_name = ['loss']
-        for m in self._metrics:
-            metrics_name.extend(to_list(m.name()))
-        return metrics_name
diff --git a/progressbar.py b/progressbar.py
deleted file mode 100644
index bbeff68d9a1e733c8face6903481fa7bb41d908e..0000000000000000000000000000000000000000
--- a/progressbar.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import sys
-import time
-import numpy as np
-
-
-class ProgressBar(object):
-    """progress bar """
-
-    def __init__(self,
-                 num=None,
-                 width=30,
-                 verbose=1,
-                 start=True,
-                 file=sys.stdout):
-        self._num = num
-        if isinstance(num, int) and num <= 0:
-            raise TypeError('num should be None or integer (> 0)')
-        max_width = self._get_max_width()
-        self._width = width if width <= max_width else max_width
-        self._total_width = 0
-        self._verbose = verbose
-        self.file = file
-        self._values = {}
-        self._values_order = []
-        if start:
-            self._start = time.time()
-        self._last_update = 0
-
-        self._dynamic_display = (
-            (hasattr(self.file, 'isatty') and
-             self.file.isatty()) or 'ipykernel' in sys.modules or
-            'posix' in sys.modules or 'PYCHARM_HOSTED' in os.environ)
-
-    def _get_max_width(self):
-        if sys.version_info > (3, 3):
-            from shutil import get_terminal_size
-        else:
-            from backports.shutil_get_terminal_size import get_terminal_size
-        terminal_width, _ = get_terminal_size()
-        max_width = min(int(terminal_width * 0.6), terminal_width - 50)
-        return max_width
-
-    def start(self):
-        self.file.flush()
-        self._start = time.time()
-
-    def update(self, current_num, values=None):
-        now = time.time()
-
-        if current_num:
-            time_per_unit = (now - self._start) / current_num
-        else:
-            time_per_unit = 0
-
-        if time_per_unit >= 1 or time_per_unit == 0:
-            fps = ' - %.0fs/%s' % (time_per_unit, 'step')
-        elif time_per_unit >= 1e-3:
-            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, 'step')
-        else:
-            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, 'step')
-
-        info = ''
-        if self._verbose == 1:
-            prev_total_width = self._total_width
-
-            if self._dynamic_display:
-                sys.stdout.write('\b' * prev_total_width)
-                sys.stdout.write('\r')
-            else:
-                sys.stdout.write('\n')
-
-            if self._num is not None:
-                numdigits = int(np.log10(self._num)) + 1
-
-                bar_chars = ('step %' + str(numdigits) + 'd/%d [') % (
-                    current_num, self._num)
-                prog = float(current_num) / self._num
-                prog_width = int(self._width * prog)
-
-                if prog_width > 0:
-                    bar_chars += ('=' * (prog_width - 1))
-                    if current_num < self._num:
-                        bar_chars += '>'
-                    else:
-                        bar_chars += '='
-                bar_chars += ('.' * (self._width - prog_width))
-                bar_chars += ']'
-            else:
-                bar_chars = 'step %3d' % current_num
-
-            self._total_width = len(bar_chars)
-            sys.stdout.write(bar_chars)
-
-            for k, val in values:
-                info += ' - %s:' % k
-                val = val if isinstance(val, list) else [val]
-                for i, v in enumerate(val):
-                    if isinstance(v, (float, np.float32, np.float64)):
-                        if abs(v) > 1e-3:
-                            info += ' %.4f' % v
-                        else:
-                            info += ' %.4e' % v
-                    else:
-                        info += ' %s' % v
-
-            if self._num is not None and current_num < self._num:
-                eta = time_per_unit * (self._num - current_num)
-                if eta > 3600:
-                    eta_format = '%d:%02d:%02d' % (eta // 3600, (eta % 3600) //
-                                                   60, eta % 60)
-                elif eta > 60:
-                    eta_format = '%d:%02d' % (eta // 60, eta % 60)
-                else:
-                    eta_format = '%ds' % eta
-
-                info += ' - ETA: %s' % eta_format
-
-            info += fps
-            self._total_width += len(info)
-            if prev_total_width > self._total_width:
-                info += (' ' * (prev_total_width - self._total_width))
-
-            # newline for another epoch
-            if self._num is not None and current_num >= self._num:
-                info += '\n'
-            if self._num is None:
-                info += '\n'
-
-            sys.stdout.write(info)
-            sys.stdout.flush()
-            self._last_update = now
-        elif self._verbose == 2:
-            if self._num:
-                numdigits = int(np.log10(self._num)) + 1
-                count = ('step %' + str(numdigits) + 'd/%d') % (current_num,
-                                                                self._num)
-            else:
-                count = 'step %3d' % current_num
-            info = count + info
-
-            for k, val in values:
-                info += ' - %s:' % k
-                val = val if isinstance(val, list) else [val]
-                for v in val:
-                    if isinstance(v, (float, np.float32, np.float64)):
-                        if abs(v) > 1e-3:
-                            info += ' %.4f' % v
-                        else:
-                            info += ' %.4e' % v
-                    elif isinstance(v, np.ndarray) and \
-                        v.size == 1 and \
-                        isinstance(v.dtype, (np.float32, np.float64)):
-                        if abs(v[0]) > 1e-3:
-                            info += ' %.4f' % v[0]
-                        else:
-                            info += ' %.4e' % v[0]
-                    else:
-                        info += ' %s' % v
-
-            info += fps
-            info += '\n'
-            sys.stdout.write(info)
-            sys.stdout.flush()
diff --git a/sequence_tagging/reader.py b/sequence_tagging/reader.py
index 5cdba92dde33ed534a1a71a6588875e3ba4d2d3d..d02f49eba4c16aa0e755fef930daec6b3981901b 100644
--- a/sequence_tagging/reader.py
+++ b/sequence_tagging/reader.py
@@ -168,13 +168,13 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
 
 def create_dataloader(generator, place, feed_list=None):
     if not feed_list:
-        data_loader = fluid.io.DataLoader.from_generator(
+        data_loader = paddle.io.DataLoader.from_generator(
             capacity=50,
             use_double_buffer=True,
             iterable=True,
             return_list=True)
     else:
-        data_loader = fluid.io.DataLoader.from_generator(
+        data_loader = paddle.io.DataLoader.from_generator(
             feed_list=feed_list,
             capacity=50,
             use_double_buffer=True,
diff --git a/setup.py b/setup.py
index df8acbfa872214650c9b5d588f912b5c81d2d45d..4a8c246d75e43a4182190326f5996dd3afb8ba02 100644
--- a/setup.py
+++ b/setup.py
@@ -31,15 +31,26 @@ setuptools.setup(
     description="A Paddle High-level API that supports both static and dynamic execution modes (still under development)",
     url="https://github.com/PaddlePaddle/hapi",
     packages=[
-        'hapi', 'hapi.text', 'hapi.text.tokenizer', 'hapi.text.bert',
-        'hapi.text.bert.utils'
+        'hapi',
+        'hapi.datasets',
+        'hapi.text',
+        'hapi.text.tokenizer',
+        'hapi.text.bert',
+        'hapi.text.bert.utils',
+        'hapi.vision',
+        'hapi.vision.models',
+        'hapi.vision.transforms',
     ],
     package_dir={
         'hapi': './hapi',
+        'hapi.datasets': './hapi/datasets',
         'hapi.text': './hapi/text',
         'hapi.text.tokenizer': './hapi/text/tokenizer',
         'hapi.text.bert': './hapi/text/bert',
         'hapi.text.bert.utils': './hapi/text/bert/utils',
+        'hapi.vision': './hapi/vision',
+        'hapi.vision.models': './hapi/vision/models',
+        'hapi.vision.transforms': './hapi/vision/transforms',
     },
     platforms="any",
     license='Apache 2.0',
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index c133f3b174f664690b78656600b60a0d2d90913c..b9f42d977a681ce4bf0ed4fa1e28dbf80a859103 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -18,7 +18,7 @@ import unittest
 import time
 import random
 
-from callbacks import config_callbacks
+from hapi.callbacks import config_callbacks
 
 
 class TestCallbacks(unittest.TestCase):
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 3d32f05650fc540d91e2929fd222d546b2a035f3..6adc9b667ac12c95fce0632ce2647db15e9fd470 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -14,14 +14,16 @@
 
 # when test, you should add hapi root path to the PYTHONPATH,
 # export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
+
 import unittest
+import numpy as np
 
-from datasets.folder import DatasetFolder
+from hapi.datasets import *
 
 
 class TestFolderDatasets(unittest.TestCase):
     def test_dataset(self):
-        dataset_folder = DatasetFolder('test_data')
+        dataset_folder = DatasetFolder('tests/test_data')
 
         for _ in dataset_folder:
             pass
@@ -30,5 +32,71 @@ class TestFolderDatasets(unittest.TestCase):
         assert len(dataset_folder.classes) == 2
 
 
+class TestMNISTTest(unittest.TestCase):
+    def test_main(self):
+        mnist = MNIST(mode='test')
+        self.assertTrue(len(mnist) == 10000)
+
+        for i in range(len(mnist)):
+            image, label = mnist[i]
+            self.assertTrue(image.shape[0] == 784)
+            self.assertTrue(label.shape[0] == 1)
+            self.assertTrue(0 <= int(label) <= 9)
+
+
+class TestMNISTTrain(unittest.TestCase):
+    def test_main(self):
+        mnist = MNIST(mode='train')
+        self.assertTrue(len(mnist) == 60000)
+
+        for i in range(len(mnist)):
+            image, label = mnist[i]
+            self.assertTrue(image.shape[0] == 784)
+            self.assertTrue(label.shape[0] == 1)
+            self.assertTrue(0 <= int(label) <= 9)
+
+
+class TestFlowersTrain(unittest.TestCase):
+    def test_main(self):
+        flowers = Flowers(mode='train')
+        self.assertTrue(len(flowers) == 6149)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 6149)
+        image, label = flowers[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(image.shape[2] == 3)
+        self.assertTrue(label.shape[0] == 1)
+
+
+class TestFlowersValid(unittest.TestCase):
+    def test_main(self):
+        flowers = Flowers(mode='valid')
+        self.assertTrue(len(flowers) == 1020)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1020)
+        image, label = flowers[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(image.shape[2] == 3)
+        self.assertTrue(label.shape[0] == 1)
+
+
+class TestFlowersTest(unittest.TestCase):
+    def test_main(self):
+        flowers = Flowers(mode='test')
+        self.assertTrue(len(flowers) == 1020)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1020)
+        image, label = flowers[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(image.shape[2] == 3)
+        self.assertTrue(label.shape[0] == 1)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_model.py b/tests/test_model.py
index 2ac4cbd9cfa8e611ce6b6133615df23dfd338d2b..7fe414c0c914b561cc78083f1fe89b0c79e77da2 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -28,11 +28,12 @@ import contextlib
 import paddle
 from paddle import fluid
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
-from model import Model, CrossEntropy, Input, Loss, set_device
-from metrics import Accuracy
-from callbacks import ProgBarLogger
-from paddle.fluid.io import BatchSampler, DataLoader
-from paddle.fluid.io import MNIST as MnistDataset
+from paddle.io import BatchSampler, DataLoader
+
+from hapi.model import Model, CrossEntropy, Input, Loss, set_device
+from hapi.metrics import Accuracy
+from hapi.callbacks import ProgBarLogger
+from hapi.datasets import MNIST as MnistDataset
 
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
diff --git a/tests/test_progressbar.py b/tests/test_progressbar.py
index dd099ca28e87a286142c9172955f78c92b00e744..797b94a1f0cae8ee37c50803bc4b2f6f4f4afe25 100644
--- a/tests/test_progressbar.py
+++ b/tests/test_progressbar.py
@@ -18,7 +18,7 @@ import unittest
 import random
 import time
 
-from progressbar import ProgressBar
+from hapi.progressbar import ProgressBar
 
 
 class TestProgressBar(unittest.TestCase):
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
index fbe0f2cce25867f9c021eb23636ec4a4974b6231..4471470d62ee1ba88ed6bb1bcebce6252908dc03 100644
--- a/tests/test_transforms.py
+++ b/tests/test_transforms.py
@@ -16,13 +16,13 @@
 # export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
 import unittest
 
-from datasets.folder import DatasetFolder
-from transform import transforms
+from hapi.datasets import DatasetFolder
+import hapi.vision.transforms as transforms
 
 
 class TestTransforms(unittest.TestCase):
     def do_transform(self, trans):
-        dataset_folder = DatasetFolder('test_data', transform=trans)
+        dataset_folder = DatasetFolder('tests/test_data', transform=trans)
 
         for _ in dataset_folder:
             pass
diff --git a/text.py b/text.py
deleted file mode 100644
index 26fb9f7b78e19fafb497fa424b51b9b9d51b11fc..0000000000000000000000000000000000000000
--- a/text.py
+++ /dev/null
@@ -1,1296 +0,0 @@
-import collections
-import copy
-import six
-import sys
-from functools import partial, reduce
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers.utils as utils
-from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit
-from paddle.fluid.data_feeder import convert_dtype
-
-from paddle.fluid import layers
-from paddle.fluid.dygraph import Layer
-from paddle.fluid.layers import BeamSearchDecoder
-
-__all__ = [
-    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
-    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
-    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
-    'TransformerDecoder', 'TransformerBeamSearchDecoder', 'DynamicGRU', 'BiGRU',
-    'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging']
-
-
-class RNNCell(Layer):
-    def get_initial_states(self,
-                           batch_ref,
-                           shape=None,
-                           dtype=None,
-                           init_value=0,
-                           batch_dim_idx=0):
-        """
-        Generate initialized states according to provided shape, data type and
-        value.
-
-        Parameters:
-            batch_ref: A (possibly nested structure of) tensor variable[s].
-                The first dimension of the tensor will be used as batch size to
-                initialize states.
-            shape: A (possiblely nested structure of) shape[s], where a shape is
-                represented as a list/tuple of integer). -1(for batch size) will
-                beautomatically inserted if shape is not started with it. If None,
-                property `state_shape` will be used. The default value is None.
-            dtype: A (possiblely nested structure of) data type[s]. The structure
-                must be same as that of `shape`, except when all tensors' in states
-                has the same data type, a single data type can be used. If None and
-                property `cell.state_shape` is not available, float32 will be used
-                as the data type. The default value is None.
-            init_value: A float value used to initialize states.
-
-        Returns:
-            Variable: tensor variable[s] packed in the same structure provided \
-                by shape, representing the initialized states.
-        """
-        # TODO: use inputs and batch_size
-        batch_ref = flatten(batch_ref)[0]
-
-        def _is_shape_sequence(seq):
-            if sys.version_info < (3, ):
-                integer_types = (
-                    int,
-                    long, )
-            else:
-                integer_types = (int, )
-            """For shape, list/tuple of integer is the finest-grained objection"""
-            if (isinstance(seq, list) or isinstance(seq, tuple)):
-                if reduce(
-                        lambda flag, x: isinstance(x, integer_types) and flag,
-                        seq, True):
-                    return False
-            # TODO: Add check for the illegal
-            if isinstance(seq, dict):
-                return True
-            return (isinstance(seq, collections.Sequence) and
-                    not isinstance(seq, six.string_types))
-
-        class Shape(object):
-            def __init__(self, shape):
-                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
-
-        # nested structure of shapes
-        states_shapes = self.state_shape if shape is None else shape
-        is_sequence_ori = utils.is_sequence
-        utils.is_sequence = _is_shape_sequence
-        states_shapes = map_structure(lambda shape: Shape(shape),
-                                      states_shapes)
-        utils.is_sequence = is_sequence_ori
-
-        # nested structure of dtypes
-        try:
-            states_dtypes = self.state_dtype if dtype is None else dtype
-        except NotImplementedError:  # use fp32 as default
-            states_dtypes = "float32"
-        if len(flatten(states_dtypes)) == 1:
-            dtype = flatten(states_dtypes)[0]
-            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
-
-        init_states = map_structure(
-            lambda shape, dtype: fluid.layers.fill_constant_batch_size_like(
-                input=batch_ref,
-                shape=shape.shape,
-                dtype=dtype,
-                value=init_value,
-                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
-        return init_states
-
-    @property
-    def state_shape(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) shape[s], where a shape is represented
-        as a list/tuple of integers (-1 for batch size would be automatically
-        inserted into a shape if shape is not started with it).
-        Not necessary to be implemented if states are not initialized by
-        `get_initial_states` or the `shape` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_shape` in the used cell.")
-
-    @property
-    def state_dtype(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) data types[s]. The structure must be
-        same as that of `shape`, except when all tensors' in states has the same
-        data type, a signle data type can be used.
-        Not necessary to be implemented if states are not initialized
-        by `get_initial_states` or the `dtype` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_dtype` in the used cell.")
-
-
-class BasicLSTMCell(RNNCell):
-    """
-    ****
-    BasicLSTMUnit class, Using basic operator to build LSTM
-    The algorithm can be described as the code below.
-        .. math::
-           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-           h_t &= o_t \odot tanh(c_t)
-        - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-          of weights from the input gate to the input)
-        - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-        - sigmoid is the logistic sigmoid function.
-        - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-          and cell activation vectors, respectively, all of which have the same size as
-          the cell output activation vector $h$.
-        - The :math:`\odot` is the element-wise product of the vectors.
-        - :math:`tanh` is the activation functions.
-        - :math:`\\tilde{c_t}` is also called candidate hidden state,
-          which is computed based on the current input and the previous hidden state.
-    Args:
-        name_scope(string) : The name scope used to identify parameter and bias name
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized as zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cells (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate
-        dtype(string): data type used in this unit
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 dtype='float32'):
-        super(BasicLSTMCell, self).__init__()
-
-        self._hidden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        self._forget_bias = layers.fill_constant(
-            [1], dtype=dtype, value=forget_bias)
-        self._forget_bias.stop_gradient = False
-        self._dtype = dtype
-        self._input_size = input_size
-
-        self._weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=[
-                self._input_size + self._hidden_size, 4 * self._hidden_size
-            ],
-            dtype=self._dtype)
-
-        self._bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[4 * self._hidden_size],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input, state):
-        pre_hidden, pre_cell = state
-        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
-
-        gate_input = layers.elementwise_add(gate_input, self._bias)
-        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-        new_cell = layers.elementwise_add(
-            layers.elementwise_mul(
-                pre_cell,
-                layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
-            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
-        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
-
-        return new_hidden, [new_hidden, new_cell]
-
-    @property
-    def state_shape(self):
-        return [[self._hidden_size], [self._hidden_size]]
-
-
-class BasicGRUCell(RNNCell):
-    """
-    ****
-    BasicGRUUnit class, using basic operators to build GRU
-    The algorithm can be described as the equations below.
-
-        .. math::
-            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
-
-            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
-
-            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    Args:
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        dtype(string): data type used in this unit
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 dtype='float32'):
-        super(BasicGRUCell, self).__init__()
-        self._input_size = input_size
-        self._hiden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        self._dtype = dtype
-
-        if self._param_attr is not None and self._param_attr.name is not None:
-            gate_param_attr = copy.deepcopy(self._param_attr)
-            candidate_param_attr = copy.deepcopy(self._param_attr)
-            gate_param_attr.name += "_gate"
-            candidate_param_attr.name += "_candidate"
-        else:
-            gate_param_attr = self._param_attr
-            candidate_param_attr = self._param_attr
-
-        self._gate_weight = self.create_parameter(
-            attr=gate_param_attr,
-            shape=[self._input_size + self._hiden_size, 2 * self._hiden_size],
-            dtype=self._dtype)
-
-        self._candidate_weight = self.create_parameter(
-            attr=candidate_param_attr,
-            shape=[self._input_size + self._hiden_size, self._hiden_size],
-            dtype=self._dtype)
-
-        if self._bias_attr is not None and self._bias_attr.name is not None:
-            gate_bias_attr = copy.deepcopy(self._bias_attr)
-            candidate_bias_attr = copy.deepcopy(self._bias_attr)
-            gate_bias_attr.name += "_gate"
-            candidate_bias_attr.name += "_candidate"
-        else:
-            gate_bias_attr = self._bias_attr
-            candidate_bias_attr = self._bias_attr
-
-        self._gate_bias = self.create_parameter(
-            attr=gate_bias_attr,
-            shape=[2 * self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
-        self._candidate_bias = self.create_parameter(
-            attr=candidate_bias_attr,
-            shape=[self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input, state):
-        pre_hidden = state
-        concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
-
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
-
-        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
-
-        gate_input = self._gate_activation(gate_input)
-        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
-
-        r_hidden = r * pre_hidden
-
-        candidate = layers.matmul(
-            layers.concat([input, r_hidden], 1), self._candidate_weight)
-        candidate = layers.elementwise_add(candidate, self._candidate_bias)
-
-        c = self._activation(candidate)
-        new_hidden = u * pre_hidden + (1 - u) * c
-
-        return new_hidden
-
-    @property
-    def state_shape(self):
-        return [self._hidden_size]
-
-
-class RNN(fluid.dygraph.Layer):
-    def __init__(self, cell, is_reverse=False, time_major=False):
-        super(RNN, self).__init__()
-        self.cell = cell
-        if not hasattr(self.cell, "call"):
-            self.cell.call = self.cell.forward
-        self.is_reverse = is_reverse
-        self.time_major = time_major
-        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
-                                                                            1)
-
-    def forward(self,
-                inputs,
-                initial_states=None,
-                sequence_length=None,
-                **kwargs):
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                new_state = fluid.layers.elementwise_mul(
-                    new_state, step_mask,
-                    axis=0) - fluid.layers.elementwise_mul(
-                        state, (step_mask - 1), axis=0)
-                return new_state
-
-            flat_inputs = flatten(inputs)
-            batch_size, time_steps = (
-                flat_inputs[0].shape[self.batch_index],
-                flat_inputs[0].shape[self.time_step_index])
-
-            if initial_states is None:
-                initial_states = self.cell.get_initial_states(
-                    batch_ref=inputs, batch_dim_idx=self.batch_index)
-
-            if not self.time_major:
-                inputs = map_structure(
-                    lambda x: fluid.layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), inputs)
-
-            if sequence_length:
-                mask = fluid.layers.sequence_mask(
-                    sequence_length,
-                    maxlen=time_steps,
-                    dtype=flatten(initial_states)[0].dtype)
-                mask = fluid.layers.transpose(mask, [1, 0])
-
-            if self.is_reverse:
-                inputs = map_structure(
-                    lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
-                mask = fluid.layers.reverse(
-                    mask, axis=[0]) if sequence_length else None
-
-            states = initial_states
-            outputs = []
-            for i in range(time_steps):
-                step_inputs = map_structure(lambda x: x[i], inputs)
-                step_outputs, new_states = self.cell(step_inputs, states,
-                                                     **kwargs)
-                if sequence_length:
-                    new_states = map_structure(
-                        partial(
-                            _maybe_copy, step_mask=mask[i]),
-                        states,
-                        new_states)
-                states = new_states
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if i == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array,
-                                             axis=self.time_step_index),
-                outputs)
-
-            if self.is_reverse:
-                final_outputs = map_structure(
-                    lambda x: fluid.layers.reverse(x,
-                                                   axis=self.time_step_index),
-                    final_outputs)
-
-            final_states = new_states
-        else:
-            final_outputs, final_states = fluid.layers.rnn(
-                self.cell,
-                inputs,
-                initial_states=initial_states,
-                sequence_length=sequence_length,
-                time_major=self.time_major,
-                is_reverse=self.is_reverse,
-                **kwargs)
-        return final_outputs, final_states
-
-
-class DynamicDecode(Layer):
-    def __init__(self,
-                 decoder,
-                 max_step_num=None,
-                 output_time_major=False,
-                 impute_finished=False,
-                 is_test=False,
-                 return_length=False):
-        super(DynamicDecode, self).__init__()
-        self.decoder = decoder
-        self.max_step_num = max_step_num
-        self.output_time_major = output_time_major
-        self.impute_finished = impute_finished
-        self.is_test = is_test
-        self.return_length = return_length
-
-    def forward(self, inits=None, **kwargs):
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-                def __getitem__(self, item):
-                    return self.array.__getitem__(item)
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                state_dtype = state.dtype
-                if convert_dtype(state_dtype) in ["bool"]:
-                    state = layers.cast(state, dtype="float32")
-                    new_state = layers.cast(new_state, dtype="float32")
-                if step_mask.dtype != state.dtype:
-                    step_mask = layers.cast(step_mask, dtype=state.dtype)
-                    # otherwise, renamed bool gradients of would be summed up leading
-                    # to sum(bool) error.
-                    step_mask.stop_gradient = True
-                new_state = layers.elementwise_mul(
-                    state, step_mask, axis=0) - layers.elementwise_mul(
-                        new_state, (step_mask - 1), axis=0)
-                if convert_dtype(state_dtype) in ["bool"]:
-                    new_state = layers.cast(new_state, dtype=state_dtype)
-                return new_state
-
-            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
-                inits)
-            inputs, states, finished = (initial_inputs, initial_states,
-                                        initial_finished)
-            cond = layers.logical_not((layers.reduce_all(initial_finished)))
-            sequence_lengths = layers.cast(
-                layers.zeros_like(initial_finished), "int64")
-            outputs = None
-
-            step_idx = 0
-            step_idx_tensor = layers.fill_constant(
-                shape=[1], dtype="int64", value=step_idx)
-            while cond.numpy():
-                (step_outputs, next_states, next_inputs,
-                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
-                                                    states, **kwargs)
-                if not self.decoder.tracks_own_finished:
-                    # BeamSearchDecoder would track it own finished, since
-                    # beams would be reordered and the finished status of each
-                    # entry might change. Otherwise, perform logical OR which
-                    # would not change the already finished.
-                    next_finished = layers.logical_or(next_finished, finished)
-                    # To confirm states.finished/finished be consistent with
-                    # next_finished.
-                    layers.assign(next_finished, finished)
-                next_sequence_lengths = layers.elementwise_add(
-                    sequence_lengths,
-                    layers.cast(
-                        layers.logical_not(finished), sequence_lengths.dtype))
-
-                if self.impute_finished:  # rectify the states for the finished.
-                    next_states = map_structure(
-                        lambda x, y: _maybe_copy(x, y, finished), states,
-                        next_states)
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if step_idx == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-                inputs, states, finished, sequence_lengths = (
-                    next_inputs, next_states, next_finished,
-                    next_sequence_lengths)
-
-                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
-                step_idx += 1
-
-                layers.logical_not(layers.reduce_all(finished), cond)
-                if self.max_step_num is not None and step_idx > self.max_step_num:
-                    break
-
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
-            final_states = states
-
-            try:
-                final_outputs, final_states = self.decoder.finalize(
-                    final_outputs, final_states, sequence_lengths)
-            except NotImplementedError:
-                pass
-
-            if not self.output_time_major:
-                final_outputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), final_outputs)
-
-            return (final_outputs, final_states,
-                    sequence_lengths) if self.return_length else (
-                        final_outputs, final_states)
-        else:
-            return fluid.layers.dynamic_decode(
-                self.decoder,
-                inits,
-                max_step_num=self.max_step_num,
-                output_time_major=self.output_time_major,
-                impute_finished=self.impute_finished,
-                is_test=self.is_test,
-                return_length=self.return_length,
-                **kwargs)
-
-
-class TransfomerCell(object):
-    """
-    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
-    used as RNNCell
-    """
-
-    def __init__(self, decoder):
-        self.decoder = decoder
-
-    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
-                 static_caches):
-        trg_word, trg_pos = inputs
-        for cache, static_cache in zip(states, static_caches):
-            cache.update(static_cache)
-        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
-                              enc_output, states)
-        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
-        return logits, new_states
-
-
-class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
-    def __init__(self, cell, start_token, end_token, beam_size,
-                 var_dim_in_state):
-        super(TransformerBeamSearchDecoder,
-              self).__init__(cell, start_token, end_token, beam_size)
-        self.cell = cell
-        self.var_dim_in_state = var_dim_in_state
-
-    def _merge_batch_beams_with_var_dim(self, x):
-        # init length of cache is 0, and it increases with decoding carrying on,
-        # thus need to reshape elaborately
-        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
-        x = layers.transpose(x,
-                             list(range(var_dim_in_state, len(x.shape))) +
-                             list(range(0, var_dim_in_state)))
-        x = layers.reshape(
-            x, [0] * (len(x.shape) - var_dim_in_state
-                      ) + [self.batch_size * self.beam_size] +
-            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
-        x = layers.transpose(
-            x,
-            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
-            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
-        return x
-
-    def _split_batch_beams_with_var_dim(self, x):
-        var_dim_size = layers.shape(x)[self.var_dim_in_state]
-        x = layers.reshape(
-            x, [-1, self.beam_size] +
-            [int(size)
-             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
-            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
-        return x
-
-    def step(self, time, inputs, states, **kwargs):
-        # compared to RNN, Transformer has 3D data at every decoding step
-        inputs = layers.reshape(inputs, [-1, 1])  # token
-        pos = layers.ones_like(inputs) * time  # pos
-        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
-                                    states.cell_states)
-
-        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
-                                                   **kwargs)
-        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
-        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
-                                         next_cell_states)
-
-        beam_search_output, beam_search_state = self._beam_search_step(
-            time=time,
-            logits=cell_outputs,
-            next_cell_states=next_cell_states,
-            beam_state=states)
-        next_inputs, finished = (beam_search_output.predicted_ids,
-                                 beam_search_state.finished)
-
-        return (beam_search_output, beam_search_state, next_inputs, finished)
-
-
-### Transformer Modules ###
-class PrePostProcessLayer(Layer):
-    """
-    PrePostProcessLayer
-    """
-
-    def __init__(self, process_cmd, d_model, dropout_rate):
-        super(PrePostProcessLayer, self).__init__()
-        self.process_cmd = process_cmd
-        self.functors = []
-        for cmd in self.process_cmd:
-            if cmd == "a":  # add residual connection
-                self.functors.append(lambda x, y: x + y if y else x)
-            elif cmd == "n":  # add layer normalization
-                self.functors.append(
-                    self.add_sublayer(
-                        "layer_norm_%d" % len(
-                            self.sublayers(include_sublayers=False)),
-                        LayerNorm(
-                            normalized_shape=d_model,
-                            param_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(1.)),
-                            bias_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(0.)))))
-            elif cmd == "d":  # add dropout
-                self.functors.append(lambda x: layers.dropout(
-                    x, dropout_prob=dropout_rate, is_test=False)
-                                     if dropout_rate else x)
-
-    def forward(self, x, residual=None):
-        for i, cmd in enumerate(self.process_cmd):
-            if cmd == "a":
-                x = self.functors[i](x, residual)
-            else:
-                x = self.functors[i](x)
-        return x
-
-
-class MultiHeadAttention(Layer):
-    """
-    Multi-Head Attention
-    """
-
-    def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
-        super(MultiHeadAttention, self).__init__()
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
-        self.d_model = d_model
-        self.dropout_rate = dropout_rate
-        self.q_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.k_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.v_fc = Linear(
-            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
-        self.proj_fc = Linear(
-            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
-
-    def _prepare_qkv(self, queries, keys, values, cache=None):
-        if keys is None:  # self-attention
-            keys, values = queries, queries
-            static_kv = False
-        else:  # cross-attention
-            static_kv = True
-
-        q = self.q_fc(queries)
-        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
-        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
-
-        if cache is not None and static_kv and "static_k" in cache:
-            # for encoder-decoder attention in inference and has cached
-            k = cache["static_k"]
-            v = cache["static_v"]
-        else:
-            k = self.k_fc(keys)
-            v = self.v_fc(values)
-            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-
-        if cache is not None:
-            if static_kv and not "static_k" in cache:
-                # for encoder-decoder attention in inference and has not cached
-                cache["static_k"], cache["static_v"] = k, v
-            elif not static_kv:
-                # for decoder self-attention in inference
-                cache_k, cache_v = cache["k"], cache["v"]
-                k = layers.concat([cache_k, k], axis=2)
-                v = layers.concat([cache_v, v], axis=2)
-                cache["k"], cache["v"] = k, v
-
-        return q, k, v
-
-    def forward(self, queries, keys, values, attn_bias, cache=None):
-        # compute q ,k ,v
-        q, k, v = self._prepare_qkv(queries, keys, values, cache)
-
-        # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if self.dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=self.dropout_rate, is_test=False)
-
-        out = layers.matmul(weights, v)
-
-        # combine heads
-        out = layers.transpose(out, perm=[0, 2, 1, 3])
-        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
-
-        # project to output
-        out = self.proj_fc(out)
-        return out
-
-    def cal_kv(self, keys, values):
-        k = self.k_fc(keys)
-        v = self.v_fc(values)
-        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-        return k, v
-
-
-class FFN(Layer):
-    """
-    Feed-Forward Network
-    """
-
-    def __init__(self, d_inner_hid, d_model, dropout_rate):
-        super(FFN, self).__init__()
-        self.dropout_rate = dropout_rate
-        self.fc1 = Linear(
-            input_dim=d_model, output_dim=d_inner_hid, act="relu")
-        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
-
-    def forward(self, x):
-        hidden = self.fc1(x)
-        if self.dropout_rate:
-            hidden = layers.dropout(
-                hidden, dropout_prob=self.dropout_rate, is_test=False)
-        out = self.fc2(hidden)
-        return out
-
-
-class TransformerEncoderLayer(Layer):
-    """
-    EncoderLayer
-    """
-
-    def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
-
-        super(TransformerEncoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                            attention_dropout)
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-    def forward(self, enc_input, attn_bias):
-        attn_output = self.self_attn(
-            self.preprocesser1(enc_input), None, None, attn_bias)
-        attn_output = self.postprocesser1(attn_output, enc_input)
-
-        ffn_output = self.ffn(self.preprocesser2(attn_output))
-        ffn_output = self.postprocesser2(ffn_output, attn_output)
-        return ffn_output
-
-
-class TransformerEncoder(Layer):
-    """
-    encoder
-    """
-
-    def __init__(self,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
-
-        super(TransformerEncoder, self).__init__()
-
-        self.encoder_layers = list()
-        for i in range(n_layer):
-            self.encoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerEncoderLayer(
-                        n_head, d_key, d_value, d_model, d_inner_hid,
-                        prepostprocess_dropout, attention_dropout,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
-
-    def forward(self, enc_input, attn_bias):
-        for encoder_layer in self.encoder_layers:
-            enc_output = encoder_layer(enc_input, attn_bias)
-            enc_input = enc_output
-
-        return self.processer(enc_output)
-
-
-class TransformerDecoderLayer(Layer):
-    """
-    decoder
-    """
-
-    def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
-        super(TransformerDecoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                            attention_dropout)
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                             attention_dropout)
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
-        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                cache=None):
-        self_attn_output = self.self_attn(
-            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
-        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
-
-        cross_attn_output = self.cross_attn(
-            self.preprocesser2(self_attn_output), enc_output, enc_output,
-            cross_attn_bias, cache)
-        cross_attn_output = self.postprocesser2(cross_attn_output,
-                                                self_attn_output)
-
-        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
-        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
-
-        return ffn_output
-
-
-class TransformerDecoder(Layer):
-    """
-    decoder
-    """
-
-    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-                 prepostprocess_dropout, attention_dropout, relu_dropout,
-                 preprocess_cmd, postprocess_cmd):
-        super(TransformerDecoder, self).__init__()
-
-        self.decoder_layers = list()
-        for i in range(n_layer):
-            self.decoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerDecoderLayer(
-                        n_head, d_key, d_value, d_model, d_inner_hid,
-                        prepostprocess_dropout, attention_dropout,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
-
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                caches=None):
-        for i, decoder_layer in enumerate(self.decoder_layers):
-            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
-                                       cross_attn_bias, None
-                                       if caches is None else caches[i])
-            dec_input = dec_output
-
-        return self.processer(dec_output)
-
-    def prepare_static_cache(self, enc_output):
-        return [
-            dict(
-                zip(("static_k", "static_v"),
-                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
-            for decoder_layer in self.decoder_layers
-        ]
-
-
-class DynamicGRU(fluid.dygraph.Layer):
-    def __init__(self,
-                 size,
-                 h_0=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 origin_mode=False,
-                 init_size=None):
-        super(DynamicGRU, self).__init__()
-
-        self.gru_unit = GRUUnit(
-            size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
-
-        self.size = size
-        self.h_0 = h_0
-        self.is_reverse = is_reverse
-
-    def forward(self, inputs):
-        hidden = self.h_0
-        res = []
-
-        for i in range(inputs.shape[1]):
-            if self.is_reverse:
-                i = inputs.shape[1] - 1 - i
-            input_ = inputs[:, i:i + 1, :]
-            input_ = fluid.layers.reshape(
-                input_, [-1, input_.shape[2]], inplace=False)
-            hidden, reset, gate = self.gru_unit(input_, hidden)
-            hidden_ = fluid.layers.reshape(
-                hidden, [-1, 1, hidden.shape[1]], inplace=False)
-            res.append(hidden_)
-        if self.is_reverse:
-            res = res[::-1]
-        res = fluid.layers.concat(res, axis=1)
-        return res
-
-
-class BiGRU(fluid.dygraph.Layer):
-    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
-        super(BiGRU, self).__init__()
-
-        self.pre_gru = Linear(
-            input_dim=input_dim,
-            output_dim=grnn_hidden_dim * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.gru = DynamicGRU(
-            size=grnn_hidden_dim,
-            h_0=h_0,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.pre_gru_r = Linear(
-            input_dim=input_dim,
-            output_dim=grnn_hidden_dim * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.gru_r = DynamicGRU(
-            size=grnn_hidden_dim,
-            is_reverse=True,
-            h_0=h_0,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-    def forward(self, input_feature):
-        res_pre_gru = self.pre_gru(input_feature)
-        res_gru = self.gru(res_pre_gru)
-        res_pre_gru_r = self.pre_gru_r(input_feature)
-        res_gru_r = self.gru_r(res_pre_gru_r)
-        bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1)
-        return bi_merge
-
-
-class Linear_chain_crf(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Linear_chain_crf, self).__init__()
-
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-
-    @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-
-    def forward(self, input, label, length=None):
-
-        alpha = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        emission_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        transition_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        log_likelihood = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": [label]
-        }
-        if length:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='linear_chain_crf',
-            inputs=this_inputs,
-            outputs={
-                "Alpha": [alpha],
-                "EmissionExps": [emission_exps],
-                "TransitionExps": transition_exps,
-                "LogLikelihood": log_likelihood
-            },
-            attrs={"is_test": self._is_test, })
-        return log_likelihood
-
-
-class Crf_decoding(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Crf_decoding, self).__init__()
-
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._param_attr = param_attr
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-
-    @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-
-    def forward(self, input, label=None, length=None):
-
-        viterbi_path = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": label
-        }
-        if length:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='crf_decoding',
-            inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={"is_test": self._is_test, })
-        return viterbi_path
-
-
-class SequenceTagging(fluid.dygraph.Layer):
-    def __init__(self, 
-             vocab_size,
-             num_labels,
-             batch_size, 
-             word_emb_dim=128,
-             grnn_hidden_dim=128,
-             emb_learning_rate=0.1,
-             crf_learning_rate=0.1,
-             bigru_num=2,
-             init_bound=0.1,
-             length=None):
-        super(SequenceTagging, self).__init__()
-        """
-        define the sequence tagging network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
-        self.word_emb_dim = word_emb_dim
-        self.vocab_size = vocab_size
-        self.num_labels = num_labels
-        self.grnn_hidden_dim = grnn_hidden_dim
-        self.emb_lr = emb_learning_rate
-        self.crf_lr = crf_learning_rate
-        self.bigru_num = bigru_num
-        self.batch_size = batch_size
-        self.init_bound = 0.1
-
-        self.word_embedding = Embedding(
-            size=[self.vocab_size, self.word_emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
-                learning_rate=self.emb_lr,
-                name="word_emb",
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound)))
-
-        h_0 = fluid.layers.create_global_var(
-            shape=[self.batch_size, self.grnn_hidden_dim],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            force_cpu=True,
-            name='h_0')
-
-        self.bigru_units = []
-        for i in range(self.bigru_num):
-            if i == 0:
-                self.bigru_units.append(
-                    self.add_sublayer(
-                        "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0)))
-            else:
-                self.bigru_units.append(
-                    self.add_sublayer(
-                        "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim * 2,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0)))
-
-        self.fc = Linear(
-            input_dim=self.grnn_hidden_dim * 2,
-            output_dim=self.num_labels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.linear_chain_crf = Linear_chain_crf(
-            param_attr=fluid.ParamAttr(
-                name='linear_chain_crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-
-        self.crf_decoding = Crf_decoding(
-            param_attr=fluid.ParamAttr(
-                name='crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-
-    def forward(self, word, target, lengths):
-        """
-        Configure the network
-        """
-        word_embed = self.word_embedding(word)
-        input_feature = word_embed
-
-        for i in range(self.bigru_num):
-            bigru_output = self.bigru_units[i](input_feature)
-            input_feature = bigru_output
-
-        emission = self.fc(bigru_output)
-
-        crf_cost = self.linear_chain_crf(
-            input=emission, label=target, length=lengths)
-        avg_cost = fluid.layers.mean(x=crf_cost)
-        self.crf_decoding.weight = self.linear_chain_crf.weight
-        crf_decode = self.crf_decoding(input=emission, length=lengths)
-        return crf_decode, avg_cost, lengths
diff --git a/transformer/predict.py b/transformer/predict.py
index 7a47ccdaef7426505ab69ee93ec20bfb2f765513..b83d5403486c1e661a939663bad154735b29b37e 100644
--- a/transformer/predict.py
+++ b/transformer/predict.py
@@ -22,7 +22,7 @@ from functools import partial
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.io import DataLoader
+from paddle.io import DataLoader
 from paddle.fluid.layers.utils import flatten
 
 from utils.configure import PDConfig
diff --git a/transformer/reader.py b/transformer/reader.py
index 66fb8dc02b99f345f337d8a91b6c7eeaff71fe18..c0d02dcfb5b526ff8407f9320f31836d42ae5e4b 100644
--- a/transformer/reader.py
+++ b/transformer/reader.py
@@ -22,7 +22,7 @@ from functools import partial
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.io import BatchSampler, DataLoader, Dataset
+from paddle.io import BatchSampler, DataLoader, Dataset
 
 
 def create_data_loader(args, device):
diff --git a/transformer/train.py b/transformer/train.py
index 58df6afb2cadc3b471aaffd4ed4caebbbc0bbc3d..04a61f83a0191a944d9b2611b3bca61f0bcf2a0a 100644
--- a/transformer/train.py
+++ b/transformer/train.py
@@ -21,7 +21,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.io import DataLoader
+from paddle.io import DataLoader
 
 from utils.configure import PDConfig
 from utils.check import check_gpu, check_version