diff --git a/examples/bert/bert.yaml b/bert/bert.yaml
similarity index 100%
rename from examples/bert/bert.yaml
rename to bert/bert.yaml
diff --git a/examples/bert/bert_classifier.py b/bert/bert_classifier.py
similarity index 100%
rename from examples/bert/bert_classifier.py
rename to bert/bert_classifier.py
diff --git a/examples/bert/readme.md b/bert/readme.md
similarity index 89%
rename from examples/bert/readme.md
rename to bert/readme.md
index f8df579ec8302b890f7fb523facc417fc784536a..e367d881ec9e82f2735db47f8cae90fe9b005885 100644
--- a/examples/bert/readme.md
+++ b/bert/readme.md
@@ -6,4 +6,4 @@
 
 4. unzip pretrained parameters: tar -zvxf bert_uncased_L-12_H-768_A-12.tar.gz
 
-4. bash run_classifier_single_gpu.sh 
+4. bash run_classifier_single_gpu.sh
diff --git a/examples/bert/run_classifier_multi_gpu.sh b/bert/run_classifier_multi_gpu.sh
similarity index 100%
rename from examples/bert/run_classifier_multi_gpu.sh
rename to bert/run_classifier_multi_gpu.sh
diff --git a/examples/bert/run_classifier_single_gpu.sh b/bert/run_classifier_single_gpu.sh
similarity index 100%
rename from examples/bert/run_classifier_single_gpu.sh
rename to bert/run_classifier_single_gpu.sh
diff --git a/examples/bert_leveldb/bert.yaml b/bert_leveldb/bert.yaml
similarity index 100%
rename from examples/bert_leveldb/bert.yaml
rename to bert_leveldb/bert.yaml
diff --git a/examples/bert_leveldb/bert_classifier.py b/bert_leveldb/bert_classifier.py
similarity index 100%
rename from examples/bert_leveldb/bert_classifier.py
rename to bert_leveldb/bert_classifier.py
diff --git a/examples/bert_leveldb/readme.md b/bert_leveldb/readme.md
similarity index 90%
rename from examples/bert_leveldb/readme.md
rename to bert_leveldb/readme.md
index 77241a168946b3fa4c4eb203dee039fb53004f50..fbcb7e70d9e9be7def58f06dc305077c3136076c 100644
--- a/examples/bert_leveldb/readme.md
+++ b/bert_leveldb/readme.md
@@ -8,4 +8,4 @@
 
 4. unzip pretrained parameters: tar -zvxf bert_uncased_L-12_H-768_A-12.tar.gz
 
-4. bash run_classifier_single_gpu.sh 
+4. bash run_classifier_single_gpu.sh
diff --git a/examples/bert_leveldb/run_classifier_multi_gpu.sh b/bert_leveldb/run_classifier_multi_gpu.sh
similarity index 100%
rename from examples/bert_leveldb/run_classifier_multi_gpu.sh
rename to bert_leveldb/run_classifier_multi_gpu.sh
diff --git a/examples/bert_leveldb/run_classifier_single_gpu.sh b/bert_leveldb/run_classifier_single_gpu.sh
similarity index 100%
rename from examples/bert_leveldb/run_classifier_single_gpu.sh
rename to bert_leveldb/run_classifier_single_gpu.sh
diff --git a/examples/bmn/BMN.png b/bmn/BMN.png
similarity index 100%
rename from examples/bmn/BMN.png
rename to bmn/BMN.png
diff --git a/examples/bmn/README.md b/bmn/README.md
similarity index 100%
rename from examples/bmn/README.md
rename to bmn/README.md
diff --git a/examples/bmn/bmn.yaml b/bmn/bmn.yaml
similarity index 100%
rename from examples/bmn/bmn.yaml
rename to bmn/bmn.yaml
diff --git a/examples/bmn/bmn_metric.py b/bmn/bmn_metric.py
similarity index 100%
rename from examples/bmn/bmn_metric.py
rename to bmn/bmn_metric.py
diff --git a/examples/bmn/bmn_utils.py b/bmn/bmn_utils.py
similarity index 99%
rename from examples/bmn/bmn_utils.py
rename to bmn/bmn_utils.py
index cccf50647a55fabdfe94dd0f1f7e1370e15d0fe2..61ea308cb56c8ddbda3c5ea5208a97e4f6046e86 100644
--- a/examples/bmn/bmn_utils.py
+++ b/bmn/bmn_utils.py
@@ -160,5 +160,3 @@ def bmn_post_processing(video_dict, subset, output_path, result_path):
 
     json.dump(output_dict, outfile)
     outfile.close()
-
-
diff --git a/examples/bmn/config_utils.py b/bmn/config_utils.py
similarity index 100%
rename from examples/bmn/config_utils.py
rename to bmn/config_utils.py
diff --git a/examples/bmn/eval.py b/bmn/eval.py
similarity index 100%
rename from examples/bmn/eval.py
rename to bmn/eval.py
diff --git a/examples/bmn/eval_anet_prop.py b/bmn/eval_anet_prop.py
similarity index 100%
rename from examples/bmn/eval_anet_prop.py
rename to bmn/eval_anet_prop.py
diff --git a/examples/bmn/infer.list b/bmn/infer.list
similarity index 100%
rename from examples/bmn/infer.list
rename to bmn/infer.list
diff --git a/examples/bmn/modeling.py b/bmn/modeling.py
similarity index 100%
rename from examples/bmn/modeling.py
rename to bmn/modeling.py
diff --git a/examples/bmn/predict.py b/bmn/predict.py
similarity index 100%
rename from examples/bmn/predict.py
rename to bmn/predict.py
diff --git a/examples/bmn/reader.py b/bmn/reader.py
similarity index 100%
rename from examples/bmn/reader.py
rename to bmn/reader.py
diff --git a/examples/bmn/run.sh b/bmn/run.sh
similarity index 100%
rename from examples/bmn/run.sh
rename to bmn/run.sh
diff --git a/examples/bmn/train.py b/bmn/train.py
similarity index 100%
rename from examples/bmn/train.py
rename to bmn/train.py
diff --git a/examples/cyclegan/README.md b/cyclegan/README.md
similarity index 100%
rename from examples/cyclegan/README.md
rename to cyclegan/README.md
diff --git a/examples/cyclegan/__init__.py b/cyclegan/__init__.py
similarity index 100%
rename from examples/cyclegan/__init__.py
rename to cyclegan/__init__.py
diff --git a/examples/cyclegan/check.py b/cyclegan/check.py
similarity index 100%
rename from examples/cyclegan/check.py
rename to cyclegan/check.py
diff --git a/examples/cyclegan/cyclegan.py b/cyclegan/cyclegan.py
similarity index 100%
rename from examples/cyclegan/cyclegan.py
rename to cyclegan/cyclegan.py
diff --git a/examples/cyclegan/data.py b/cyclegan/data.py
similarity index 100%
rename from examples/cyclegan/data.py
rename to cyclegan/data.py
diff --git a/examples/cyclegan/image/A2B.png b/cyclegan/image/A2B.png
similarity index 100%
rename from examples/cyclegan/image/A2B.png
rename to cyclegan/image/A2B.png
diff --git a/examples/cyclegan/image/B2A.png b/cyclegan/image/B2A.png
similarity index 100%
rename from examples/cyclegan/image/B2A.png
rename to cyclegan/image/B2A.png
diff --git a/examples/cyclegan/image/net.png b/cyclegan/image/net.png
similarity index 100%
rename from examples/cyclegan/image/net.png
rename to cyclegan/image/net.png
diff --git a/examples/cyclegan/image/testA/123_A.jpg b/cyclegan/image/testA/123_A.jpg
similarity index 100%
rename from examples/cyclegan/image/testA/123_A.jpg
rename to cyclegan/image/testA/123_A.jpg
diff --git a/examples/cyclegan/image/testB/78_B.jpg b/cyclegan/image/testB/78_B.jpg
similarity index 100%
rename from examples/cyclegan/image/testB/78_B.jpg
rename to cyclegan/image/testB/78_B.jpg
diff --git a/examples/cyclegan/infer.py b/cyclegan/infer.py
similarity index 100%
rename from examples/cyclegan/infer.py
rename to cyclegan/infer.py
diff --git a/examples/cyclegan/layers.py b/cyclegan/layers.py
similarity index 100%
rename from examples/cyclegan/layers.py
rename to cyclegan/layers.py
diff --git a/examples/cyclegan/test.py b/cyclegan/test.py
similarity index 100%
rename from examples/cyclegan/test.py
rename to cyclegan/test.py
diff --git a/examples/cyclegan/train.py b/cyclegan/train.py
similarity index 100%
rename from examples/cyclegan/train.py
rename to cyclegan/train.py
diff --git a/examples/handwritten_number_recognition/README.md b/handwritten_number_recognition/README.md
similarity index 100%
rename from examples/handwritten_number_recognition/README.md
rename to handwritten_number_recognition/README.md
diff --git a/examples/handwritten_number_recognition/mnist.py b/handwritten_number_recognition/mnist.py
similarity index 100%
rename from examples/handwritten_number_recognition/mnist.py
rename to handwritten_number_recognition/mnist.py
diff --git a/hapi/__init__.py b/hapi/__init__.py
deleted file mode 100644
index 3860aafc7306c764cfc055745038a78ba99de1fd..0000000000000000000000000000000000000000
--- a/hapi/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from hapi import logger
-from hapi.configure import Config
-from hapi import callbacks
-from hapi import datasets
-from hapi import distributed
-from hapi import download
-from hapi import metrics
-from hapi import model
-from hapi import progressbar
-from hapi import text
-from hapi import vision
-from hapi import loss
-
-logger.setup_logger()
-
-__all__ = [
-    'Config', 'callbacks', 'datasets', 'distributed', 'download', 'metrics',
-    'model', 'progressbar', 'text', 'vision', 'loss'
-]
diff --git a/hapi/callbacks.py b/hapi/callbacks.py
deleted file mode 100644
index c13fd431dc3327d90d89f71612523e4b5ce1ba0a..0000000000000000000000000000000000000000
--- a/hapi/callbacks.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import six
-import copy
-
-from .progressbar import ProgressBar
-from paddle.fluid.dygraph.parallel import ParallelEnv
-
-
-def config_callbacks(callbacks=None,
-                     model=None,
-                     batch_size=None,
-                     epochs=None,
-                     steps=None,
-                     log_freq=2,
-                     verbose=2,
-                     save_freq=1,
-                     save_dir=None,
-                     metrics=None,
-                     mode='train'):
-    cbks = callbacks or []
-    cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks]
-    if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose:
-        cbks = [ProgBarLogger(log_freq, verbose=verbose)] + cbks
-
-    if not any(isinstance(k, ModelCheckpoint) for k in cbks):
-        cbks = cbks + [ModelCheckpoint(save_freq, save_dir)]
-
-    cbk_list = CallbackList(cbks)
-    cbk_list.set_model(model)
-    metrics = metrics or [] if mode != 'test' else []
-    params = {
-        'batch_size': batch_size,
-        'epochs': epochs,
-        'steps': steps,
-        'verbose': verbose,
-        'metrics': metrics,
-    }
-    cbk_list.set_params(params)
-    return cbk_list
-
-
-class CallbackList(object):
-    def __init__(self, callbacks=None):
-        # copy
-        self.callbacks = [c for c in callbacks]
-        self.params = {}
-        self.model = None
-
-    def append(self, callback):
-        self.callbacks.append(callback)
-
-    def __iter__(self):
-        return iter(self.callbacks)
-
-    def set_params(self, params):
-        for c in self.callbacks:
-            c.set_params(params)
-
-    def set_model(self, model):
-        for c in self.callbacks:
-            c.set_model(model)
-
-    def _call(self, name, *args):
-        for c in self.callbacks:
-            func = getattr(c, name)
-            func(*args)
-
-    def _check_mode(self, mode):
-        assert mode in ['train', 'eval', 'test'], \
-            'mode should be train, eval or test'
-
-    def on_begin(self, mode, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_begin'.format(mode)
-        self._call(name, logs)
-
-    def on_end(self, mode, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_end'.format(mode)
-        self._call(name, logs)
-
-    def on_epoch_begin(self, epoch=None, logs=None):
-        self._call('on_epoch_begin', epoch, logs)
-
-    def on_epoch_end(self, epoch=None, logs=None):
-        self._call('on_epoch_end', epoch, logs)
-
-    def on_batch_begin(self, mode, step=None, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_batch_begin'.format(mode)
-        self._call(name, step, logs)
-
-    def on_batch_end(self, mode, step=None, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_batch_end'.format(mode)
-        self._call(name, step, logs)
-
-
-class Callback(object):
-    """Base class used to build new callbacks.
-    """
-
-    def __init__(self):
-        self.model = None
-        self.params = {}
-
-    def set_params(self, params):
-        self.params = params
-
-    def set_model(self, model):
-        self.model = model
-
-    def on_train_begin(self, logs=None):
-        """Called at the start of training.
-        """
-
-    def on_train_end(self, logs=None):
-        """Called at the end of training.
-        """
-
-    def on_eval_begin(self, logs=None):
-        """Called at the start of evaluation.
-        """
-
-    def on_eval_end(self, logs=None):
-        """Called at the end of evaluation.
-        """
-
-    def on_test_begin(self, logs=None):
-        """Called at the beginning of predict.
-        """
-
-    def on_test_end(self, logs=None):
-        """Called at the end of predict.
-        """
-
-    def on_epoch_begin(self, epoch, logs=None):
-        """Called at the beginning of each epoch.
-        """
-
-    def on_epoch_end(self, epoch, logs=None):
-        """Called at the end of each epoch.
-        """
-
-    def on_train_batch_begin(self, step, logs=None):
-        """Called at the beginning of each batch in training.
-        """
-
-    def on_train_batch_end(self, step, logs=None):
-        """Called at the end of each batch in training.
-        """
-
-    def on_eval_batch_begin(self, step, logs=None):
-        """Called at the beginning of each batch in evaluation.
-        """
-
-    def on_eval_batch_end(self, step, logs=None):
-        """Called at the end of each batch in evaluation.
-        """
-
-    def on_test_batch_begin(self, step, logs=None):
-        """Called at the beginning of each batch in predict.
-        """
-
-    def on_test_batch_end(self, step, logs=None):
-        """Called at the end of each batch in predict.
-        """
-
-
-class ProgBarLogger(Callback):
-    """Logger callback function
-    Args:
-        log_freq (int): The frequency, in number of steps, the logs such as `loss`, 
-                `metrics` are printed. Default: 1.
-        verbose (int): The verbosity mode, should be 0, 1, or 2.
-                0 = silent, 1 = progress bar, 2 = one line per epoch. Default: 2.
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            from paddle import fluid
-            from hapi.metrics import Accuracy
-            from hapi.loss import CrossEntropy
-            from hapi.datasets import MNIST
-            from hapi.vision.transforms import Compose, Resize
-            from hapi.vision.models import LeNet
-            from hapi.callbacks import ProgBarLogger
-            from hapi.model import Input, set_device
-
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
-
-            train_dataset = MNIST(mode='train')
-
-            model = LeNet()
-
-            optim = fluid.optimizer.Adam(0.001)
-            model.prepare(optimizer=optim, 
-                        loss_function=CrossEntropy(), 
-                        metrics=Accuracy(), 
-                        inputs=inputs, 
-                        labels=labels)
-
-            callback = ProgBarLogger(log_freq=10)
-            model.fit(train_dataset, batch_size=64, callbacks=callback)
-    """
-
-    def __init__(self, log_freq=1, verbose=2):
-        self.epochs = None
-        self.steps = None
-        self.progbar = None
-        self.verbose = verbose
-        self.log_freq = log_freq
-
-    def _is_print(self):
-        return self.verbose and ParallelEnv().local_rank == 0
-
-    def on_train_begin(self, logs=None):
-        self.epochs = self.params['epochs']
-        assert self.epochs
-        self.train_metrics = self.params['metrics']
-        assert self.train_metrics
-
-    def on_epoch_begin(self, epoch=None, logs=None):
-        self.steps = self.params['steps']
-        self.epoch = epoch
-        self.train_step = 0
-        if self.epochs and self._is_print():
-            print('Epoch %d/%d' % (epoch + 1, self.epochs))
-        self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
-
-    def _updates(self, logs, mode):
-        values = []
-        metrics = getattr(self, '%s_metrics' % (mode))
-        progbar = getattr(self, '%s_progbar' % (mode))
-        steps = getattr(self, '%s_step' % (mode))
-
-        for k in metrics:
-            if k in logs:
-                values.append((k, logs[k]))
-
-        progbar.update(steps, values)
-
-    def on_train_batch_end(self, step, logs=None):
-        logs = logs or {}
-        self.train_step += 1
-
-        if self._is_print() and self.train_step % self.log_freq == 0:
-            if self.steps is None or self.train_step < self.steps:
-                self._updates(logs, 'train')
-
-    def on_epoch_end(self, epoch, logs=None):
-        logs = logs or {}
-        if self._is_print() and (self.steps is not None):
-            self._updates(logs, 'train')
-
-    def on_eval_begin(self, logs=None):
-        self.eval_steps = logs.get('steps', None)
-        self.eval_metrics = logs.get('metrics_name', [])
-        self.eval_step = 0
-        self.evaled_samples = 0
-
-        self.eval_progbar = ProgressBar(
-            num=self.eval_steps, verbose=self.verbose)
-        if self._is_print():
-            print('Eval begin...')
-
-    def on_eval_batch_end(self, step, logs=None):
-        logs = logs or {}
-        self.eval_step += 1
-        samples = logs.get('batch_size', 1)
-        self.evaled_samples += samples
-
-        if self._is_print() and self.eval_step % self.log_freq == 0:
-            if self.eval_steps is None or self.eval_step < self.eval_steps:
-                self._updates(logs, 'eval')
-
-    def on_test_begin(self, logs=None):
-        self.test_steps = logs.get('steps', None)
-        self.test_metrics = logs.get('metrics_name', [])
-        self.test_step = 0
-        self.tested_samples = 0
-        self.test_progbar = ProgressBar(
-            num=self.test_steps, verbose=self.verbose)
-        if self._is_print():
-            print('Predict begin...')
-
-    def on_test_batch_end(self, step, logs=None):
-        logs = logs or {}
-        self.test_step += 1
-        samples = logs.get('batch_size', 1)
-        self.tested_samples += samples
-
-        if self.test_step % self.log_freq == 0 and self._is_print():
-            if self.test_steps is None or self.test_step < self.test_steps:
-                self._updates(logs, 'test')
-
-    def on_eval_end(self, logs=None):
-        logs = logs or {}
-
-        if self._is_print() and (self.eval_steps is not None):
-            self._updates(logs, 'eval')
-            print('Eval samples: %d' % (self.evaled_samples))
-
-    def on_test_end(self, logs=None):
-        logs = logs or {}
-        if self._is_print():
-            if self.test_step % self.log_freq != 0 or self.verbose == 1:
-                self._updates(logs, 'test')
-            print('Predict samples: %d' % (self.tested_samples))
-
-
-class ModelCheckpoint(Callback):
-    """Model checkpoint callback function
-    Args:
-        save_freq(int): The frequency, in number of epochs, the model checkpoint 
-                        are saved. Default: 1.
-        save_dir(str|None): The directory to save checkpoint during training.
-                If None, will not save checkpoint. Default: None.
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            from paddle import fluid
-            from hapi.metrics import Accuracy
-            from hapi.loss import CrossEntropy
-            from hapi.datasets import MNIST
-            from hapi.vision.transforms import Compose, Resize
-            from hapi.vision.models import LeNet
-            from hapi.callbacks import ModelCheckpoint
-            from hapi.model import Input, set_device
-
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
-
-            train_dataset = MNIST(mode='train')
-
-            model = LeNet()
-
-            optim = fluid.optimizer.Adam(0.001)
-            model.prepare(optimizer=optim, 
-                        loss_function=CrossEntropy(), 
-                        metrics=Accuracy(), 
-                        inputs=inputs, 
-                        labels=labels)
-
-            callback = ModelCheckpoint(save_dir='./temp')
-            model.fit(train_dataset, batch_size=64, callbacks=callback)
-    """
-
-    def __init__(self, save_freq=1, save_dir=None):
-        self.save_freq = save_freq
-        self.save_dir = save_dir
-
-    def on_epoch_begin(self, epoch=None, logs=None):
-        self.epoch = epoch
-
-    def _is_save(self):
-        return self.model and self.save_dir and ParallelEnv().local_rank == 0
-
-    def on_epoch_end(self, epoch, logs=None):
-        if self._is_save() and self.epoch % self.save_freq == 0:
-            path = '{}/{}'.format(self.save_dir, epoch)
-            print('save checkpoint at {}'.format(path))
-            self.model.save(path)
-
-    def on_train_end(self, logs=None):
-        if self._is_save():
-            path = '{}/final'.format(self.save_dir)
-            print('save checkpoint at {}'.format(path))
-            self.model.save(path)
diff --git a/hapi/configure.py b/hapi/configure.py
deleted file mode 100644
index 3b4ae0f7363bcaad2c0ec407f868f42678cd485a..0000000000000000000000000000000000000000
--- a/hapi/configure.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import argparse
-import json
-import yaml
-import six
-import logging
-
-logging_only_message = "%(message)s"
-logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
-
-
-class JsonConfig(object):
-    """
-    A high-level api for handling json configure file.
-    """
-
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except:
-            raise IOError("Error in parsing bert model config file '%s'" %
-                          config_path)
-        else:
-            return config_dict
-
-    def __getitem__(self, key):
-        return self._config_dict[key]
-
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-
-
-class ArgumentGroup(object):
-    def __init__(self, parser, title, des):
-        self._group = parser.add_argument_group(title=title, description=des)
-
-    def add_arg(self, name, type, default, help, **kwargs):
-        type = str2bool if type == bool else type
-        self._group.add_argument(
-            "--" + name,
-            default=default,
-            type=type,
-            help=help + ' Default: %(default)s.',
-            **kwargs)
-
-
-class ArgConfig(object):
-    """
-    A high-level api for handling argument configs.
-    """
-
-    def __init__(self):
-        parser = argparse.ArgumentParser()
-
-        custom_g = ArgumentGroup(parser, "customize", "customized options.")
-
-        self.custom_g = custom_g
-
-        self.parser = parser
-
-    def add_arg(self, name, dtype, default, descrip):
-        self.custom_g.add_arg(name, dtype, default, descrip)
-
-    def build_conf(self):
-        return self.parser.parse_args()
-
-
-def str2bool(v):
-    # because argparse does not support to parse "true, False" as python
-    # boolean directly
-    return v.lower() in ("true", "t", "1")
-
-
-def print_arguments(args, log=None):
-    if not log:
-        print('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-    else:
-        log.info('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            log.info('%s: %s' % (arg, value))
-        log.info('------------------------------------------------')
-
-
-class Config(object):
-    """
-    A high-level API for managing configuration files in PaddlePaddle.
-    Can jointly work with command-line-arugment, json files and yaml files.
-    """
-
-    def __init__(self, json_file="", yaml_file="", fuse_args=True):
-        """
-            Init funciton for PDConfig.
-            json_file: the path to the json configure file.
-            yaml_file: the path to the yaml configure file.
-            fuse_args: if fuse the json/yaml configs with argparse.
-        """
-        assert isinstance(json_file, str)
-        assert isinstance(yaml_file, str)
-
-        if json_file != "" and yaml_file != "":
-            raise Warning(
-                "json_file and yaml_file can not co-exist for now. please only use one configure file type."
-            )
-            return
-
-        self.args = None
-        self.arg_config = {}
-        self.json_config = {}
-        self.yaml_config = {}
-
-        parser = argparse.ArgumentParser()
-
-        self.default_g = ArgumentGroup(parser, "default", "default options.")
-        self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.")
-        self.json_g = ArgumentGroup(parser, "json", "options from json.")
-        self.com_g = ArgumentGroup(parser, "custom", "customized options.")
-
-        self.parser = parser
-
-        if json_file != "":
-            self.load_json(json_file, fuse_args=fuse_args)
-
-        if yaml_file:
-            self.load_yaml(yaml_file, fuse_args=fuse_args)
-
-    def load_json(self, file_path, fuse_args=True):
-
-        if not os.path.exists(file_path):
-            raise Warning("the json file %s does not exist." % file_path)
-            return
-
-        with open(file_path, "r") as fin:
-            self.json_config = json.loads(fin.read())
-            fin.close()
-
-        if fuse_args:
-            for name in self.json_config:
-                if isinstance(self.json_config[name], list):
-                    self.json_g.add_arg(
-                        name,
-                        type(self.json_config[name][0]),
-                        self.json_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.json_config[name]))
-                    continue
-                if not isinstance(self.json_config[name], int) \
-                    and not isinstance(self.json_config[name], float) \
-                    and not isinstance(self.json_config[name], str) \
-                    and not isinstance(self.json_config[name], bool):
-
-                    continue
-
-                self.json_g.add_arg(name,
-                                    type(self.json_config[name]),
-                                    self.json_config[name],
-                                    "This is from %s" % file_path)
-
-    def load_yaml(self, file_path, fuse_args=True):
-
-        if not os.path.exists(file_path):
-            raise Warning("the yaml file %s does not exist." % file_path)
-            return
-
-        with open(file_path, "r") as fin:
-            self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
-            fin.close()
-
-        if fuse_args:
-            for name in self.yaml_config:
-                if isinstance(self.yaml_config[name], list):
-                    self.yaml_g.add_arg(
-                        name,
-                        type(self.yaml_config[name][0]),
-                        self.yaml_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.yaml_config[name]))
-                    continue
-
-                if not isinstance(self.yaml_config[name], int) \
-                    and not isinstance(self.yaml_config[name], float) \
-                    and not isinstance(self.yaml_config[name], str) \
-                    and not isinstance(self.yaml_config[name], bool):
-
-                    continue
-
-                self.yaml_g.add_arg(name,
-                                    type(self.yaml_config[name]),
-                                    self.yaml_config[name],
-                                    "This is from %s" % file_path)
-
-    def build(self):
-        self.args = self.parser.parse_args()
-        self.arg_config = vars(self.args)
-
-    def __add__(self, new_arg):
-        assert isinstance(new_arg, list) or isinstance(new_arg, tuple)
-        assert len(new_arg) >= 3
-        assert self.args is None
-
-        name = new_arg[0]
-        dtype = new_arg[1]
-        dvalue = new_arg[2]
-        desc = new_arg[3] if len(
-            new_arg) == 4 else "Description is not provided."
-
-        self.com_g.add_arg(name, dtype, dvalue, desc)
-
-        return self
-
-    def __getattr__(self, name):
-        if name in self.arg_config:
-            return self.arg_config[name]
-
-        if name in self.json_config:
-            return self.json_config[name]
-
-        if name in self.yaml_config:
-            return self.yaml_config[name]
-
-        raise Warning("The argument %s is not defined." % name)
-
-    def Print(self):
-
-        print("-" * 70)
-        for name in self.arg_config:
-            print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name])))
-
-        for name in self.json_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.json_config[name])))
-
-        for name in self.yaml_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.yaml_config[name])))
-
-        print("-" * 70)
-
-
-if __name__ == "__main__":
-    """
-    pd_config = PDConfig(json_file = "./test/bert_config.json")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-
-    pd_config = PDConfig(yaml_file = "./test/bert_config.yaml")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    """
-
-    config = Config(yaml_file="./bert.yaml")
-    config += ("my_age", int, 18, "I am forever 18.")
-    config.build()
-
-    print(config.data_dir)
-    print(config.my_age)
diff --git a/hapi/datasets/__init__.py b/hapi/datasets/__init__.py
deleted file mode 100644
index fc5df6401992def4bc37329794e534a832924da3..0000000000000000000000000000000000000000
--- a/hapi/datasets/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import folder
-from . import mnist
-from . import flowers
-
-from .folder import *
-from .mnist import *
-from .flowers import *
-
-__all__ = folder.__all__ \
-        + mnist.__all__ \
-        + flowers.__all__
diff --git a/hapi/datasets/flowers.py b/hapi/datasets/flowers.py
deleted file mode 100644
index 9d543c318dff1540122842aaa5e8a0ae9592988b..0000000000000000000000000000000000000000
--- a/hapi/datasets/flowers.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import io
-import tarfile
-import numpy as np
-import scipy.io as scio
-from PIL import Image
-
-from paddle.io import Dataset
-from .utils import _check_exists_and_download
-
-__all__ = ["Flowers"]
-
-DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
-LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
-SETID_URL = 'http://paddlemodels.bj.bcebos.com/flowers/setid.mat'
-DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
-LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
-SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
-
-# In official 'readme', tstid is the flag of test data
-# and trnid is the flag of train data. But test data is more than train data.
-# So we exchange the train data and test data.
-MODE_FLAG_MAP = {'train': 'tstid', 'test': 'trnid', 'valid': "valid"}
-
-
-class Flowers(Dataset):
-    """
-    Implement of flowers dataset
-
-    Args:
-        data_file(str): path to data file, can be set None if
-            :attr:`download` is True. Default None
-        label_file(str): path to label file, can be set None if
-            :attr:`download` is True. Default None
-        setid_file(str): path to subset index file, can be set
-            None if :attr:`download` is True. Default None
-        mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
-        download(bool): whether auto download mnist dataset if
-            :attr:`image_path`/:attr:`label_path` unset. Default
-            True
-
-    Examples:
-        
-        .. code-block:: python
-
-            from hapi.vision.datasets import Flowers
-
-            flowers = Flowers(mode='test')
-
-            for i in range(len(flowers)):
-                sample = flowers[i]
-                print(sample[0].shape, sample[1])
-
-    """
-
-    def __init__(self,
-                 data_file=None,
-                 label_file=None,
-                 setid_file=None,
-                 mode='train',
-                 transform=None,
-                 download=True):
-        assert mode.lower() in ['train', 'valid', 'test'], \
-                "mode should be 'train', 'valid' or 'test', but got {}".format(mode)
-        self.flag = MODE_FLAG_MAP[mode.lower()]
-
-        self.data_file = data_file
-        if self.data_file is None:
-            assert download, "data_file not set and auto download disabled"
-            self.data_file = _check_exists_and_download(
-                data_file, DATA_URL, DATA_MD5, 'flowers', download)
-
-        self.label_file = label_file
-        if self.label_file is None:
-            assert download, "label_file not set and auto download disabled"
-            self.label_file = _check_exists_and_download(
-                label_file, LABEL_URL, LABEL_MD5, 'flowers', download)
-
-        self.setid_file = setid_file
-        if self.setid_file is None:
-            assert download, "setid_file not set and auto download disabled"
-            self.setid_file = _check_exists_and_download(
-                setid_file, SETID_URL, SETID_MD5, 'flowers', download)
-
-        self.transform = transform
-
-        # read dataset into memory
-        self._load_anno()
-
-    def _load_anno(self):
-        self.name2mem = {}
-        self.data_tar = tarfile.open(self.data_file)
-        for ele in self.data_tar.getmembers():
-            self.name2mem[ele.name] = ele
-
-        self.labels = scio.loadmat(self.label_file)['labels'][0]
-        self.indexes = scio.loadmat(self.setid_file)[self.flag][0]
-
-    def __getitem__(self, idx):
-        index = self.indexes[idx]
-        label = np.array([self.labels[index - 1]])
-        img_name = "jpg/image_%05d.jpg" % index
-        img_ele = self.name2mem[img_name]
-        image = self.data_tar.extractfile(img_ele).read()
-        image = np.array(Image.open(io.BytesIO(image)))
-
-        if self.transform is not None:
-            image = self.transform(image)
-
-        return image, label.astype('int64')
-
-    def __len__(self):
-        return len(self.indexes)
diff --git a/hapi/datasets/folder.py b/hapi/datasets/folder.py
deleted file mode 100644
index c0b7c08794c4ca00301e5aee623f5d44db251bf1..0000000000000000000000000000000000000000
--- a/hapi/datasets/folder.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import cv2
-
-from paddle.io import Dataset
-
-__all__ = ["DatasetFolder", "ImageFolder"]
-
-
-def has_valid_extension(filename, extensions):
-    """Checks if a file is a vilid extension.
-
-    Args:
-        filename (str): path to a file
-        extensions (tuple of str): extensions to consider (lowercase)
-
-    Returns:
-        bool: True if the filename ends with one of given extensions
-    """
-    return filename.lower().endswith(extensions)
-
-
-def make_dataset(dir, class_to_idx, extensions, is_valid_file=None):
-    images = []
-    dir = os.path.expanduser(dir)
-
-    if extensions is not None:
-
-        def is_valid_file(x):
-            return has_valid_extension(x, extensions)
-
-    for target in sorted(class_to_idx.keys()):
-        d = os.path.join(dir, target)
-        if not os.path.isdir(d):
-            continue
-        for root, _, fnames in sorted(os.walk(d, followlinks=True)):
-            for fname in sorted(fnames):
-                path = os.path.join(root, fname)
-                if is_valid_file(path):
-                    item = (path, class_to_idx[target])
-                    images.append(item)
-
-    return images
-
-
-class DatasetFolder(Dataset):
-    """A generic data loader where the samples are arranged in this way:
-
-        root/class_a/1.ext
-        root/class_a/2.ext
-        root/class_a/3.ext
-
-        root/class_b/123.ext
-        root/class_b/456.ext
-        root/class_b/789.ext
-
-    Args:
-        root (string): Root directory path.
-        loader (callable|optional): A function to load a sample given its path.
-        extensions (tuple[str]|optional): A list of allowed extensions.
-            both extensions and is_valid_file should not be passed.
-        transform (callable|optional): A function/transform that takes in
-            a sample and returns a transformed version.
-        is_valid_file (callable|optional): A function that takes path of a file
-            and check if the file is a valid file (used to check of corrupt files)
-            both extensions and is_valid_file should not be passed.
-
-     Attributes:
-        classes (list): List of the class names.
-        class_to_idx (dict): Dict with items (class_name, class_index).
-        samples (list): List of (sample path, class_index) tuples
-        targets (list): The class_index value for each image in the dataset
-    """
-
-    def __init__(self,
-                 root,
-                 loader=None,
-                 extensions=None,
-                 transform=None,
-                 is_valid_file=None):
-        self.root = root
-        self.transform = transform
-        if extensions is None:
-            extensions = IMG_EXTENSIONS
-        classes, class_to_idx = self._find_classes(self.root)
-        samples = make_dataset(self.root, class_to_idx, extensions,
-                               is_valid_file)
-        if len(samples) == 0:
-            raise (RuntimeError(
-                "Found 0 files in subfolders of: " + self.root + "\n"
-                "Supported extensions are: " + ",".join(extensions)))
-
-        self.loader = cv2_loader if loader is None else loader
-        self.extensions = extensions
-
-        self.classes = classes
-        self.class_to_idx = class_to_idx
-        self.samples = samples
-        self.targets = [s[1] for s in samples]
-
-    def _find_classes(self, dir):
-        """
-        Finds the class folders in a dataset.
-
-        Args:
-            dir (string): Root directory path.
-
-        Returns:
-            tuple: (classes, class_to_idx) where classes are relative to (dir), 
-                    and class_to_idx is a dictionary.
-
-        """
-        if sys.version_info >= (3, 5):
-            # Faster and available in Python 3.5 and above
-            classes = [d.name for d in os.scandir(dir) if d.is_dir()]
-        else:
-            classes = [
-                d for d in os.listdir(dir)
-                if os.path.isdir(os.path.join(dir, d))
-            ]
-        classes.sort()
-        class_to_idx = {classes[i]: i for i in range(len(classes))}
-        return classes, class_to_idx
-
-    def __getitem__(self, index):
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            tuple: (sample, target) where target is class_index of the target class.
-        """
-        path, target = self.samples[index]
-        sample = self.loader(path)
-        if self.transform is not None:
-            sample = self.transform(sample)
-
-        return sample, target
-
-    def __len__(self):
-        return len(self.samples)
-
-
-IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
-                  '.tiff', '.webp')
-
-
-def cv2_loader(path):
-    return cv2.imread(path)
-
-
-class ImageFolder(Dataset):
-    """A generic data loader where the samples are arranged in this way:
-
-        root/1.ext
-        root/2.ext
-        root/sub_dir/3.ext
-
-    Args:
-        root (string): Root directory path.
-        loader (callable, optional): A function to load a sample given its path.
-        extensions (tuple[string], optional): A list of allowed extensions.
-            both extensions and is_valid_file should not be passed.
-        transform (callable, optional): A function/transform that takes in
-            a sample and returns a transformed version.
-        is_valid_file (callable, optional): A function that takes path of a file
-            and check if the file is a valid file (used to check of corrupt files)
-            both extensions and is_valid_file should not be passed.
-
-     Attributes:
-        samples (list): List of sample path
-     """
-
-    def __init__(self,
-                 root,
-                 loader=None,
-                 extensions=None,
-                 transform=None,
-                 is_valid_file=None):
-        self.root = root
-        if extensions is None:
-            extensions = IMG_EXTENSIONS
-
-        samples = []
-        path = os.path.expanduser(root)
-
-        if extensions is not None:
-
-            def is_valid_file(x):
-                return has_valid_extension(x, extensions)
-
-        for root, _, fnames in sorted(os.walk(path, followlinks=True)):
-            for fname in sorted(fnames):
-                f = os.path.join(root, fname)
-                if is_valid_file(f):
-                    samples.append(f)
-
-        if len(samples) == 0:
-            raise (RuntimeError(
-                "Found 0 files in subfolders of: " + self.root + "\n"
-                "Supported extensions are: " + ",".join(extensions)))
-
-        self.loader = cv2_loader if loader is None else loader
-        self.extensions = extensions
-        self.samples = samples
-        self.transform = transform
-
-    def __getitem__(self, index):
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            tuple: (sample, target) where target is class_index of the target class.
-        """
-        path = self.samples[index]
-        sample = self.loader(path)
-        if self.transform is not None:
-            sample = self.transform(sample)
-        return [sample]
-
-    def __len__(self):
-        return len(self.samples)
diff --git a/hapi/datasets/mnist.py b/hapi/datasets/mnist.py
deleted file mode 100644
index 3f09bb355de7b6050912d748bca6cbfbfd08f442..0000000000000000000000000000000000000000
--- a/hapi/datasets/mnist.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import gzip
-import struct
-import numpy as np
-
-import paddle.dataset.common
-from paddle.io import Dataset
-from .utils import _check_exists_and_download
-
-__all__ = ["MNIST"]
-
-URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
-TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
-TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
-TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
-TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
-TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
-TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
-TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
-TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
-
-
-class MNIST(Dataset):
-    """
-    Implement of MNIST dataset
-
-    Args:
-        image_path(str): path to image file, can be set None if
-            :attr:`download` is True. Default None
-        label_path(str): path to label file, can be set None if
-            :attr:`download` is True. Default None
-        chw_format(bool): If set True, the output shape is [1, 28, 28],
-            otherwise, output shape is [1, 784]. Default True.
-        mode(str): 'train' or 'test' mode. Default 'train'.
-        download(bool): whether auto download mnist dataset if
-            :attr:`image_path`/:attr:`label_path` unset. Default
-            True
-
-    Returns:
-        Dataset: MNIST Dataset.
-
-    Examples:
-        
-        .. code-block:: python
-
-            from hapi.vision.datasets import MNIST
-
-            mnist = MNIST(mode='test')
-
-            for i in range(len(mnist)):
-                sample = mnist[i]
-                print(sample[0].shape, sample[1])
-
-    """
-
-    def __init__(self,
-                 image_path=None,
-                 label_path=None,
-                 chw_format=True,
-                 mode='train',
-                 transform=None,
-                 download=True):
-        assert mode.lower() in ['train', 'test'], \
-                "mode should be 'train' or 'test', but got {}".format(mode)
-        self.mode = mode.lower()
-        self.chw_format = chw_format
-        self.image_path = image_path
-        if self.image_path is None:
-            assert download, "image_path not set and auto download disabled"
-            image_url = TRAIN_IMAGE_URL if mode == 'train' else TEST_IMAGE_URL
-            image_md5 = TRAIN_IMAGE_MD5 if mode == 'train' else TEST_IMAGE_MD5
-            self.image_path = _check_exists_and_download(
-                image_path, image_url, image_md5, 'mnist', download)
-
-        self.label_path = label_path
-        if self.label_path is None:
-            assert download, "label_path not set and auto download disabled"
-            label_url = TRAIN_LABEL_URL if mode == 'train' else TEST_LABEL_URL
-            label_md5 = TRAIN_LABEL_MD5 if mode == 'train' else TEST_LABEL_MD5
-            self.label_path = _check_exists_and_download(
-                label_path, label_url, label_md5, 'mnist', download)
-
-        self.transform = transform
-
-        # read dataset into memory
-        self._parse_dataset()
-
-    def _parse_dataset(self, buffer_size=100):
-        self.images = []
-        self.labels = []
-        with gzip.GzipFile(self.image_path, 'rb') as image_file:
-            img_buf = image_file.read()
-            with gzip.GzipFile(self.label_path, 'rb') as label_file:
-                lab_buf = label_file.read()
-
-                step_label = 0
-                offset_img = 0
-                # read from Big-endian
-                # get file info from magic byte
-                # image file : 16B
-                magic_byte_img = '>IIII'
-                magic_img, image_num, rows, cols = struct.unpack_from(
-                    magic_byte_img, img_buf, offset_img)
-                offset_img += struct.calcsize(magic_byte_img)
-
-                offset_lab = 0
-                # label file : 8B
-                magic_byte_lab = '>II'
-                magic_lab, label_num = struct.unpack_from(magic_byte_lab,
-                                                          lab_buf, offset_lab)
-                offset_lab += struct.calcsize(magic_byte_lab)
-
-                while True:
-                    if step_label >= label_num:
-                        break
-                    fmt_label = '>' + str(buffer_size) + 'B'
-                    labels = struct.unpack_from(fmt_label, lab_buf, offset_lab)
-                    offset_lab += struct.calcsize(fmt_label)
-                    step_label += buffer_size
-
-                    fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
-                    images_temp = struct.unpack_from(fmt_images, img_buf,
-                                                     offset_img)
-                    images = np.reshape(images_temp, (buffer_size, rows *
-                                                      cols)).astype('float32')
-                    offset_img += struct.calcsize(fmt_images)
-
-                    images = images / 255.0
-                    images = images * 2.0
-                    images = images - 1.0
-
-                    for i in range(buffer_size):
-                        self.images.append(images[i, :])
-                        self.labels.append(
-                            np.array([labels[i]]).astype('int64'))
-
-    def __getitem__(self, idx):
-        image, label = self.images[idx], self.labels[idx]
-        if self.chw_format:
-            image = np.reshape(image, [1, 28, 28])
-        if self.transform is not None:
-            image = self.transform(image)
-        return image, label
-
-    def __len__(self):
-        return len(self.labels)
diff --git a/hapi/datasets/utils.py b/hapi/datasets/utils.py
deleted file mode 100644
index 171f794ba9df4270727a23cc6cd039a9faa81970..0000000000000000000000000000000000000000
--- a/hapi/datasets/utils.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import paddle.dataset.common
-
-
-def _check_exists_and_download(path, url, md5, module_name, download=True):
-    if path and os.path.exists(path):
-        return path
-
-    if download:
-        return paddle.dataset.common.download(url, module_name, md5)
-    else:
-        raise ValueError('{} not exists and auto download disabled'.format(
-            path))
diff --git a/hapi/distributed.py b/hapi/distributed.py
deleted file mode 100644
index 5460cd435f3ebb67cdfeff4188fe2b22d179c277..0000000000000000000000000000000000000000
--- a/hapi/distributed.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import six
-import time
-import math
-import socket
-import contextlib
-import numpy as np
-
-from paddle import fluid
-from paddle.fluid.layers import collective
-from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
-from paddle.io import BatchSampler
-
-_parallel_context_initialized = False
-
-
-class DistributedBatchSampler(BatchSampler):
-    """Sampler that restricts data loading to a subset of the dataset.
-
-    In such case, each process can pass a DistributedBatchSampler instance 
-    as a DataLoader sampler, and load a subset of the original dataset that 
-    is exclusive to it.
-
-    .. note::
-        Dataset is assumed to be of constant size.
-        
-    Args:
-        data_source: this could be a `paddle.io.Dataset` implement
-                     or other python object which implemented
-                     `__len__` for BatchSampler to get sample
-                     number of data source.
-        batch_size(int): sample indice number in a mini-batch indices.
-        shuffle(bool): whther to shuffle indices order before genrating
-            batch indices. Default False.
-        drop_last(bool): whether drop the last incomplete batch dataset size
-            is not divisible by the batch size. Default False
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.datasets import MNIST
-            from hapi.distributed import DistributedBatchSampler
-
-            class MnistDataset(MNIST):
-                def __init__(self, mode, return_label=True):
-                    super(MnistDataset, self).__init__(mode=mode)
-                    self.return_label = return_label
-
-                def __getitem__(self, idx):
-                    img = np.reshape(self.images[idx], [1, 28, 28])
-                    if self.return_label:
-                        return img, np.array(self.labels[idx]).astype('int64')
-                    return img,
-
-                def __len__(self):
-                    return len(self.images)
-
-            train_dataset = MnistDataset(mode='train')
-            dist_train_dataloader = DistributedBatchSampler(train_dataset, batch_size=64)
-
-            for data in dist_train_dataloader:
-                # do something
-                break
-    """
-
-    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
-        self.dataset = dataset
-
-        assert isinstance(batch_size, int) and batch_size > 0, \
-                "batch_size should be a positive integer"
-        self.batch_size = batch_size
-        assert isinstance(shuffle, bool), \
-                "shuffle should be a boolean value"
-        self.shuffle = shuffle
-        assert isinstance(drop_last, bool), \
-                "drop_last should be a boolean number"
-
-        self.drop_last = drop_last
-        self.nranks = ParallelEnv().nranks
-        self.local_rank = ParallelEnv().local_rank
-        self.epoch = 0
-        self.num_samples = int(
-            math.ceil(len(self.dataset) * 1.0 / self.nranks))
-        self.total_size = self.num_samples * self.nranks
-
-    def __iter__(self):
-        num_samples = len(self.dataset)
-        indices = np.arange(num_samples).tolist()
-        indices += indices[:(self.total_size - len(indices))]
-        assert len(indices) == self.total_size
-        if self.shuffle:
-            np.random.RandomState(self.epoch).shuffle(indices)
-            self.epoch += 1
-
-        # subsample
-        def _get_indices_by_batch_size(indices):
-            subsampled_indices = []
-            last_batch_size = self.total_size % (self.batch_size * self.nranks)
-            assert last_batch_size % self.nranks == 0
-            last_local_batch_size = last_batch_size // self.nranks
-
-            for i in range(self.local_rank * self.batch_size,
-                           len(indices) - last_batch_size,
-                           self.batch_size * self.nranks):
-                subsampled_indices.extend(indices[i:i + self.batch_size])
-
-            indices = indices[len(indices) - last_batch_size:]
-            subsampled_indices.extend(indices[
-                self.local_rank * last_local_batch_size:(
-                    self.local_rank + 1) * last_local_batch_size])
-            return subsampled_indices
-
-        if self.nranks > 1:
-            indices = _get_indices_by_batch_size(indices)
-
-        assert len(indices) == self.num_samples
-        _sample_iter = iter(indices)
-
-        batch_indices = []
-        for idx in _sample_iter:
-            batch_indices.append(idx)
-            if len(batch_indices) == self.batch_size:
-                yield batch_indices
-                batch_indices = []
-        if not self.drop_last and len(batch_indices) > 0:
-            yield batch_indices
-
-    def __len__(self):
-        num_samples = self.num_samples
-        num_samples += int(not self.drop_last) * (self.batch_size - 1)
-        return num_samples // self.batch_size
-
-    def set_epoch(self, epoch):
-        self.epoch = epoch
-
-
-def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
-    return collective._c_allgather(
-        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
-
-
-def wait_server_ready(endpoints):
-    assert not isinstance(endpoints, six.string_types)
-    while True:
-        all_ok = True
-        not_ready_endpoints = []
-        for ep in endpoints:
-            ip_port = ep.split(":")
-            with contextlib.closing(
-                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-                sock.settimeout(2)
-                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
-                if result != 0:
-                    all_ok = False
-                    not_ready_endpoints.append(ep)
-        if not all_ok:
-            time.sleep(3)
-        else:
-            break
-
-
-def init_communicator(program, rank, nranks, wait_port, current_endpoint,
-                      endpoints):
-    if nranks < 2:
-        return
-    other_endpoints = endpoints[:]
-    other_endpoints.remove(current_endpoint)
-    if rank == 0 and wait_port:
-        wait_server_ready(other_endpoints)
-    block = program.global_block()
-    nccl_id_var = block.create_var(
-        name=fluid.unique_name.generate('nccl_id'),
-        persistable=True,
-        type=fluid.core.VarDesc.VarType.RAW)
-
-    block.append_op(
-        type='c_gen_nccl_id',
-        inputs={},
-        outputs={'Out': nccl_id_var},
-        attrs={
-            'rank': rank,
-            'endpoint': current_endpoint,
-            'other_endpoints': other_endpoints
-        })
-
-    block.append_op(
-        type='c_comm_init',
-        inputs={'X': nccl_id_var},
-        outputs={},
-        attrs={
-            'nranks': nranks,
-            'rank': rank,
-            'ring_id': 0,
-        })
-
-
-def prepare_distributed_context(place=None):
-    if place is None:
-        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
-            else fluid.CUDAPlace(0)
-
-    strategy = ParallelStrategy()
-    strategy.nranks = ParallelEnv().nranks
-    strategy.local_rank = ParallelEnv().local_rank
-    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-    strategy.current_endpoint = ParallelEnv().current_endpoint
-
-    if strategy.nranks < 2:
-        return
-
-    global _parallel_context_initialized
-
-    if not _parallel_context_initialized and isinstance(place,
-                                                        fluid.CUDAPlace):
-
-        def _init_context():
-            communicator_prog = fluid.Program()
-            init_communicator(communicator_prog, strategy.local_rank,
-                              strategy.nranks, True, strategy.current_endpoint,
-                              strategy.trainer_endpoints)
-            exe = fluid.Executor(place)
-            exe.run(communicator_prog)
-
-        if fluid.in_dygraph_mode():
-            fluid.disable_dygraph()
-            _init_context()
-            fluid.enable_dygraph(place)
-        else:
-            _init_context()
-
-    else:
-        assert ("Only support CUDAPlace for now.")
-
-    _parallel_context_initialized = True
-    return strategy
diff --git a/hapi/download.py b/hapi/download.py
deleted file mode 100644
index b084d0af69239df4d86c8c94c1ad07c5c7f3da38..0000000000000000000000000000000000000000
--- a/hapi/download.py
+++ /dev/null
@@ -1,235 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import os.path as osp
-import shutil
-import requests
-import hashlib
-import time
-from collections import OrderedDict
-from paddle.fluid.dygraph.parallel import ParallelEnv
-
-try:
-    from tqdm import tqdm
-except:
-
-    class tqdm(object):
-        def __init__(self, total=None):
-            self.total = total
-            self.n = 0
-
-        def update(self, n):
-            self.n += n
-            if self.total is None:
-                sys.stderr.write("\r{0:.1f} bytes".format(self.n))
-            else:
-                sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(
-                    self.total)))
-            sys.stderr.flush()
-
-        def __enter__(self):
-            return self
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            sys.stderr.write('\n')
-
-
-import logging
-logger = logging.getLogger(__name__)
-
-__all__ = ['get_weights_path_from_url', 'is_url']
-
-WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
-
-DOWNLOAD_RETRY_LIMIT = 3
-
-nlp_models = OrderedDict((
-    ('RoBERTa-zh-base',
-     'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz'
-     ),
-    ('RoBERTa-zh-large',
-     'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz'
-     ),
-    ('ERNIE-v2-en-base',
-     'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'),
-    ('ERNIE-v2-en-large',
-     'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz'),
-    ('XLNet-cased-base',
-     'https://xlnet.bj.bcebos.com/xlnet_cased_L-12_H-768_A-12.tgz'),
-    ('XLNet-cased-large',
-     'https://xlnet.bj.bcebos.com/xlnet_cased_L-24_H-1024_A-16.tgz'),
-    ('ERNIE-v1-zh-base',
-     'https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz'),
-    ('ERNIE-v1-zh-base-max-len-512',
-     'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz'),
-    ('BERT-en-uncased-large-whole-word-masking',
-     'https://bert-models.bj.bcebos.com/wwm_uncased_L-24_H-1024_A-16.tar.gz'),
-    ('BERT-en-cased-large-whole-word-masking',
-     'https://bert-models.bj.bcebos.com/wwm_cased_L-24_H-1024_A-16.tar.gz'),
-    ('BERT-en-uncased-base',
-     'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz'),
-    ('BERT-en-uncased-large',
-     'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz'),
-    ('BERT-en-cased-base',
-     'https://bert-models.bj.bcebos.com/cased_L-12_H-768_A-12.tar.gz'),
-    ('BERT-en-cased-large',
-     'https://bert-models.bj.bcebos.com/cased_L-24_H-1024_A-16.tar.gz'),
-    ('BERT-multilingual-uncased-base',
-     'https://bert-models.bj.bcebos.com/multilingual_L-12_H-768_A-12.tar.gz'),
-    ('BERT-multilingual-cased-base',
-     'https://bert-models.bj.bcebos.com/multi_cased_L-12_H-768_A-12.tar.gz'),
-    ('BERT-zh-base',
-     'https://bert-models.bj.bcebos.com/chinese_L-12_H-768_A-12.tar.gz'), ))
-
-
-def is_url(path):
-    """
-    Whether path is URL.
-    Args:
-        path (string): URL string or not.
-    """
-    return path.startswith('http://') or path.startswith('https://')
-
-
-def get_weights_path_from_url(url, md5sum=None):
-    """Get weights path from WEIGHT_HOME, if not exists,
-    download it from url.
-
-    Args:
-        url (str): download url
-        md5sum (str): md5 sum of download package
-    
-    Returns:
-        str: a local path to save downloaded weights.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.download import get_weights_path_from_url
-
-            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
-            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
-
-    """
-    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
-    return path
-
-
-def _map_path(url, root_dir):
-    # parse path after download under root_dir
-    fname = osp.split(url)[-1]
-    fpath = fname
-    return osp.join(root_dir, fpath)
-
-
-def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
-    """ Download from given url to root_dir.
-    if file or directory specified by url is exists under
-    root_dir, return the path directly, otherwise download
-    from url and decompress it, return the path.
-
-    Args:
-        url (str): download url
-        root_dir (str): root dir for downloading, it should be
-                        WEIGHTS_HOME or DATASET_HOME
-        md5sum (str): md5 sum of download package
-    
-    Returns:
-        str: a local path to save downloaded models & weights & datasets.
-    """
-    assert is_url(url), "downloading from {} not a url".format(url)
-    # parse path after download to decompress under root_dir
-    fullpath = _map_path(url, root_dir)
-
-    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
-        logger.info("Found {}".format(fullpath))
-    else:
-        if ParallelEnv().local_rank == 0:
-            fullpath = _download(url, root_dir, md5sum)
-        else:
-            while not os.path.exists(fullpath):
-                time.sleep(1)
-    return fullpath
-
-
-def _download(url, path, md5sum=None):
-    """
-    Download from url, save to path.
-
-    url (str): download url
-    path (str): download to given path
-    """
-    if not osp.exists(path):
-        os.makedirs(path)
-
-    fname = osp.split(url)[-1]
-    fullname = osp.join(path, fname)
-    retry_cnt = 0
-
-    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
-        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
-            retry_cnt += 1
-        else:
-            raise RuntimeError("Download from {} failed. "
-                               "Retry limit reached".format(url))
-
-        logger.info("Downloading {} from {}".format(fname, url))
-
-        req = requests.get(url, stream=True)
-        if req.status_code != 200:
-            raise RuntimeError("Downloading from {} failed with code "
-                               "{}!".format(url, req.status_code))
-
-        # For protecting download interupted, download to
-        # tmp_fullname firstly, move tmp_fullname to fullname
-        # after download finished
-        tmp_fullname = fullname + "_tmp"
-        total_size = req.headers.get('content-length')
-        with open(tmp_fullname, 'wb') as f:
-            if total_size:
-                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
-                    for chunk in req.iter_content(chunk_size=1024):
-                        f.write(chunk)
-                        pbar.update(1)
-            else:
-                for chunk in req.iter_content(chunk_size=1024):
-                    if chunk:
-                        f.write(chunk)
-        shutil.move(tmp_fullname, fullname)
-
-    return fullname
-
-
-def _md5check(fullname, md5sum=None):
-    if md5sum is None:
-        return True
-
-    logger.info("File {} md5 checking...".format(fullname))
-    md5 = hashlib.md5()
-    with open(fullname, 'rb') as f:
-        for chunk in iter(lambda: f.read(4096), b""):
-            md5.update(chunk)
-    calc_md5sum = md5.hexdigest()
-
-    if calc_md5sum != md5sum:
-        logger.info("File {} md5 check failed, {}(calc) != "
-                    "{}(base)".format(fullname, calc_md5sum, md5sum))
-        return False
-    return True
diff --git a/hapi/logger.py b/hapi/logger.py
deleted file mode 100644
index 8b9239008445b2cb93c81c57379fbebf444f9807..0000000000000000000000000000000000000000
--- a/hapi/logger.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import logging
-
-from paddle.fluid.dygraph.parallel import ParallelEnv
-
-
-def setup_logger(output=None, name="hapi", log_level=logging.INFO):
-    """
-    Initialize logger of hapi and set its verbosity level to "INFO".
-
-    Args:
-        output (str): a file name or a directory to save log. If None, will not save log file.
-            If ends with ".txt" or ".log", assumed to be a file name.
-            Otherwise, logs will be saved to `output/log.txt`.
-        name (str): the root module name of this logger. Default: 'hapi'.
-        log_level (enum): log level. eg.'INFO', 'DEBUG', 'ERROR'. Default: logging.INFO.
-    Returns:
-        logging.Logger: a logger
-    """
-    logger = logging.getLogger(name)
-    logger.propagate = False
-    logger.setLevel(log_level)
-
-    format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    # stdout logging: only local rank==0
-    local_rank = ParallelEnv().local_rank
-    if local_rank == 0 and len(logger.handlers) == 0:
-        ch = logging.StreamHandler(stream=sys.stdout)
-        ch.setLevel(log_level)
-
-        ch.setFormatter(logging.Formatter(format_str))
-        logger.addHandler(ch)
-
-    # file logging if output is not None: all workers
-    if output is not None:
-        if output.endswith(".txt") or output.endswith(".log"):
-            filename = output
-        else:
-            filename = os.path.join(output, "log.txt")
-
-        if local_rank > 0:
-            filename = filename + ".rank{}".format(local_rank)
-
-        if not os.path.exists(os.path.dirname(filename)):
-            os.makedirs(os.path.dirname(filename))
-
-        fh = logging.StreamHandler(filename)
-        fh.setLevel(log_level)
-        fh.setFormatter(logging.Formatter(format_str))
-        logger.addHandler(fh)
-
-    return logger
diff --git a/hapi/loss.py b/hapi/loss.py
deleted file mode 100644
index 9bdf04af4e356c9e2312ba28e0627f38355b00c3..0000000000000000000000000000000000000000
--- a/hapi/loss.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-import os
-
-from paddle import fluid
-from paddle.fluid.framework import in_dygraph_mode, Variable
-from paddle.fluid.dygraph.base import to_variable
-
-from hapi.utils import to_list
-
-__all__ = ['Loss', 'CrossEntropy', 'SoftmaxWithCrossEntropy']
-
-
-class Loss(object):
-    """
-    Base class for loss, encapsulates loss logic and APIs
-
-    Usage:
-        custom_loss = CustomLoss()
-        loss = custom_loss(inputs, labels)
-    """
-
-    def __init__(self, average=True):
-        super(Loss, self).__init__()
-        self.average = average
-
-    def forward(self, outputs, labels):
-        raise NotImplementedError()
-
-    def __call__(self, outputs, labels=None):
-        labels = to_list(labels)
-        if in_dygraph_mode() and labels:
-            labels = [to_variable(l) for l in labels]
-        losses = to_list(self.forward(to_list(outputs), labels))
-        if self.average:
-            losses = [fluid.layers.reduce_mean(l) for l in losses]
-        else:
-            losses = [fluid.layers.reduce_sum(l) for l in losses]
-        return losses
-
-
-class CrossEntropy(Loss):
-    """
-    Args:
-        input (list[Variable]): Input tensor, the data type is float32,
-            float64, int32, int64.
-        label (list[Variable]): Label tensor, the data type is float32,
-            float64, int32, int64.
-        average (bool, optional): Indicate whether to average the loss, Default: True.
-    Returns:
-        list[Variable]: The tensor variable storing the cross_entropy_loss of inputs and labels.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.model import Input
-            from hapi.vision.models import LeNet
-            from hapi.loss import CrossEntropy
-
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
-
-            model = LeNet()
-            loss = CrossEntropy()
-            model.prepare(loss_function=loss, inputs=inputs, labels=labels)
-            
-    """
-
-    def __init__(self, average=True):
-        super(CrossEntropy, self).__init__(average)
-
-    def forward(self, outputs, labels):
-        return [
-            fluid.layers.cross_entropy(o, l) for o, l in zip(outputs, labels)
-        ]
-
-
-class SoftmaxWithCrossEntropy(Loss):
-    """
-    this op combined softmax and cross entropy.
-    Args:
-        input (list[Variable]): Input tensor, the data type is float32,
-            float64, int32, int64.
-        label (list[Variable]): Label tensor, the data type is float32,
-            float64, int32, int64.
-        average (bool, optional): Indicate whether to average the loss, Default: True.
-    Returns:
-        list[Variable]: The tensor variable storing the cross_entropy_loss of inputs and labels.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.model import Input
-            from hapi.vision.models import LeNet
-            from hapi.loss import SoftmaxWithCrossEntropy
-
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
-
-            model = LeNet(classifier_activation=None)
-            loss = SoftmaxWithCrossEntropy()
-            model.prepare(loss_function=loss, inputs=inputs, labels=labels)
-    """
-
-    def __init__(self, average=True):
-        super(SoftmaxWithCrossEntropy, self).__init__(average)
-
-    def forward(self, outputs, labels):
-        return [
-            fluid.layers.softmax_with_cross_entropy(
-                o, l, return_softmax=False) for o, l in zip(outputs, labels)
-        ]
diff --git a/hapi/metrics.py b/hapi/metrics.py
deleted file mode 100644
index 11a174caaeeb4b320663f68ef60b4500244f9adc..0000000000000000000000000000000000000000
--- a/hapi/metrics.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-import six
-import abc
-import numpy as np
-import paddle.fluid as fluid
-
-import logging
-FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-logger = logging.getLogger(__name__)
-
-__all__ = ['Metric', 'Accuracy']
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Metric(object):
-    """
-    Base class for metric, encapsulates metric logic and APIs
-
-    Usage:
-    m = SomeMetric()
-    for prediction, label in ...:
-        m.update(prediction, label)
-    m.accumulate()
-    """
-
-    @abc.abstractmethod
-    def reset(self):
-        """
-        Reset states and result
-        """
-        raise NotImplementedError("function 'reset' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def update(self, *args):
-        """
-        Update states for metric
-
-        Inputs of :code:`update` is the outputs of :code:`Metric.add_metric_op`,
-        if :code:`add_metric_op` is not defined, the inputs of :code:`update`
-        will be flatten arguments of **output** of mode and **label** from data:
-        :code:`update(output1, output2, ..., label1, label2,...)`
-
-        see :code:`Metric.add_metric_op`
-        """
-        raise NotImplementedError("function 'update' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def accumulate(self):
-        """
-        Accumulates statistics, computes and returns the metric value
-        """
-        raise NotImplementedError(
-            "function 'accumulate' not implemented in {}.".format(
-                self.__class__.__name__))
-
-    @abc.abstractmethod
-    def name(self):
-        """
-        Returns metric name
-        """
-        raise NotImplementedError("function 'name' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    def add_metric_op(self, *args):
-        """
-        This API is advanced usage to accelerate metric calculating, calulations
-        from outputs of model to the states which should be updated by Metric can
-        be defined here, where Paddle OPs is also supported. Outputs of this API
-        will be the inputs of "Metric.update".
-
-        If :code:`add_metric_op` is defined, it will be called with **outputs**
-        of model and **labels** from data as arguments, all outputs and labels
-        will be concatenated and flatten and each filed as a separate argument
-        as follows:
-        :code:`add_metric_op(output1, output2, ..., label1, label2,...)`
-
-        If :code:`add_metric_op` is not defined, default behaviour is to pass
-        input to output, so output format will be:
-        :code:`return output1, output2, ..., label1, label2,...`
-
-        see :code:`Metric.update`
-        """
-        return args
-
-
-class Accuracy(Metric):
-    """
-    Encapsulates accuracy metric logic
-    """
-
-    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
-        super(Accuracy, self).__init__(*args, **kwargs)
-        self.topk = topk
-        self.maxk = max(topk)
-        self._init_name(name)
-        self.reset()
-
-    def add_metric_op(self, pred, label, *args):
-        pred = fluid.layers.argsort(pred, descending=True)[1][:, :self.maxk]
-        correct = pred == label
-        return fluid.layers.cast(correct, dtype='float32')
-
-    def update(self, correct, *args):
-        accs = []
-        for i, k in enumerate(self.topk):
-            num_corrects = correct[:, :k].sum()
-            num_samples = len(correct)
-            accs.append(float(num_corrects) / num_samples)
-            self.total[i] += num_corrects
-            self.count[i] += num_samples
-        return accs
-
-    def reset(self):
-        self.total = [0.] * len(self.topk)
-        self.count = [0] * len(self.topk)
-
-    def accumulate(self):
-        res = []
-        for t, c in zip(self.total, self.count):
-            res.append(float(t) / c)
-        return res
-
-    def _init_name(self, name):
-        name = name or 'acc'
-        if self.maxk != 1:
-            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
-        else:
-            self._name = [name]
-
-    def name(self):
-        return self._name
diff --git a/hapi/model.py b/hapi/model.py
deleted file mode 100644
index 9bb307a3f1de9819d3ac2c86f293254e61a14e4b..0000000000000000000000000000000000000000
--- a/hapi/model.py
+++ /dev/null
@@ -1,1658 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-import inspect
-import os
-import pickle
-import numpy as np
-import six
-import warnings
-
-from collections import Iterable
-from paddle import fluid
-from paddle.fluid.framework import in_dygraph_mode, Variable
-from paddle.fluid.executor import global_scope
-from paddle.fluid.io import is_belong_to_optimizer
-from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.layers.utils import flatten
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-from paddle.fluid.incubate.fleet.base import role_maker
-from paddle.io import DataLoader, Dataset
-
-from hapi.loss import Loss
-from hapi.distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
-from hapi.metrics import Metric
-from hapi.callbacks import config_callbacks
-from hapi.utils import to_list, to_numpy, flatten_list, restore_flatten_list
-
-__all__ = [
-    'Model',
-    'Input',
-    'set_device',
-]
-
-
-def set_device(device):
-    """
-    Args:
-        device (str): specify device type, 'cpu' or 'gpu'.
-        
-    Returns:
-        fluid.CUDAPlace or fluid.CPUPlace: Created GPU or CPU place.
-    """
-
-    assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
-    "Expected device in ['cpu', 'gpu'], but got {}".format(device)
-
-    place = fluid.CUDAPlace(ParallelEnv().dev_id) \
-            if device.lower() == 'gpu' and fluid.is_compiled_with_cuda() \
-                else fluid.CPUPlace()
-
-    return place
-
-
-class Input(fluid.dygraph.Layer):
-    def __init__(self, shape=None, dtype=None, name=None):
-        super(Input, self).__init__()
-        self.shape = shape
-        self.dtype = dtype
-        self.name = name
-
-    def forward(self):
-        return fluid.data(self.name, shape=self.shape, dtype=self.dtype)
-
-
-class StaticGraphAdapter(object):
-    def __init__(self, model):
-        super(StaticGraphAdapter, self).__init__()
-        self.model = model
-        # with `_build_once` gone, parameters are now created in `__init__`
-        # so we need to keep track of the parameters already created
-        self._startup_prog = fluid.default_startup_program()
-        self._orig_prog = fluid.default_main_program()
-
-        self._label_vars = {}  # label variables
-        self._input_vars = {}  # label variables
-        self._endpoints = {}
-        self._loss_endpoint = None
-        self._executor = None
-        self._progs = {}
-        self._compiled_progs = {}
-
-        self._merge_count = {
-            'eval_total': 0,
-            'test_total': 0,
-            'eval_batch': 0,
-            'test_batch': 0
-        }
-
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
-
-    @property
-    def mode(self):
-        return self.model.mode
-
-    @mode.setter
-    def mode(self, value):
-        self.model.mode = value
-
-    def train_batch(self, inputs, labels=None):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
-        self.mode = 'train'
-        return self._run(inputs, labels)
-
-    def eval_batch(self, inputs, labels=None):
-        self.mode = 'eval'
-        return self._run(inputs, labels)
-
-    def test_batch(self, inputs):
-        self.mode = 'test'
-        return self._run(inputs, None)
-
-    def parameters(self, *args, **kwargs):
-        return super(Model, self.model).parameters(*args, **kwargs)
-
-    def save(self, path):
-        def _save(state, path):
-            if not state:
-                return
-            state = {
-                k: to_numpy(v) if isinstance(v, Variable) else v
-                for k, v in state.items()
-            }
-            with open(path, 'wb') as f:
-                pickle.dump(state, f)
-
-        base = os.path.basename(path)
-        assert base != "", "path should be of 'dirname/filename' format"
-        dir_name = os.path.dirname(path)
-        if dir_name and not os.path.exists(dir_name):
-            os.makedirs(dir_name)
-        param_path = path + ".pdparams"
-        _save(self.model.state_dict(), param_path)
-        prog = self._progs.get('train', None)
-        if prog is None or self.model._optimizer is None:
-            return
-        # XXX `optimizer.state_dict()` only work in dygraph mode
-        optim_path = path + ".pdopt"
-        optim = {
-            p.name: p
-            for p in filter(is_belong_to_optimizer, prog.list_vars())
-        }
-        if not optim:
-            return
-
-        _save(optim, optim_path)
-
-    def load(self, param_state_pairs, optim_state):
-        if self._executor is None:
-            executor = fluid.Executor(fluid.CPUPlace())._default_executor
-        else:
-            executor = self._executor._default_executor
-
-        # restore parameter states
-        fluid.core._create_loaded_parameter(
-            [param for param, state in param_state_pairs],
-            global_scope(), executor)
-        for param, state in param_state_pairs:
-            self._set_var(param, state)
-
-        # restore optimizer states
-        # FIXME what if a different optimizer is used?
-        if not self.model._optimizer or not optim_state:
-            return
-        self._load_optimizer(optim_state, executor)
-
-    def _load_optimizer(self, state, executor):
-        prog = self._progs.get('train', None)
-        optim = list(filter(is_belong_to_optimizer, prog.list_vars()))
-        if not optim:
-            return
-
-        fluid.core._create_loaded_parameter(optim, global_scope(), executor)
-
-        converted_state = dict(state)
-        for var in optim:
-            if var.name in ["@LR_DECAY_COUNTER@", "global_step"]:
-                # When using learning rate scheduler, dygraph would name the
-                # global step var as "global_step" to save, while static-graph
-                # would has a state var named as "@LR_DECAY_COUNTER@".
-                # NOTE: dygraph saved global_step is 1 larger than that in
-                # static-graph, since the time of global_step to increase is
-                # different.
-                state_val = (
-                    np.array(converted_state.pop("global_step")) - 1
-                ) if "global_step" in converted_state else converted_state.pop(
-                    "@LR_DECAY_COUNTER@", None)
-                if state_val is not None:
-                    converted_state[var.name] = state_val
-            elif var.name.startswith("learning_rate_"):
-                # When using static learning rate, static-graph would make it
-                # a persistable var named 'unique_name.generate("learning_rate")',
-                # However, dygraph wouldn't save it.
-                if var.name not in state:
-                    continue
-            else:
-                # moment and other accumulators
-                if var.name not in converted_state:
-                    # try to convert from dygraph name
-                    opt_name = self.model._optimizer._name
-                    opt_cls_name = self.model._optimizer.__class__.__name__
-                    opt_unq_name = None
-                    for name in self.model._optimizer._accumulators.keys():
-                        accum_name = name if opt_name is None else name[len(
-                            opt_name) + 1:]
-                        for param_name, state_var in self.model._optimizer._accumulators[
-                                name].items():
-                            if opt_unq_name is None:
-                                # can not infer out the exact unique(opt_name),
-                                # thus try to extract rather than generate
-                                for state_key in sorted(
-                                        state.keys(),
-                                        key=lambda x: len(x),
-                                        reverse=True):
-                                    prefix = param_name + "_" + (
-                                        opt_cls_name if opt_name is None else
-                                        opt_name) + "_"
-                                    if state_key.startswith(prefix):
-                                        prefix_offset = state_key[len(
-                                            prefix):].find("_") + len(prefix)
-                                        opt_unq_name = state_key[len(
-                                            param_name + "_"):prefix_offset]
-                                        # TODO: assert
-                                        # assert opt_unq_name is None
-                                    # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
-                                    # always end with "_0" since the unique optimizer._name
-                            dy_state_name = (param_name + "_" + opt_unq_name +
-                                             "_" + accum_name + "_0")
-                            converted_state[
-                                state_var.name] = converted_state.pop(
-                                    dy_state_name)
-
-            assert var.name in converted_state, \
-                "variable [{}] is not in optimizer state file".format(var.name)
-            self._set_var(var, converted_state[var.name])
-
-    def _set_var(self, var, ndarray):
-        t = global_scope().find_var(var.name).get_tensor()
-        p = t._place()
-        if p.is_cpu_place():
-            place = fluid.CPUPlace()
-        elif p.is_cuda_pinned_place():
-            place = fluid.CUDAPinnedPlace()
-        else:
-            p = fluid.core.Place()
-            p.set_place(t._place())
-            place = fluid.CUDAPlace(p.gpu_device_id())
-
-        t.set(ndarray, place)
-
-    def _run(self, inputs, labels=None):
-        compiled_prog = self._compiled_progs.get(self.mode, None)
-        assert compiled_prog, \
-            "Model is not ready, please call `model.prepare()` first"
-
-        inputs = to_list(inputs)
-        if labels is not None:
-            labels = to_list(labels)
-        assert len(inputs) == len(self._input_vars[self.mode]), \
-            "number of inputs" \
-            + " does not match number of arguments of `forward` method"
-
-        feed = {}
-        input_names = [v.name for v in self._input_vars[self.mode]]
-        for idx, n in enumerate(input_names):
-            # train and test may take different arguments
-            if inputs[idx] is not None:
-                feed[n] = inputs[idx]
-        if labels is not None:
-            for idx, v in enumerate(self._label_vars[self.mode]):
-                feed[v.name] = labels[idx]
-
-        endpoints = self._endpoints[self.mode]
-        if self.mode == 'test':
-            fetch_list = endpoints['output']
-        else:
-            metric_list, metric_splits = flatten_list(endpoints['metric'])
-            fetch_list = endpoints['loss'] + metric_list
-            num_loss = len(endpoints['loss'])
-
-        # if fetch Variable is same as input Variable, do not fetch
-        # from program, get it from input directly
-        pruned_fetch_list = []
-        pruned_fetch_idx_name_map = [""] * len(fetch_list)
-        for i, fetch_var in enumerate(fetch_list):
-            if fetch_var.name in feed.keys():
-                pruned_fetch_idx_name_map[i] = fetch_var.name
-            else:
-                pruned_fetch_list.append(fetch_var)
-
-        rets = self._executor.run(compiled_prog,
-                                  feed=feed,
-                                  fetch_list=pruned_fetch_list,
-                                  return_numpy=False)
-
-        # restore pruned fetch_list Variable from feeds
-        for i, name in enumerate(pruned_fetch_idx_name_map):
-            if len(name) > 0:
-                rets.insert(i, feed[name])
-
-        # LoDTensor cannot be fetch as numpy directly
-        rets = [np.array(v) for v in rets]
-        if self.mode == 'test':
-            return rets[:]
-        losses = rets[:num_loss]
-        metric_states = restore_flatten_list(rets[num_loss:], metric_splits)
-        metrics = []
-        for metric, state in zip(self.model._metrics, metric_states):
-            # cut off padding size
-            if self.mode != 'train' and self.model._test_dataloader is not None \
-                    and isinstance(self.model._test_dataloader, DataLoader) \
-                    and self._nranks > 1:
-                total_size = len(self.model._test_dataloader.dataset)
-                # TODO: fixme if have better way to get batch size
-                samples = state[0].shape[0]
-                current_count = self._merge_count.get(self.mode + '_total', 0)
-                if current_count + samples >= total_size:
-                    state = [
-                        s[:total_size - current_count, ...] for s in state
-                    ]
-                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode +
-                                      '_batch'] = total_size - current_count
-                else:
-                    self._merge_count[self.mode + '_total'] += samples
-                    self._merge_count[self.mode + '_batch'] = samples
-
-            metrics.append(metric.update(*state))
-        return (losses, metrics) if len(metrics) > 0 else losses
-
-    def prepare(self):
-        modes = ['train', 'eval', 'test']
-        for mode in modes:
-            self._make_program(mode)
-            self._compile_and_initialize(self._progs[mode], mode)
-
-    def _make_program(self, mode):
-        prog = self._progs.get(mode, None)
-        if prog is not None:
-            return
-
-        prog = self._orig_prog.clone()
-        # NOTE: When defining learning rate scheduling in static-graph, ops to
-        # increase the global step var and calculate learning rate would be
-        # prepended into _orig_prog. test program maked by `_orig_prog.clone`
-        # also would include these ops. Thus must prune these ops in test
-        # program, otherwise the global step would be changed in test.
-        if mode != 'train':
-            for op in list(prog.global_block().ops):
-                prog.global_block()._remove_op(0)
-        if mode == 'train' and self.model._optimizer \
-                and self.model._optimizer._learning_rate_map:
-            # HACK workaround learning rate map issue
-            lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
-            new_lr_var = prog.global_block().vars[lr_var.name]
-            self.model._optimizer._learning_rate_map[prog] = new_lr_var
-
-        losses = []
-        metrics = []
-        with fluid.program_guard(prog, self._startup_prog):
-            ins = self.model._inputs
-            lbls = self.model._labels if self.model._labels else []
-            inputs = [k.forward() for k in to_list(ins)]
-            labels = [k.forward() for k in to_list(lbls)]
-            self._label_vars[mode] = labels
-            outputs = to_list(self.model.forward(*inputs))
-
-            if mode != 'test' and self.model._loss_function:
-                losses = self.model._loss_function(outputs, labels)
-
-            if self._nranks > 1 and mode != 'train':
-                outputs = [_all_gather(o, self._nranks) for o in outputs]
-                if mode != 'test':
-                    labels = [_all_gather(l, self._nranks) for l in labels]
-
-            if mode != 'test':
-                for metric in self.model._metrics:
-                    metrics.append(
-                        to_list(metric.add_metric_op(*(outputs + labels))))
-
-            if mode == 'train' and self.model._optimizer:
-                self._loss_endpoint = fluid.layers.sum(losses)
-                if self._nranks > 1:
-                    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-                    fleet.init(role)
-                    dist_strategy = DistributedStrategy()
-                    dist_strategy.mode = "collective"
-                    dist_strategy.collective_mode = "grad_allreduce"
-                    self.model._optimizer = fleet.distributed_optimizer(
-                        self.model._optimizer, strategy=dist_strategy)
-
-                self.model._optimizer.minimize(self._loss_endpoint)
-
-        if mode != 'train':  # clone again to put it in test mode
-            prog = prog.clone(for_test=True)
-
-        self._input_vars[mode] = inputs
-
-        self._progs[mode] = prog
-        self._endpoints[mode] = {
-            "output": outputs,
-            "loss": losses,
-            "metric": metrics
-        }
-
-    def _compile_and_initialize(self, prog, mode):
-        compiled_prog = self._compiled_progs.get(mode, None)
-        if compiled_prog is not None:
-            return compiled_prog
-
-        assert self.model._place is not None, \
-            "device is not set, please call `model.prepare()` first"
-
-        place = self.model._place
-
-        # XXX *ALL WEIGHTS* should be initialized upon model construction
-        # even if `forward()` may run different code path for different mode
-        # therefore startup program only needs to run once
-        if self._executor is None:
-            self._executor = fluid.Executor(place)
-            # XXX incremental initialization
-            uninitialized = []
-            for var_py in self._startup_prog.list_vars():
-                var = fluid.global_scope().find_var(var_py.name)
-                if not var_py.name.startswith('nccl_id') and var and \
-                        var.get_tensor()._is_initialized():
-                    continue
-
-                uninitialized.append(var_py)
-            if uninitialized:
-                startup_prog = self._startup_prog._prune(uninitialized)
-                self._executor.run(startup_prog)
-
-        if self._nranks < 2:
-            compiled_prog = fluid.CompiledProgram(prog)
-        else:
-            compiled_prog = prog
-
-        self._compiled_progs[mode] = compiled_prog
-
-
-class DynamicGraphAdapter(object):
-    def __init__(self, model):
-        super(DynamicGraphAdapter, self).__init__()
-        self.model = model
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
-        self._merge_count = {
-            'eval_total': 0,
-            'test_total': 0,
-            'eval_batch': 0,
-            'test_batch': 0
-        }
-
-        if self._nranks > 1:
-            stradegy = fluid.dygraph.parallel.ParallelStrategy()
-            stradegy.nranks = ParallelEnv().nranks
-            stradegy.local_rank = ParallelEnv().local_rank
-            stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
-            stradegy.current_endpoint = ParallelEnv().current_endpoint
-            self.ddp_model = fluid.dygraph.parallel.DataParallel(self.model,
-                                                                 stradegy)
-
-    @property
-    def mode(self):
-        return self.model.mode
-
-    @mode.setter
-    def mode(self, value):
-        self.model.mode = value
-
-    # TODO multi device in dygraph mode not implemented at present time
-    def train_batch(self, inputs, labels=None):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
-        super(Model, self.model).train()
-        self.mode = 'train'
-        inputs = to_list(inputs)
-        if labels is not None:
-            labels = [to_variable(l) for l in to_list(labels)]
-        if self._nranks > 1:
-            outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs])
-            losses = self.model._loss_function(outputs, labels)
-            final_loss = fluid.layers.sum(losses)
-            final_loss = self.ddp_model.scale_loss(final_loss)
-            final_loss.backward()
-            self.ddp_model.apply_collective_grads()
-        else:
-            outputs = self.model.forward(*[to_variable(x) for x in inputs])
-            losses = self.model._loss_function(outputs, labels)
-            final_loss = fluid.layers.sum(losses)
-            final_loss.backward()
-
-        self.model._optimizer.minimize(final_loss)
-        self.model.clear_gradients()
-        metrics = []
-        for metric in self.model._metrics:
-            metric_outs = metric.add_metric_op(*(
-                to_list(outputs) + to_list(labels)))
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
-            metrics.append(m)
-
-        return ([to_numpy(l) for l in losses], metrics) \
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
-
-    def eval_batch(self, inputs, labels=None):
-        super(Model, self.model).eval()
-        self.mode = 'eval'
-        inputs = to_list(inputs)
-        if labels is not None:
-            labels = [to_variable(l) for l in to_list(labels)]
-        outputs = self.model.forward(*[to_variable(x) for x in inputs])
-        if self.model._loss_function:
-            losses = self.model._loss_function(outputs, labels)
-        else:
-            losses = []
-        if self._nranks > 1:
-            outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
-            labels = [_all_gather(l, self._nranks) for l in labels]
-        metrics = []
-        for metric in self.model._metrics:
-            # cut off padding value.
-            if self.model._test_dataloader is not None and self._nranks > 1 \
-                    and isinstance(self.model._test_dataloader, DataLoader):
-                total_size = len(self.model._test_dataloader.dataset)
-                samples = outputs[0].shape[0]
-                current_count = self._merge_count.get(self.mode + '_total', 0)
-                if current_count + samples >= total_size:
-                    outputs = [o[:total_size - current_count] for o in outputs]
-                    labels = [l[:total_size - current_count] for l in labels]
-                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode +
-                                      '_batch'] = total_size - current_count
-                else:
-                    self._merge_count[self.mode + '_total'] += samples
-                    self._merge_count[self.mode + '_batch'] = samples
-
-            metric_outs = metric.add_metric_op(*(
-                to_list(outputs) + to_list(labels)))
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
-            metrics.append(m)
-
-        # To be consistent with static graph
-        # return empty loss if loss_function is None
-        return ([to_numpy(l) for l in losses], metrics) \
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
-
-    def test_batch(self, inputs):
-        super(Model, self.model).eval()
-        self.mode = 'test'
-        inputs = [to_variable(x) for x in to_list(inputs)]
-        outputs = self.model.forward(*inputs)
-        if self._nranks > 1 and isinstance(self.model._place, fluid.CUDAPlace):
-            outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
-
-        return [to_numpy(o) for o in to_list(outputs)]
-
-    def parameters(self, *args, **kwargs):
-        return super(Model, self.model).parameters(*args, **kwargs)
-
-    def save(self, path):
-        params = self.model.state_dict()
-        fluid.save_dygraph(params, path)
-        if self.model._optimizer is None:
-            return
-        if self.model._optimizer.state_dict():
-            optim = self.model._optimizer.state_dict()
-            fluid.save_dygraph(optim, path)
-
-    def load(self, param_state_pairs, optim_state):
-        # restore parameter states
-        for param, state in param_state_pairs:
-            param.set_value(state)
-
-        # resotre optimizer states
-        if not self.model._optimizer or not optim_state:
-            return
-
-        # If optimizer performs set_dict when state vars haven't been created,
-        # which would happen when set_dict before minimize, the state would be
-        # stored in optimizer._accumulators_holder and loaded lazily.
-        # To contrive this when loading from static-graph saved states, extend
-        # state dict to include keys named accoring to dygraph naming rules.
-        # TODO: if len(self.model._optimizer._accumulators) > 0
-        converted_state = dict(optim_state)
-        opt_unq_name = self.model._optimizer._name
-        opt_cls_name = self.model._optimizer.__class__.__name__
-        opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
-        param_names = [param.name for param in self.model.parameters()]
-        for var_name, state_var in sorted(
-                optim_state.items(), key=lambda x: len(x[0]), reverse=True):
-            if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
-                # NOTE: dygraph saved global_step is 1 larger than that in
-                # static-graph, since the time of global_step to increase is
-                # different.
-                if var_name == "@LR_DECAY_COUNTER@":
-                    converted_state["global_step"] = np.array(
-                        converted_state.pop("@LR_DECAY_COUNTER@")) + 1
-            else:
-                # moment and other accumulators
-                # extend state dict to include promising dygraph names
-                for param_name in param_names:
-                    if var_name.startswith(param_name + "_" + opt_name):
-                        # when init optimizer with name
-                        accum_name = var_name[len(param_name + "_" + opt_name +
-                                                  "_"):]
-                    elif var_name.startswith(param_name +
-                                             "_") and opt_name == opt_cls_name:
-                        # when init optimizer without name
-                        accum_name = var_name[len(param_name + "_"):]
-                    else:
-                        continue
-                    # remove suffix idx
-                    accum_name = accum_name[:accum_name.rfind("_")]
-                    # state names always end with "_0" in dygraph because of the
-                    # unique optimizer._name
-                    dy_state_name = (param_name + "_" + opt_unq_name + "_" +
-                                     accum_name + "_0")
-                    converted_state[dy_state_name] = state_var
-
-        self.model._optimizer.set_dict(converted_state)
-
-
-class Model(fluid.dygraph.Layer):
-    """
-    An Model object is network with training and inference features.
-    Dynamic graph and static graph are supported at the same time,
-    switched by `fluid.enable_dygraph()`. The usage is as follows.
-    But note, the switching between dynamic and static should be before
-    instantiating a Model. The input description, i.e, hapi.Input,
-    must be required for static graph.
-
-    Usage:
-        .. code-block:: python
-
-        import numpy as np
-        import paddle
-        import paddle.fluid as fluid
-        #import paddle.incubate.hapi as hapi
-        from hapi import Model, Input, set_device
-        from hapi.loss import CrossEntropy
-        from hapi.dataset import MNIST
-
-        class MyModel(Model):
-            def __init__(self):
-                super(MyModel, self).__init__()
-                self._fc = fluid.dygraph.Linear(784, 10, act='softmax')
-            def forward(self, x):
-                y = self._fc(x)
-                return y
-        device = set_device('gpu')
-        # if use static graph, do not set
-        fluid.enable_dygraph(device)
-        model = MyModel()
-        optim = fluid.optimizer.SGD(learning_rate=1e-3,
-            parameter_list=model.parameters())
-        
-        inputs = [Input([None, 784], 'float32', name='x')]
-        labels = [Input([None, 1], 'int64', name='label')]
-        
-        mnist_data = MNIST(mode='train')
-        model.prepare(optim,
-                      CrossEntropy(average=True),
-                      hapi.metrics.Accuracy(),
-                      inputs,
-                      labels,
-                      device=device)
-        model.fit(mnist_data, epochs=2, batch_size=32, verbose=1)
-    """
-
-    def __init__(self):
-        super(Model, self).__init__(self.__class__.__name__)
-        self.mode = 'train'
-        self._inputs = None
-        self._labels = None
-        self._loss_function = None
-        self._loss_weights = None
-        self._optimizer = None
-        self._device = None
-        self._optimizer = None
-        self._test_dataloader = None
-
-        # init backend
-        if fluid.in_dygraph_mode():
-            self._adapter = DynamicGraphAdapter(self)
-        else:
-            self._adapter = StaticGraphAdapter(self)
-
-    def train_batch(self, inputs, labels=None):
-        """
-        Run one training step on a batch of data.
-
-        Args:
-            inputs (list): A list of numpy.ndarray, each is a batch of
-                input data.
-            labels (list): A list of numpy.ndarray, each is a batch of
-                input label. If has no labels, set None. Default is None.
-
-        Returns:
-            A list of scalar training loss if the model has no metrics,
-            or a tuple (list of scalar loss, list of metrics) if the model
-            set metrics.
-
-        Examples:
-
-            .. code-block:: python
-            
-              import numpy as np
-              import paddle.fluid as fluid
-              from hapi import Model, Input, set_device
-
-              class MyModel(Model):
-                  def __init__(self):
-                      super(MyModel, self).__init__()
-                      self._fc = Linear(784, 1, act='softmax')
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
-
-              device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
-
-              model = MyModel()
-              optim = fluid.optimizer.SGD(learning_rate=1e-3,
-                  parameter_list=model.parameters())
-
-              inputs = [Input([None, 784], 'float32', name='x')]
-              labels = [Input([None, 1], 'int64', name='label')]
-              model.prepare(optim,
-                            CrossEntropy(average=True),
-                            inputs=inputs,
-                            labels=labels,
-                            device=device)
-              data = np.random.random(size=(4,784)).astype(np.float32)
-              label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
-              loss = model.train_batch([data], [label])
-              print(loss)
-        """
-        return self._adapter.train_batch(inputs, labels)
-
-    def eval_batch(self, inputs, labels=None):
-        """
-        Run one evaluating step on a batch of data.
-
-        Args:
-            inputs (list): A list of numpy.ndarray, each is a batch of
-                input data.
-            labels (list): A list of numpy.ndarray, each is a batch of
-                input label. If has no labels, set None. Default is None.
-
-        Returns:
-            A list of scalar testing loss if the model has no metrics,
-            or a tuple (list of scalar loss, list of metrics) if the model
-            set metrics.
-
-        Examples:
-
-            .. code-block:: python
-            
-              import numpy as np
-              import paddle.fluid as fluid
-              from hapi import Model, Input, set_device
-
-              class MyModel(Model):
-                  def __init__(self):
-                      super(MyModel, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
-
-              device = set_device('gpu')
-              fluid.enable_dygraph(device)
-
-              model = MyModel()
-              optim = fluid.optimizer.SGD(learning_rate=1e-3,
-                  parameter_list=model.parameters())
-
-              inputs = [Input([None, 784], 'float32', name='x')]
-              labels = [Input([None, 1], 'int64', name='label')]
-              model.prepare(optim,
-                            CrossEntropy(average=True),
-                            inputs=inputs,
-                            labels=labels,
-                            device=device)
-              data = np.random.random(size=(4,784)).astype(np.float32)
-              label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
-              loss = model.eval_batch([data], [label])
-              print(loss)
-        """
-        return self._adapter.eval_batch(inputs, labels)
-
-    def test_batch(self, inputs):
-        """
-        Run one testing step on a batch of data.
-
-        Args:
-            inputs (list): A list of numpy.ndarray, each is a batch of
-                input data.
-
-        Returns:
-            A list of numpy.ndarray of predictions, that is the outputs
-            of Model forward.
-
-        Examples:
-
-            .. code-block:: python
-            
-              import numpy as np
-              import paddle.fluid as fluid
-              from hapi import Model, Input, set_device
-
-              class MyModel(Model):
-                  def __init__(self):
-                      super(MyModel, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
-
-              device = set_device('gpu')
-              fluid.enable_dygraph(device)
-
-              model = MyModel()
-              inputs = [Input([None, 784], 'float32', name='x')]
-              model.prepare(inputs=inputs,
-                            device=device)
-              data = np.random.random(size=(4,784)).astype(np.float32)
-              out = model.eval_batch([data])
-              print(out)
-        """
-        return self._adapter.test_batch(inputs)
-
-    def save(self, path):
-        """
-        This function saves parameters, optimizer infomation to path.
-
-        The parameters contains all the trainable Variable, will save to
-        a file with suffix ".pdparams".
-        The optimizer information contains all the variable used by optimizer.
-        For Adam optimizer, contains beta1, beta2, momentum etc. All the
-        information will save to a file with suffix ".pdopt". (If the optimizer
-        have no variable need to save (like SGD), the fill will not generated).
-
-        This function will silently overwrite existing file
-        at the target location.
-
-        Args:
-            path (str): The file prefix to save model. The format is
-                'dirname/file_prefix' or 'file_prefix'. if empty str. A exception
-                 will be raised.
-
-        Returns:
-            None
-
-        Examples:
-
-            .. code-block:: python
-            
-              import paddle.fluid as fluid
-              from hapi import Model, set_device
-              
-              class MyModel(Model):
-                  def __init__(self):
-                      super(MyModel, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
-              
-              device = set_device('cpu')
-              fluid.enable_dygraph(device)
-              model = MyModel()
-              model.save('checkpoint/test')
-        """
-        if ParallelEnv().local_rank == 0:
-            self._adapter.save(path)
-
-    def load(self, path, skip_mismatch=False, reset_optimizer=False):
-        """
-        Load from files storing the model states and optimizer states. The file
-        for optimizer states is not necessary if no need to restore the optimizer.
-
-        NOTE: parameters are retrieved out from the file storing model states
-        accoring to their structured names.
-
-        For fine-tuning or transfer-learning models where some of the layers have
-        changed, keep parameters needed to restore have same structured names in
-        the pre-trained model and fine-tuning model.
-
-        Args:
-            path (str): The prefix of files storing the model states and
-                optimizer states. The files would be `path.pdparams` and
-                `path.pdopt` separately, and the latter is not necessary
-                when no need to restore.
-            skip_mismatch (bool): Whether to skip the loading of mismatch
-                parameter or raise an error when mismatch happens (not found
-                the parameter in file storing model states of or receives a
-                mismatch shape).
-            reset_optimizer (bool): If True, ignore the providing file storing
-                optimizer states and initialize optimizer states from scratch.
-                Otherwise, restore optimizer states from `path.pdopt` if
-                a optimizer has been set to the model. Default False.
-
-        Returns:
-            None
-
-        Examples:
-
-            .. code-block:: python
-            
-              import paddle.fluid as fluid
-              from hapi import Model, set_device
-              
-              class MyModel(Model):
-                  def __init__(self):
-                      super(MyModel, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
-              
-              device = set_device('cpu')
-              fluid.enable_dygraph(device)
-              model = MyModel()
-              model.load('checkpoint/test')
-        """
-
-        def _load_state_from_path(path):
-            if not os.path.exists(path):
-                return
-            with open(path, 'rb') as f:
-                return pickle.load(f) if six.PY2 else pickle.load(
-                    f, encoding='latin1')
-
-        def _check_match(key, param):
-            state = param_state.get(key, None)
-            if state is None:
-                raise ValueError(
-                    "{} is not found in the providing file.".format(key))
-            if list(state.shape) != list(param.shape):
-                raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
-            return param, state
-
-        def _strip_postfix(path):
-            path, ext = os.path.splitext(path)
-            assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
-                    "Unknown postfix {} from weights".format(ext)
-            return path
-
-        path = _strip_postfix(path)
-        param_state = _load_state_from_path(path + ".pdparams")
-        assert param_state, "Failed to load parameters, please check path."
-
-        matched_param_state = []
-        for key, param in self.state_dict().items():
-            try:
-                match_res = _check_match(key, param)
-            except ValueError as err:
-                if skip_mismatch:
-                    warnings.warn(
-                        ("Skip loading for {}. ".format(key) + str(err)))
-                    # reset optimizer when mismatch happens
-                    reset_optimizer = True
-                else:
-                    raise err
-            matched_param_state.append(match_res)
-
-        optim_state = None if reset_optimizer else _load_state_from_path(
-            path + ".pdopt")
-        return self._adapter.load(matched_param_state, optim_state)
-
-    def parameters(self, *args, **kwargs):
-        """
-        Returns a list of parameters of the model.
-
-        Returns:
-            A list of Parameter in static graph.
-            A list of ParamBase in dynamic graph.
-
-        Examples:
-
-            .. code-block:: python
-
-              from hapi.model import Model, Input, set_device
-              class MyModel(Model):
-                  def __init__(self):
-                      super(MyModel, self).__init__()
-                      self._fc = fluid.dygraph.Linear(20, 10, act='softmax')
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
-
-              fluid.enable_dygraph()
-              model = MyModel()
-              params = model.parameters()
-        """
-        return self._adapter.parameters()
-
-    def prepare(self,
-                optimizer=None,
-                loss_function=None,
-                metrics=None,
-                inputs=None,
-                labels=None,
-                device=None):
-        """
-        Configures the model before runing.
-
-        Args:
-            optimizer (Optimizer|None): Optimizer must be set in training
-                and should be a Optimizer instance. It can be None in eval
-                and test mode.
-            loss_function (Loss|None): Loss function must be set in training
-                and should be a Loss instance. It can be None when there is
-                no loss.
-            metrics (Metric|list of Metric|None): If metrics is set, all
-                metrics will be calculated and output in train/eval mode.
-            inputs (Input|list|dict|None): `inputs`, entry points of network,
-                could be a Input layer, or lits of Input layers,
-                or dict (name: Input), or None. For static graph,
-                inputs must be set. For dynamic graph, it could be None.
-            labels (Input|list|None): `labels`, entry points of network,
-                could be a Input layer or lits of Input layers, or None.
-                For static graph, if labels is required in loss_function,
-                labels must be set. Otherwise, it could be None.
-            device (str|fluid.CUDAPlace|fluid.CPUPlace|None): Specify device
-                type, 'CPU', 'GPU', fluid.CUDAPlace or fluid.CPUPlace.
-                If None, automatically select device according to
-                installation package version.
-
-        Returns:
-            None
-        """
-
-        if isinstance(device, fluid.CUDAPlace) or \
-            (isinstance(device, six.string_types) and device.lower() == 'gpu') \
-            or (device is None and fluid.is_compiled_with_cuda()):
-            if isinstance(device, fluid.CUDAPlace):
-                self._place = device
-            else:
-                self._place = fluid.CUDAPlace(ParallelEnv().dev_id) \
-                    if ParallelEnv().nranks > 1 else fluid.CUDAPlace(0)
-
-            global _parallel_context_initialized
-            if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
-                if fluid.in_dygraph_mode():
-                    main_prog_seed = fluid.default_main_program().random_seed
-                    startup_prog_seed = fluid.default_startup_program(
-                    ).random_seed
-                    fluid.disable_dygraph()
-                    fluid.enable_dygraph(self._place)
-                    # enable_dygraph would create and switch to a new program,
-                    # thus also copy seed to the new program
-                    fluid.default_main_program().random_seed = main_prog_seed
-                    fluid.default_startup_program(
-                    ).random_seed = startup_prog_seed
-                    fluid.dygraph.parallel.prepare_context()
-                else:
-                    prepare_distributed_context(self._place)
-
-                _parallel_context_initialized = True
-        elif isinstance(device, fluid.CPUPlace):
-            self._place = device
-        elif (isinstance(device, six.string_types) and device.lower() == 'cpu') \
-            or (device is None):
-            self._place = fluid.CPUPlace()
-        else:
-            raise ValueError(
-                "Expected device in ('gpu', 'cpu', fluid.CUDAPlace, fluid.CPUPlace, None), \
-                but got {}".format(device))
-
-        self._optimizer = optimizer
-        if loss_function:
-            if not isinstance(loss_function, Loss):
-                raise TypeError(
-                    "'loss_function' must be sub classes of 'Loss'")
-        self._loss_function = loss_function
-        if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
-                raise TypeError(
-                    "'inputs' must be list or dict in static graph mode")
-
-        metrics = metrics or []
-        for metric in to_list(metrics):
-            assert isinstance(metric, Metric), \
-                "{} is not sub class of Metric".format(
-                    metric.__class__.__name__)
-        self._metrics = to_list(metrics)
-
-        self._inputs = to_list(inputs) if not isinstance(inputs, dict) else [
-            inputs[n] for n in extract_args(self.forward) if n != 'self'
-        ]
-        self._labels = to_list(labels)
-
-        if not in_dygraph_mode():
-            self._adapter.prepare()
-
-    def fit(
-            self,
-            train_data=None,
-            eval_data=None,
-            batch_size=1,
-            epochs=1,
-            eval_freq=1,
-            log_freq=10,
-            save_dir=None,
-            save_freq=1,
-            verbose=2,
-            drop_last=False,
-            shuffle=True,
-            num_workers=0,
-            callbacks=None, ):
-        """
-        Trains the model for a fixed number of epochs. If `eval_data` is set,
-        evaluation will be done at the end of each epoch.
-
-        Args:
-            train_data (Dataset|DataLoader): An iterable data loader is used for 
-                train. An instance of paddle paddle.io.Dataset or 
-                paddle.io.Dataloader is recomended. Default: None.
-            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.io.Dataset or paddle.io.Dataloader 
-                is recomended. Default: None.
-            batch_size (int): Integer number. The batch size of train_data
-                and eval_data. When train_data and eval_data are both the
-                instance of Dataloader, this parameter will be ignored.
-                Default: 1.
-            epochs (int): Integer number. The number of epochs to train
-                the model. Default: 1.
-            eval_freq (int): The frequency, in number of epochs, an evalutation
-                is performed. Default: 1.
-            log_freq (int): The frequency, in number of steps, the training logs
-                are printed. Default: 10.
-            save_dir(str|None): The directory to save checkpoint during training.
-                If None, will not save checkpoint. Default: None.
-            save_freq (int): The frequency, in number of epochs, to save
-                checkpoint. Default: 1.
-            verbose (int): The verbosity mode, should be 0, 1, or 2. 0 = silent,
-                1 = progress bar, 2 = one line per epoch. Default: 2.
-            drop_last (bool): Whether drop the last incomplete batch of
-                train_data when dataset size is not divisible by the batch size.
-                When train_data is an instance of Dataloader, this parameter
-                will be ignored. Default: False.
-            shuffle (bool): Whther to shuffle train_data. When train_data is
-                an instance of Dataloader, this parameter will be ignored.
-                Default: True.
-            num_workers (int): The number of subprocess to load data, 0 for no
-                subprocess used and loading data in main process.
-                When train_data and eval_data are both the instance of
-                Dataloader, this parameter will be ignored. Default: 0.
-            callbacks (Callback|None): A list of `Callback` instances to apply
-                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
-                are automatically inserted. Default: None.
-
-        Returns:
-            None
-
-        Examples:
-            1. An example use Dataset and set btch size, shuffle in fit.
-               How to make a batch is done internally.
-
-            .. code-block:: python
-
-              from hapi.model import Model, Input, set_device
-              from hapi.loss import CrossEntropy
-              from hapi.metrics import Accuracy
-              from hapi.datasets import MNIST
-              from hapi.vision.models import LeNet
-
-              dynamic = True
-              device = set_device(FLAGS.device)
-              fluid.enable_dygraph(device) if dynamic else None
-           
-              train_dataset = MNIST(mode='train')
-              val_dataset = MNIST(mode='test')
-           
-              inputs = [Input([None, 1, 28, 28], 'float32', name='image')]
-              labels = [Input([None, 1], 'int64', name='label')]
-           
-              model = LeNet()
-              optim = fluid.optimizer.Adam(
-                  learning_rate=0.001, parameter_list=model.parameters())
-              model.prepare(
-                  optim,
-                  CrossEntropy(),
-                  Accuracy(topk=(1, 2)),
-                  inputs=inputs,
-                  labels=labels,
-                  device=device)
-              model.fit(train_dataset,
-                        val_dataset,
-                        epochs=2,
-                        batch_size=64,
-                        save_dir='mnist_checkpoint')
-
-            2. An example use DataLoader, batch size and shuffle is set in
-               DataLoader.
-
-            .. code-block:: python
-
-              from hapi.model import Model, Input, set_device
-              from hapi.loss import CrossEntropy
-              from hapi.metrics import Accuracy
-              from hapi.datasets import MNIST
-              from hapi.vision.models import LeNet
-
-              dynamic = True
-              device = set_device(FLAGS.device)
-              fluid.enable_dygraph(device) if dynamic else None
-           
-              train_dataset = MNIST(mode='train')
-              train_loader = fluid.io.DataLoader(train_dataset,
-                  places=device, batch_size=64)
-              val_dataset = MNIST(mode='test')
-              val_loader = fluid.io.DataLoader(val_dataset,
-                  places=device, batch_size=64)
-           
-              inputs = [Input([None, 1, 28, 28], 'float32', name='image')]
-              labels = [Input([None, 1], 'int64', name='label')]
-           
-              model = LeNet()
-              optim = fluid.optimizer.Adam(
-                  learning_rate=0.001, parameter_list=model.parameters())
-              model.prepare(
-                  optim,
-                  CrossEntropy(),
-                  Accuracy(topk=(1, 2)),
-                  inputs=inputs,
-                  labels=labels,
-                  device=device)
-              model.fit(train_loader,
-                        val_loader,
-                        epochs=2,
-                        save_dir='mnist_checkpoint')
-        """
-
-        assert train_data is not None, \
-                "train_data must be given!"
-
-        if isinstance(train_data, Dataset):
-            train_sampler = DistributedBatchSampler(
-                train_data,
-                batch_size=batch_size,
-                shuffle=shuffle,
-                drop_last=drop_last)
-            train_loader = DataLoader(
-                train_data,
-                batch_sampler=train_sampler,
-                places=self._place,
-                num_workers=num_workers,
-                return_list=True)
-        else:
-            train_loader = train_data
-
-        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(
-                eval_data, batch_size=batch_size)
-            eval_loader = DataLoader(
-                eval_data,
-                batch_sampler=eval_sampler,
-                places=self._place,
-                num_workers=num_workers,
-                return_list=True)
-        elif eval_data is not None:
-            eval_loader = eval_data
-        else:
-            eval_loader = None
-
-        do_eval = eval_loader is not None
-        self._test_dataloader = eval_loader
-
-        steps = self._len_data_loader(train_loader)
-        cbks = config_callbacks(
-            callbacks,
-            model=self,
-            epochs=epochs,
-            steps=steps,
-            log_freq=log_freq,
-            save_freq=save_freq,
-            save_dir=save_dir,
-            verbose=verbose,
-            metrics=self._metrics_name(), )
-
-        cbks.on_begin('train')
-        for epoch in range(epochs):
-
-            cbks.on_epoch_begin(epoch)
-            logs = self._run_one_epoch(train_loader, cbks, 'train')
-            cbks.on_epoch_end(epoch, logs)
-
-            if do_eval and epoch % eval_freq == 0:
-
-                eval_steps = self._len_data_loader(eval_loader)
-                cbks.on_begin('eval', {
-                    'steps': eval_steps,
-                    'metrics_name': self._metrics_name()
-                })
-
-                logs = self._run_one_epoch(eval_loader, cbks, 'eval')
-
-                cbks.on_end('eval', logs)
-
-        cbks.on_end('train', logs)
-        self._test_dataloader = None
-
-    def evaluate(
-            self,
-            eval_data,
-            batch_size=1,
-            log_freq=10,
-            verbose=2,
-            num_workers=0,
-            callbacks=None, ):
-        """
-        Evaluate the loss and metrics of the model on input dataset.
-
-        Args:
-            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.io.Dataset or 
-                paddle.io.Dataloader is recomended.
-            batch_size (int): Integer number. The batch size of train_data
-                and eval_data.  When eval_data is the instance of Dataloader,
-                this argument will be ignored. Default: 1.
-            log_freq (int): The frequency, in number of steps, the eval logs
-                are printed. Default: 10.
-            verbose (int): The verbosity mode, should be 0, 1, or 2. 0 = silent,
-                1 = progress bar, 2 = one line per epoch. Default: 2.
-            num_workers (int): The number of subprocess to load data,
-                0 for no subprocess used and loading data in main process. When
-                train_data and eval_data are both the instance of Dataloader,
-                this parameter will be ignored. Default: 0.
-            callbacks (Callback|None): A list of `Callback` instances to apply
-                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
-                are automatically inserted. Default: None.
-        Returns:
-            dict: Result of metric. The key is the names of Metric,
-                value is a scalar or numpy.array.
-
-        Examples:
-        .. code-block:: python
-
-            # declarative mode
-            import numpy as np
-            from hapi.metrics import Accuracy
-            from hapi.datasets import MNIST
-            from hapi.vision.transforms import Compose,Resize
-            from hapi.vision.models import LeNet
-            from hapi.model import Input, set_device
-
-
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
-
-            val_dataset = MNIST(mode='test')
-
-            model = LeNet()
-            model.prepare(metrics=Accuracy(), inputs=inputs, labels=labels)
-
-            result = model.evaluate(val_dataset, batch_size=64)
-            print(result)
-
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            place = set_device('cpu')
-            with dg.guard(place) as g:
-                model = LeNet()
-                model.prepare(metrics=Accuracy(), inputs=inputs, labels=labels)
-
-                result = model.evaluate(val_dataset, batch_size=64)
-                print(result)
-                
-        """
-
-        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(
-                eval_data, batch_size=batch_size)
-            eval_loader = DataLoader(
-                eval_data,
-                batch_sampler=eval_sampler,
-                places=self._place,
-                num_workers=num_workers,
-                return_list=True)
-        else:
-            eval_loader = eval_data
-
-        self._test_dataloader = eval_loader
-
-        cbks = config_callbacks(
-            callbacks,
-            model=self,
-            log_freq=log_freq,
-            verbose=verbose,
-            metrics=self._metrics_name(), )
-
-        eval_steps = self._len_data_loader(eval_loader)
-        cbks.on_begin('eval', {
-            'steps': eval_steps,
-            'metrics_name': self._metrics_name()
-        })
-
-        logs = self._run_one_epoch(eval_loader, cbks, 'eval')
-
-        cbks.on_end('eval', logs)
-
-        self._test_dataloader = None
-
-        eval_result = {}
-        for k in self._metrics_name():
-            eval_result[k] = logs[k]
-
-        return eval_result
-
-    def predict(self,
-                test_data,
-                batch_size=1,
-                num_workers=0,
-                stack_outputs=False,
-                callbacks=None):
-        """
-        Compute the output predictions on testing data.
-
-        Args:
-            test_data (Dataset|DataLoader): An iterable data loader is used for
-                predict. An instance of paddle.io.Dataset or paddle.io.Dataloader
-                is recomended.
-            batch_size (int): Integer number. The batch size of train_data and eval_data.
-                When train_data and eval_data are both the instance of Dataloader, this
-                argument will be ignored. Default: 1.
-            num_workers (int): The number of subprocess to load data, 0 for no subprocess 
-                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this argument will be ignored. Default: 0.
-            stack_output (bool): Whether stack output field like a batch, as for an output
-                filed of a sample is in shape [X, Y], test_data contains N samples, predict
-                output field will be in shape [N, X, Y] if stack_output is True, and will
-                be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
-                is False. stack_outputs as False is used for LoDTensor output situation,
-                it is recommended set as True if outputs contains no LoDTensor. Default: False.
-        Returns:
-            list: output of models.
-
-        Examples:
-        .. code-block:: python
-
-            # declarative mode
-            import numpy as np
-            from hapi.metrics import Accuracy
-            from hapi.datasets import MNIST
-            from hapi.vision.transforms import Compose,Resize
-            from hapi.vision.models import LeNet
-            from hapi.model import Input, set_device
-
-            class MnistDataset(MNIST):
-                def __init__(self, mode, return_label=True):
-                    super(MnistDataset, self).__init__(mode=mode)
-                    self.return_label = return_label
-
-                def __getitem__(self, idx):
-                    img = np.reshape(self.images[idx], [1, 28, 28])
-                    if self.return_label:
-                        return img, np.array(self.labels[idx]).astype('int64')
-                    return img,
-
-                def __len__(self):
-                    return len(self.images)
-
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-
-            test_dataset = MnistDataset(mode='test', return_label=False)
-
-            model = LeNet()
-            model.prepare(inputs=inputs)
-
-            result = model.predict(test_dataset, batch_size=64)
-            print(result)
-
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            place = set_device('cpu')
-            with dg.guard(place) as g:
-                model = LeNet()
-                model.prepare(inputs=inputs)
-
-                result = model.predict(test_dataset, batch_size=64)
-                print(result)
-        """
-
-        if test_data is not None and isinstance(test_data, Dataset):
-            test_sampler = DistributedBatchSampler(
-                test_data, batch_size=batch_size)
-            test_loader = DataLoader(
-                test_data,
-                batch_sampler=test_sampler,
-                places=self._place,
-                num_workers=num_workers,
-                return_list=True)
-        else:
-            test_loader = test_data
-
-        self._test_dataloader = test_loader
-
-        cbks = config_callbacks(callbacks, model=self, verbose=1)
-
-        test_steps = self._len_data_loader(test_loader)
-        logs = {'steps': test_steps}
-
-        cbks.on_begin('test', logs)
-
-        outputs = []
-
-        logs, outputs = self._run_one_epoch(test_loader, cbks, 'test')
-
-        outputs = list(zip(*outputs))
-
-        # NOTE: for lod tensor output, we should not stack outputs
-        # for stacking may loss its detail info
-        if stack_outputs:
-            outputs = [np.vstack(outs) for outs in outputs]
-
-        self._test_dataloader = None
-
-        cbks.on_end('test', logs)
-        return outputs
-
-    def save_inference_model(self,
-                             save_dir,
-                             model_filename=None,
-                             params_filename=None,
-                             model_only=False):
-        """
-        Save inference model must in static mode.
-
-        Args:
-            dirname(str): The directory path to save the inference model.
-            model_filename(str|None): The name of file to save the inference
-                model itself. If is set None, a default filename
-                :code:`__model__` will be used.
-            params_filename(str|None): The name of file to save all related
-                parameters. If it is set None, parameters will be saved
-                in separate files .
-            model_only(bool): If True, It will save inference model only,
-                and do not save parameters. Default: False.
-
-        Returns:
-            list: The fetch variables' name list
-        """
-        assert not fluid.in_dygraph_mode(
-        ), 'Save inference model must in static mode!'
-
-        prog = self._adapter._progs.get('test', None)
-        assert prog, \
-            "Model is not ready, please call `model.prepare()` first"
-
-        infer_prog = prog.clone(for_test=True)
-
-        input_names = [v.name for v in self._adapter._input_vars['test']]
-        endpoints = self._adapter._endpoints['test']['output']
-
-        return fluid.io.save_inference_model(
-            save_dir,
-            input_names,
-            endpoints,
-            self._adapter._executor,
-            main_program=infer_prog,
-            model_filename=model_filename,
-            params_filename=params_filename,
-            program_only=model_only)
-
-    def _run_one_epoch(self, data_loader, callbacks, mode, logs={}):
-        outputs = []
-        for step, data in enumerate(data_loader):
-            # data might come from different types of data_loader and have
-            # different format, as following:
-            # 1. DataLoader in static graph:
-            #    [[input1, input2, ..., label1, lable2, ...]]
-            # 2. DataLoader in dygraph
-            #    [input1, input2, ..., label1, lable2, ...]
-            # 3. custumed iterator yield concated inputs and labels:
-            #   [input1, input2, ..., label1, lable2, ...]
-            # 4. custumed iterator yield seperated inputs and labels:
-            #   ([input1, input2, ...], [label1, lable2, ...])
-            # To handle all of these, flatten (nested) list to list.
-            data = flatten(data)
-            # LoDTensor.shape is callable, where LoDTensor comes from
-            # DataLoader in static graph
-            batch_size = data[0].shape()[0] if callable(data[
-                0].shape) else data[0].shape[0]
-
-            callbacks.on_batch_begin(mode, step, logs)
-
-            if mode != 'test':
-                outs = getattr(self, mode + '_batch')(data[:len(self._inputs)],
-                                                      data[len(self._inputs):])
-                # losses
-                loss = outs[0] if self._metrics else outs
-                metrics = [[l[0] for l in loss]]
-
-                # metrics
-                for metric in self._metrics:
-                    res = metric.accumulate()
-                    metrics.extend(to_list(res))
-
-                assert len(self._metrics_name()) == len(metrics)
-                for k, v in zip(self._metrics_name(), metrics):
-                    logs[k] = v
-            else:
-                outs = getattr(self, mode + '_batch')(data[:len(self._inputs)])
-                outputs.append(outs)
-
-            logs['step'] = step
-            if mode == 'train' or self._adapter._merge_count.get(
-                    mode + '_batch', 0) <= 0:
-                logs['batch_size'] = batch_size * ParallelEnv().nranks
-            else:
-                logs['batch_size'] = self._adapter._merge_count[mode +
-                                                                '_batch']
-
-            callbacks.on_batch_end(mode, step, logs)
-        self._reset_metrics()
-
-        if mode == 'test':
-            return logs, outputs
-        return logs
-
-    def _reset_metrics(self):
-        for metric in self._metrics:
-            metric.reset()
-
-    def _metrics_name(self):
-        metrics_name = ['loss']
-        for m in self._metrics:
-            metrics_name.extend(to_list(m.name()))
-        return metrics_name
-
-    def _len_data_loader(self, data_loader):
-        try:
-            steps = len(data_loader)
-        except Exception:
-            steps = None
-        return steps
diff --git a/hapi/progressbar.py b/hapi/progressbar.py
deleted file mode 100644
index bbeff68d9a1e733c8face6903481fa7bb41d908e..0000000000000000000000000000000000000000
--- a/hapi/progressbar.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import sys
-import time
-import numpy as np
-
-
-class ProgressBar(object):
-    """progress bar """
-
-    def __init__(self,
-                 num=None,
-                 width=30,
-                 verbose=1,
-                 start=True,
-                 file=sys.stdout):
-        self._num = num
-        if isinstance(num, int) and num <= 0:
-            raise TypeError('num should be None or integer (> 0)')
-        max_width = self._get_max_width()
-        self._width = width if width <= max_width else max_width
-        self._total_width = 0
-        self._verbose = verbose
-        self.file = file
-        self._values = {}
-        self._values_order = []
-        if start:
-            self._start = time.time()
-        self._last_update = 0
-
-        self._dynamic_display = (
-            (hasattr(self.file, 'isatty') and
-             self.file.isatty()) or 'ipykernel' in sys.modules or
-            'posix' in sys.modules or 'PYCHARM_HOSTED' in os.environ)
-
-    def _get_max_width(self):
-        if sys.version_info > (3, 3):
-            from shutil import get_terminal_size
-        else:
-            from backports.shutil_get_terminal_size import get_terminal_size
-        terminal_width, _ = get_terminal_size()
-        max_width = min(int(terminal_width * 0.6), terminal_width - 50)
-        return max_width
-
-    def start(self):
-        self.file.flush()
-        self._start = time.time()
-
-    def update(self, current_num, values=None):
-        now = time.time()
-
-        if current_num:
-            time_per_unit = (now - self._start) / current_num
-        else:
-            time_per_unit = 0
-
-        if time_per_unit >= 1 or time_per_unit == 0:
-            fps = ' - %.0fs/%s' % (time_per_unit, 'step')
-        elif time_per_unit >= 1e-3:
-            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, 'step')
-        else:
-            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, 'step')
-
-        info = ''
-        if self._verbose == 1:
-            prev_total_width = self._total_width
-
-            if self._dynamic_display:
-                sys.stdout.write('\b' * prev_total_width)
-                sys.stdout.write('\r')
-            else:
-                sys.stdout.write('\n')
-
-            if self._num is not None:
-                numdigits = int(np.log10(self._num)) + 1
-
-                bar_chars = ('step %' + str(numdigits) + 'd/%d [') % (
-                    current_num, self._num)
-                prog = float(current_num) / self._num
-                prog_width = int(self._width * prog)
-
-                if prog_width > 0:
-                    bar_chars += ('=' * (prog_width - 1))
-                    if current_num < self._num:
-                        bar_chars += '>'
-                    else:
-                        bar_chars += '='
-                bar_chars += ('.' * (self._width - prog_width))
-                bar_chars += ']'
-            else:
-                bar_chars = 'step %3d' % current_num
-
-            self._total_width = len(bar_chars)
-            sys.stdout.write(bar_chars)
-
-            for k, val in values:
-                info += ' - %s:' % k
-                val = val if isinstance(val, list) else [val]
-                for i, v in enumerate(val):
-                    if isinstance(v, (float, np.float32, np.float64)):
-                        if abs(v) > 1e-3:
-                            info += ' %.4f' % v
-                        else:
-                            info += ' %.4e' % v
-                    else:
-                        info += ' %s' % v
-
-            if self._num is not None and current_num < self._num:
-                eta = time_per_unit * (self._num - current_num)
-                if eta > 3600:
-                    eta_format = '%d:%02d:%02d' % (eta // 3600, (eta % 3600) //
-                                                   60, eta % 60)
-                elif eta > 60:
-                    eta_format = '%d:%02d' % (eta // 60, eta % 60)
-                else:
-                    eta_format = '%ds' % eta
-
-                info += ' - ETA: %s' % eta_format
-
-            info += fps
-            self._total_width += len(info)
-            if prev_total_width > self._total_width:
-                info += (' ' * (prev_total_width - self._total_width))
-
-            # newline for another epoch
-            if self._num is not None and current_num >= self._num:
-                info += '\n'
-            if self._num is None:
-                info += '\n'
-
-            sys.stdout.write(info)
-            sys.stdout.flush()
-            self._last_update = now
-        elif self._verbose == 2:
-            if self._num:
-                numdigits = int(np.log10(self._num)) + 1
-                count = ('step %' + str(numdigits) + 'd/%d') % (current_num,
-                                                                self._num)
-            else:
-                count = 'step %3d' % current_num
-            info = count + info
-
-            for k, val in values:
-                info += ' - %s:' % k
-                val = val if isinstance(val, list) else [val]
-                for v in val:
-                    if isinstance(v, (float, np.float32, np.float64)):
-                        if abs(v) > 1e-3:
-                            info += ' %.4f' % v
-                        else:
-                            info += ' %.4e' % v
-                    elif isinstance(v, np.ndarray) and \
-                        v.size == 1 and \
-                        isinstance(v.dtype, (np.float32, np.float64)):
-                        if abs(v[0]) > 1e-3:
-                            info += ' %.4f' % v[0]
-                        else:
-                            info += ' %.4e' % v[0]
-                    else:
-                        info += ' %s' % v
-
-            info += fps
-            info += '\n'
-            sys.stdout.write(info)
-            sys.stdout.flush()
diff --git a/hapi/tests/dist_mnist.py b/hapi/tests/dist_mnist.py
deleted file mode 100644
index e6106f6109902b45a34b8acf51e1ba54be51abdc..0000000000000000000000000000000000000000
--- a/hapi/tests/dist_mnist.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-import os
-
-import numpy as np
-import contextlib
-
-import paddle
-from paddle import fluid
-
-from hapi.model import Model, Input, set_device
-from hapi.loss import CrossEntropy
-from hapi.vision.models import LeNet
-from hapi.metrics import Accuracy
-from hapi.callbacks import ProgBarLogger
-from hapi.datasets import MNIST
-
-
-class MnistDataset(MNIST):
-    def __init__(self, mode, return_label=True):
-        super(MnistDataset, self).__init__(mode=mode)
-        self.return_label = return_label
-
-    def __getitem__(self, idx):
-        img = np.reshape(self.images[idx], [1, 28, 28])
-        if self.return_label:
-            return img, np.array(self.labels[idx]).astype('int64')
-        return img,
-
-    def __len__(self):
-        return len(self.images)
-
-
-def compute_accuracy(pred, gt):
-    pred = np.argmax(pred, -1)
-    gt = np.array(gt)
-
-    correct = pred[:, np.newaxis] == gt
-
-    return np.sum(correct) / correct.shape[0]
-
-
-class TestModel(unittest.TestCase):
-    def run(self, dynamic):
-        device = set_device('gpu')
-        fluid.enable_dygraph(device) if dynamic else None
-
-        im_shape = (-1, 784)
-        batch_size = 128
-
-        inputs = [Input(im_shape, 'float32', name='image')]
-        labels = [Input([None, 1], 'int64', name='label')]
-
-        train_dataset = MnistDataset(mode='train')
-        val_dataset = MnistDataset(mode='test')
-        test_dataset = MnistDataset(mode='test', return_label=False)
-
-        model = LeNet()
-        optim = fluid.optimizer.Momentum(
-            learning_rate=0.001,
-            momentum=.9,
-            parameter_list=model.parameters())
-        loss = CrossEntropy()
-        model.prepare(optim, loss, Accuracy(), inputs, labels, device=device)
-        cbk = ProgBarLogger(50)
-
-        model.fit(train_dataset,
-                  val_dataset,
-                  epochs=2,
-                  batch_size=batch_size,
-                  callbacks=cbk)
-
-        eval_result = model.evaluate(val_dataset, batch_size=batch_size)
-
-        output = model.predict(
-            test_dataset, batch_size=batch_size, stack_outputs=True)
-
-        np.testing.assert_equal(output[0].shape[0], len(test_dataset))
-
-        acc = compute_accuracy(output[0], val_dataset.labels)
-
-        np.testing.assert_allclose(acc, eval_result['acc'])
-
-    def test_multiple_gpus_static(self):
-        self.run(False)
-
-    def test_multiple_gpus_dygraph(self):
-        self.run(True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_bert_dataloader.py b/hapi/tests/test_bert_dataloader.py
deleted file mode 100644
index f4a303b57f9443e5bbf7bb8106861989aa8120d9..0000000000000000000000000000000000000000
--- a/hapi/tests/test_bert_dataloader.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import paddle
-from hapi.model import set_device
-from hapi.text.bert.dataloader import SingleSentenceDataLoader
-import hapi.text.tokenizer.tokenization as tokenization
-
-device = set_device("cpu")
-paddle.fluid.enable_dygraph(device)
-
-tokenizer = tokenization.FullTokenizer(
-    vocab_file="./tmp/hapi/data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt",
-    do_lower_case=True)
-
-bert_dataloader = SingleSentenceDataLoader(
-    "./tmp/hapi/aaa.txt",
-    tokenizer, ["1", "2"],
-    max_seq_length=32,
-    batch_size=1)
-
-for data in bert_dataloader.dataloader():
-    print(data)
diff --git a/hapi/tests/test_callbacks.py b/hapi/tests/test_callbacks.py
deleted file mode 100644
index bb0e6f22b0d7160800a465e9aaa19d4b5f9edc3e..0000000000000000000000000000000000000000
--- a/hapi/tests/test_callbacks.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import time
-import random
-import tempfile
-import shutil
-
-from hapi.model import Input
-from hapi.vision.models import LeNet
-from hapi.callbacks import config_callbacks
-
-
-class TestCallbacks(unittest.TestCase):
-    def setUp(self):
-        self.save_dir = tempfile.mkdtemp()
-
-    def tearDown(self):
-        shutil.rmtree(self.save_dir)
-
-    def run_callback(self):
-        epochs = 2
-        steps = 50
-        freq = 2
-        eval_steps = 20
-
-        lenet = LeNet()
-        inputs = [Input([None, 1, 28, 28], 'float32', name='image')]
-        lenet.prepare(inputs=inputs)
-
-        cbks = config_callbacks(
-            model=lenet,
-            batch_size=128,
-            epochs=epochs,
-            steps=steps,
-            log_freq=freq,
-            verbose=self.verbose,
-            metrics=['loss', 'acc'],
-            save_dir=self.save_dir)
-        cbks.on_begin('train')
-
-        logs = {'loss': 50.341673, 'acc': 0.00256}
-        for epoch in range(epochs):
-            cbks.on_epoch_begin(epoch)
-            for step in range(steps):
-                cbks.on_batch_begin('train', step, logs)
-                logs['loss'] -= random.random() * 0.1
-                logs['acc'] += random.random() * 0.1
-                time.sleep(0.005)
-                cbks.on_batch_end('train', step, logs)
-            cbks.on_epoch_end(epoch, logs)
-
-            eval_logs = {'eval_loss': 20.341673, 'eval_acc': 0.256}
-            params = {
-                'steps': eval_steps,
-                'metrics_name': ['eval_loss', 'eval_acc'],
-            }
-            cbks.on_begin('eval', params)
-            for step in range(eval_steps):
-                cbks.on_batch_begin('eval', step, eval_logs)
-                eval_logs['eval_loss'] -= random.random() * 0.1
-                eval_logs['eval_acc'] += random.random() * 0.1
-                eval_logs['batch_size'] = 2
-                time.sleep(0.005)
-                cbks.on_batch_end('eval', step, eval_logs)
-            cbks.on_end('eval', eval_logs)
-
-            test_logs = {}
-            params = {'steps': eval_steps}
-            cbks.on_begin('test', params)
-            for step in range(eval_steps):
-                cbks.on_batch_begin('test', step, test_logs)
-                test_logs['batch_size'] = 2
-                time.sleep(0.005)
-                cbks.on_batch_end('test', step, test_logs)
-            cbks.on_end('test', test_logs)
-
-        cbks.on_end('train')
-
-    def test_callback_verbose_0(self):
-        self.verbose = 0
-        self.run_callback()
-
-    def test_callback_verbose_1(self):
-        self.verbose = 1
-        self.run_callback()
-
-    def test_callback_verbose_2(self):
-        self.verbose = 2
-        self.run_callback()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_datasets.py b/hapi/tests/test_datasets.py
deleted file mode 100644
index cec6f1e748de65508e73fa4464aaac6b9f2acba7..0000000000000000000000000000000000000000
--- a/hapi/tests/test_datasets.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import os
-import numpy as np
-import tempfile
-import shutil
-import cv2
-
-from hapi.datasets import *
-from hapi.datasets.utils import _check_exists_and_download
-from hapi.vision.transforms import Compose
-
-
-class TestFolderDatasets(unittest.TestCase):
-    def setUp(self):
-        self.data_dir = tempfile.mkdtemp()
-        self.empty_dir = tempfile.mkdtemp()
-        for i in range(2):
-            sub_dir = os.path.join(self.data_dir, 'class_' + str(i))
-            if not os.path.exists(sub_dir):
-                os.makedirs(sub_dir)
-            for j in range(2):
-                fake_img = (np.random.random(
-                    (32, 32, 3)) * 255).astype('uint8')
-                cv2.imwrite(os.path.join(sub_dir, str(j) + '.jpg'), fake_img)
-
-    def tearDown(self):
-        shutil.rmtree(self.data_dir)
-
-    def test_dataset(self):
-        dataset_folder = DatasetFolder(self.data_dir)
-
-        for _ in dataset_folder:
-            pass
-
-        assert len(dataset_folder) == 4
-        assert len(dataset_folder.classes) == 2
-
-        transform = Compose([])
-        dataset_folder = DatasetFolder(self.data_dir, transform=transform)
-        for _ in dataset_folder:
-            pass
-
-    def test_folder(self):
-        loader = ImageFolder(self.data_dir)
-
-        for _ in loader:
-            pass
-
-        transform = Compose([])
-        loader = ImageFolder(self.data_dir, transform=transform)
-        for _ in loader:
-            pass
-
-    def test_errors(self):
-        with self.assertRaises(RuntimeError):
-            ImageFolder(self.empty_dir)
-        with self.assertRaises(RuntimeError):
-            DatasetFolder(self.empty_dir)
-
-        with self.assertRaises(ValueError):
-            _check_exists_and_download('temp_paddle', None, None, None, False)
-
-
-class TestMNISTTest(unittest.TestCase):
-    def test_main(self):
-        mnist = MNIST(mode='test')
-        self.assertTrue(len(mnist) == 10000)
-
-        for i in range(len(mnist)):
-            image, label = mnist[i]
-            self.assertTrue(image.shape[0] == 784)
-            self.assertTrue(label.shape[0] == 1)
-            self.assertTrue(0 <= int(label) <= 9)
-
-
-class TestMNISTTrain(unittest.TestCase):
-    def test_main(self):
-        mnist = MNIST(mode='train')
-        self.assertTrue(len(mnist) == 60000)
-
-        for i in range(len(mnist)):
-            image, label = mnist[i]
-            self.assertTrue(image.shape[0] == 784)
-            self.assertTrue(label.shape[0] == 1)
-            self.assertTrue(0 <= int(label) <= 9)
-
-
-class TestFlowersTrain(unittest.TestCase):
-    def test_main(self):
-        flowers = Flowers(mode='train')
-        self.assertTrue(len(flowers) == 6149)
-
-        # traversal whole dataset may cost a
-        # long time, randomly check 1 sample
-        idx = np.random.randint(0, 6149)
-        image, label = flowers[idx]
-        self.assertTrue(len(image.shape) == 3)
-        self.assertTrue(image.shape[2] == 3)
-        self.assertTrue(label.shape[0] == 1)
-
-
-class TestFlowersValid(unittest.TestCase):
-    def test_main(self):
-        flowers = Flowers(mode='valid')
-        self.assertTrue(len(flowers) == 1020)
-
-        # traversal whole dataset may cost a
-        # long time, randomly check 1 sample
-        idx = np.random.randint(0, 1020)
-        image, label = flowers[idx]
-        self.assertTrue(len(image.shape) == 3)
-        self.assertTrue(image.shape[2] == 3)
-        self.assertTrue(label.shape[0] == 1)
-
-
-class TestFlowersTest(unittest.TestCase):
-    def test_main(self):
-        flowers = Flowers(mode='test')
-        self.assertTrue(len(flowers) == 1020)
-
-        # traversal whole dataset may cost a
-        # long time, randomly check 1 sample
-        idx = np.random.randint(0, 1020)
-        image, label = flowers[idx]
-        self.assertTrue(len(image.shape) == 3)
-        self.assertTrue(image.shape[2] == 3)
-        self.assertTrue(label.shape[0] == 1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_distributed.py b/hapi/tests/test_distributed.py
deleted file mode 100644
index 362600b60aa1534a937f60c4a365b857b361a93c..0000000000000000000000000000000000000000
--- a/hapi/tests/test_distributed.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import os
-import time
-import six
-import copy
-from argparse import ArgumentParser, REMAINDER
-import paddle
-import paddle.fluid as fluid
-
-from paddle.distributed.utils import *
-import paddle.distributed.cloud_utils as cloud_utils
-
-
-def get_cluster_from_args(selected_gpus):
-    cluster_node_ips = '127.0.0.1'
-    node_ip = '127.0.0.1'
-
-    node_ips = [x.strip() for x in cluster_node_ips.split(',')]
-
-    node_ips.index(node_ip)
-
-    free_ports = None
-
-    free_ports = find_free_ports(len(selected_gpus))
-    if free_ports is not None:
-        free_ports = list(free_ports)
-    return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
-
-
-def get_gpus(selected_gpus):
-    selected_gpus = [x.strip() for x in selected_gpus.split(',')]
-    return selected_gpus
-
-
-def start_local_trainers(cluster,
-                         pod,
-                         training_script,
-                         training_script_args,
-                         log_dir=None):
-    current_env = copy.copy(os.environ.copy())
-    #paddle broadcast ncclUniqueId use socket, and
-    #proxy maybe make trainers unreachable, so delete them.
-    #if we set them to "", grpc will log error message "bad uri"
-    #so just delete them.
-    current_env.pop("http_proxy", None)
-    current_env.pop("https_proxy", None)
-
-    procs = []
-    for idx, t in enumerate(pod.trainers):
-        proc_env = {
-            "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
-            "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
-            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
-        }
-
-        current_env.update(proc_env)
-
-        print("trainer proc env:{}".format(current_env))
-
-        cmd = "python -u " + training_script
-
-        print("start trainer proc:{} env:{}".format(cmd, proc_env))
-
-        fn = None
-
-        proc = subprocess.Popen(cmd.split(" "), env=current_env)
-
-        tp = TrainerProc()
-        tp.proc = proc
-        tp.rank = t.rank
-        tp.log_fn = fn
-        tp.cmd = cmd
-
-        procs.append(tp)
-
-    return procs
-
-
-class TestMultipleGpus(unittest.TestCase):
-    def test_mnist_2gpu(self):
-        if fluid.core.get_cuda_device_count() == 0:
-            return
-
-        selected_gpus = get_gpus('0,1')
-        cluster = None
-        pod = None
-
-        cluster, pod = get_cluster_from_args(selected_gpus)
-
-        procs = start_local_trainers(
-            cluster,
-            pod,
-            training_script='dist_mnist.py',
-            training_script_args=[])
-
-        while True:
-            alive = watch_local_trainers(procs, cluster.trainers_nranks())
-
-            if not alive:
-                print("Local procs complete, POD info:{}".format(pod))
-                break
-            time.sleep(3)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/hapi/tests/test_logger.py b/hapi/tests/test_logger.py
deleted file mode 100644
index 561253dd32f3e88e17656e23d1338b2e4ad74200..0000000000000000000000000000000000000000
--- a/hapi/tests/test_logger.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import os
-import numpy as np
-import shutil
-import tempfile
-
-from hapi.logger import setup_logger
-
-
-class TestSetupLogger(unittest.TestCase):
-    def setUp(self):
-        self.save_dir = tempfile.mkdtemp()
-        self.save_file = os.path.join(self.save_dir, 'logger.txt')
-
-    def tearDown(self):
-        shutil.rmtree(self.save_dir)
-
-    def logger(self, output=None):
-        setup_logger(output=output)
-
-    def test_logger_no_output(self):
-        self.logger()
-
-    def test_logger_dir(self):
-        self.logger(self.save_dir)
-
-    def test_logger_file(self):
-        self.logger(self.save_file)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_loss.py b/hapi/tests/test_loss.py
deleted file mode 100644
index 0bf8a48a47d3b36f49df4437fa12dcee40519a7c..0000000000000000000000000000000000000000
--- a/hapi/tests/test_loss.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import os
-import six
-import numpy as np
-import shutil
-import copy
-
-import paddle
-from paddle import fluid
-
-from hapi.model import Model, Input
-from hapi.vision.models import resnet18
-from hapi.loss import CrossEntropy, SoftmaxWithCrossEntropy
-
-
-def stable_softmax(x):
-    """Compute the softmax of vector x in a numerically stable way."""
-    # clip to shiftx, otherwise, when calc loss with
-    # log(exp(shiftx)), may get log(0)=INF
-    shiftx = (x - np.max(x)).clip(-64.)
-    exps = np.exp(shiftx)
-    return exps / np.sum(exps)
-
-
-def randomize_probability(batch_size, class_num, dtype='float32'):
-    prob = np.random.uniform(
-        0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
-    prob_sum = prob.sum(axis=1)
-    for i in six.moves.xrange(len(prob)):
-        prob[i] /= prob_sum[i]
-    return prob
-
-
-def numpy_ce(x, label):
-    return np.asmatrix(
-        [[-np.log(x[i][label[i][0]])] for i in range(x.shape[0])],
-        dtype="float32").mean()
-
-
-class TestLoss(unittest.TestCase):
-    def test_cross_entropy(self):
-        class_num = 100
-        batch_size = 128
-        inputs = [randomize_probability(128, class_num) for _ in range(2)]
-
-        labels = [
-            np.random.randint(
-                0, class_num, (batch_size, 1), dtype="int64") for _ in range(2)
-        ]
-
-        gt_out = [numpy_ce(inputs[i], labels[i]) for i in range(2)]
-
-        fluid.enable_dygraph()
-        cross_entropy = CrossEntropy()
-        out = cross_entropy(
-            [fluid.dygraph.to_variable(x) for x in inputs],
-            [fluid.dygraph.to_variable(label) for label in labels])
-        out = [o.numpy() for o in out]
-
-        for o, g in zip(out, gt_out):
-            np.testing.assert_allclose(o, g, atol=1e-5)
-
-    def test_soft_cross_entronpy(self):
-        class_num = 100
-        batch_size = 128
-
-        inputs = [randomize_probability(128, class_num) for _ in range(2)]
-
-        labels = [
-            np.random.randint(
-                0, class_num, (batch_size, 1), dtype="int64") for _ in range(2)
-        ]
-
-        fluid.enable_dygraph()
-        softmax_cross_entropy = SoftmaxWithCrossEntropy()
-
-        softmax_cross_entropy(
-            [fluid.dygraph.to_variable(x) for x in inputs],
-            [fluid.dygraph.to_variable(label) for label in labels])
-
-        softmax_cross_entropy = SoftmaxWithCrossEntropy(average=False)
-
-        inputs = [randomize_probability(128, class_num)]
-
-        labels = [
-            np.random.randint(
-                0, class_num, (batch_size, 1), dtype="int64")
-        ]
-
-        softmax_cross_entropy([fluid.dygraph.to_variable(x) for x in inputs],
-                              fluid.dygraph.to_variable(labels[0]))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_metrics.py b/hapi/tests/test_metrics.py
deleted file mode 100644
index 4dad4835c68070146c0a601c61f57ef56a6123b7..0000000000000000000000000000000000000000
--- a/hapi/tests/test_metrics.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.base import to_variable
-
-from hapi.metrics import *
-from hapi.utils import to_list
-
-
-def accuracy(pred, label, topk=(1, )):
-    maxk = max(topk)
-    pred = np.argsort(pred)[:, ::-1][:, :maxk]
-    correct = (pred == np.repeat(label, maxk, 1))
-
-    batch_size = label.shape[0]
-    res = []
-    for k in topk:
-        correct_k = correct[:, :k].sum()
-        res.append(correct_k / batch_size)
-    return res
-
-
-def convert_to_one_hot(y, C):
-    oh = np.random.random((y.shape[0], C)).astype('float32') * .5
-    for i in range(y.shape[0]):
-        oh[i, int(y[i])] = 1.
-    return oh
-
-
-class TestAccuracyDynamic(unittest.TestCase):
-    def setUp(self):
-        self.topk = (1, )
-        self.class_num = 5
-        self.sample_num = 1000
-        self.name = None
-
-    def random_pred_label(self):
-        label = np.random.randint(0, self.class_num, (self.sample_num, 1)).astype('int64')
-        pred = np.random.randint(0, self.class_num, (self.sample_num, 1)).astype('int32')
-        pred_one_hot = convert_to_one_hot(pred, self.class_num)
-        pred_one_hot = pred_one_hot.astype('float32')
-
-        return label, pred_one_hot
-
-    def test_main(self):
-        with fluid.dygraph.guard(fluid.CPUPlace()):
-            acc = Accuracy(topk=self.topk, name=self.name)
-            for i in range(10):
-                label, pred = self.random_pred_label()
-                label_var = to_variable(label)
-                pred_var = to_variable(pred)
-                state = to_list(acc.add_metric_op(pred_var, label_var))
-                acc.update(*[s.numpy() for s in state])
-                res_m = acc.accumulate()
-                res_f = accuracy(pred, label, self.topk)
-                assert np.all(np.isclose(np.array(res_m), np.array(res_f), rtol=1e-3)), \
-                        "Accuracy precision error: {} != {}".format(res_m, res_f)
-                acc.reset()
-                assert np.sum(acc.total) == 0
-                assert np.sum(acc.count) == 0
-
-
-class TestAccuracyDynamicMultiTopk(TestAccuracyDynamic):
-    def setUp(self):
-        self.topk = (1, 5)
-        self.class_num = 10
-        self.sample_num = 1000
-        self.name = "accuracy"
-
-
-class TestAccuracyStatic(TestAccuracyDynamic):
-    def test_main(self):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            pred = fluid.data(name='pred', shape=[None, self.class_num], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            acc = Accuracy(topk=self.topk, name=self.name)
-            state = acc.add_metric_op(pred, label)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        compiled_main_prog = fluid.CompiledProgram(main_prog)
-
-        for i in range(10):
-            label, pred = self.random_pred_label()
-            state_ret = exe.run(compiled_main_prog,
-                                feed={'pred': pred, 'label': label},
-                                fetch_list=[s.name for s in to_list(state)],
-                                return_numpy=True)
-            acc.update(*state_ret)
-            res_m = acc.accumulate()
-            res_f = accuracy(pred, label, self.topk)
-            assert np.all(np.isclose(np.array(res_m), np.array(res_f), rtol=1e-3)), \
-                    "Accuracy precision error: {} != {}".format(res_m, res_f)
-            acc.reset()
-            assert np.sum(acc.total) == 0
-            assert np.sum(acc.count) == 0
-
-
-class TestAccuracyStaticMultiTopk(TestAccuracyStatic):
-    def setUp(self):
-        self.topk = (1, 5)
-        self.class_num = 10
-        self.sample_num = 1000
-        self.name = "accuracy"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_model.py b/hapi/tests/test_model.py
deleted file mode 100644
index db62bd20aade6a9d04f7871b0ef9cfd955aa5d73..0000000000000000000000000000000000000000
--- a/hapi/tests/test_model.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-import os
-import numpy as np
-import shutil
-import tempfile
-
-import paddle
-from paddle import fluid
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
-from paddle.fluid.dygraph.container import Sequential
-from paddle.io import DataLoader
-from paddle.fluid.dygraph.base import to_variable
-
-from hapi.model import Model, Input, set_device
-from hapi.loss import CrossEntropy
-from hapi.metrics import Accuracy
-from hapi.datasets import MNIST
-from hapi.vision.models import LeNet
-
-
-class LeNetDygraph(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10, classifier_activation='softmax'):
-        super(LeNetDygraph, self).__init__()
-        self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2D(
-                1, 6, 3, stride=1, padding=1),
-            Pool2D(2, 'max', 2),
-            Conv2D(
-                6, 16, 5, stride=1, padding=0),
-            Pool2D(2, 'max', 2))
-
-        if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.fc(x)
-        return x
-
-
-class MnistDataset(MNIST):
-    def __init__(self, mode, return_label=True, sample_num=None):
-        super(MnistDataset, self).__init__(mode=mode)
-        self.return_label = return_label
-        if sample_num:
-            self.images = self.images[:sample_num]
-            self.labels = self.labels[:sample_num]
-
-    def __getitem__(self, idx):
-        img, label = self.images[idx], self.labels[idx]
-        img = np.reshape(img, [1, 28, 28])
-        if self.return_label:
-            return img, np.array(self.labels[idx]).astype('int64')
-        return img,
-
-    def __len__(self):
-        return len(self.images)
-
-
-def compute_acc(pred, label):
-    pred = np.argmax(pred, -1)
-    label = np.array(label)
-    correct = pred[:, np.newaxis] == label
-    return np.sum(correct) / correct.shape[0]
-
-
-def dynamic_train(model, dataloader):
-    optim = fluid.optimizer.Adam(
-        learning_rate=0.001, parameter_list=model.parameters())
-    model.train()
-    for inputs, labels in dataloader:
-        outputs = model(inputs)
-        loss = fluid.layers.cross_entropy(outputs, labels)
-        avg_loss = fluid.layers.reduce_sum(loss)
-        avg_loss.backward()
-        optim.minimize(avg_loss)
-        model.clear_gradients()
-
-
-def dynamic_evaluate(model, dataloader):
-    with fluid.dygraph.no_grad():
-        model.eval()
-        cnt = 0
-        for inputs, labels in dataloader:
-            outputs = model(inputs)
-
-            cnt += (np.argmax(outputs.numpy(), -1)[:, np.newaxis] ==
-                    labels.numpy()).astype('int').sum()
-
-    return cnt / len(dataloader.dataset)
-
-
-class TestModel(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.device = set_device('gpu')
-        fluid.enable_dygraph(cls.device)
-
-        sp_num = 1280
-        cls.train_dataset = MnistDataset(mode='train', sample_num=sp_num)
-        cls.val_dataset = MnistDataset(mode='test', sample_num=sp_num)
-        cls.test_dataset = MnistDataset(
-            mode='test', return_label=False, sample_num=sp_num)
-
-        cls.train_loader = fluid.io.DataLoader(
-            cls.train_dataset, places=cls.device, batch_size=64)
-        cls.val_loader = fluid.io.DataLoader(
-            cls.val_dataset, places=cls.device, batch_size=64)
-        cls.test_loader = fluid.io.DataLoader(
-            cls.test_dataset, places=cls.device, batch_size=64)
-
-        seed = 333
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
-
-        dy_lenet = LeNetDygraph()
-        cls.init_param = dy_lenet.state_dict()
-        dynamic_train(dy_lenet, cls.train_loader)
-
-        cls.acc1 = dynamic_evaluate(dy_lenet, cls.val_loader)
-
-        cls.inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-        cls.labels = [Input([None, 1], 'int64', name='label')]
-
-        cls.save_dir = tempfile.mkdtemp()
-        cls.weight_path = os.path.join(cls.save_dir, 'lenet')
-        fluid.dygraph.save_dygraph(dy_lenet.state_dict(), cls.weight_path)
-
-        fluid.disable_dygraph()
-
-    @classmethod
-    def tearDownClass(cls):
-        shutil.rmtree(cls.save_dir)
-
-    def test_fit_dygraph(self):
-        self.fit(True)
-
-    def test_fit_static(self):
-        self.fit(False)
-
-    def test_evaluate_dygraph(self):
-        self.evaluate(True)
-
-    def test_evaluate_static(self):
-        self.evaluate(False)
-
-    def test_predict_dygraph(self):
-        self.predict(True)
-
-    def test_predict_static(self):
-        self.predict(False)
-
-    def predict(self, dynamic):
-        fluid.enable_dygraph(self.device) if dynamic else None
-
-        inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-        labels = [Input([None, 1], 'int64', name='label')]
-
-        test_dataloader = fluid.io.DataLoader(
-            self.test_dataset,
-            places=self.device,
-            batch_size=64,
-            return_list=True)
-
-        model = LeNet()
-
-        model.load(self.weight_path)
-
-        model.prepare(metrics=Accuracy(), inputs=inputs, labels=labels)
-
-        output = model.predict(test_dataloader, stack_outputs=True)
-
-    def fit(self, dynamic):
-        fluid.enable_dygraph(self.device) if dynamic else None
-        seed = 333
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
-
-        model = LeNet()
-        optim_new = fluid.optimizer.Adam(
-            learning_rate=0.001, parameter_list=model.parameters())
-        model.prepare(
-            optim_new,
-            loss_function=CrossEntropy(average=False),
-            metrics=Accuracy(),
-            inputs=self.inputs,
-            labels=self.labels)
-        model.fit(self.train_dataset, batch_size=64, shuffle=False)
-
-        result = model.evaluate(self.val_dataset, batch_size=64)
-        np.testing.assert_allclose(result['acc'], self.acc1)
-        fluid.disable_dygraph() if dynamic else None
-
-    def evaluate(self, dynamic):
-        fluid.enable_dygraph(self.device) if dynamic else None
-        model = LeNet()
-        model.prepare(
-            metrics=Accuracy(), inputs=self.inputs, labels=self.labels)
-        model.load(self.weight_path)
-        result = model.evaluate(self.val_dataset, batch_size=64)
-        np.testing.assert_allclose(result['acc'], self.acc1)
-        fluid.disable_dygraph() if dynamic else None
-
-    def predict(self, dynamic):
-        fluid.enable_dygraph(self.device) if dynamic else None
-        model = LeNet()
-        model.prepare(inputs=self.inputs)
-        model.load(self.weight_path)
-        output = model.predict(
-            self.test_dataset, batch_size=64, stack_outputs=True)
-        np.testing.assert_equal(output[0].shape[0], len(self.test_dataset))
-
-        acc = compute_acc(output[0], self.val_dataset.labels)
-        np.testing.assert_allclose(acc, self.acc1)
-        fluid.disable_dygraph() if dynamic else None
-
-
-class MyModel(Model):
-    def __init__(self):
-        super(MyModel, self).__init__()
-        self._fc = Linear(20, 10, act='softmax')
-
-    def forward(self, x):
-        y = self._fc(x)
-        return y
-
-
-class TestModelFunction(unittest.TestCase):
-    def set_seed(self, seed=1024):
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
-
-    def test_train_batch(self, dynamic=True):
-        dim = 20
-        data = np.random.random(size=(4, dim)).astype(np.float32)
-        label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
-
-        def get_expect():
-            fluid.enable_dygraph(fluid.CPUPlace())
-            self.set_seed()
-            m = MyModel()
-            optim = fluid.optimizer.SGD(learning_rate=0.001,
-                                        parameter_list=m.parameters())
-            m.train()
-            output = m(to_variable(data))
-            l = to_variable(label)
-            loss = fluid.layers.cross_entropy(output, l)
-            avg_loss = fluid.layers.reduce_sum(loss)
-            avg_loss.backward()
-            optim.minimize(avg_loss)
-            m.clear_gradients()
-            fluid.disable_dygraph()
-            return avg_loss.numpy()
-
-        ref = get_expect()
-        for dynamic in [True, False]:
-            device = set_device('cpu')
-            fluid.enable_dygraph(device) if dynamic else None
-            self.set_seed()
-            model = MyModel()
-
-            optim2 = fluid.optimizer.SGD(learning_rate=0.001,
-                                         parameter_list=model.parameters())
-
-            inputs = [Input([None, dim], 'float32', name='x')]
-            labels = [Input([None, 1], 'int64', name='label')]
-            model.prepare(
-                optim2,
-                loss_function=CrossEntropy(average=False),
-                inputs=inputs,
-                labels=labels,
-                device=device)
-            loss, = model.train_batch([data], [label])
-
-            np.testing.assert_allclose(loss.flatten(), ref.flatten())
-            fluid.disable_dygraph() if dynamic else None
-
-    def test_test_batch(self, dynamic=True):
-        dim = 20
-        data = np.random.random(size=(4, dim)).astype(np.float32)
-
-        def get_expect():
-            fluid.enable_dygraph(fluid.CPUPlace())
-            self.set_seed()
-            m = MyModel()
-            m.eval()
-            output = m(to_variable(data))
-            fluid.disable_dygraph()
-            return output.numpy()
-
-        ref = get_expect()
-        for dynamic in [True, False]:
-            device = set_device('cpu')
-            fluid.enable_dygraph(device) if dynamic else None
-            self.set_seed()
-            model = MyModel()
-            inputs = [Input([None, dim], 'float32', name='x')]
-            model.prepare(inputs=inputs, device=device)
-            out, = model.test_batch([data])
-
-            np.testing.assert_allclose(out, ref)
-            fluid.disable_dygraph() if dynamic else None
-
-    def test_save_load(self):
-        path = tempfile.mkdtemp()
-        for dynamic in [True, False]:
-            device = set_device('cpu')
-            fluid.enable_dygraph(device) if dynamic else None
-            model = MyModel()
-            inputs = [Input([None, 20], 'float32', name='x')]
-            model.prepare(inputs=inputs)
-            model.save(path + '/test')
-            model.load(path + '/test')
-            shutil.rmtree(path)
-            fluid.disable_dygraph() if dynamic else None
-
-    def test_parameters(self):
-        for dynamic in [True, False]:
-            device = set_device('cpu')
-            fluid.enable_dygraph(device) if dynamic else None
-            model = MyModel()
-            inputs = [Input([None, 20], 'float32', name='x')]
-            model.prepare(inputs=inputs)
-            params = model.parameters()
-            self.assertTrue(params[0].shape[0] == 20)
-            self.assertTrue(params[0].shape[1] == 10)
-            fluid.disable_dygraph() if dynamic else None
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_progressbar.py b/hapi/tests/test_progressbar.py
deleted file mode 100644
index 797b94a1f0cae8ee37c50803bc4b2f6f4f4afe25..0000000000000000000000000000000000000000
--- a/hapi/tests/test_progressbar.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# when test, you should add hapi root path to the PYTHONPATH,
-# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
-import unittest
-import random
-import time
-
-from hapi.progressbar import ProgressBar
-
-
-class TestProgressBar(unittest.TestCase):
-    def prog_bar(self, num, epoch, width, verbose=1):
-        for epoch in range(epoch):
-            progbar = ProgressBar(num, verbose=verbose)
-            values = [
-                ['loss', 50.341673],
-                ['acc', 0.00256],
-            ]
-            for step in range(1, num + 1):
-                values[0][1] -= random.random() * 0.1
-                values[1][1] += random.random() * 0.1
-                if step % 10 == 0:
-                    progbar.update(step, values)
-                time.sleep(0.002)
-            progbar.update(step, values)
-
-    def test1(self):
-        self.prog_bar(50, 1, 30)
-
-    def test2(self):
-        self.prog_bar(50, 2, 30)
-
-    def test4(self):
-        self.prog_bar(50, 2, 30, verbose=2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_save_inference_model.py b/hapi/tests/test_save_inference_model.py
deleted file mode 100644
index 51d8cb533c7d5ec638a68575a12ea7cb79a8d9cf..0000000000000000000000000000000000000000
--- a/hapi/tests/test_save_inference_model.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import os
-import numpy as np
-import shutil
-import tempfile
-
-import paddle
-from paddle import fluid
-
-from hapi.model import Model, Input
-from hapi.vision.models import resnet18
-
-
-class TestSaveInferenceModel(unittest.TestCase):
-    def tearDown(self):
-        shutil.rmtree(self.save_dir)
-
-    def export_deploy_model(self):
-        model = resnet18()
-
-        inputs = [Input([None, 3, 224, 224], 'float32', name='image')]
-
-        model.prepare(inputs=inputs)
-
-        self.save_dir = tempfile.mkdtemp()
-        if not os.path.exists(self.save_dir):
-            os.makedirs(self.save_dir)
-
-        model.save_inference_model(self.save_dir)
-
-        place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=self.save_dir, executor=exe))
-        tensor_img = np.array(
-            np.random.random((1, 3, 224, 224)), dtype=np.float32)
-        ori_results = model.test_batch(tensor_img)
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
-
-        np.testing.assert_allclose(results, ori_results)
-
-    def test_save_inference_model(self):
-        self.export_deploy_model()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_transforms.py b/hapi/tests/test_transforms.py
deleted file mode 100644
index 5fc8d5067b7b1cb6501d5f1ed9dea69624dea2db..0000000000000000000000000000000000000000
--- a/hapi/tests/test_transforms.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# when test, you should add hapi root path to the PYTHONPATH,
-# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
-import unittest
-import os
-import tempfile
-import cv2
-import shutil
-import numpy as np
-
-from hapi.datasets import DatasetFolder
-import hapi.vision.transforms as transforms
-
-
-class TestTransforms(unittest.TestCase):
-    def setUp(self):
-        self.data_dir = tempfile.mkdtemp()
-        for i in range(2):
-            sub_dir = os.path.join(self.data_dir, 'class_' + str(i))
-            if not os.path.exists(sub_dir):
-                os.makedirs(sub_dir)
-            for j in range(2):
-                if j == 0:
-                    fake_img = (np.random.random(
-                        (280, 350, 3)) * 255).astype('uint8')
-                else:
-                    fake_img = (np.random.random(
-                        (400, 300, 3)) * 255).astype('uint8')
-                cv2.imwrite(os.path.join(sub_dir, str(j) + '.jpg'), fake_img)
-
-    def tearDown(self):
-        shutil.rmtree(self.data_dir)
-
-    def do_transform(self, trans):
-        dataset_folder = DatasetFolder(self.data_dir, transform=trans)
-
-        for _ in dataset_folder:
-            pass
-
-    def test_trans_all(self):
-        normalize = transforms.Normalize(
-            mean=[123.675, 116.28, 103.53], std=[58.395, 57.120, 57.375])
-        trans = transforms.Compose([
-            transforms.RandomResizedCrop(224), transforms.GaussianNoise(),
-            transforms.ColorJitter(
-                brightness=0.4, contrast=0.4, saturation=0.4,
-                hue=0.4), transforms.RandomHorizontalFlip(),
-            transforms.Permute(mode='CHW'), normalize
-        ])
-
-        self.do_transform(trans)
-
-    def test_trans_resize(self):
-        trans = transforms.Compose([
-            transforms.Resize(300, [0, 1]),
-            transforms.RandomResizedCrop((280, 280)),
-            transforms.Resize(280, [0, 1]),
-            transforms.Resize((256, 200)),
-            transforms.Resize((180, 160)),
-            transforms.CenterCrop(128),
-            transforms.CenterCrop((128, 128)),
-        ])
-        self.do_transform(trans)
-
-    def test_trans_centerCrop(self):
-        trans = transforms.Compose([
-            transforms.CenterCropResize(224),
-            transforms.CenterCropResize(128, 160),
-        ])
-        self.do_transform(trans)
-
-    def test_flip(self):
-        trans = transforms.Compose([
-            transforms.RandomHorizontalFlip(1.0),
-            transforms.RandomHorizontalFlip(0.0),
-            transforms.RandomVerticalFlip(0.0),
-            transforms.RandomVerticalFlip(1.0),
-        ])
-        self.do_transform(trans)
-
-    def test_color_jitter(self):
-        trans = transforms.BatchCompose([
-            transforms.BrightnessTransform(0.0),
-            transforms.HueTransform(0.0),
-            transforms.SaturationTransform(0.0),
-            transforms.ContrastTransform(0.0),
-        ])
-        self.do_transform(trans)
-
-    def test_exception(self):
-        trans = transforms.Compose([transforms.Resize(-1)])
-
-        trans_batch = transforms.BatchCompose([transforms.Resize(-1)])
-
-        with self.assertRaises(Exception):
-            self.do_transform(trans)
-
-        with self.assertRaises(Exception):
-            self.do_transform(trans_batch)
-
-        with self.assertRaises(ValueError):
-            transforms.ContrastTransform(-1.0)
-
-        with self.assertRaises(ValueError):
-            transforms.SaturationTransform(-1.0),
-
-        with self.assertRaises(ValueError):
-            transforms.HueTransform(-1.0)
-
-        with self.assertRaises(ValueError):
-            transforms.BrightnessTransform(-1.0)
-
-    def test_info(self):
-        str(transforms.Compose([transforms.Resize((224, 224))]))
-        str(transforms.BatchCompose([transforms.Resize((224, 224))]))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/tests/test_vison_models.py b/hapi/tests/test_vison_models.py
deleted file mode 100644
index 05d3a10b34573cbf095c8e95f149f26edbbdfeec..0000000000000000000000000000000000000000
--- a/hapi/tests/test_vison_models.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import hapi.vision.models as models
-from hapi.model import Input
-
-
-class TestVisonModels(unittest.TestCase):
-    def models_infer(self, arch, pretrained=False, batch_norm=False):
-
-        x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
-        if batch_norm:
-            model = models.__dict__[arch](pretrained=pretrained,
-                                          batch_norm=True)
-        else:
-            model = models.__dict__[arch](pretrained=pretrained)
-        inputs = [Input([None, 3, 224, 224], 'float32', name='image')]
-
-        model.prepare(inputs=inputs)
-
-        model.test_batch(x)
-
-    def test_mobilenetv2_pretrained(self):
-        self.models_infer('mobilenet_v2', pretrained=True)
-
-    def test_mobilenetv1(self):
-        self.models_infer('mobilenet_v1')
-
-    def test_vgg11(self):
-        self.models_infer('vgg11')
-
-    def test_vgg13(self):
-        self.models_infer('vgg13')
-
-    def test_vgg16(self):
-        self.models_infer('vgg16')
-
-    def test_vgg16_bn(self):
-        self.models_infer('vgg16', batch_norm=True)
-
-    def test_vgg19(self):
-        self.models_infer('vgg19')
-
-    def test_resnet18(self):
-        self.models_infer('resnet18')
-
-    def test_resnet34(self):
-        self.models_infer('resnet34')
-
-    def test_resnet50(self):
-        self.models_infer('resnet50')
-
-    def test_resbet101(self):
-        self.models_infer('resnet101')
-
-    def test_resbet152(self):
-        self.models_infer('resnet152')
-
-    def test_darknet53(self):
-        self.models_infer('darknet53')
-
-    def test_lenet(self):
-        lenet = models.__dict__['LeNet']()
-
-        inputs = [Input([None, 1, 28, 28], 'float32', name='x')]
-        lenet.prepare(inputs=inputs)
-
-        x = np.array(np.random.random((2, 1, 28, 28)), dtype=np.float32)
-        lenet.test_batch(x)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/hapi/text/__init__.py b/hapi/text/__init__.py
deleted file mode 100644
index 2177ada5c0c7135e3feea0772b609d0ab29a7ba2..0000000000000000000000000000000000000000
--- a/hapi/text/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from hapi.text.text import RNNCell as RNNCell
-from hapi.text.text import BasicLSTMCell as BasicLSTMCell
-from hapi.text.text import BasicGRUCell as BasicGRUCell
-from hapi.text.text import RNN as RNN
-from hapi.text.text import DynamicDecode as DynamicDecode
-from hapi.text.text import BeamSearchDecoder as BeamSearchDecoder
-from hapi.text.text import MultiHeadAttention as MultiHeadAttention
-from hapi.text.text import FFN as FFN
-from hapi.text.text import TransformerEncoderLayer as TransformerEncoderLayer
-from hapi.text.text import TransformerDecoderLayer as TransformerDecoderLayer
-from hapi.text.text import TransformerEncoder as TransformerEncoder
-from hapi.text.text import TransformerDecoder as TransformerDecoder
-from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder
-from hapi.text.text import GRUCell as GRUCell
-from hapi.text.text import GRUEncoderCell as GRUEncoderCell
-from hapi.text.text import BiGRU as BiGRU
-from hapi.text.text import Linear_chain_crf as Linear_chain_crf
-from hapi.text.text import Crf_decoding as Crf_decoding
-from hapi.text.text import SequenceTagging as SequenceTagging
diff --git a/hapi/text/bert/__init__.py b/hapi/text/bert/__init__.py
deleted file mode 100644
index b634f9a6adce27d5dcd4d552799c4e0771d8950d..0000000000000000000000000000000000000000
--- a/hapi/text/bert/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from hapi.text.bert.bert import BertConfig as BertConfig
-from hapi.text.bert.dygraph_optimization import DyOptimizer as DyOptimizer
-from hapi.text.bert.static_optimization import StOptimizer as StOptimizer
-from hapi.text.bert.optimization import make_optimizer as make_optimizer
-from hapi.text.bert.dataloader import BertDataLoader as BertDataLoader
-from hapi.text.bert.dataloader import BertInputExample as BertInputExample
-from hapi.text.tokenizer import tokenization as tokenization
-from hapi.text.bert.bert import BertEncoder as BertEncoder
diff --git a/hapi/text/bert/batching.py b/hapi/text/bert/batching.py
deleted file mode 100644
index f9e3106856a51e0b49d61a7e01835a46cc9a4db2..0000000000000000000000000000000000000000
--- a/hapi/text/bert/batching.py
+++ /dev/null
@@ -1,189 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Mask, padding and batching."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
-    """
-    Add mask for batch_tokens, return out, mask_label, mask_pos;
-    Note: mask_pos responding the batch_tokens after padded;
-    """
-    max_len = max([len(sent) for sent in batch_tokens])
-    mask_label = []
-    mask_pos = []
-    prob_mask = np.random.rand(total_token_num)
-    # Note: the first token is [CLS], so [low=1]
-    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
-    pre_sent_len = 0
-    prob_index = 0
-    for sent_index, sent in enumerate(batch_tokens):
-        mask_flag = False
-        prob_index += pre_sent_len
-        for token_index, token in enumerate(sent):
-            prob = prob_mask[prob_index + token_index]
-            if prob > 0.15:
-                continue
-            elif 0.03 < prob <= 0.15:
-                # mask
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    sent[token_index] = MASK
-                    mask_flag = True
-                    mask_pos.append(sent_index * max_len + token_index)
-            elif 0.015 < prob <= 0.03:
-                # random replace
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    sent[token_index] = replace_ids[prob_index + token_index]
-                    mask_flag = True
-                    mask_pos.append(sent_index * max_len + token_index)
-            else:
-                # keep the original token
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    mask_pos.append(sent_index * max_len + token_index)
-        pre_sent_len = len(sent)
-
-        # ensure at least mask one word in a sentence
-        while not mask_flag:
-            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
-            if sent[token_index] != SEP and sent[token_index] != CLS:
-                mask_label.append(sent[token_index])
-                sent[token_index] = MASK
-                mask_flag = True
-                mask_pos.append(sent_index * max_len + token_index)
-    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
-    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
-    return batch_tokens, mask_label, mask_pos
-
-
-def prepare_batch_data(insts,
-                       total_token_num,
-                       voc_size=0,
-                       pad_id=None,
-                       cls_id=None,
-                       sep_id=None,
-                       mask_id=None,
-                       return_input_mask=True,
-                       return_max_len=True,
-                       return_num_token=False):
-    """
-    1. generate Tensor of data
-    2. generate Tensor of position
-    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
-    """
-
-    batch_src_ids = [inst[0] for inst in insts]
-    batch_pos_ids = [inst[1] for inst in insts]
-    batch_sent_ids = [inst[2] for inst in insts]
-    labels_list = []
-    # compatible with squad, whose example includes start/end positions, 
-    # or unique id
-
-    for i in range(3, len(insts[0]), 1):
-        labels = [inst[i] for inst in insts]
-        labels = np.array(labels).astype("int64").reshape([-1, 1])
-        labels_list.append(labels)
-
-    # First step: do mask without padding
-    if mask_id >= 0:
-        out, mask_label, mask_pos = mask(
-            batch_src_ids,
-            total_token_num,
-            vocab_size=voc_size,
-            CLS=cls_id,
-            SEP=sep_id,
-            MASK=mask_id)
-    else:
-        out = batch_src_ids
-    # Second step: padding
-    src_id, self_input_mask = pad_batch_data(
-        out, pad_idx=pad_id, return_input_mask=True)
-    pos_id = pad_batch_data(
-        batch_pos_ids,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
-    sent_id = pad_batch_data(
-        batch_sent_ids,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
-
-    if mask_id >= 0:
-        return_list = [
-            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
-        ] + labels_list
-    else:
-        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
-
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-def pad_batch_data(insts,
-                   pad_idx=0,
-                   return_pos=False,
-                   return_input_mask=False,
-                   return_max_len=False,
-                   return_num_token=False):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and input mask.
-    """
-    return_list = []
-    max_len = max(len(inst) for inst in insts)
-    # Any token included in dict can be used to pad, since the paddings' loss
-    # will be masked out by weights and make no effect on parameter gradients.
-
-    inst_data = np.array([
-        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
-    ])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len])]
-
-    # position data
-    if return_pos:
-        inst_pos = np.array([
-            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
-            for inst in insts
-        ])
-
-        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
-
-    if return_input_mask:
-        # This is used to avoid attention on paddings.
-        input_mask_data = np.array([[1] * len(inst) + [0] *
-                                    (max_len - len(inst)) for inst in insts])
-        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
-        return_list += [input_mask_data.astype("float32")]
-
-    if return_max_len:
-        return_list += [max_len]
-
-    if return_num_token:
-        num_token = 0
-        for inst in insts:
-            num_token += len(inst)
-        return_list += [num_token]
-
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-if __name__ == "__main__":
-    pass
diff --git a/hapi/text/bert/bert.py b/hapi/text/bert/bert.py
deleted file mode 100644
index be0203d9c8ac435dd8ec6a225bc71ad8b121b91d..0000000000000000000000000000000000000000
--- a/hapi/text/bert/bert.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"bert"
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import json
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-from hapi.model import Model
-from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
-from hapi.text.text import PrePostProcessLayer, TransformerEncoder
-from hapi.text.bert.utils.init import init_from_static_model
-
-
-class BertConfig(object):
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except Exception:
-            raise IOError("Error in parsing bert model config file '%s'" %
-                          config_path)
-        else:
-            return config_dict
-
-    def __getitem__(self, key):
-        return self._config_dict[key]
-
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-
-
-class BertEncoder(Model):
-    """
-    bert
-    """
-
-    def __init__(self, config, return_pooled_out=True, use_fp16=False):
-        super(BertEncoder, self).__init__()
-
-        self.config = config
-        self._emb_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        self._sent_types = config['type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self.return_pooled_out = return_pooled_out
-
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._dtype = "float16" if use_fp16 else "float32"
-
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range'])
-
-        self._src_emb = Embedding(
-            size=[self._voc_size, self._emb_size],
-            param_attr=fluid.ParamAttr(
-                name=self._word_emb_name, initializer=self._param_initializer),
-            dtype=self._dtype)
-
-        self._pos_emb = Embedding(
-            size=[self._max_position_seq_len, self._emb_size],
-            param_attr=fluid.ParamAttr(
-                name=self._pos_emb_name, initializer=self._param_initializer),
-            dtype=self._dtype)
-
-        self._sent_emb = Embedding(
-            size=[self._sent_types, self._emb_size],
-            param_attr=fluid.ParamAttr(
-                name=self._sent_emb_name, initializer=self._param_initializer),
-            dtype=self._dtype)
-
-        self.pooled_fc = Linear(
-            input_dim=self._emb_size,
-            output_dim=self._emb_size,
-            param_attr=fluid.ParamAttr(
-                name="pooled_fc.w_0", initializer=self._param_initializer),
-            bias_attr="pooled_fc.b_0",
-            act="tanh")
-
-        self.pre_process_layer = PrePostProcessLayer(
-            "nd", self._emb_size, self._prepostprocess_dropout, None)
-
-        self._encoder = TransformerEncoder(
-            n_layer=self._n_layer,
-            n_head=self._n_head,
-            d_key=self._emb_size // self._n_head,
-            d_value=self._emb_size // self._n_head,
-            d_model=self._emb_size,
-            d_inner_hid=self._emb_size * 4,
-            prepostprocess_dropout=self._prepostprocess_dropout,
-            attention_dropout=self._attention_dropout,
-            relu_dropout=0,
-            preprocess_cmd="",
-            postprocess_cmd="dan",
-            ffn_fc1_act=self._hidden_act)
-
-    def init_parameters(self, param_path="", verbose=False):
-        init_from_static_model(param_path, self, self.config, verbose)
-
-    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
-        """
-        forward
-        """
-        src_emb = self._src_emb(src_ids)
-        pos_emb = self._pos_emb(position_ids)
-        sent_emb = self._sent_emb(sentence_ids)
-
-        emb_out = src_emb + pos_emb
-        emb_out = emb_out + sent_emb
-
-        emb_out = self.pre_process_layer(emb_out)
-
-        self_attn_mask = fluid.layers.matmul(
-            x=input_mask, y=input_mask, transpose_y=True)
-        self_attn_mask = fluid.layers.scale(
-            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
-        n_head_self_attn_mask = fluid.layers.stack(
-            x=[self_attn_mask] * self._n_head, axis=1)
-        n_head_self_attn_mask.stop_gradient = True
-
-        enc_output = self._encoder(emb_out, n_head_self_attn_mask)
-
-        if not self.return_pooled_out:
-            return enc_output
-        next_sent_feat = fluid.layers.slice(
-            input=enc_output, axes=[1], starts=[0], ends=[1])
-        next_sent_feat = self.pooled_fc(next_sent_feat)
-        next_sent_feat = fluid.layers.reshape(
-            next_sent_feat, shape=[-1, self._emb_size])
-
-        return enc_output, next_sent_feat
diff --git a/hapi/text/bert/data_processor.py b/hapi/text/bert/data_processor.py
deleted file mode 100644
index 7429fcb66d12eef7c4b3664d6fc14b3db817476c..0000000000000000000000000000000000000000
--- a/hapi/text/bert/data_processor.py
+++ /dev/null
@@ -1,676 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io
-import os
-import types
-import csv
-import numpy as np
-
-import hapi.text.tokenizer.tokenization as tokenization
-from hapi.text.bert.batching import prepare_batch_data
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def __init__(self, tokenizer, max_seq_len, in_tokens, random_seed=None):
-
-        self.max_seq_len = max_seq_len
-        self.tokenizer = tokenizer
-        self.vocab = self.tokenizer.vocab
-
-        self.in_tokens = in_tokens
-        np.random.seed(random_seed)
-
-        self.current_train_example = -1
-        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
-        self.current_train_epoch = -1
-
-    def get_train_iter(self,
-                       data_dir,
-                       epoch_num=1,
-                       shuffle=True,
-                       shuffle_seed=None):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_iter(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_test_iter(self, data_dir):
-        """Gets a collection of `InputExample`s for prediction."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    def convert_example(self, index, example, labels, max_seq_len, tokenizer):
-        """Converts a single `InputExample` into a single `InputFeatures`."""
-        feature = convert_single_example(index, example, labels, max_seq_len,
-                                         tokenizer)
-        return feature
-
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with io.open(input_file, "r", encoding="utf8") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                lines.append(line)
-            return lines
-
-    def generate_instance(self, feature):
-        """
-        generate instance with given feature
-
-        Args:
-            feature: InputFeatures(object). A single set of features of data.
-        """
-        input_pos = list(range(len(feature.input_ids)))
-        return [
-            feature.input_ids, feature.segment_ids, input_pos, feature.label_id
-        ]
-
-    def generate_batch_data(self,
-                            batch_data,
-                            total_token_num,
-                            voc_size=-1,
-                            mask_id=-1,
-                            return_input_mask=True,
-                            return_max_len=False,
-                            return_num_token=False):
-        return prepare_batch_data(
-            batch_data,
-            total_token_num,
-            voc_size=-1,
-            pad_id=self.vocab["[PAD]"],
-            cls_id=self.vocab["[CLS]"],
-            sep_id=self.vocab["[SEP]"],
-            mask_id=-1,
-            return_input_mask=True,
-            return_max_len=False,
-            return_num_token=False)
-
-    def get_num_examples(self, phase):
-        """Get number of examples for train, dev or test."""
-        if phase not in ['train', 'dev', 'test']:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'dev', 'test'].")
-        if phase == 'train':
-            return len(self.train_examples)
-        elif phase == 'dev':
-            return len(self.dev_examples)
-        elif phase == 'test':
-            return len(self.test_examples)
-        else:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'dev', 'test'].")
-
-    def get_train_progress(self):
-        """Gets progress for training phase."""
-        return self.current_train_example, self.current_train_epoch
-
-    def data_generator(self, data_iter, batch_size, phase='train',
-                       dev_count=1):
-        """
-        Generate data for train, dev or test.
-    
-        Args:
-          batch_size: int. The batch size of generated data.
-          phase: string. The phase for which to generate data.
-        """
-        assert phase in ['train', 'dev', 'test']
-        if phase == 'train':
-            sample_num = len(self.train_examples)
-        elif phase == 'dev':
-            sample_num = len(self.dev_examples)
-        elif phase == 'test':
-            sample_num = len(self.test_examples)
-        else:
-            sample_num = -1
-        self.num_examples[phase] = sample_num
-
-        def instance_reader():
-            for epoch_idx, example_idx, example in data_iter():
-                if phase == 'train':
-                    self.current_train_epoch = epoch_idx
-                    self.current_train_example = example_idx
-                feature = self.convert_example(
-                    example_idx, example,
-                    self.get_labels(), self.max_seq_len, self.tokenizer)
-
-                instance = self.generate_instance(feature)
-                yield instance
-
-        def batch_reader(reader, batch_size, in_tokens):
-            batch, total_token_num, max_len = [], 0, 0
-            for instance in reader():
-                token_ids, sent_ids, pos_ids, label = instance[:4]
-                max_len = max(max_len, len(token_ids))
-                if in_tokens:
-                    to_append = (len(batch) + 1) * max_len <= batch_size
-                else:
-                    to_append = len(batch) < batch_size
-                if to_append:
-                    batch.append(instance)
-                    total_token_num += len(token_ids)
-                else:
-                    yield batch, total_token_num
-                    batch, total_token_num, max_len = [instance], len(
-                        token_ids), len(token_ids)
-
-            if len(batch) > 0:
-                yield batch, total_token_num
-
-        def wrapper():
-            all_dev_batches = []
-            for batch_data, total_token_num in batch_reader(
-                    instance_reader, batch_size, self.in_tokens):
-                batch_data = self.generate_batch_data(
-                    batch_data,
-                    total_token_num,
-                    voc_size=-1,
-                    mask_id=-1,
-                    return_input_mask=True,
-                    return_max_len=False,
-                    return_num_token=False)
-                if len(all_dev_batches) < dev_count:
-                    all_dev_batches.append(batch_data)
-
-                if len(all_dev_batches) == dev_count:
-                    for batch in all_dev_batches:
-                        yield batch
-                    all_dev_batches = []
-
-        return wrapper
-
-
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-    Args:
-      guid: Unique id for the example.
-      text_a: string. The untokenized text of the first sequence. For single
-        sequence tasks, only this sequence must be specified.
-      text_b: (Optional) string. The untokenized text of the second sequence.
-        Only must be specified for sequence pair tasks.
-      label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
-
-
-class XnliProcessor(DataProcessor):
-    """Processor for the XNLI data set."""
-
-    def get_train_iter(self,
-                       data_dir,
-                       epoch_num=1,
-                       shuffle=True,
-                       shuffle_seed=None):
-        """See base class."""
-        self.language = "zh"
-        lines = self._read_tsv(
-            os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" %
-                         self.language))
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "train-%d" % (i)
-            text_a = tokenization.convert_to_unicode(line[0])
-            text_b = tokenization.convert_to_unicode(line[1])
-            label = tokenization.convert_to_unicode(line[2])
-            if label == tokenization.convert_to_unicode("contradictory"):
-                label = tokenization.convert_to_unicode("contradiction")
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-
-        self.train_examples = examples
-
-        def wrapper():
-            if shuffle:
-                if shuffle_seed is not None:
-                    np.random.seed(shuffle_seed)
-            for epoch_idx in range(epoch_num):
-                if shuffle:
-                    np.random.shuffle(examples)
-                for (example_idx, example) in enumerate(examples):
-                    yield epoch_idx, example_idx, example
-
-        return wrapper
-
-    def get_dev_iter(self, data_dir):
-        """See base class."""
-        self.language = "zh"
-        lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "dev-%d" % (i)
-            language = tokenization.convert_to_unicode(line[0])
-            if language != tokenization.convert_to_unicode(self.language):
-                continue
-            text_a = tokenization.convert_to_unicode(line[6])
-            text_b = tokenization.convert_to_unicode(line[7])
-            label = tokenization.convert_to_unicode(line[1])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-
-        self.dev_examples = examples
-
-        def wrapper():
-            for (example_idx, example) in enumerate(examples):
-                yield 0, example_idx, example
-
-        return wrapper
-
-    def get_test_iter(self, data_dir):
-        """See base class."""
-        self.language = "zh"
-        lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "test-%d" % (i)
-            language = tokenization.convert_to_unicode(line[0])
-            if language != tokenization.convert_to_unicode(self.language):
-                continue
-            text_a = tokenization.convert_to_unicode(line[6])
-            text_b = tokenization.convert_to_unicode(line[7])
-            label = tokenization.convert_to_unicode(line[1])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-
-        self.test_examples = examples
-
-        def wrapper():
-            for (example_idx, example) in enumerate(examples):
-                yield 0, example_idx, example
-
-        return wrapper
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_train_iter(self,
-                       data_dir,
-                       epoch_num=1,
-                       shuffle=True,
-                       shuffle_seed=None):
-        """See base class."""
-        examples = self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-        self.train_examples = examples
-
-        def wrapper():
-            if shuffle:
-                if shuffle_seed is not None:
-                    np.random.seed(shuffle_seed)
-            for epoch_idx in range(epoch_num):
-                if shuffle:
-                    np.random.shuffle(examples)
-                for (example_idx, example) in enumerate(examples):
-                    yield epoch_idx, example_idx, example
-
-        return wrapper
-
-    def get_dev_iter(self, data_dir):
-        """See base class."""
-        examples = self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-            "dev_matched")
-
-        self.dev_examples = examples
-
-        def wrapper():
-            for (example_idx, example) in enumerate(examples):
-                yield 0, example_idx, example
-
-        return wrapper
-
-    def get_test_iter(self, data_dir):
-        """See base class."""
-        examples = self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
-
-        self.test_examples = examples
-
-        def wrapper():
-            for (example_idx, example) in enumerate(examples):
-                yield 0, example_idx, example
-
-        return wrapper
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type,
-                              tokenization.convert_to_unicode(line[0]))
-            text_a = tokenization.convert_to_unicode(line[8])
-            text_b = tokenization.convert_to_unicode(line[9])
-            if set_type == "test":
-                label = "contradiction"
-            else:
-                label = tokenization.convert_to_unicode(line[-1])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-
-    def get_train_iter(self,
-                       data_dir,
-                       epoch_num=1,
-                       shuffle=True,
-                       shuffle_seed=None):
-        """See base class."""
-        examples = self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-        self.train_examples = examples
-
-        def wrapper():
-            if shuffle:
-                if shuffle_seed is not None:
-                    np.random.seed(shuffle_seed)
-            for epoch_idx in range(epoch_num):
-                if shuffle:
-                    np.random.shuffle(examples)
-                for (example_idx, example) in enumerate(examples):
-                    yield epoch_idx, example_idx, example
-
-        return wrapper
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        examples = self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-        self.dev_examples = examples
-
-        def wrapper():
-            for (example_idx, example) in enumerate(examples):
-                yield 0, example_idx, example
-
-        return wrapper
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        examples = self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-        self.test_examples = examples
-
-        def wrapper():
-            for (example_idx, example) in enumerate(examples):
-                yield 0, example_idx, example
-
-        return wrapper
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = tokenization.convert_to_unicode(line[3])
-            text_b = tokenization.convert_to_unicode(line[4])
-            if set_type == "test":
-                label = "0"
-            else:
-                label = tokenization.convert_to_unicode(line[0])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-
-    def get_train_iter(self,
-                       data_dir,
-                       epoch_num=1,
-                       shuffle=True,
-                       shuffle_seed=None):
-        """See base class."""
-        examples = self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-        self.train_examples = examples
-
-        def wrapper():
-            if shuffle:
-                if shuffle_seed is not None:
-                    np.random.seed(shuffle_seed)
-            for epoch_idx in range(epoch_num):
-                if shuffle:
-                    np.random.shuffle(examples)
-                for (example_idx, example) in enumerate(examples):
-                    yield epoch_idx, example_idx, example
-
-        return wrapper
-
-    def get_dev_iter(self, data_dir):
-        """See base class."""
-        examples = self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-        self.dev_examples = examples
-
-        def wrapper():
-            for (example_idx, example) in enumerate(examples):
-                yield 0, example_idx, example
-
-        return wrapper
-
-    def get_test_iter(self, data_dir):
-        """See base class."""
-        examples = self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-        self.test_examples = examples
-
-        def wrapper():
-            for (example_idx, example) in enumerate(examples):
-                yield 0, example_idx, example
-
-        return wrapper
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            # Only the test set has a header
-            if set_type == "test" and i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            if set_type == "test":
-                text_a = tokenization.convert_to_unicode(line[1])
-                label = "0"
-            else:
-                text_a = tokenization.convert_to_unicode(line[3])
-                label = tokenization.convert_to_unicode(line[1])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-def convert_single_example_to_unicode(guid, single_example):
-    text_a = tokenization.convert_to_unicode(single_example[0])
-    text_b = tokenization.convert_to_unicode(single_example[1])
-    label = tokenization.convert_to_unicode(single_example[2])
-    return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
-
-
-def convert_single_example(ex_index, example, label_list, max_seq_length,
-                           tokenizer):
-    """Converts a single `InputExample` into a single `InputFeatures`."""
-    label_map = {}
-    for (i, label) in enumerate(label_list):
-        label_map[label] = i
-
-    tokens_a = tokenizer.tokenize(example.text_a)
-    tokens_b = None
-    if example.text_b:
-        tokens_b = tokenizer.tokenize(example.text_b)
-
-    if tokens_b:
-        # Modifies `tokens_a` and `tokens_b` in place so that the total
-        # length is less than the specified length.
-        # Account for [CLS], [SEP], [SEP] with "- 3"
-        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-    else:
-        # Account for [CLS] and [SEP] with "- 2"
-        if len(tokens_a) > max_seq_length - 2:
-            tokens_a = tokens_a[0:(max_seq_length - 2)]
-
-    # The convention in BERT is:
-    # (a) For sequence pairs:
-    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-    # (b) For single sequences:
-    #  tokens:   [CLS] the dog is hairy . [SEP]
-    #  type_ids: 0     0   0   0  0     0 0
-    #
-    # Where "type_ids" are used to indicate whether this is the first
-    # sequence or the second sequence. The embedding vectors for `type=0` and
-    # `type=1` were learned during pre-training and are added to the wordpiece
-    # embedding vector (and position vector). This is not *strictly* necessary
-    # since the [SEP] token unambiguously separates the sequences, but it makes
-    # it easier for the model to learn the concept of sequences.
-    #
-    # For classification tasks, the first vector (corresponding to [CLS]) is
-    # used as as the "sentence vector". Note that this only makes sense because
-    # the entire model is fine-tuned.
-    tokens = []
-    segment_ids = []
-    tokens.append("[CLS]")
-    segment_ids.append(0)
-    for token in tokens_a:
-        tokens.append(token)
-        segment_ids.append(0)
-    tokens.append("[SEP]")
-    segment_ids.append(0)
-
-    if tokens_b:
-        for token in tokens_b:
-            tokens.append(token)
-            segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-
-    input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-    # The mask has 1 for real tokens and 0 for padding tokens. Only real
-    # tokens are attended to.
-    input_mask = [1] * len(input_ids)
-
-    label_id = label_map[example.label]
-
-    feature = InputFeatures(
-        input_ids=input_ids,
-        input_mask=input_mask,
-        segment_ids=segment_ids,
-        label_id=label_id)
-    return feature
-
-
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer):
-    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            print("Writing example %d of %d" % (ex_index, len(examples)))
-
-        feature = convert_single_example(ex_index, example, label_list,
-                                         max_seq_length, tokenizer)
-
-        features.append(feature)
-    return features
-
-
-if __name__ == '__main__':
-    print("hello world")
-    pass
diff --git a/hapi/text/bert/dataloader.py b/hapi/text/bert/dataloader.py
deleted file mode 100644
index 838ad50b370c7ff4c9247880adc994be3aedf501..0000000000000000000000000000000000000000
--- a/hapi/text/bert/dataloader.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io
-import os
-import six
-import csv
-import glob
-import tarfile
-import itertools
-import leveldb
-from functools import partial
-
-import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.io import BatchSampler, DataLoader, Dataset
-from hapi.distributed import DistributedBatchSampler
-from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
-from hapi.text.bert.batching import prepare_batch_data
-import hapi.text.tokenizer.tokenization as tokenization
-from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
-
-__all__ = [
-    'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset',
-    'SentencePairDataset', 'BertDataLoader'
-]
-
-
-class BertInputExample(object):
-    def __init__(self, uid, text_a, text_b=None, label=None):
-        self.uid = uid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class BertInputFeatures(object):
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.pos_ids = list(range(len(self.input_ids)))
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def convert_single_example_to_unicode(guid, single_example):
-    text_a = tokenization.convert_to_unicode(single_example[0])
-    text_b = tokenization.convert_to_unicode(single_example[1])
-    label = tokenization.convert_to_unicode(single_example[2])
-    return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)
-
-
-def convert_single_example(ex_index, example, label_list, max_seq_length,
-                           tokenizer):
-    """Converts a single `BertInputExample` into a single `BertInputFeatures`."""
-    label_map = {}
-    for (i, label) in enumerate(label_list):
-        label_map[label] = i
-
-    tokens_a = tokenizer.tokenize(example.text_a)
-    tokens_b = None
-    if example.text_b:
-        tokens_b = tokenizer.tokenize(example.text_b)
-
-    if tokens_b:
-        # Modifies `tokens_a` and `tokens_b` in place so that the total
-        # length is less than the specified length.
-        # Account for [CLS], [SEP], [SEP] with "- 3"
-        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-    else:
-        # Account for [CLS] and [SEP] with "- 2"
-        if len(tokens_a) > max_seq_length - 2:
-            tokens_a = tokens_a[0:(max_seq_length - 2)]
-
-    tokens = []
-    segment_ids = []
-    tokens.append("[CLS]")
-    segment_ids.append(0)
-    for token in tokens_a:
-        tokens.append(token)
-        segment_ids.append(0)
-    tokens.append("[SEP]")
-    segment_ids.append(0)
-
-    if tokens_b:
-        for token in tokens_b:
-            tokens.append(token)
-            segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-
-    input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-    input_mask = [1] * len(input_ids)
-    label_id = label_map[example.label]
-
-    feature = BertInputFeatures(
-        input_ids=input_ids,
-        input_mask=input_mask,
-        segment_ids=segment_ids,
-        label_id=label_id)
-
-    return feature
-
-
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer):
-    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            print("Writing example %d of %d" % (ex_index, len(examples)))
-
-        feature = convert_single_example(ex_index, example, label_list,
-                                         max_seq_length, tokenizer)
-
-        features.append(feature)
-
-    return features
-
-
-def _read_tsv(input_file, delimiter="\t", quotechar=None):
-    """Reads a tab separated value file."""
-    with io.open(input_file, "r", encoding="utf8") as f:
-        reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar)
-        lines = []
-        for line in reader:
-            lines.append(line)
-        return lines
-
-
-class SingleSentenceDataset(Dataset):
-    def __init__(self,
-                 tokenizer,
-                 label_list,
-                 max_seq_length,
-                 mode="all_in_memory"):
-
-        assert isinstance(mode,
-                          str), "mode of SingleSentenceDataset should be str"
-        assert mode in [
-            "all_in_memory", "leveldb", "streaming"
-        ], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb, streaming], but get" % mode
-
-        self.delimiter = None
-        self.mode = mode
-        self.examples = []
-        self._db = None
-        self._line_processor = None
-
-    def load_all_data_in_memory(self,
-                                input_file,
-                                label_list,
-                                max_seq_length,
-                                tokenizer,
-                                line_processor=None,
-                                delimiter="\t",
-                                quotechar=None):
-        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
-
-        def default_line_processor(line_id, line):
-            assert len(line) == 2
-            text_a = line[0]
-            label = line[1]
-
-            return BertInputExample(
-                str(line_id), text_a=text_a, text_b=None, label=label)
-
-        if line_processor is None:
-            line_processor = default_line_processor
-
-        for (line_id, line) in enumerate(lines):
-            input_example = line_processor(line_id, line)
-            if not input_example:
-                continue
-            input_feature = convert_single_example(
-                str(line_id), input_example, label_list, max_seq_length,
-                tokenizer)
-            self.examples.append(input_feature)
-
-    def prepare_leveldb(self,
-                        input_file,
-                        leveldb_file,
-                        label_list,
-                        max_seq_length,
-                        tokenizer,
-                        line_processor=None,
-                        delimiter="\t",
-                        quotechar=None):
-        def default_line_processor(line_id, line):
-            assert len(line) == 2
-            text_a = line[0]
-            label = line[1]
-
-            return BertInputExample(
-                str(line_id), text_a=text_a, text_b=None, label=label)
-
-        if line_processor is None:
-            line_processor = default_line_processor
-
-        if ParallelEnv().nranks > 1:
-            leveldb_file = leveldb_file + "_" + str(ParallelEnv().local_rank)
-
-        if not os.path.exists(leveldb_file):
-            print("putting data %s into leveldb %s" %
-                  (input_file, leveldb_file))
-            _example_num = 0
-            _db = leveldb.LevelDB(leveldb_file, create_if_missing=True)
-            with io.open(input_file, "r", encoding="utf8") as f:
-                reader = csv.reader(
-                    f, delimiter=delimiter, quotechar=quotechar)
-                line_id = 0
-                for (_line_id, line) in enumerate(reader):
-                    if line_processor(str(_line_id), line) is None:
-                        continue
-
-                    line_str = delimiter.join(line)
-                    _db.Put(
-                        str(line_id).encode("utf8"), line_str.encode("utf8"))
-                    line_id += 1
-                    _example_num += 1
-            _db.Put("_example_num_".encode("utf8"),
-                    str(_example_num).encode("utf8"))
-        else:
-            _db = leveldb.LevelDB(leveldb_file, create_if_missing=False)
-
-        self.label_list = label_list
-        self.max_seq_length = max_seq_length
-        self.tokenizer = tokenizer
-        self.delimiter = delimiter
-        self._db = _db
-        self._line_processor = line_processor
-
-    def __getitem__(self, idx):
-
-        if self.mode == "all_in_memory":
-            return self.examples[idx].input_ids, self.examples[
-                idx].pos_ids, self.examples[idx].segment_ids, self.examples[
-                    idx].label_id
-
-        if self.mode == "leveldb":
-            assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
-            line_str = self._db.Get(str(idx).encode("utf8"))
-            line_str = line_str.decode("utf8")
-
-            line = line_str.split(self.delimiter)
-            input_example = self._line_processor(str(idx + 1), line)
-
-            input_example = convert_single_example(
-                str(idx + 1), input_example, self.label_list,
-                self.max_seq_length, self.tokenizer)
-
-            return input_example.input_ids, input_example.pos_ids, input_example.segment_ids, input_example.label_id
-
-    def __len__(self):
-        if self.mode == "all_in_memory":
-            return len(self.examples)
-
-        if self.mode == "leveldb":
-            assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
-
-            exmaple_num = self._db.Get("_example_num_".encode("utf8"))
-            exmaple_num = exmaple_num.decode("utf8")
-            return int(exmaple_num)
-
-
-class SentencePairDataset(Dataset):
-    def __init__(self,
-                 tokenizer,
-                 label_ist,
-                 max_seq_length,
-                 mode="all_in_memory"):
-
-        assert isinstance(mode,
-                          str), "mode of SentencePairDataset should be str"
-        assert mode in [
-            "all_in_memory", "leveldb"
-        ], "mode of SentencePairDataset should be in [all_in_memory, leveldb], but get" % mode
-
-        self.examples = []
-
-    def load_all_data_in_memory(self,
-                                input_file,
-                                label_list,
-                                max_seq_length,
-                                tokenizer,
-                                line_processor=None,
-                                delimiter="\t",
-                                quotechar=None):
-        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
-
-        def default_line_processor(line_id, line):
-            assert len(line) == 3
-            text_a = line[0]
-            text_b = line[1]
-            label = line[2]
-
-            return BertInputExample(
-                str(line_id), text_a=text_a, text_b=text_b, label=label)
-
-        if line_processor is None:
-            line_processor = default_line_processor
-
-        for (line_id, line) in enumerate(lines):
-            input_example = line_processor(line_id, line)
-            if not input_example:
-                continue
-            input_feature = convert_single_example(
-                str(line_id), input_example, label_list, max_seq_length,
-                tokenizer)
-            self.examples.append(input_feature)
-
-    def __getitem__(self, idx):
-        return self.examples[idx].input_ids, self.examples[
-            idx].pos_ids, self.examples[idx].segment_ids, self.examples[
-                idx].label_id
-
-    def __len__(self):
-        return len(self.examples)
-
-
-def _prepare_train_batch(insts,
-                         vocab_size=0,
-                         pad_id=None,
-                         cls_id=None,
-                         sep_id=None,
-                         mask_id=-1,
-                         return_input_mask=True,
-                         return_max_len=True,
-                         return_num_token=False):
-
-    return prepare_batch_data(
-        insts,
-        0,
-        voc_size=vocab_size,
-        pad_id=pad_id,
-        cls_id=cls_id,
-        sep_id=sep_id,
-        mask_id=mask_id,
-        return_input_mask=return_input_mask,
-        return_max_len=return_max_len,
-        return_num_token=return_num_token)
-
-
-class BertDataLoader(object):
-    def __init__(self,
-                 input_file,
-                 tokenizer,
-                 label_list,
-                 max_seq_length,
-                 batch_size,
-                 shuffle=False,
-                 drop_last=False,
-                 mode="all_in_memory",
-                 leveldb_file="./leveldb",
-                 line_processor=None,
-                 delimiter="\t",
-                 quotechar=None,
-                 device=fluid.CPUPlace(),
-                 num_workers=0,
-                 return_list=True,
-                 phase="train"):
-
-        assert phase in [
-            "train", "predict", "test"
-        ], "phase of BertDataLoader should be in [train, predict, test], but get %s" % phase
-
-        self.dataset = SingleSentenceDataset(tokenizer, label_list,
-                                             max_seq_length, mode)
-
-        if mode == "all_in_memory":
-            self.dataset.load_all_data_in_memory(
-                input_file, label_list, max_seq_length, tokenizer,
-                line_processor, delimiter, quotechar)
-        elif mode == "leveldb":
-            self.dataset.prepare_leveldb(input_file, leveldb_file, label_list,
-                                         max_seq_length, tokenizer,
-                                         line_processor, delimiter, quotechar)
-        else:
-            raise ValueError("mode should be in [all_in_memory, leveldb]")
-
-        if phase == "train":
-            self.sampler = DistributedBatchSampler(
-                self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
-        elif phase == "test" or phase == "predict":
-            self.sampler = BatchSampler(
-                dataset=self.dataset,
-                batch_size=batch_size,
-                shuffle=shuffle,
-                drop_last=drop_last)
-
-        self.dataloader = DataLoader(
-            dataset=self.dataset,
-            batch_sampler=self.sampler,
-            places=device,
-            collate_fn=partial(
-                _prepare_train_batch,
-                vocab_size=-1,
-                pad_id=tokenizer.vocab["[PAD]"],
-                cls_id=tokenizer.vocab["[CLS]"],
-                sep_id=tokenizer.vocab["[SEP]"],
-                mask_id=-1,
-                return_input_mask=True,
-                return_max_len=False,
-                return_num_token=False),
-            num_workers=num_workers,
-            return_list=return_list)
-
-
-if __name__ == "__main__":
-    print("hello world.")
diff --git a/hapi/text/bert/dygraph_optimization.py b/hapi/text/bert/dygraph_optimization.py
deleted file mode 100755
index af84f3cce8139fd08e7e3506b8f828f6a213fdd7..0000000000000000000000000000000000000000
--- a/hapi/text/bert/dygraph_optimization.py
+++ /dev/null
@@ -1,182 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Optimization and learning rate scheduling."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import paddle.fluid as fluid
-
-from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
-
-
-class ConstantLR(LearningRateDecay):
-    def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
-        super(ConstantLR, self).__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-
-    def step(self):
-        return self.learning_rate
-
-
-class LinearDecay(LearningRateDecay):
-    def __init__(self,
-                 learning_rate,
-                 warmup_steps,
-                 decay_steps,
-                 end_learning_rate=0.0001,
-                 power=1.0,
-                 cycle=False,
-                 begin=0,
-                 step=1,
-                 dtype='float32'):
-        super(LinearDecay, self).__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-        self.warmup_steps = warmup_steps
-        self.decay_steps = decay_steps
-        self.end_learning_rate = end_learning_rate
-        self.power = power
-        self.cycle = cycle
-
-    def step(self):
-        if self.step_num < self.warmup_steps:
-            decayed_lr = self.learning_rate * (self.step_num /
-                                               self.warmup_steps)
-            decayed_lr = self.create_lr_var(decayed_lr)
-        else:
-            tmp_step_num = self.step_num
-            tmp_decay_steps = self.decay_steps
-            if self.cycle:
-                div_res = fluid.layers.ceil(
-                    self.create_lr_var(tmp_step_num / float(self.decay_steps)))
-                if tmp_step_num == 0:
-                    div_res = self.create_lr_var(1.0)
-                tmp_decay_steps = self.decay_steps * div_res
-            else:
-                tmp_step_num = self.create_lr_var(
-                    tmp_step_num
-                    if tmp_step_num < self.decay_steps else self.decay_steps)
-                decayed_lr = (self.learning_rate - self.end_learning_rate) * \
-                    ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
-
-        return decayed_lr
-
-
-class DyOptimizer(object):
-    def __init__(self,
-                 warmup_steps,
-                 num_train_steps,
-                 learning_rate,
-                 model_cls,
-                 weight_decay,
-                 scheduler='linear_warmup_decay',
-                 loss_scaling=1.0,
-                 parameter_list=None):
-        self.warmup_steps = warmup_steps
-        self.num_train_steps = num_train_steps
-        self.learning_rate = learning_rate
-        self.model_cls = model_cls
-        self.weight_decay = weight_decay
-        self.scheduler = scheduler
-        self.loss_scaling = loss_scaling
-        self.parameter_list = parameter_list
-
-        self.scheduled_lr = 0.0
-        self.optimizer = self.lr_schedule()
-
-    def lr_schedule(self):
-        if self.warmup_steps > 0:
-            if self.scheduler == 'noam_decay':
-                self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
-                    self.warmup_steps * (self.learning_rate**2)),
-                                                            self.warmup_steps)
-            elif self.scheduler == 'linear_warmup_decay':
-                self.scheduled_lr = LinearDecay(self.learning_rate,
-                                                self.warmup_steps,
-                                                self.num_train_steps, 0.0)
-            else:
-                raise ValueError("Unkown learning rate scheduler, should be "
-                                 "'noam_decay' or 'linear_warmup_decay'")
-            optimizer = fluid.optimizer.Adam(
-                learning_rate=self.scheduled_lr,
-                parameter_list=self.parameter_list)
-        else:
-            self.scheduled_lr = ConstantLR(self.learning_rate)
-            optimizer = fluid.optimizer.Adam(
-                learning_rate=self.scheduled_lr,
-                parameter_list=self.parameter_list)
-
-        return optimizer
-
-    def exclude_from_weight_decay(self, name):
-        if name.find("layer_norm") > -1:
-            return True
-        bias_suffix = ["_bias", "_b", ".b_0"]
-        for suffix in bias_suffix:
-            if name.endswith(suffix):
-                return True
-        return False
-
-    def state_dict(self):
-        return self.optimizer.state_dict()
-
-    def set_dict(self, state_dict):
-        return self.optimizer.set_dict(state_dict)
-
-    def get_opti_var_name_list(self):
-        return self.optimizer.get_opti_var_name_list()
-
-    def current_step_lr(self):
-        return self.optimizer.current_step_lr()
-
-    def minimize(self, loss, use_data_parallel=False, model=None):
-        param_list = dict()
-
-        clip_norm_thres = 1.0
-        #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
-
-        if use_data_parallel:
-            loss = model.scale_loss(loss)
-
-        loss.backward()
-
-        if self.weight_decay > 0:
-            for param in self.model_cls.parameters():
-                param_list[param.name] = param * 1.0
-                param_list[param.name].stop_gradient = True
-
-        if use_data_parallel:
-            assert model is not None
-            model.apply_collective_grads()
-
-        #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
-        _, param_grads = self.optimizer.minimize(loss)
-
-        if self.weight_decay > 0:
-            for param, grad in param_grads:
-                if self.exclude_from_weight_decay(param.name):
-                    continue
-                if isinstance(self.scheduled_lr.step(), float):
-                    updated_param = param.numpy() - param_list[
-                        param.name].numpy(
-                        ) * self.weight_decay * self.scheduled_lr.step()
-                else:
-                    updated_param = param.numpy(
-                    ) - param_list[param.name].numpy(
-                    ) * self.weight_decay * self.scheduled_lr.step().numpy()
-                updated_param_var = fluid.dygraph.to_variable(updated_param)
-                param = updated_param_var
-                #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
diff --git a/hapi/text/bert/optimization.py b/hapi/text/bert/optimization.py
deleted file mode 100644
index d9f8d277ca9d64486aa3304ffb91657da287a5f6..0000000000000000000000000000000000000000
--- a/hapi/text/bert/optimization.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid.framework import in_dygraph_mode
-from hapi.text.bert.dygraph_optimization import DyOptimizer as DyOptimizer
-from hapi.text.bert.static_optimization import StOptimizer as StOptimizer
-
-
-def make_optimizer(warmup_steps,
-                   num_train_steps,
-                   learning_rate,
-                   weight_decay,
-                   model,
-                   scheduler='linear_warmup_decay',
-                   loss_scaling=1.0,
-                   parameter_list=None):
-
-    if in_dygraph_mode():
-        return DyOptimizer(
-            warmup_steps=warmup_steps,
-            num_train_steps=num_train_steps,
-            learning_rate=learning_rate,
-            model_cls=model,
-            weight_decay=weight_decay,
-            scheduler=scheduler,
-            loss_scaling=loss_scaling,
-            parameter_list=parameter_list)
-    else:
-        return StOptimizer(
-            warmup_steps=warmup_steps,
-            num_train_steps=num_train_steps,
-            learning_rate=learning_rate,
-            weight_decay=weight_decay,
-            scheduler=scheduler)
diff --git a/hapi/text/bert/static_optimization.py b/hapi/text/bert/static_optimization.py
deleted file mode 100644
index adc8f87ce3269478cab81a5434d9d71fff30e20e..0000000000000000000000000000000000000000
--- a/hapi/text/bert/static_optimization.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Optimization and learning rate scheduling."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import paddle.fluid as fluid
-
-
-def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
-    """ Applies linear warmup of learning rate from 0 and decay to 0."""
-    with fluid.default_main_program()._lr_schedule_guard():
-        lr = fluid.layers.tensor.create_global_var(
-            shape=[1],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            name="scheduled_learning_rate")
-
-        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter(
-        )
-
-        with fluid.layers.control_flow.Switch() as switch:
-            with switch.case(global_step < warmup_steps):
-                warmup_lr = learning_rate * (global_step / warmup_steps)
-                fluid.layers.tensor.assign(warmup_lr, lr)
-            with switch.default():
-                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
-                    learning_rate=learning_rate,
-                    decay_steps=num_train_steps,
-                    end_learning_rate=0.0,
-                    power=1.0,
-                    cycle=False)
-                fluid.layers.tensor.assign(decayed_lr, lr)
-
-        return lr
-
-
-class StOptimizer(fluid.optimizer.Optimizer):
-    def __init__(self,
-                 warmup_steps,
-                 num_train_steps,
-                 learning_rate,
-                 weight_decay,
-                 scheduler='linear_warmup_decay'):
-        super(StOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=None,
-            regularization=None,
-            grad_clip=None,
-            name=None)
-        self.warmup_steps = warmup_steps
-        self.num_train_steps = num_train_steps
-        self.learning_rate = learning_rate
-        self.weight_decay = weight_decay
-        self.scheduler = scheduler
-
-    def minimize(self, loss):
-
-        train_program = fluid.default_main_program()
-        startup_program = fluid.default_startup_program()
-
-        if self.scheduler == 'noam_decay':
-            if self.warmup_steps > 0:
-                scheduled_lr = fluid.layers.learning_rate_scheduler\
-                 .noam_decay(1/(self.warmup_steps *(self.learning_rate ** 2)),
-                self.warmup_steps)
-            else:
-                print(
-                    "WARNING: noam decay of learning rate should have postive warmup "
-                    "steps but given {}, using constant learning rate instead!"
-                    .format(self.warmup_steps))
-                scheduled_lr = fluid.layers.create_global_var(
-                    name=fluid.unique_name.generate("learning_rate"),
-                    shape=[1],
-                    value=self.learning_rate,
-                    dtype='float32',
-                    persistable=True)
-        elif self.scheduler == 'linear_warmup_decay':
-            if self.warmup_steps > 0:
-                scheduled_lr = linear_warmup_decay(self.learning_rate,
-                                                   self.warmup_steps,
-                                                   self.num_train_steps)
-            else:
-                print(
-                    "WARNING: linear warmup decay of learning rate should have "
-                    "postive warmup steps but given {}, use constant learning rate "
-                    "instead!".format(self.warmup_steps))
-                scheduled_lr = fluid.layers.create_global_var(
-                    name=fluid.unique_name.generate("learning_rate"),
-                    shape=[1],
-                    value=self.learning_rate,
-                    dtype='float32',
-                    persistable=True)
-        else:
-            raise ValueError("Unkown learning rate scheduler, should be "
-                             "'noam_decay' or 'linear_warmup_decay'")
-
-        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
-        fluid.clip.set_gradient_clip(
-            clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
-
-        def exclude_from_weight_decay(param):
-            name = param.name.rstrip(".master")
-            if name.find("layer_norm") > -1:
-                return True
-            bias_suffix = ["_bias", "_b", ".b_0"]
-            for suffix in bias_suffix:
-                if name.endswith(suffix):
-                    return True
-            return False
-
-        param_list = dict()
-
-        if self.weight_decay > 0:
-            for param in train_program.all_parameters():
-                param_list[param.name] = param * 1.0
-                param_list[param.name].stop_gradient = True
-
-        _, param_grads = optimizer.minimize(loss)
-
-        if self.weight_decay > 0:
-            for param, grad in param_grads:
-                if exclude_from_weight_decay(param):
-                    continue
-                with param.block.program._optimized_guard(
-                    [param, grad]), fluid.framework.name_scope("weight_decay"):
-                    updated_param = param - param_list[
-                        param.name] * self.weight_decay * scheduled_lr
-                    fluid.layers.assign(output=param, input=updated_param)
diff --git a/hapi/text/bert/utils/__init__.py b/hapi/text/bert/utils/__init__.py
deleted file mode 100644
index 89b95c137b760b01623a2b78d66ecebb1b2a5e43..0000000000000000000000000000000000000000
--- a/hapi/text/bert/utils/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from hapi.text.bert.utils.args import str2bool as str2bool
-from hapi.text.bert.utils.args import ArgumentGroup as ArgumentGroup
-from hapi.text.bert.utils.args import print_arguments as print_arguments
-from hapi.text.bert.utils.args import check_cuda as check_cuda
-
-from hapi.text.bert.utils.cards import get_cards as get_cards
-
-from hapi.text.bert.utils.fp16 import cast_fp16_to_fp32 as cast_fp16_to_fp32
-from hapi.text.bert.utils.fp16 import cast_fp32_to_fp16 as cast_fp32_to_fp16
-from hapi.text.bert.utils.fp16 import copy_to_master_param as copy_to_master_param
-from hapi.text.bert.utils.fp16 import create_master_params_grads as create_master_params_grads
-from hapi.text.bert.utils.fp16 import master_param_to_train_param as master_param_to_train_param
-
-from hapi.text.bert.utils.init import init_checkpoint as init_checkpoint
-from hapi.text.bert.utils.init import init_pretraining_params as init_pretraining_params
-from hapi.text.bert.utils.init import init_from_static_model as init_from_static_model
diff --git a/hapi/text/bert/utils/args.py b/hapi/text/bert/utils/args.py
deleted file mode 100644
index 66e9bb81a35bb4cc4c8c79cac4631841742bdeb8..0000000000000000000000000000000000000000
--- a/hapi/text/bert/utils/args.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Arguments for configuration."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import argparse
-
-import paddle.fluid as fluid
-
-
-def str2bool(v):
-    # because argparse does not support to parse "true, False" as python
-    # boolean directly
-    return v.lower() in ("true", "t", "1")
-
-
-class ArgumentGroup(object):
-    def __init__(self, parser, title, des):
-        self._group = parser.add_argument_group(title=title, description=des)
-
-    def add_arg(self, name, type, default, help, **kwargs):
-        type = str2bool if type == bool else type
-        self._group.add_argument(
-            "--" + name,
-            default=default,
-            type=type,
-            help=help + ' Default: %(default)s.',
-            **kwargs)
-
-
-def print_arguments(args):
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(six.iteritems(vars(args))):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-def check_cuda(use_cuda, err = \
-    "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
-    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
-                                                                                                                     ):
-    try:
-        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
-            print(err)
-            sys.exit(1)
-    except Exception as e:
-        pass
diff --git a/hapi/text/bert/utils/cards.py b/hapi/text/bert/utils/cards.py
deleted file mode 100644
index 70c58ee30da7f68f00d12af0b5dc1025dad42630..0000000000000000000000000000000000000000
--- a/hapi/text/bert/utils/cards.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-
-def get_cards():
-    """
-    get gpu cards number
-    """
-    num = 0
-    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-    if cards != '':
-        num = len(cards.split(","))
-    return num
diff --git a/hapi/text/bert/utils/convert_static_to_dygraph.py b/hapi/text/bert/utils/convert_static_to_dygraph.py
deleted file mode 100755
index cbd4f7f74003cbcb1f7f800e7f72e69fbbb3a5f9..0000000000000000000000000000000000000000
--- a/hapi/text/bert/utils/convert_static_to_dygraph.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import shutil
-import sys
-import os
-
-
-def usage():
-    """
-    usage information
-    """
-    print
-    print("please use command: ")
-    print(
-        "python convert_static_to_dygraph.py input_params_dir output_params_dir"
-    )
-    print
-
-
-def convert_static_to_dygraph(static_model_path, dygraph_model_path):
-    """
-    convert paddle static bert model to dygraph model 
-    """
-
-    def mkdir(path):
-        if not os.path.isdir(path):
-            if os.path.split(path)[0]:
-                mkdir(os.path.split(path)[0])
-        else:
-            return
-        os.mkdir(path)
-
-    if os.path.exists(dygraph_model_path):
-        shutil.rmtree(dygraph_model_path)
-    mkdir(dygraph_model_path)
-
-    if not os.path.exists(static_model_path):
-        print("paddle static model path doesn't exist.....")
-        return -1
-
-    file_list = []
-    for root, dirs, files in os.walk(static_model_path):
-        file_list.extend(files)
-
-    os.makedirs(os.path.join(dygraph_model_path, "PretrainModelLayer_0"))
-    os.makedirs(
-        os.path.join(dygraph_model_path,
-                     "PretrainModelLayer_0/BertModelLayer_0"))
-    os.makedirs(
-        os.path.join(dygraph_model_path,
-                     "PretrainModelLayer_0/PrePostProcessLayer_0"))
-    os.makedirs(
-        os.path.join(
-            dygraph_model_path,
-            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"))
-
-    #os.chdir(static_model_path)
-    #convert embedding file
-    embedding_type = ["word", "pos", "sent"]
-    for i in range(3):
-        src_name = embedding_type[i] + "_embedding"
-        trg_name = "Embedding_" + str(i) + "." + src_name
-        shutil.copyfile(
-            os.path.join(static_model_path, src_name),
-            os.path.join(dygraph_model_path,
-                         "PretrainModelLayer_0/BertModelLayer_0/" + trg_name))
-
-    #convert pre_encoder file
-    shutil.copyfile(
-        os.path.join(static_model_path, "pre_encoder_layer_norm_scale"),
-        os.path.join(
-            dygraph_model_path,
-            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
-        ))
-    shutil.copyfile(
-        os.path.join(static_model_path, "pre_encoder_layer_norm_bias"),
-        os.path.join(
-            dygraph_model_path,
-            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
-        ))
-
-    #convert mask lm params file
-    shutil.copyfile(
-        os.path.join(static_model_path, "mask_lm_out_fc.b_0"),
-        os.path.join(dygraph_model_path,
-                     "PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"))
-    shutil.copyfile(
-        os.path.join(static_model_path, "mask_lm_trans_fc.b_0"),
-        os.path.join(dygraph_model_path,
-                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"))
-    shutil.copyfile(
-        os.path.join(static_model_path, "mask_lm_trans_fc.w_0"),
-        os.path.join(dygraph_model_path,
-                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"))
-    shutil.copyfile(
-        os.path.join(static_model_path, "mask_lm_trans_layer_norm_bias"),
-        os.path.join(
-            dygraph_model_path,
-            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
-        ))
-    shutil.copyfile(
-        os.path.join(static_model_path, "mask_lm_trans_layer_norm_scale"),
-        os.path.join(
-            dygraph_model_path,
-            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
-        ))
-    shutil.copyfile(
-        os.path.join(static_model_path, "next_sent_fc.b_0"),
-        os.path.join(dygraph_model_path,
-                     "PretrainModelLayer_0/FC_1.next_sent_fc.b_0"))
-    shutil.copyfile(
-        os.path.join(static_model_path, "next_sent_fc.w_0"),
-        os.path.join(dygraph_model_path,
-                     "PretrainModelLayer_0/FC_1.next_sent_fc.w_0"))
-    shutil.copyfile(
-        os.path.join(static_model_path, "pooled_fc.b_0"),
-        os.path.join(
-            dygraph_model_path,
-            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"))
-    shutil.copyfile(
-        os.path.join(static_model_path, "pooled_fc.w_0"),
-        os.path.join(
-            dygraph_model_path,
-            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"))
-
-    encoder_num = 0
-    for f in file_list:
-        if not f.startswith("encoder_layer"):
-            continue
-        layer_num = f.split('_')[2]
-        if int(layer_num) > encoder_num:
-            encoder_num = int(layer_num)
-
-    encoder_num += 1
-    for i in range(encoder_num):
-        encoder_dir = "EncoderSubLayer_" + str(i)
-        os.makedirs(
-            os.path.join(dygraph_model_path,
-                         "PretrainModelLayer_0/BertModelLayer_0/" +
-                         "EncoderLayer_0/", encoder_dir))
-        os.makedirs(
-            os.path.join(dygraph_model_path,
-                         "PretrainModelLayer_0/BertModelLayer_0/" +
-                         "EncoderLayer_0/", encoder_dir +
-                         "/PositionwiseFeedForwardLayer_0"))
-        os.makedirs(
-            os.path.join(
-                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
-                "EncoderLayer_0/", encoder_dir + "/MultiHeadAttentionLayer_0"))
-        os.makedirs(
-            os.path.join(
-                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
-                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_1"))
-        os.makedirs(
-            os.path.join(
-                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
-                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_3"))
-
-    encoder_map_dict = {
-        "ffn_fc_0.b_0":
-        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.b_0"),
-        "ffn_fc_0.w_0":
-        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.w_0"),
-        "ffn_fc_1.b_0":
-        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.b_0"),
-        "ffn_fc_1.w_0":
-        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.w_0"),
-        "multi_head_att_key_fc.b_0":
-        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.b_0"),
-        "multi_head_att_key_fc.w_0":
-        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.w_0"),
-        "multi_head_att_output_fc.b_0":
-        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.b_0"),
-        "multi_head_att_output_fc.w_0":
-        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.w_0"),
-        "multi_head_att_query_fc.b_0":
-        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.b_0"),
-        "multi_head_att_query_fc.w_0":
-        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.w_0"),
-        "multi_head_att_value_fc.b_0":
-        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.b_0"),
-        "multi_head_att_value_fc.w_0":
-        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.w_0"),
-        "post_att_layer_norm_bias":
-        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_bias"),
-        "post_att_layer_norm_scale":
-        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_scale"),
-        "post_ffn_layer_norm_bias":
-        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_bias"),
-        "post_ffn_layer_norm_scale":
-        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_scale")
-    }
-
-    for f in file_list:
-        if not f.startswith("encoder_layer"):
-            continue
-        layer_num = f.split('_')[2]
-        suffix_name = "_".join(f.split('_')[3:])
-        in_dir = encoder_map_dict[suffix_name][0]
-        rename = encoder_map_dict[suffix_name][1]
-        encoder_layer = "EncoderSubLayer_" + layer_num
-        shutil.copyfile(
-            os.path.join(static_model_path, f),
-            os.path.join(
-                dygraph_model_path,
-                "PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/" +
-                encoder_layer + "/" + in_dir + "/" + rename))
-
-
-if __name__ == "__main__":
-
-    if len(sys.argv) < 3:
-        usage()
-        exit(1)
-    static_model_path = sys.argv[1]
-    dygraph_model_path = sys.argv[2]
-    convert_static_to_dygraph(static_model_path, dygraph_model_path)
diff --git a/hapi/text/bert/utils/fp16.py b/hapi/text/bert/utils/fp16.py
deleted file mode 100644
index e153c2b9a1029897def264278c5dbe72e1f369f5..0000000000000000000000000000000000000000
--- a/hapi/text/bert/utils/fp16.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle
-import paddle.fluid as fluid
-
-
-def cast_fp16_to_fp32(i, o, prog):
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={
-            "in_dtype": fluid.core.VarDesc.VarType.FP16,
-            "out_dtype": fluid.core.VarDesc.VarType.FP32
-        })
-
-
-def cast_fp32_to_fp16(i, o, prog):
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={
-            "in_dtype": fluid.core.VarDesc.VarType.FP32,
-            "out_dtype": fluid.core.VarDesc.VarType.FP16
-        })
-
-
-def copy_to_master_param(p, block):
-    v = block.vars.get(p.name, None)
-    if v is None:
-        raise ValueError("no param name %s found!" % p.name)
-    new_p = fluid.framework.Parameter(
-        block=block,
-        shape=v.shape,
-        dtype=fluid.core.VarDesc.VarType.FP32,
-        type=v.type,
-        lod_level=v.lod_level,
-        stop_gradient=p.stop_gradient,
-        trainable=p.trainable,
-        optimize_attr=p.optimize_attr,
-        regularizer=p.regularizer,
-        gradient_clip_attr=p.gradient_clip_attr,
-        error_clip=p.error_clip,
-        name=v.name + ".master")
-    return new_p
-
-
-def create_master_params_grads(params_grads, main_prog, startup_prog,
-                               loss_scaling):
-    master_params_grads = []
-    tmp_role = main_prog._current_role
-    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
-    main_prog._current_role = OpRole.Backward
-    for p, g in params_grads:
-        # create master parameters
-        master_param = copy_to_master_param(p, main_prog.global_block())
-        startup_master_param = startup_prog.global_block()._clone_variable(
-            master_param)
-        startup_p = startup_prog.global_block().var(p.name)
-        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
-        # cast fp16 gradients to fp32 before apply gradients
-        if g.name.find("layer_norm") > -1:
-            if loss_scaling > 1:
-                scaled_g = g / float(loss_scaling)
-            else:
-                scaled_g = g
-            master_params_grads.append([p, scaled_g])
-            continue
-        master_grad = fluid.layers.cast(g, "float32")
-        if loss_scaling > 1:
-            master_grad = master_grad / float(loss_scaling)
-        master_params_grads.append([master_param, master_grad])
-    main_prog._current_role = tmp_role
-    return master_params_grads
-
-
-def master_param_to_train_param(master_params_grads, params_grads, main_prog):
-    for idx, m_p_g in enumerate(master_params_grads):
-        train_p, _ = params_grads[idx]
-        if train_p.name.find("layer_norm") > -1:
-            continue
-        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
-            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
diff --git a/hapi/text/bert/utils/init.py b/hapi/text/bert/utils/init.py
deleted file mode 100644
index 48087ee750e637ca3201f2b9115b8ca7df60d54b..0000000000000000000000000000000000000000
--- a/hapi/text/bert/utils/init.py
+++ /dev/null
@@ -1,246 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import six
-import ast
-import copy
-
-import numpy as np
-import paddle.fluid as fluid
-
-
-def cast_fp32_to_fp16(exe, main_program):
-    print("Cast parameters to float16 data format.")
-    for param in main_program.global_block().all_parameters():
-        if not param.name.endswith(".master"):
-            param_t = fluid.global_scope().find_var(param.name).get_tensor()
-            data = np.array(param_t)
-            if param.name.find("layer_norm") == -1:
-                param_t.set(np.float16(data).view(np.uint16), exe.place)
-            master_param_var = fluid.global_scope().find_var(param.name +
-                                                             ".master")
-            if master_param_var is not None:
-                master_param_var.get_tensor().set(data, exe.place)
-
-
-def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
-    assert os.path.exists(
-        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
-
-    def existed_persitables(var):
-        if not fluid.io.is_persistable(var):
-            return False
-        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
-
-    fluid.io.load_vars(
-        exe,
-        init_checkpoint_path,
-        main_program=main_program,
-        predicate=existed_persitables)
-    print("Load model from {}".format(init_checkpoint_path))
-
-    if use_fp16:
-        cast_fp32_to_fp16(exe, main_program)
-
-
-def init_pretraining_params(exe,
-                            pretraining_params_path,
-                            main_program,
-                            use_fp16=False):
-    assert os.path.exists(pretraining_params_path
-                          ), "[%s] cann't be found." % pretraining_params_path
-
-    def existed_params(var):
-        if not isinstance(var, fluid.framework.Parameter):
-            return False
-        return os.path.exists(os.path.join(pretraining_params_path, var.name))
-
-    fluid.io.load_vars(
-        exe,
-        pretraining_params_path,
-        main_program=main_program,
-        predicate=existed_params)
-    print("Load pretraining parameters from {}.".format(
-        pretraining_params_path))
-
-    if use_fp16:
-        cast_fp32_to_fp16(exe, main_program)
-
-
-def init_from_static_model(dir_path,
-                           backbone_model,
-                           bert_config,
-                           verbose=False):
-    def load_numpy_weight(file_name):
-        if six.PY2:
-            res = np.load(os.path.join(dir_path, file_name), allow_pickle=True)
-        else:
-            res = np.load(
-                os.path.join(dir_path, file_name),
-                allow_pickle=True,
-                encoding='latin1')
-        assert res is not None
-        return res
-
-    # load word embedding
-    _param = load_numpy_weight("word_embedding")
-    backbone_model._src_emb.set_dict({"weight": _param})
-    if verbose:
-        print("INIT word embedding")
-
-    _param = load_numpy_weight("pos_embedding")
-    backbone_model._pos_emb.set_dict({"weight": _param})
-    if verbose:
-        print("INIT pos embedding")
-
-    _param = load_numpy_weight("sent_embedding")
-    backbone_model._sent_emb.set_dict({"weight": _param})
-    if verbose:
-        print("INIT sent embedding")
-
-    _param0 = load_numpy_weight("pooled_fc.w_0")
-    _param1 = load_numpy_weight("pooled_fc.b_0")
-    backbone_model.pooled_fc.set_dict({"weight": _param0, "bias": _param1})
-    if verbose:
-        print("INIT pooled_fc")
-
-    _param0 = load_numpy_weight("pre_encoder_layer_norm_scale")
-    _param1 = load_numpy_weight("pre_encoder_layer_norm_bias")
-    backbone_model.pre_process_layer._sub_layers["layer_norm_0"].set_dict({
-        "weight": _param0,
-        "bias": _param1
-    })
-    if verbose:
-        print("INIT pre_encoder layer norm")
-
-    for _i in range(bert_config["num_hidden_layers"]):
-        _param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i
-        _param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i
-
-        _param_weight = load_numpy_weight(_param_weight)
-        _param_bias = load_numpy_weight(_param_bias)
-
-        backbone_model._encoder._sub_layers["layer_%d" %
-                                            _i].self_attn.q_fc.set_dict({
-                                                "weight": _param_weight,
-                                                "bias": _param_bias
-                                            })
-        if verbose:
-            print("INIT multi_head_att_query_fc %d" % _i)
-
-        _param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i
-        _param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i
-
-        _param_weight = load_numpy_weight(_param_weight)
-        _param_bias = load_numpy_weight(_param_bias)
-
-        backbone_model._encoder._sub_layers["layer_%d" %
-                                            _i].self_attn.k_fc.set_dict({
-                                                "weight": _param_weight,
-                                                "bias": _param_bias
-                                            })
-        if verbose:
-            print("INIT multi_head_att_key_fc %d" % _i)
-
-        _param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i
-        _param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i
-
-        _param_weight = load_numpy_weight(_param_weight)
-        _param_bias = load_numpy_weight(_param_bias)
-
-        backbone_model._encoder._sub_layers["layer_%d" %
-                                            _i].self_attn.v_fc.set_dict({
-                                                "weight": _param_weight,
-                                                "bias": _param_bias
-                                            })
-        if verbose:
-            print("INIT multi_head_att_value_fc %d" % _i)
-
-        # init output fc
-        _param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i
-        _param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i
-
-        _param_weight = load_numpy_weight(_param_weight)
-        _param_bias = load_numpy_weight(_param_bias)
-
-        backbone_model._encoder._sub_layers["layer_%d" %
-                                            _i].self_attn.proj_fc.set_dict({
-                                                "weight": _param_weight,
-                                                "bias": _param_bias
-                                            })
-        if verbose:
-            print("INIT multi_head_att_output_fc %d" % _i)
-
-        # init layer_norm 1
-        _param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i
-        _param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i
-
-        _param_weight = load_numpy_weight(_param_weight)
-        _param_bias = load_numpy_weight(_param_bias)
-
-        backbone_model._encoder._sub_layers[
-            "layer_%d" % _i].postprocesser1.layer_norm_0.set_dict({
-                "weight": _param_weight,
-                "bias": _param_bias
-            })
-        if verbose:
-            print("INIT layer norm in attention at %d layer" % _i)
-
-        # init layer_norm 2
-        _param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i
-        _param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i
-
-        _param_weight = load_numpy_weight(_param_weight)
-        _param_bias = load_numpy_weight(_param_bias)
-
-        backbone_model._encoder._sub_layers[
-            "layer_%d" % _i].postprocesser2.layer_norm_0.set_dict({
-                "weight": _param_weight,
-                "bias": _param_bias
-            })
-        if verbose:
-            print("INIT layer norm in FFN at %d layer" % _i)
-
-        # init FFN 1
-        _param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i
-        _param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i
-
-        _param_weight = load_numpy_weight(_param_weight)
-        _param_bias = load_numpy_weight(_param_bias)
-
-        backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc1.set_dict({
-            "weight": _param_weight,
-            "bias": _param_bias
-        })
-        if verbose:
-            print("INIT FFN-1 at %d layer" % _i)
-
-        # init FFN 2
-        _param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i
-        _param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i
-
-        _param_weight = load_numpy_weight(_param_weight)
-        _param_bias = load_numpy_weight(_param_bias)
-
-        backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc2.set_dict({
-            "weight": _param_weight,
-            "bias": _param_bias
-        })
-        if verbose:
-            print("INIT FFN-2 at %d layer" % _i)
-
-    return True
diff --git a/hapi/text/senta/__init__.py b/hapi/text/senta/__init__.py
deleted file mode 100644
index fb3894939ed6ceff1b33147bb755dcfff305d675..0000000000000000000000000000000000000000
--- a/hapi/text/senta/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from hapi.text.senta.data_processer import SentaProcessor
diff --git a/hapi/text/senta/data_processer.py b/hapi/text/senta/data_processer.py
deleted file mode 100644
index a850c0d03af038451ae080b1bf0d3485f8dd3879..0000000000000000000000000000000000000000
--- a/hapi/text/senta/data_processer.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from hapi.text.senta.data_reader import load_vocab
-from hapi.text.senta.data_reader import data_reader
-from paddle.io import DataLoader
-
-
-class SentaProcessor(object):
-    def __init__(self, data_dir, vocab_path, random_seed=None):
-        self.data_dir = data_dir
-        self.vocab = load_vocab(vocab_path)
-        self.num_examples = {"train": -1, "dev": -1, "infer": -1}
-        np.random.seed(random_seed)
-
-    def get_train_examples(self, data_dir, epoch, shuffle, batch_size, places, padding_size):
-        train_reader = data_reader((self.data_dir + "/train.tsv"), self.vocab,
-                           self.num_examples, "train", epoch, padding_size, shuffle)
-        loader = DataLoader.from_generator(capacity=50, return_list=True)
-        loader.set_sample_generator(train_reader, batch_size=batch_size, drop_last=False, places=places)
-        return loader
-        
-
-    def get_dev_examples(self, data_dir, epoch, shuffle, batch_size, places, padding_size):
-        dev_reader = data_reader((self.data_dir + "/dev.tsv"), self.vocab,
-                           self.num_examples, "dev", epoch, padding_size, shuffle)
-        loader = DataLoader.from_generator(capacity=50, return_list=True)
-        loader.set_sample_generator(dev_reader, batch_size=batch_size, drop_last=False, places=places)
-        return loader
-
-    def get_test_examples(self, data_dir, epoch, batch_size, places, padding_size):
-        test_reader = data_reader((self.data_dir + "/test.tsv"), self.vocab,
-                           self.num_examples, "infer", epoch, padding_size)
-        loader = DataLoader.from_generator(capacity=50, return_list=True)
-        loader.set_sample_generator(test_reader, batch_size=batch_size, drop_last=False, places=places)
-        return loader
-
-    def get_labels(self):
-        return ["0", "1"]
-
-    def get_num_examples(self, phase):
-        if phase not in ['train', 'dev', 'infer']:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'dev', 'infer'].")
-        return self.num_examples[phase]
-
-    def get_train_progress(self):
-        return self.current_train_example, self.current_train_epoch
-
-    def data_generator(self, padding_size, batch_size, places, phase='train', epoch=1, shuffle=True):
-        if phase == "train":
-            return self.get_train_examples(self.data_dir, epoch, shuffle, batch_size, places, padding_size)
-        elif phase == "dev":
-            return self.get_dev_examples(self.data_dir, epoch, shuffle, batch_size, places, padding_size)
-        elif phase == "infer":
-            return self.get_test_examples(self.data_dir, epoch, batch_size, places, padding_size)
-        else:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'dev', 'infer'].")
diff --git a/hapi/text/senta/data_reader.py b/hapi/text/senta/data_reader.py
deleted file mode 100644
index 47fbec7fc78fb3058342481e53177d943950d419..0000000000000000000000000000000000000000
--- a/hapi/text/senta/data_reader.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import io
-import sys
-import random
-
-
-def str2bool(v):
-    return v.lower() in ("true", "t", "1")
-
-
-def data_reader(file_path, word_dict, num_examples, phrase, epoch, padding_size, shuffle=False):
-    unk_id = len(word_dict)
-    all_data = []
-    with io.open(file_path, "r", encoding='utf8') as fin:
-        for line in fin:
-            if line.startswith('text_a'):
-                continue
-            cols = line.strip().split("\t")
-            if len(cols) != 2:
-                sys.stderr.write("[NOTICE] Error Format Line!")
-                continue
-            label = [int(cols[1])]
-            wids = [
-                word_dict[x] if x in word_dict else unk_id
-                for x in cols[0].split(" ")
-            ]
-            wids = wids[:padding_size]
-            while len(wids) < padding_size:
-                wids.append(unk_id)
-            all_data.append((wids, label))
-
-    if shuffle:
-        if phrase == "train":
-            random.shuffle(all_data)
-
-    num_examples[phrase] = len(all_data)
-
-    def reader():
-        for epoch_index in range(epoch):
-            for doc, label in all_data:
-                yield doc, label
-
-    return reader
-
-
-def load_vocab(file_path):
-    vocab = {}
-    with io.open(file_path, 'r', encoding='utf8') as f:
-        wid = 0
-        for line in f:
-            if line.strip() not in vocab:
-                vocab[line.strip()] = wid
-                wid += 1
-    vocab["<unk>"] = len(vocab)
-    return vocab
diff --git a/hapi/text/sequence_tagging/__init__.py b/hapi/text/sequence_tagging/__init__.py
deleted file mode 100644
index 18d16eeefe522f30afc8afba5c6cf3be86db65ae..0000000000000000000000000000000000000000
--- a/hapi/text/sequence_tagging/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from hapi.text.sequence_tagging.reader import LacDataset as LacDataset
-from hapi.text.sequence_tagging.reader import LacDataLoader as LacDataLoader
-from hapi.text.sequence_tagging.sequence_tagging import SeqTagging as SeqTagging
-from hapi.text.sequence_tagging.sequence_tagging import Chunk_eval as Chunk_eval
-from hapi.text.sequence_tagging.sequence_tagging import LacLoss as LacLoss
-from hapi.text.sequence_tagging.sequence_tagging import ChunkEval as ChunkEval
-from hapi.text.sequence_tagging.utils.configure import PDConfig as PDConfig 
-from hapi.text.sequence_tagging.utils.check import check_gpu as check_gpu
-from hapi.text.sequence_tagging.utils.check import check_version as check_version
-
diff --git a/hapi/text/sequence_tagging/reader.py b/hapi/text/sequence_tagging/reader.py
deleted file mode 100644
index 991a24e867c5d171247a1497f744cb96d0758f8b..0000000000000000000000000000000000000000
--- a/hapi/text/sequence_tagging/reader.py
+++ /dev/null
@@ -1,249 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-SequenceTagging dataset
-"""
-
-from __future__ import division
-from __future__ import print_function
-
-import io
-import os
-import numpy as np
-import shutil
-from functools import partial
-
-import paddle
-from paddle.io import BatchSampler, DataLoader, Dataset
-from paddle.fluid.dygraph.parallel import ParallelEnv
-from hapi.distributed import DistributedBatchSampler
-
-
-class LacDataset(Dataset):
-    """
-    Load lexical analysis dataset
-    """
-
-    def __init__(self, args):
-        self.word_dict_path = args.word_dict_path
-        self.label_dict_path = args.label_dict_path
-        self.word_rep_dict_path = args.word_rep_dict_path
-        self._load_dict()
-        self.examples = []
-
-    def _load_dict(self):
-        self.word2id_dict = self.load_kv_dict(
-            self.word_dict_path, reverse=True, value_func=np.int64)
-        self.id2word_dict = self.load_kv_dict(self.word_dict_path)
-        self.label2id_dict = self.load_kv_dict(
-            self.label_dict_path, reverse=True, value_func=np.int64)
-        self.id2label_dict = self.load_kv_dict(self.label_dict_path)
-        if self.word_rep_dict_path is None:
-            self.word_replace_dict = dict()
-        else:
-            self.word_replace_dict = self.load_kv_dict(self.word_rep_dict_path)
-
-    def load_kv_dict(self,
-                     dict_path,
-                     reverse=False,
-                     delimiter="\t",
-                     key_func=None,
-                     value_func=None):
-        """
-        Load key-value dict from file
-        """
-        result_dict = {}
-        for line in io.open(dict_path, "r", encoding='utf8'):
-            terms = line.strip("\n").split(delimiter)
-            if len(terms) != 2:
-                continue
-            if reverse:
-                value, key = terms
-            else:
-                key, value = terms
-            if key in result_dict:
-                raise KeyError("key duplicated with [%s]" % (key))
-            if key_func:
-                key = key_func(key)
-            if value_func:
-                value = value_func(value)
-            result_dict[key] = value
-        return result_dict
-
-    @property
-    def vocab_size(self):
-        return max(self.word2id_dict.values()) + 1
-
-    @property
-    def num_labels(self):
-        return max(self.label2id_dict.values()) + 1
-
-    def get_num_examples(self, filename):
-        """num of line of file"""
-        return sum(1 for line in io.open(filename, "r", encoding='utf8'))
-
-    def word_to_ids(self, words):
-        """convert word to word index"""
-        word_ids = []
-        for word in words:
-            word = self.word_replace_dict.get(word, word)
-            if word not in self.word2id_dict:
-                word = "OOV"
-            word_id = self.word2id_dict[word]
-            word_ids.append(word_id)
-
-        return word_ids
-
-    def label_to_ids(self, labels):
-        """convert label to label index"""
-        label_ids = []
-        for label in labels:
-            if label not in self.label2id_dict:
-                label = "O"
-            label_id = self.label2id_dict[label]
-            label_ids.append(label_id)
-        return label_ids
-
-    def file_reader(self, filename, phase="train"):
-        """
-        yield (word_idx, target_idx) one by one from file,
-            or yield (word_idx, ) in `infer` mode
-        """
-        self.phase = phase
-        with io.open(filename, "r", encoding="utf8") as fr:
-            if phase in ["train", "test"]:
-                headline = next(fr)
-                headline = headline.strip().split('\t')
-                assert len(headline) == 2 and headline[
-                    0] == "text_a" and headline[1] == "label"
-
-                for line in fr:
-                    line_str = line.strip("\n")
-                    if len(line_str) < 1 and len(line_str.split('\t')) < 2:
-                        continue
-
-                    self.examples.append(line_str)
-            else:
-                for idx, line in enumerate(fr):
-                    words = line.strip("\n").split("\t")[0]
-                    self.examples.append(words)
-
-    def __getitem__(self, idx):
-        line_str = self.examples[idx]
-        if self.phase in ["train", "test"]:
-            words, labels = line_str.split('\t')
-            word_ids = self.word_to_ids(words.split("\002"))
-            label_ids = self.label_to_ids(labels.split("\002"))
-            assert len(word_ids) == len(label_ids)
-            return word_ids, label_ids
-        else:
-            words = [w for w in line_str]
-            word_ids = self.word_to_ids(words)
-            return word_ids
-
-    def __len__(self):
-
-        return len(self.examples)
-
-
-def create_lexnet_data_generator(args, insts, phase="train"):
-    def padding_data(max_len, batch_data, if_len=False):
-        padding_batch_data = []
-        padding_lens = []
-        for data in batch_data:
-            data = data[:max_len]
-            if if_len:
-                seq_len = np.int64(len(data))
-                padding_lens.append(seq_len)
-            data += [0 for _ in range(max_len - len(data))]
-            padding_batch_data.append(data)
-        if if_len:
-            return np.array(padding_batch_data), np.array(padding_lens)
-        else:
-            return np.array(padding_batch_data)
-
-    if phase == "train":
-        batch_words = [inst[0] for inst in insts]
-        batch_labels = [inst[1] for inst in insts]
-        padding_batch_words, padding_lens = padding_data(
-            args.max_seq_len, batch_words, if_len=True)
-        padding_batch_labels = padding_data(args.max_seq_len, batch_labels)
-        return [
-            padding_batch_words, padding_lens, padding_batch_labels,
-            padding_batch_labels
-        ]
-    elif phase == "test":
-        batch_words = [inst[0] for inst in insts]
-        seq_len = [len(inst[0]) for inst in insts]
-        max_seq_len = max(seq_len)
-        batch_labels = [inst[1] for inst in insts]
-        padding_batch_words, padding_lens = padding_data(
-            max_seq_len, batch_words, if_len=True)
-        padding_batch_labels = padding_data(max_seq_len, batch_labels)
-        return [
-            padding_batch_words, padding_lens, padding_batch_labels,
-            padding_batch_labels
-        ]
-    else:
-        batch_words = insts
-        seq_len = [len(inst) for inst in insts]
-        max_seq_len = max(seq_len)
-        padding_batch_words, padding_lens = padding_data(
-            max_seq_len, batch_words, if_len=True)
-        return [padding_batch_words, padding_lens]
-
-
-class LacDataLoader(object):
-    def __init__(self,
-                 args,
-                 place,
-                 phase="train",
-                 shuffle=False,
-                 num_workers=0,
-                 drop_last=False):
-        assert phase in [
-            "train", "test", "predict"
-        ], "phase should be in [train, test, predict], but get %s" % phase
-
-        if phase == "train":
-            file_name = args.train_file
-        elif phase == "test":
-            file_name = args.test_file
-        elif phase == "predict":
-            file_name = args.predict_file
-
-        self.dataset = LacDataset(args)
-        self.dataset.file_reader(file_name, phase=phase)
-
-        if phase == "train":
-            self.sampler = DistributedBatchSampler(
-                dataset=self.dataset,
-                batch_size=args.batch_size,
-                shuffle=shuffle,
-                drop_last=drop_last)
-        else:
-            self.sampler = BatchSampler(
-                dataset=self.dataset,
-                batch_size=args.batch_size,
-                shuffle=shuffle,
-                drop_last=drop_last)
-
-        self.dataloader = DataLoader(
-            dataset=self.dataset,
-            batch_sampler=self.sampler,
-            places=place,
-            collate_fn=partial(
-                create_lexnet_data_generator, args, phase=phase),
-            num_workers=num_workers,
-            return_list=True)
diff --git a/hapi/text/sequence_tagging/sequence_tagging.py b/hapi/text/sequence_tagging/sequence_tagging.py
deleted file mode 100644
index e4cd3cc363e04c2fc4e92c2b03f52e2896efad01..0000000000000000000000000000000000000000
--- a/hapi/text/sequence_tagging/sequence_tagging.py
+++ /dev/null
@@ -1,201 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-SequenceTagging network structure
-"""
-
-from __future__ import division
-from __future__ import print_function
-
-import io
-import os
-import sys
-import math
-import argparse
-import numpy as np
-
-from hapi.metrics import Metric
-from hapi.model import Model, Input, set_device
-from hapi.loss import Loss
-from hapi.text.text import SequenceTagging
-
-from hapi.text.sequence_tagging.utils.check import check_gpu, check_version
-from hapi.text.sequence_tagging.utils.configure import PDConfig
-
-import paddle.fluid as fluid
-from paddle.fluid.optimizer import AdamOptimizer
-
-
-class SeqTagging(Model):
-    def __init__(self, args, vocab_size, num_labels, length=None,
-                 mode="train"):
-        super(SeqTagging, self).__init__()
-        """
-        define the lexical analysis network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
-        self.mode_type = mode
-        self.word_emb_dim = args.word_emb_dim
-        self.vocab_size = vocab_size
-        self.num_labels = num_labels
-        self.grnn_hidden_dim = args.grnn_hidden_dim
-        self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(
-            args) else 1.0
-        self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(
-            args) else 1.0
-        self.bigru_num = args.bigru_num
-        self.batch_size = args.batch_size
-        self.init_bound = 0.1
-        self.length = length
-
-        self.sequence_tagging = SequenceTagging(
-            vocab_size=self.vocab_size,
-            num_labels=self.num_labels,
-            batch_size=self.batch_size,
-            word_emb_dim=self.word_emb_dim,
-            grnn_hidden_dim=self.grnn_hidden_dim,
-            emb_learning_rate=self.emb_lr,
-            crf_learning_rate=self.crf_lr,
-            bigru_num=self.bigru_num,
-            init_bound=self.init_bound,
-            length=self.length)
-
-    def forward(self, *inputs):
-        """
-        Configure the network
-        """
-        word = inputs[0]
-        lengths = inputs[1]
-        if self.mode_type == "train" or self.mode_type == "test":
-            target = inputs[2]
-            outputs = self.sequence_tagging(word, lengths, target)
-        else:
-            outputs = self.sequence_tagging(word, lengths)
-        return outputs
-
-
-class Chunk_eval(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_chunk_types,
-                 chunk_scheme,
-                 excluded_chunk_types=None):
-        super(Chunk_eval, self).__init__()
-        self.num_chunk_types = num_chunk_types
-        self.chunk_scheme = chunk_scheme
-        self.excluded_chunk_types = excluded_chunk_types
-
-    def forward(self, input, label, seq_length=None):
-        precision = self._helper.create_variable_for_type_inference(
-            dtype="float32")
-        recall = self._helper.create_variable_for_type_inference(
-            dtype="float32")
-        f1_score = self._helper.create_variable_for_type_inference(
-            dtype="float32")
-        num_infer_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64")
-        num_label_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64")
-        num_correct_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64")
-        this_input = {"Inference": input, "Label": label}
-        if seq_length is not None:
-            this_input["SeqLength"] = seq_length
-        self._helper.append_op(
-            type='chunk_eval',
-            inputs=this_input,
-            outputs={
-                "Precision": [precision],
-                "Recall": [recall],
-                "F1-Score": [f1_score],
-                "NumInferChunks": [num_infer_chunks],
-                "NumLabelChunks": [num_label_chunks],
-                "NumCorrectChunks": [num_correct_chunks]
-            },
-            attrs={
-                "num_chunk_types": self.num_chunk_types,
-                "chunk_scheme": self.chunk_scheme,
-                "excluded_chunk_types": self.excluded_chunk_types or []
-            })
-        return (num_infer_chunks, num_label_chunks, num_correct_chunks)
-
-
-class LacLoss(Loss):
-    def __init__(self):
-        super(LacLoss, self).__init__()
-        pass
-
-    def forward(self, outputs, labels):
-        avg_cost = outputs[1]
-        return avg_cost
-
-
-class ChunkEval(Metric):
-    def __init__(self, num_labels, name=None, *args, **kwargs):
-        super(ChunkEval, self).__init__(*args, **kwargs)
-        self._init_name(name)
-        self.chunk_eval = Chunk_eval(
-            int(math.ceil((num_labels - 1) / 2.0)), "IOB")
-        self.reset()
-
-    def add_metric_op(self, *args):
-        crf_decode = args[0]
-        lengths = args[2]
-        label = args[3]
-        (num_infer_chunks, num_label_chunks,
-         num_correct_chunks) = self.chunk_eval(
-             input=crf_decode, label=label, seq_length=lengths)
-        return [num_infer_chunks, num_label_chunks, num_correct_chunks]
-
-    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks,
-               *args, **kwargs):
-        self.infer_chunks_total += num_infer_chunks
-        self.label_chunks_total += num_label_chunks
-        self.correct_chunks_total += num_correct_chunks
-        precision = float(
-            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
-        recall = float(
-            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
-        f1_score = float(2 * precision * recall) / (
-            precision + recall) if num_correct_chunks else 0
-        return [precision, recall, f1_score]
-
-    def reset(self):
-        self.infer_chunks_total = 0
-        self.label_chunks_total = 0
-        self.correct_chunks_total = 0
-
-    def accumulate(self):
-        precision = float(
-            self.correct_chunks_total
-        ) / self.infer_chunks_total if self.infer_chunks_total else 0
-        recall = float(
-            self.correct_chunks_total
-        ) / self.label_chunks_total if self.label_chunks_total else 0
-        f1_score = float(2 * precision * recall) / (
-            precision + recall) if self.correct_chunks_total else 0
-        res = [precision, recall, f1_score]
-        return res
-
-    def _init_name(self, name):
-        name = name or 'chunk eval'
-        self._name = ['precision', 'recall', 'F1']
-
-    def name(self):
-        return self._name
-
diff --git a/hapi/text/sequence_tagging/utils/__init__.py b/hapi/text/sequence_tagging/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/hapi/text/sequence_tagging/utils/check.py b/hapi/text/sequence_tagging/utils/check.py
deleted file mode 100644
index 79ab4862d3c2082c36039b047be08d4a4b5dcedd..0000000000000000000000000000000000000000
--- a/hapi/text/sequence_tagging/utils/check.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-import paddle.fluid as fluid
-
-__all__ = ['check_gpu', 'check_version']
-
-
-def check_gpu(use_gpu):
-    """
-     Log error and exit when set use_gpu=true in paddlepaddle
-     cpu version.
-     """
-    err = "Config use_gpu cannot be set as true while you are " \
-          "using paddlepaddle cpu version ! \nPlease try: \n" \
-          "\t1. Install paddlepaddle-gpu to run model on GPU \n" \
-          "\t2. Set use_gpu as false in config file to run " \
-          "model on CPU"
-
-    try:
-        if use_gpu and not fluid.is_compiled_with_cuda():
-            print(err)
-            sys.exit(1)
-    except Exception as e:
-        pass
-
-
-def check_version():
-    """
-    Log error and exit when the installed version of paddlepaddle is
-    not satisfied.
-    """
-    err = "PaddlePaddle version 1.6 or higher is required, " \
-          "or a suitable develop version is satisfied as well. \n" \
-          "Please make sure the version is good with your code." \
-
-    try:
-        fluid.require_version('1.7.0')
-    except Exception as e:
-        print(err)
-        sys.exit(1)
diff --git a/hapi/text/sequence_tagging/utils/configure.py b/hapi/text/sequence_tagging/utils/configure.py
deleted file mode 100644
index 17dfaa53d8b44a68a2847c4bc1a1934384bb5f82..0000000000000000000000000000000000000000
--- a/hapi/text/sequence_tagging/utils/configure.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import argparse
-import json
-import yaml
-import six
-import logging
-
-logging_only_message = "%(message)s"
-logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
-
-
-class JsonConfig(object):
-    """
-    A high-level api for handling json configure file.
-    """
-
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except:
-            raise IOError("Error in parsing bert model config file '%s'" %
-                          config_path)
-        else:
-            return config_dict
-
-    def __getitem__(self, key):
-        return self._config_dict[key]
-
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-
-
-class ArgumentGroup(object):
-    def __init__(self, parser, title, des):
-        self._group = parser.add_argument_group(title=title, description=des)
-
-    def add_arg(self, name, type, default, help, **kwargs):
-        type = str2bool if type == bool else type
-        self._group.add_argument(
-            "--" + name,
-            default=default,
-            type=type,
-            help=help + ' Default: %(default)s.',
-            **kwargs)
-
-
-class ArgConfig(object):
-    """
-    A high-level api for handling argument configs.
-    """
-
-    def __init__(self):
-        parser = argparse.ArgumentParser()
-
-        train_g = ArgumentGroup(parser, "training", "training options.")
-        train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
-        train_g.add_arg("learning_rate", float, 5e-5,
-                        "Learning rate used to train with warmup.")
-        train_g.add_arg(
-            "lr_scheduler",
-            str,
-            "linear_warmup_decay",
-            "scheduler of learning rate.",
-            choices=['linear_warmup_decay', 'noam_decay'])
-        train_g.add_arg("weight_decay", float, 0.01,
-                        "Weight decay rate for L2 regularizer.")
-        train_g.add_arg(
-            "warmup_proportion", float, 0.1,
-            "Proportion of training steps to perform linear learning rate warmup for."
-        )
-        train_g.add_arg("save_steps", int, 1000,
-                        "The steps interval to save checkpoints.")
-        train_g.add_arg("use_fp16", bool, False,
-                        "Whether to use fp16 mixed precision training.")
-        train_g.add_arg(
-            "loss_scaling", float, 1.0,
-            "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled."
-        )
-        train_g.add_arg("pred_dir", str, None,
-                        "Path to save the prediction results")
-
-        log_g = ArgumentGroup(parser, "logging", "logging related.")
-        log_g.add_arg("skip_steps", int, 10,
-                      "The steps interval to print loss.")
-        log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
-
-        run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
-        run_type_g.add_arg("use_cuda", bool, True,
-                           "If set, use GPU for training.")
-        run_type_g.add_arg(
-            "use_fast_executor", bool, False,
-            "If set, use fast parallel executor (in experiment).")
-        run_type_g.add_arg(
-            "num_iteration_per_drop_scope", int, 1,
-            "Ihe iteration intervals to clean up temporary variables.")
-        run_type_g.add_arg("do_train", bool, True,
-                           "Whether to perform training.")
-        run_type_g.add_arg("do_predict", bool, True,
-                           "Whether to perform prediction.")
-
-        custom_g = ArgumentGroup(parser, "customize", "customized options.")
-
-        self.custom_g = custom_g
-
-        self.parser = parser
-
-    def add_arg(self, name, dtype, default, descrip):
-        self.custom_g.add_arg(name, dtype, default, descrip)
-
-    def build_conf(self):
-        return self.parser.parse_args()
-
-
-def str2bool(v):
-    # because argparse does not support to parse "true, False" as python
-    # boolean directly
-    return v.lower() in ("true", "t", "1")
-
-
-def print_arguments(args, log=None):
-    if not log:
-        print('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-    else:
-        log.info('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            log.info('%s: %s' % (arg, value))
-        log.info('------------------------------------------------')
-
-
-class PDConfig(object):
-    """
-    A high-level API for managing configuration files in PaddlePaddle.
-    Can jointly work with command-line-arugment, json files and yaml files.
-    """
-
-    def __init__(self, json_file="", yaml_file="", fuse_args=True):
-        """
-            Init funciton for PDConfig.
-            json_file: the path to the json configure file.
-            yaml_file: the path to the yaml configure file.
-            fuse_args: if fuse the json/yaml configs with argparse.
-        """
-        assert isinstance(json_file, str)
-        assert isinstance(yaml_file, str)
-
-        if json_file != "" and yaml_file != "":
-            raise Warning(
-                "json_file and yaml_file can not co-exist for now. please only use one configure file type."
-            )
-            return
-
-        self.args = None
-        self.arg_config = {}
-        self.json_config = {}
-        self.yaml_config = {}
-
-        parser = argparse.ArgumentParser()
-
-        self.default_g = ArgumentGroup(parser, "default", "default options.")
-        self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.")
-        self.json_g = ArgumentGroup(parser, "json", "options from json.")
-        self.com_g = ArgumentGroup(parser, "custom", "customized options.")
-
-        self.default_g.add_arg("do_train", bool, False,
-                               "Whether to perform training.")
-        self.default_g.add_arg("do_predict", bool, False,
-                               "Whether to perform predicting.")
-        self.default_g.add_arg("do_eval", bool, False,
-                               "Whether to perform evaluating.")
-        self.default_g.add_arg(
-            "do_save_inference_model", bool, False,
-            "Whether to perform model saving for inference.")
-
-        # NOTE: args for profiler
-        self.default_g.add_arg(
-            "is_profiler", int, 0,
-            "the switch of profiler tools. (used for benchmark)")
-        self.default_g.add_arg(
-            "profiler_path", str, './',
-            "the profiler output file path. (used for benchmark)")
-        self.default_g.add_arg("max_iter", int, 0,
-                               "the max train batch num.(used for benchmark)")
-
-        self.parser = parser
-
-        if json_file != "":
-            self.load_json(json_file, fuse_args=fuse_args)
-
-        if yaml_file:
-            self.load_yaml(yaml_file, fuse_args=fuse_args)
-
-    def load_json(self, file_path, fuse_args=True):
-
-        if not os.path.exists(file_path):
-            raise Warning("the json file %s does not exist." % file_path)
-            return
-
-        with open(file_path, "r") as fin:
-            self.json_config = json.loads(fin.read())
-            fin.close()
-
-        if fuse_args:
-            for name in self.json_config:
-                if isinstance(self.json_config[name], list):
-                    self.json_g.add_arg(
-                        name,
-                        type(self.json_config[name][0]),
-                        self.json_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.json_config[name]))
-                    continue
-                if not isinstance(self.json_config[name], int) \
-                    and not isinstance(self.json_config[name], float) \
-                    and not isinstance(self.json_config[name], str) \
-                    and not isinstance(self.json_config[name], bool):
-
-                    continue
-
-                self.json_g.add_arg(name,
-                                    type(self.json_config[name]),
-                                    self.json_config[name],
-                                    "This is from %s" % file_path)
-
-    def load_yaml(self, file_path, fuse_args=True):
-
-        if not os.path.exists(file_path):
-            raise Warning("the yaml file %s does not exist." % file_path)
-            return
-
-        with open(file_path, "r") as fin:
-            self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
-            fin.close()
-
-        if fuse_args:
-            for name in self.yaml_config:
-                if isinstance(self.yaml_config[name], list):
-                    self.yaml_g.add_arg(
-                        name,
-                        type(self.yaml_config[name][0]),
-                        self.yaml_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.yaml_config[name]))
-                    continue
-
-                if not isinstance(self.yaml_config[name], int) \
-                    and not isinstance(self.yaml_config[name], float) \
-                    and not isinstance(self.yaml_config[name], str) \
-                    and not isinstance(self.yaml_config[name], bool):
-
-                    continue
-
-                self.yaml_g.add_arg(name,
-                                    type(self.yaml_config[name]),
-                                    self.yaml_config[name],
-                                    "This is from %s" % file_path)
-
-    def build(self):
-        self.args = self.parser.parse_args()
-        self.arg_config = vars(self.args)
-
-    def __add__(self, new_arg):
-        assert isinstance(new_arg, list) or isinstance(new_arg, tuple)
-        assert len(new_arg) >= 3
-        assert self.args is None
-
-        name = new_arg[0]
-        dtype = new_arg[1]
-        dvalue = new_arg[2]
-        desc = new_arg[3] if len(
-            new_arg) == 4 else "Description is not provided."
-
-        self.com_g.add_arg(name, dtype, dvalue, desc)
-
-        return self
-
-    def __getattr__(self, name):
-        if name in self.arg_config:
-            return self.arg_config[name]
-
-        if name in self.json_config:
-            return self.json_config[name]
-
-        if name in self.yaml_config:
-            return self.yaml_config[name]
-
-        raise Warning("The argument %s is not defined." % name)
-
-    def Print(self):
-
-        print("-" * 70)
-        for name in self.arg_config:
-            print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name])))
-
-        for name in self.json_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.json_config[name])))
-
-        for name in self.yaml_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.yaml_config[name])))
-
-        print("-" * 70)
-
-
-if __name__ == "__main__":
-    """
-    pd_config = PDConfig(json_file = "./test/bert_config.json")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-
-    pd_config = PDConfig(yaml_file = "./test/bert_config.yaml")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    """
-
-    pd_config = PDConfig(yaml_file="./test/bert_config.yaml")
-    pd_config += ("my_age", int, 18, "I am forever 18.")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    print(pd_config.my_age)
diff --git a/hapi/text/sequence_tagging/utils/metrics.py b/hapi/text/sequence_tagging/utils/metrics.py
deleted file mode 100644
index 2b6422388b7729bc2b820bfb55d15f1dee56c006..0000000000000000000000000000000000000000
--- a/hapi/text/sequence_tagging/utils/metrics.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-import paddle.fluid as fluid
-
-__all__ = ['chunk_count', "build_chunk"]
-
-
-def build_chunk(data_list, id2label_dict):
-    """
-    Assembly entity
-    """
-    tag_list = [id2label_dict.get(str(id)) for id in data_list]
-    ner_dict = {}
-    ner_str = ""
-    ner_start = 0
-    for i in range(len(tag_list)):
-        tag = tag_list[i]
-        if tag == u"O":
-            if i != 0:
-                key = "%d_%d" % (ner_start, i - 1)
-                ner_dict[key] = ner_str
-            ner_start = i
-            ner_str = tag
-        elif tag.endswith(u"B"):
-            if i != 0:
-                key = "%d_%d" % (ner_start, i - 1)
-                ner_dict[key] = ner_str
-            ner_start = i
-            ner_str = tag.split('-')[0]
-        elif tag.endswith(u"I"):
-            if tag.split('-')[0] != ner_str:
-                if i != 0:
-                    key = "%d_%d" % (ner_start, i - 1)
-                    ner_dict[key] = ner_str
-                ner_start = i
-                ner_str = tag.split('-')[0]
-    return ner_dict
-
-
-def chunk_count(infer_numpy, label_numpy, seq_len, id2label_dict):
-    """
-    calculate num_correct_chunks num_error_chunks total_num for metrics
-    """
-    num_infer_chunks, num_label_chunks, num_correct_chunks = 0, 0, 0
-    assert infer_numpy.shape[0] == label_numpy.shape[0]
-
-    for i in range(infer_numpy.shape[0]):
-        infer_list = infer_numpy[i][:seq_len[i]]
-        label_list = label_numpy[i][:seq_len[i]]
-        infer_dict = build_chunk(infer_list, id2label_dict)
-        num_infer_chunks += len(infer_dict)
-        label_dict = build_chunk(label_list, id2label_dict)
-        num_label_chunks += len(label_dict)
-        for key in infer_dict:
-            if key in label_dict and label_dict[key] == infer_dict[key]:
-                num_correct_chunks += 1
-    return num_infer_chunks, num_label_chunks, num_correct_chunks
diff --git a/hapi/text/text.py b/hapi/text/text.py
deleted file mode 100644
index ed803ae08eb16eed596e7097cb1c5fb6e1de2dbe..0000000000000000000000000000000000000000
--- a/hapi/text/text.py
+++ /dev/null
@@ -1,1899 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import six
-import sys
-if six.PY2:
-    reload(sys)
-    sys.setdefaultencoding('utf8')
-
-import ast
-import time
-import argparse as argparse
-import numpy as np
-import multiprocessing
-
-import collections
-import copy
-from functools import partial, reduce
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers.utils as utils
-from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit
-from paddle.fluid.data_feeder import convert_dtype
-
-from paddle.fluid import layers
-from paddle.fluid.dygraph import Layer
-from paddle.fluid.layers import BeamSearchDecoder
-
-__all__ = [
-    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
-    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
-    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
-    'TransformerDecoder', 'TransformerBeamSearchDecoder', 'Linear_chain_crf',
-    'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer'
-]
-
-
-class RNNCell(Layer):
-    def get_initial_states(self,
-                           batch_ref,
-                           shape=None,
-                           dtype=None,
-                           init_value=0,
-                           batch_dim_idx=0):
-        """
-        Generate initialized states according to provided shape, data type and
-        value.
-
-        Parameters:
-            batch_ref: A (possibly nested structure of) tensor variable[s].
-                The first dimension of the tensor will be used as batch size to
-                initialize states.
-            shape: A (possiblely nested structure of) shape[s], where a shape is
-                represented as a list/tuple of integer). -1(for batch size) will
-                beautomatically inserted if shape is not started with it. If None,
-                property `state_shape` will be used. The default value is None.
-            dtype: A (possiblely nested structure of) data type[s]. The structure
-                must be same as that of `shape`, except when all tensors' in states
-                has the same data type, a single data type can be used. If None and
-                property `cell.state_shape` is not available, float32 will be used
-                as the data type. The default value is None.
-            init_value: A float value used to initialize states.
-
-        Returns:
-            Variable: tensor variable[s] packed in the same structure provided \
-                by shape, representing the initialized states.
-        """
-        # TODO: use inputs and batch_size
-        batch_ref = flatten(batch_ref)[0]
-
-        def _is_shape_sequence(seq):
-            if sys.version_info < (3, ):
-                integer_types = (
-                    int,
-                    long, )
-            else:
-                integer_types = (int, )
-            """For shape, list/tuple of integer is the finest-grained objection"""
-            if (isinstance(seq, list) or isinstance(seq, tuple)):
-                if reduce(
-                        lambda flag, x: isinstance(x, integer_types) and flag,
-                        seq, True):
-                    return False
-            # TODO: Add check for the illegal
-            if isinstance(seq, dict):
-                return True
-            return (isinstance(seq, collections.Sequence) and
-                    not isinstance(seq, six.string_types))
-
-        class Shape(object):
-            def __init__(self, shape):
-                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
-
-        # nested structure of shapes
-        states_shapes = self.state_shape if shape is None else shape
-        is_sequence_ori = utils.is_sequence
-        utils.is_sequence = _is_shape_sequence
-        states_shapes = map_structure(lambda shape: Shape(shape),
-                                      states_shapes)
-        utils.is_sequence = is_sequence_ori
-
-        # nested structure of dtypes
-        try:
-            states_dtypes = self.state_dtype if dtype is None else dtype
-        except NotImplementedError:  # use fp32 as default
-            states_dtypes = "float32"
-        if len(flatten(states_dtypes)) == 1:
-            dtype = flatten(states_dtypes)[0]
-            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
-
-        init_states = map_structure(
-            lambda shape, dtype: fluid.layers.fill_constant_batch_size_like(
-                input=batch_ref,
-                shape=shape.shape,
-                dtype=dtype,
-                value=init_value,
-                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
-        return init_states
-
-    @property
-    def state_shape(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) shape[s], where a shape is represented
-        as a list/tuple of integers (-1 for batch size would be automatically
-        inserted into a shape if shape is not started with it).
-        Not necessary to be implemented if states are not initialized by
-        `get_initial_states` or the `shape` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_shape` in the used cell.")
-
-    @property
-    def state_dtype(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) data types[s]. The structure must be
-        same as that of `shape`, except when all tensors' in states has the same
-        data type, a signle data type can be used.
-        Not necessary to be implemented if states are not initialized
-        by `get_initial_states` or the `dtype` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_dtype` in the used cell.")
-
-
-class BasicLSTMCell(RNNCell):
-    """
-    ****
-    BasicLSTMUnit class, Using basic operator to build LSTM
-    The algorithm can be described as the code below.
-        .. math::
-           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-           h_t &= o_t \odot tanh(c_t)
-        - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-          of weights from the input gate to the input)
-        - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-        - sigmoid is the logistic sigmoid function.
-        - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-          and cell activation vectors, respectively, all of which have the same size as
-          the cell output activation vector $h$.
-        - The :math:`\odot` is the element-wise product of the vectors.
-        - :math:`tanh` is the activation functions.
-        - :math:`\\tilde{c_t}` is also called candidate hidden state,
-          which is computed based on the current input and the previous hidden state.
-    Args:
-        name_scope(string) : The name scope used to identify parameter and bias name
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized as zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cells (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate
-        dtype(string): data type used in this unit
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 dtype='float32',
-                 forget_gate_weights={"w": None,
-                                      "h": None,
-                                      "b": None},
-                 input_gate_weights={"w": None,
-                                     "h": None,
-                                     "b": None},
-                 output_gate_weights={"w": None,
-                                      "h": None,
-                                      "b": None},
-                 cell_weights={"w": None,
-                               "h": None,
-                               "b": None}):
-        super(BasicLSTMCell, self).__init__()
-
-        self._hidden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        # TODO(guosheng): find better way to resolve constants in __init__
-        self._forget_bias = layers.create_global_var(
-            shape=[1], dtype=dtype, value=forget_bias, persistable=True)
-        self._forget_bias.stop_gradient = False
-        self._dtype = dtype
-        self._input_size = input_size
-
-        self.use_customized_weight = False
-        for _weights in [
-                forget_gate_weights, input_gate_weights, output_gate_weights,
-                cell_weights
-        ]:
-            for _key in _weights:
-                if _weights[_key] is not None:
-                    self.use_customized_weight = True
-                    break
-            if self.use_customized_weight:
-                break
-
-        if not self.use_customized_weight:
-
-            self._weight = self.create_parameter(
-                attr=self._param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, 4 * self._hidden_size
-                ],
-                dtype=self._dtype)
-
-            self._bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[4 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            if "w" in forget_gate_weights and forget_gate_weights[
-                    "w"] is not None:
-                self.fg_w = forget_gate_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_forget_gate_w"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.fg_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in forget_gate_weights and forget_gate_weights[
-                    "h"] is not None:
-                self.fg_h = forget_gate_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_forget_gate_h"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.fg_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in forget_gate_weights and forget_gate_weights[
-                    "b"] is not None:
-                self.fg_b = forget_gate_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_forget_gate_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.fg_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            if "w" in input_gate_weights and input_gate_weights[
-                    "w"] is not None:
-                self.ig_w = input_gate_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_input_gate_w"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.ig_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in input_gate_weights and input_gate_weights[
-                    "h"] is not None:
-                self.ig_h = input_gate_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_input_gate_h"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.ig_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in input_gate_weights and input_gate_weights[
-                    "b"] is not None:
-                self.ig_b = input_gate_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_input_gate_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.ig_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            if "w" in output_gate_weights and output_gate_weights[
-                    "w"] is not None:
-                self.og_w = output_gate_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_output_gate_w"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.og_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in output_gate_weights and output_gate_weights[
-                    "h"] is not None:
-                self.og_h = output_gate_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_output_gate_h"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.og_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in output_gate_weights and output_gate_weights[
-                    "b"] is not None:
-                self.og_b = output_gate_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_output_gate_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.og_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            if "w" in cell_weights and cell_weights["w"] is not None:
-                self.c_w = cell_weights["w"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_cell_w"
-                else:
-                    tmp_param_attr = self._param_attr
-
-                self.c_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in cell_weights and cell_weights["h"] is not None:
-                self.c_h = cell_weights["h"]
-            else:
-                if self._param_attr is not None and self._param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._param_attr)
-                    tmp_param_attr.name += "_cell_h"
-                else:
-                    tmp_param_attr = self._param_attr
-                self.c_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in cell_weights and cell_weights["b"] is not None:
-                self.c_b = cell_weights["b"]
-            else:
-                if self._bias_attr is not None and self._bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(self._bias_attr)
-                    tmp_param_attr.name += "_cell_b"
-                else:
-                    tmp_param_attr = self._bias_attr
-                self.c_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-    def forward(self, input, state):
-
-        if self.use_customized_weight:
-            weight_w = fluid.layers.concat(
-                [self.ig_w, self.c_w, self.fg_w, self.og_w], axis=-1)
-            weight_h = fluid.layers.concat(
-                [self.ig_h, self.c_h, self.fg_h, self.og_h], axis=-1)
-            _weight = fluid.layers.concat([weight_w, weight_h], axis=0)
-            _bias = fluid.layers.concat(
-                [self.ig_b, self.c_b, self.fg_b, self.og_b])
-        else:
-            _weight = self._weight
-            _bias = self._bias
-
-        pre_hidden, pre_cell = state
-        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=_weight)
-
-        gate_input = layers.elementwise_add(gate_input, _bias)
-        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-        new_cell = layers.elementwise_add(
-            layers.elementwise_mul(
-                pre_cell,
-                layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
-            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
-        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
-
-        return new_hidden, [new_hidden, new_cell]
-
-    @property
-    def state_shape(self):
-        return [[self._hidden_size], [self._hidden_size]]
-
-
-class BasicGRUCell(RNNCell):
-    """
-    ****
-    BasicGRUUnit class, using basic operators to build GRU
-    The algorithm can be described as the equations below.
-
-        .. math::
-            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
-
-            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
-
-            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    Args:
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        dtype(string): data type used in this unit
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 dtype='float32',
-                 update_gate_weights={"w": None,
-                                      "h": None,
-                                      "b": None},
-                 reset_gate_weights={"w": None,
-                                     "h": None,
-                                     "b": None},
-                 cell_weights={"w": None,
-                               "h": None,
-                               "b": None}):
-        super(BasicGRUCell, self).__init__()
-        self._input_size = input_size
-        self._hidden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        self._dtype = dtype
-
-        assert isinstance(update_gate_weights, dict)
-        assert isinstance(reset_gate_weights, dict)
-        assert isinstance(cell_weights, dict)
-
-        self.use_customized_weight = False
-        for _weights in [
-                update_gate_weights, reset_gate_weights, cell_weights
-        ]:
-            for _key in _weights:
-                if _weights[_key] is not None:
-                    self.use_customized_weight = True
-            if self.use_customized_weight:
-                break
-
-        if self._param_attr is not None and self._param_attr.name is not None:
-            gate_param_attr = copy.deepcopy(self._param_attr)
-            candidate_param_attr = copy.deepcopy(self._param_attr)
-            gate_param_attr.name += "_gate"
-            candidate_param_attr.name += "_candidate"
-        else:
-            gate_param_attr = self._param_attr
-            candidate_param_attr = self._param_attr
-
-        if not self.use_customized_weight:
-            self._gate_weight = self.create_parameter(
-                attr=gate_param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, 2 * self._hidden_size
-                ],
-                dtype=self._dtype)
-
-            self._candidate_weight = self.create_parameter(
-                attr=candidate_param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, self._hidden_size
-                ],
-                dtype=self._dtype)
-
-            if self._bias_attr is not None and self._bias_attr.name is not None:
-                gate_bias_attr = copy.deepcopy(self._bias_attr)
-                candidate_bias_attr = copy.deepcopy(self._bias_attr)
-                gate_bias_attr.name += "_gate"
-                candidate_bias_attr.name += "_candidate"
-            else:
-                gate_bias_attr = self._bias_attr
-                candidate_bias_attr = self._bias_attr
-
-            self._gate_bias = self.create_parameter(
-                attr=gate_bias_attr,
-                shape=[2 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
-            self._candidate_bias = self.create_parameter(
-                attr=candidate_bias_attr,
-                shape=[self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
-
-        else:
-
-            # create the parameters of gates in gru
-            if "w" in update_gate_weights and update_gate_weights[
-                    "w"] is not None:
-                self.ug_w = update_gate_weights["w"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_update_gate_w"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.ug_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in update_gate_weights and update_gate_weights[
-                    "h"] is not None:
-                self.ug_h = update_gate_weights["h"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_update_gate_h"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.ug_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in update_gate_weights and update_gate_weights[
-                    "b"] is not None:
-                self.ug_b = update_gate_weights["b"]
-            else:
-                if gate_bias_attr is not None and gate_bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
-                    tmp_param_attr.name += "_update_gate_b"
-                else:
-                    tmp_param_attr = gate_bias_attr
-                self.ug_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            # reset gate parameters
-            if "w" in reset_gate_weights and reset_gate_weights[
-                    "w"] is not None:
-                self.rg_w = reset_gate_weights["w"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_reset_gate_w"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.rg_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in reset_gate_weights and reset_gate_weights[
-                    "h"] is not None:
-                self.rg_h = reset_gate_weights["h"]
-            else:
-                if gate_param_attr is not None and gate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-                    tmp_param_attr.name += "_reset_gate_h"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.rg_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in reset_gate_weights and reset_gate_weights[
-                    "b"] is not None:
-                self.rg_b = reused_params["b"]
-            else:
-                if gate_bias_attr is not None and gate_bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
-                    tmp_param_attr.name += "_reset_gate_b"
-                else:
-                    tmp_param_attr = gate_bias_attr
-                self.rg_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-            # cell parameters
-            if "w" in cell_weights and cell_weights["w"] is not None:
-                self.c_w = cell_weights["w"]
-            else:
-                if candidate_param_attr is not None and candidate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
-                    tmp_param_attr.name += "_cell_w"
-                else:
-                    tmp_param_attr = gate_param_attr
-
-                self.c_w = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._input_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "h" in cell_weights and cell_weights["h"] is not None:
-                self.c_h = cell_weights["h"]
-            else:
-                if candidate_param_attr is not None and candidate_param_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
-                    tmp_param_attr.name += "_cell_h"
-                else:
-                    tmp_param_attr = gate_param_attr
-                self.c_h = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size, self._hidden_size],
-                    dtype=self._dtype)
-
-            if "b" in cell_weights and cell_weights["b"] is not None:
-                self.c_b = cell_weights["b"]
-            else:
-                if candidate_bias_attr is not None and candidate_bias_attr.name is not None:
-                    tmp_param_attr = copy.deepcopy(candidate_bias_attr)
-                    tmp_param_attr.name += "_cell_b"
-                else:
-                    tmp_param_attr = gate_bias_attr
-                self.c_b = self.create_parameter(
-                    attr=tmp_param_attr,
-                    shape=[self._hidden_size],
-                    dtype=self._dtype,
-                    is_bias=True)
-
-    def forward(self, input, state):
-
-        if self.use_customized_weight:
-            rg_weights = layers.concat([self.rg_w, self.rg_h], axis=0)
-            ug_weights = layers.concat([self.ug_w, self.ug_h], axis=0)
-            _gate_weight = layers.concat([rg_weights, ug_weights], axis=-1)
-            _candidate_weight = layers.concat([self.c_w, self.c_h], axis=0)
-            _gate_bias = layers.concat([self.rg_b, self.ug_b], axis=0)
-            _candidate_bias = self.c_b
-        else:
-            _gate_weight = self._gate_weight
-            _gate_bias = self._gate_bias
-            _candidate_weight = self._candidate_weight
-            _candidate_bias = self._candidate_bias
-
-        pre_hidden = state
-        concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
-
-        gate_input = layers.matmul(x=concat_input_hidden, y=_gate_weight)
-
-        gate_input = layers.elementwise_add(gate_input, _gate_bias)
-
-        gate_input = self._gate_activation(gate_input)
-        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
-
-        r_hidden = r * pre_hidden
-
-        candidate = layers.matmul(
-            layers.concat([input, r_hidden], 1), _candidate_weight)
-        candidate = layers.elementwise_add(candidate, _candidate_bias)
-
-        c = self._activation(candidate)
-        new_hidden = u * pre_hidden + (1 - u) * c
-
-        return new_hidden, new_hidden
-
-    @property
-    def state_shape(self):
-        return [self._hidden_size]
-
-
-class RNN(fluid.dygraph.Layer):
-    def __init__(self, cell, is_reverse=False, time_major=False):
-        super(RNN, self).__init__()
-        self.cell = cell
-        if not hasattr(self.cell, "call"):
-            self.cell.call = self.cell.forward
-        self.is_reverse = is_reverse
-        self.time_major = time_major
-        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
-                                                                            1)
-
-    def forward(self,
-                inputs,
-                initial_states=None,
-                sequence_length=None,
-                **kwargs):
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                new_state = fluid.layers.elementwise_mul(
-                    new_state, step_mask,
-                    axis=0) - fluid.layers.elementwise_mul(
-                        state, (step_mask - 1), axis=0)
-                return new_state
-
-            flat_inputs = flatten(inputs)
-            batch_size, time_steps = (
-                flat_inputs[0].shape[self.batch_index],
-                flat_inputs[0].shape[self.time_step_index])
-
-            if initial_states is None:
-                initial_states = self.cell.get_initial_states(
-                    batch_ref=inputs, batch_dim_idx=self.batch_index)
-
-            if not self.time_major:
-                inputs = map_structure(
-                    lambda x: fluid.layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), inputs)
-
-            if sequence_length is not None:
-                mask = fluid.layers.sequence_mask(
-                    sequence_length,
-                    maxlen=time_steps,
-                    dtype=flatten(initial_states)[0].dtype)
-                mask = fluid.layers.transpose(mask, [1, 0])
-
-            if self.is_reverse:
-                inputs = map_structure(
-                    lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
-                mask = fluid.layers.reverse(
-                    mask, axis=[0]) if sequence_length is not None else None
-
-            states = initial_states
-            outputs = []
-            for i in range(time_steps):
-                step_inputs = map_structure(lambda x: x[i], inputs)
-                step_outputs, new_states = self.cell(step_inputs, states,
-                                                     **kwargs)
-                if sequence_length is not None:
-                    new_states = map_structure(
-                        partial(
-                            _maybe_copy, step_mask=mask[i]),
-                        states,
-                        new_states)
-                states = new_states
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if i == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array,
-                                             axis=self.time_step_index),
-                outputs)
-
-            if self.is_reverse:
-                final_outputs = map_structure(
-                    lambda x: fluid.layers.reverse(x,
-                                                   axis=self.time_step_index),
-                    final_outputs)
-
-            final_states = new_states
-        else:
-            final_outputs, final_states = fluid.layers.rnn(
-                self.cell,
-                inputs,
-                initial_states=initial_states,
-                sequence_length=sequence_length,
-                time_major=self.time_major,
-                is_reverse=self.is_reverse,
-                **kwargs)
-        return final_outputs, final_states
-
-
-class DynamicDecode(Layer):
-    def __init__(self,
-                 decoder,
-                 max_step_num=None,
-                 output_time_major=False,
-                 impute_finished=False,
-                 is_test=False,
-                 return_length=False):
-        super(DynamicDecode, self).__init__()
-        self.decoder = decoder
-        self.max_step_num = max_step_num
-        self.output_time_major = output_time_major
-        self.impute_finished = impute_finished
-        self.is_test = is_test
-        self.return_length = return_length
-
-    def forward(self, inits=None, **kwargs):
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-                def __getitem__(self, item):
-                    return self.array.__getitem__(item)
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                state_dtype = state.dtype
-                if convert_dtype(state_dtype) in ["bool"]:
-                    state = layers.cast(state, dtype="float32")
-                    new_state = layers.cast(new_state, dtype="float32")
-                if step_mask.dtype != state.dtype:
-                    step_mask = layers.cast(step_mask, dtype=state.dtype)
-                    # otherwise, renamed bool gradients of would be summed up leading
-                    # to sum(bool) error.
-                    step_mask.stop_gradient = True
-                new_state = layers.elementwise_mul(
-                    state, step_mask, axis=0) - layers.elementwise_mul(
-                        new_state, (step_mask - 1), axis=0)
-                if convert_dtype(state_dtype) in ["bool"]:
-                    new_state = layers.cast(new_state, dtype=state_dtype)
-                return new_state
-
-            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
-                inits)
-            inputs, states, finished = (initial_inputs, initial_states,
-                                        initial_finished)
-            cond = layers.logical_not((layers.reduce_all(initial_finished)))
-            sequence_lengths = layers.cast(
-                layers.zeros_like(initial_finished), "int64")
-            outputs = None
-
-            step_idx = 0
-            step_idx_tensor = layers.fill_constant(
-                shape=[1], dtype="int64", value=step_idx)
-            while cond.numpy():
-                (step_outputs, next_states, next_inputs,
-                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
-                                                    states, **kwargs)
-                if not self.decoder.tracks_own_finished:
-                    # BeamSearchDecoder would track it own finished, since
-                    # beams would be reordered and the finished status of each
-                    # entry might change. Otherwise, perform logical OR which
-                    # would not change the already finished.
-                    next_finished = layers.logical_or(next_finished, finished)
-                    # To confirm states.finished/finished be consistent with
-                    # next_finished.
-                    layers.assign(next_finished, finished)
-                next_sequence_lengths = layers.elementwise_add(
-                    sequence_lengths,
-                    layers.cast(
-                        layers.logical_not(finished), sequence_lengths.dtype))
-
-                if self.impute_finished:  # rectify the states for the finished.
-                    next_states = map_structure(
-                        lambda x, y: _maybe_copy(x, y, finished), states,
-                        next_states)
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if step_idx == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-                inputs, states, finished, sequence_lengths = (
-                    next_inputs, next_states, next_finished,
-                    next_sequence_lengths)
-
-                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
-                step_idx += 1
-
-                layers.logical_not(layers.reduce_all(finished), cond)
-                if self.max_step_num is not None and step_idx > self.max_step_num:
-                    break
-
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
-            final_states = states
-
-            try:
-                final_outputs, final_states = self.decoder.finalize(
-                    final_outputs, final_states, sequence_lengths)
-            except NotImplementedError:
-                pass
-
-            if not self.output_time_major:
-                final_outputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), final_outputs)
-
-            return (final_outputs, final_states,
-                    sequence_lengths) if self.return_length else (
-                        final_outputs, final_states)
-        else:
-            return fluid.layers.dynamic_decode(
-                self.decoder,
-                inits,
-                max_step_num=self.max_step_num,
-                output_time_major=self.output_time_major,
-                impute_finished=self.impute_finished,
-                is_test=self.is_test,
-                return_length=self.return_length,
-                **kwargs)
-
-
-class TransfomerCell(object):
-    """
-    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
-    used as RNNCell
-    """
-
-    def __init__(self, decoder):
-        self.decoder = decoder
-
-    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
-                 static_caches):
-        trg_word, trg_pos = inputs
-        for cache, static_cache in zip(states, static_caches):
-            cache.update(static_cache)
-        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
-                              enc_output, states)
-        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
-        return logits, new_states
-
-
-class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
-    def __init__(self, cell, start_token, end_token, beam_size,
-                 var_dim_in_state):
-        super(TransformerBeamSearchDecoder,
-              self).__init__(cell, start_token, end_token, beam_size)
-        self.cell = cell
-        self.var_dim_in_state = var_dim_in_state
-
-    def _merge_batch_beams_with_var_dim(self, x):
-        # init length of cache is 0, and it increases with decoding carrying on,
-        # thus need to reshape elaborately
-        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
-        x = layers.transpose(x,
-                             list(range(var_dim_in_state, len(x.shape))) +
-                             list(range(0, var_dim_in_state)))
-        x = layers.reshape(
-            x, [0] * (len(x.shape) - var_dim_in_state
-                      ) + [self.batch_size * self.beam_size] +
-            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
-        x = layers.transpose(
-            x,
-            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
-            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
-        return x
-
-    def _split_batch_beams_with_var_dim(self, x):
-        var_dim_size = layers.shape(x)[self.var_dim_in_state]
-        x = layers.reshape(
-            x, [-1, self.beam_size] +
-            [int(size)
-             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
-            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
-        return x
-
-    def step(self, time, inputs, states, **kwargs):
-        # compared to RNN, Transformer has 3D data at every decoding step
-        inputs = layers.reshape(inputs, [-1, 1])  # token
-        pos = layers.ones_like(inputs) * time  # pos
-        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
-                                    states.cell_states)
-
-        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
-                                                   **kwargs)
-        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
-        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
-                                         next_cell_states)
-
-        beam_search_output, beam_search_state = self._beam_search_step(
-            time=time,
-            logits=cell_outputs,
-            next_cell_states=next_cell_states,
-            beam_state=states)
-        next_inputs, finished = (beam_search_output.predicted_ids,
-                                 beam_search_state.finished)
-
-        return (beam_search_output, beam_search_state, next_inputs, finished)
-
-
-### Transformer Modules ###
-class PrePostProcessLayer(Layer):
-    """
-    PrePostProcessLayer
-    """
-
-    def __init__(self,
-                 process_cmd,
-                 d_model,
-                 dropout_rate,
-                 reused_layer_norm=None):
-        super(PrePostProcessLayer, self).__init__()
-        self.process_cmd = process_cmd
-        self.functors = []
-        for cmd in self.process_cmd:
-            if cmd == "a":  # add residual connection
-                self.functors.append(
-                    lambda x, y: x + y if y is not None else x)
-            elif cmd == "n":  # add layer normalization
-                if reused_layer_norm is not None:
-                    layer_norm = reused_layer_norm
-                else:
-                    layer_norm = LayerNorm(
-                        normalized_shape=d_model,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.)),
-                        bias_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(0.)))
-
-                self.functors.append(
-                    self.add_sublayer(
-                        "layer_norm_%d" % len(
-                            self.sublayers(include_sublayers=False)),
-                        layer_norm))
-            elif cmd == "d":  # add dropout
-                self.functors.append(lambda x: layers.dropout(
-                    x, dropout_prob=dropout_rate, is_test=False)
-                                     if dropout_rate else x)
-
-    def forward(self, x, residual=None):
-        for i, cmd in enumerate(self.process_cmd):
-            if cmd == "a":
-                x = self.functors[i](x, residual)
-            else:
-                x = self.functors[i](x)
-        return x
-
-
-class MultiHeadAttention(Layer):
-    """
-    Multi-Head Attention
-    """
-
-    def __init__(self,
-                 d_key,
-                 d_value,
-                 d_model,
-                 n_head=1,
-                 dropout_rate=0.0,
-                 reused_query_fc=None,
-                 reused_key_fc=None,
-                 reused_value_fc=None,
-                 reused_proj_fc=None):
-
-        super(MultiHeadAttention, self).__init__()
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
-        self.d_model = d_model
-        self.dropout_rate = dropout_rate
-
-        if reused_query_fc is not None:
-            self.q_fc = reused_query_fc
-        else:
-            self.q_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_key_fc is not None:
-            self.k_fc = reused_key_fc
-        else:
-            self.k_fc = Linear(
-                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        if reused_value_fc is not None:
-            self.v_fc = reused_value_fc
-        else:
-            self.v_fc = Linear(
-                input_dim=d_model,
-                output_dim=d_value * n_head,
-                bias_attr=False)
-        if reused_proj_fc is not None:
-            self.proj_fc = reused_proj_fc
-        else:
-            self.proj_fc = Linear(
-                input_dim=d_value * n_head,
-                output_dim=d_model,
-                bias_attr=False)
-
-    def _prepare_qkv(self, queries, keys, values, cache=None):
-        if keys is None:  # self-attention
-            keys, values = queries, queries
-            static_kv = False
-        else:  # cross-attention
-            static_kv = True
-
-        q = self.q_fc(queries)
-        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
-        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
-
-        if cache is not None and static_kv and "static_k" in cache:
-            # for encoder-decoder attention in inference and has cached
-            k = cache["static_k"]
-            v = cache["static_v"]
-        else:
-            k = self.k_fc(keys)
-            v = self.v_fc(values)
-            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-
-        if cache is not None:
-            if static_kv and not "static_k" in cache:
-                # for encoder-decoder attention in inference and has not cached
-                cache["static_k"], cache["static_v"] = k, v
-            elif not static_kv:
-                # for decoder self-attention in inference
-                cache_k, cache_v = cache["k"], cache["v"]
-                k = layers.concat([cache_k, k], axis=2)
-                v = layers.concat([cache_v, v], axis=2)
-                cache["k"], cache["v"] = k, v
-
-        return q, k, v
-
-    def forward(self, queries, keys, values, attn_bias, cache=None):
-        # compute q ,k ,v
-        q, k, v = self._prepare_qkv(queries, keys, values, cache)
-
-        # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
-        if attn_bias is not None:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if self.dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=self.dropout_rate, is_test=False)
-
-        out = layers.matmul(weights, v)
-
-        # combine heads
-        out = layers.transpose(out, perm=[0, 2, 1, 3])
-        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
-
-        # project to output
-        out = self.proj_fc(out)
-        return out
-
-    def cal_kv(self, keys, values):
-        k = self.k_fc(keys)
-        v = self.v_fc(values)
-        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-        return k, v
-
-
-class FFN(Layer):
-    """
-    Feed-Forward Network
-    """
-
-    def __init__(self,
-                 d_inner_hid,
-                 d_model,
-                 dropout_rate,
-                 fc1_act="relu",
-                 reused_fc1=None,
-                 reused_fc2=None):
-        super(FFN, self).__init__()
-        self.dropout_rate = dropout_rate
-        if reused_fc1 is not None:
-            self.fc1 = reused_fc1
-        else:
-            self.fc1 = Linear(
-                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
-        if reused_fc2 is not None:
-            self.fc2 = reused_fc2
-        else:
-            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
-
-    def forward(self, x):
-        hidden = self.fc1(x)
-        if self.dropout_rate:
-            hidden = layers.dropout(
-                hidden, dropout_prob=self.dropout_rate, is_test=False)
-        out = self.fc2(hidden)
-        return out
-
-
-class TransformerEncoderLayer(Layer):
-    """
-    EncoderLayer
-    """
-
-    def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu",
-                 reused_pre_selatt_layernorm=None,
-                 reused_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
-
-        super(TransformerEncoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
-            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
-            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       fc1_act=ffn_fc1_act,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
-
-    def forward(self, enc_input, attn_bias):
-        attn_output = self.self_attn(
-            self.preprocesser1(enc_input), None, None, attn_bias)
-        attn_output = self.postprocesser1(attn_output, enc_input)
-
-        ffn_output = self.ffn(self.preprocesser2(attn_output))
-        ffn_output = self.postprocesser2(ffn_output, attn_output)
-        return ffn_output
-
-
-class TransformerEncoder(Layer):
-    """
-    encoder
-    """
-
-    def __init__(self,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu"):
-
-        super(TransformerEncoder, self).__init__()
-
-        self.encoder_layers = list()
-        for i in range(n_layer):
-            self.encoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerEncoderLayer(
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        ffn_fc1_act=ffn_fc1_act)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
-
-    def forward(self, enc_input, attn_bias):
-        for encoder_layer in self.encoder_layers:
-            enc_output = encoder_layer(enc_input, attn_bias)
-            enc_input = enc_output
-
-        return self.processer(enc_output)
-
-
-class TransformerDecoderLayer(Layer):
-    """
-    decoder
-    """
-
-    def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 reused_pre_selfatt_layernorm=None,
-                 reused_self_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_selfatt_layernorm=None,
-                 reused_pre_crossatt_layernorm=None,
-                 reused_cross_multihead_att_weights={
-                     "reused_query_fc": None,
-                     "reused_key_fc": None,
-                     "reused_value_fc": None,
-                     "reused_proj_fc": None
-                 },
-                 reused_post_crossatt_layernorm=None,
-                 reused_pre_ffn_layernorm=None,
-                 reused_ffn_weights={"reused_fc1": None,
-                                     "reused_fc2": None},
-                 reused_post_ffn_layernorm=None):
-        super(TransformerDecoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_selfatt_layernorm)
-        self.self_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_self_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_self_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
-        self.postprocesser1 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_selfatt_layernorm)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_crossatt_layernorm)
-        self.cross_attn = MultiHeadAttention(
-            d_key,
-            d_value,
-            d_model,
-            n_head,
-            attention_dropout,
-            reused_query_fc=reused_cross_multihead_att_weights[
-                "reused_query_fc"],
-            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
-            reused_value_fc=reused_cross_multihead_att_weights[
-                "reused_value_fc"],
-            reused_proj_fc=reused_cross_multihead_att_weights[
-                "reused_proj_fc"])
-        self.postprocesser2 = PrePostProcessLayer(
-            postprocess_cmd, d_model, prepostprocess_dropout,
-            reused_post_crossatt_layernorm)
-
-        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout,
-                                                 reused_pre_ffn_layernorm)
-        self.ffn = FFN(d_inner_hid,
-                       d_model,
-                       relu_dropout,
-                       reused_fc1=reused_ffn_weights["reused_fc1"],
-                       reused_fc2=reused_ffn_weights["reused_fc2"])
-        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout,
-                                                  reused_post_ffn_layernorm)
-
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                cache=None):
-        self_attn_output = self.self_attn(
-            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
-        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
-
-        cross_attn_output = self.cross_attn(
-            self.preprocesser2(self_attn_output), enc_output, enc_output,
-            cross_attn_bias, cache)
-        cross_attn_output = self.postprocesser2(cross_attn_output,
-                                                self_attn_output)
-
-        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
-        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
-
-        return ffn_output
-
-
-class TransformerDecoder(Layer):
-    """
-    decoder
-    """
-
-    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-                 prepostprocess_dropout, attention_dropout, relu_dropout,
-                 preprocess_cmd, postprocess_cmd):
-        super(TransformerDecoder, self).__init__()
-
-        self.decoder_layers = list()
-        for i in range(n_layer):
-            self.decoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerDecoderLayer(
-                        n_head, d_key, d_value, d_model, d_inner_hid,
-                        prepostprocess_dropout, attention_dropout,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
-
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                caches=None):
-        for i, decoder_layer in enumerate(self.decoder_layers):
-            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
-                                       cross_attn_bias, None
-                                       if caches is None else caches[i])
-            dec_input = dec_output
-
-        return self.processer(dec_output)
-
-    def prepare_static_cache(self, enc_output):
-        return [
-            dict(
-                zip(("static_k", "static_v"),
-                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
-            for decoder_layer in self.decoder_layers
-        ]
-
-
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUCell(RNNCell):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 origin_mode=False):
-        super(GRUCell, self).__init__()
-        self.hidden_size = hidden_size
-        self.fc_layer = Linear(
-            input_size, hidden_size * 3, param_attr=param_attr)
-
-        self.gru_unit = GRUUnit(
-            hidden_size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
-
-    def forward(self, inputs, states):
-        # for GRUCell, `step_outputs` and `new_states` both are hidden
-        x = self.fc_layer(inputs)
-        hidden, _, _ = self.gru_unit(x, states)
-        return hidden, hidden
-
-    @property
-    def state_shape(self):
-        return [self.hidden_size]
-
-
-#TODO: we should merge GRUCell with BasicGRUCell
-class GRUEncoderCell(RNNCell):
-    def __init__(self,
-                 num_layers,
-                 input_size,
-                 hidden_size,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(GRUEncoderCell, self).__init__()
-        self.dropout_prob = dropout_prob
-        # use add_sublayer to add multi-layers
-        self.gru_cells = []
-        for i in range(num_layers):
-            self.gru_cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    #BasicGRUCell(
-                    GRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.UniformInitializer(
-                                low=-init_scale, high=init_scale)))))
-
-    def forward(self, step_input, states):
-        new_states = []
-        for i, gru_cell in enumerate(self.gru_cells):
-            out, state = gru_cell(step_input, states[i])
-            step_input = layers.dropout(
-                out,
-                self.dropout_prob,
-                dropout_implementation='upscale_in_train'
-            ) if self.dropout_prob > 0 else out
-            new_states.append(step_input)
-        return step_input, new_states
-
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.gru_cells]
-
-
-class BiGRU(fluid.dygraph.Layer):
-    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
-        super(BiGRU, self).__init__()
-        self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                      init_bound),
-                       is_reverse=False,
-                       time_major=False)
-
-        self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                        init_bound),
-                         is_reverse=True,
-                         time_major=False)
-
-    def forward(self, input_feature):
-        pre_gru, pre_state = self.gru(input_feature)
-        gru_r, r_state = self.gru_r(input_feature)
-        bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
-        return bi_merge
-
-
-class Linear_chain_crf(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Linear_chain_crf, self).__init__()
-
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-
-    @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-
-    def forward(self, input, label, length=None):
-
-        alpha = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        emission_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        transition_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        log_likelihood = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": [label]
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='linear_chain_crf',
-            inputs=this_inputs,
-            outputs={
-                "Alpha": [alpha],
-                "EmissionExps": [emission_exps],
-                "TransitionExps": transition_exps,
-                "LogLikelihood": log_likelihood
-            },
-            attrs={"is_test": self._is_test, })
-        return log_likelihood
-
-
-class Crf_decoding(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Crf_decoding, self).__init__()
-
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._param_attr = param_attr
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-
-    @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-
-    def forward(self, input, label=None, length=None):
-
-        viterbi_path = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": label
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='crf_decoding',
-            inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={"is_test": self._is_test, })
-        return viterbi_path
-
-
-class GRUEncoderLayer(Layer):
-    def __init__(self,
-                 input_dim,
-                 grnn_hidden_dim,
-                 init_bound,
-                 num_layers=1,
-                 h_0=None,
-                 is_bidirection=False):
-        super(GRUEncoderLayer, self).__init__()
-        self.h_0 = h_0
-        self.num_layers = num_layers
-        self.is_bidirection = is_bidirection
-        self.gru_list = []
-        self.gru_r_list = []
-        for i in range(num_layers):
-            self.basic_gru_cell = BasicGRUCell(
-                input_size=input_dim if i == 0 else input_dim * 2,
-                hidden_size=grnn_hidden_dim,
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-init_bound, high=init_bound),
-                    regularizer=fluid.regularizer.L2DecayRegularizer(
-                        regularization_coeff=1e-4)))
-            self.gru_list.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    RNN(self.basic_gru_cell,
-                        is_reverse=False,
-                        time_major=False)))
-        if self.is_bidirection:
-            for i in range(num_layers):
-                self.basic_gru_cell_r = BasicGRUCell(
-                    input_size=input_dim if i == 0 else input_dim * 2,
-                    hidden_size=grnn_hidden_dim,
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.UniformInitializer(
-                            low=-init_bound, high=init_bound),
-                        regularizer=fluid.regularizer.L2DecayRegularizer(
-                            regularization_coeff=1e-4)))
-                self.gru_r_list.append(
-                    self.add_sublayer(
-                        "gru_r_%d" % i,
-                        RNN(self.basic_gru_cell_r,
-                            is_reverse=True,
-                            time_major=False)))
-
-    def forward(self, input_feature):
-        for i in range(self.num_layers):
-            pre_gru, pre_state = self.gru_list[i](input_feature)
-            if self.is_bidirection:
-                gru_r, r_state = self.gru_r_list[i](input_feature)
-                out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
-            else:
-                out = pre_gru
-            input_feature = out
-        return out
-
-
-class SequenceTagging(fluid.dygraph.Layer):
-    def __init__(self,
-                 vocab_size,
-                 num_labels,
-                 batch_size,
-                 word_emb_dim=128,
-                 grnn_hidden_dim=128,
-                 emb_learning_rate=0.1,
-                 crf_learning_rate=0.1,
-                 bigru_num=2,
-                 init_bound=0.1,
-                 length=None):
-        super(SequenceTagging, self).__init__()
-        """
-        define the sequence tagging network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
-        self.word_emb_dim = word_emb_dim
-        self.vocab_size = vocab_size
-        self.num_labels = num_labels
-        self.grnn_hidden_dim = grnn_hidden_dim
-        self.emb_lr = emb_learning_rate
-        self.crf_lr = crf_learning_rate
-        self.bigru_num = bigru_num
-        self.batch_size = batch_size
-        self.init_bound = 0.1
-
-        self.word_embedding = Embedding(
-            size=[self.vocab_size, self.word_emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
-                learning_rate=self.emb_lr,
-                name="word_emb",
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound)))
-
-        h_0 = fluid.layers.create_global_var(
-            shape=[self.batch_size, self.grnn_hidden_dim],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            force_cpu=True,
-            name='h_0')
-
-        self.gru_encoder = GRUEncoderLayer(
-            input_dim=self.grnn_hidden_dim,
-            grnn_hidden_dim=self.grnn_hidden_dim,
-            init_bound=self.init_bound,
-            num_layers=self.bigru_num,
-            h_0=h_0,
-            is_bidirection=True)
-
-        self.fc = Linear(
-            input_dim=self.grnn_hidden_dim * 2,
-            output_dim=self.num_labels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.linear_chain_crf = Linear_chain_crf(
-            param_attr=fluid.ParamAttr(
-                name='linear_chain_crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-
-        self.crf_decoding = Crf_decoding(
-            param_attr=fluid.ParamAttr(
-                name='crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-
-    def forward(self, word, lengths, target=None):
-        """
-        Configure the network
-        """
-        word_embed = self.word_embedding(word)
-        input_feature = word_embed
-
-        bigru_output = self.gru_encoder(input_feature)
-        emission = self.fc(bigru_output)
-
-        if target is not None:
-            crf_cost = self.linear_chain_crf(
-                input=emission, label=target, length=lengths)
-            avg_cost = fluid.layers.mean(x=crf_cost)
-            self.crf_decoding.weight = self.linear_chain_crf.weight
-            crf_decode = self.crf_decoding(input=emission, length=lengths)
-            return crf_decode, avg_cost, lengths
-        else:
-            self.linear_chain_crf.weight = self.crf_decoding.weight
-            crf_decode = self.crf_decoding(input=emission, length=lengths)
-            return crf_decode, lengths
diff --git a/hapi/text/tokenizer/__init__.py b/hapi/text/tokenizer/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/hapi/text/tokenizer/tokenization.py b/hapi/text/tokenizer/tokenization.py
deleted file mode 100644
index 08570f30fe9e6a8036a15095e67e6e8dd8686c14..0000000000000000000000000000000000000000
--- a/hapi/text/tokenizer/tokenization.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import unicodedata
-import six
-import io
-
-
-def convert_to_unicode(text):
-    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text.decode("utf-8", "ignore")
-        elif isinstance(text, unicode):
-            return text
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-
-
-def printable_text(text):
-    """Returns text encoded in a way suitable for print or `tf.logging`."""
-
-    # These functions want `str` for both Python2 and Python3, but in one case
-    # it's a Unicode string and in the other it's a byte string.
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, unicode):
-            return text.encode("utf-8")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    fin = io.open(vocab_file, encoding="utf8")
-    for num, line in enumerate(fin):
-        items = convert_to_unicode(line.strip()).split("\t")
-        if len(items) > 2:
-            break
-        token = items[0]
-        index = items[1] if len(items) == 2 else num
-        token = token.strip()
-        vocab[token] = int(index)
-    return vocab
-
-
-def convert_by_vocab(vocab, items):
-    """Converts a sequence of [tokens|ids] using the vocab."""
-    output = []
-    for item in items:
-        output.append(vocab[item])
-    return output
-
-
-def convert_tokens_to_ids(vocab, tokens):
-    return convert_by_vocab(vocab, tokens)
-
-
-def convert_ids_to_tokens(inv_vocab, ids):
-    return convert_by_vocab(inv_vocab, ids)
-
-
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a peice of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class FullTokenizer(object):
-    """Runs end-to-end tokenziation."""
-
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.inv_vocab = {v: k for k, v in self.vocab.items()}
-        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-    def tokenize(self, text):
-        split_tokens = []
-        for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        return convert_by_vocab(self.vocab, tokens)
-
-    def convert_ids_to_tokens(self, ids):
-        return convert_by_vocab(self.inv_vocab, ids)
-
-
-class CharTokenizer(object):
-    """Runs end-to-end tokenziation."""
-
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.inv_vocab = {v: k for k, v in self.vocab.items()}
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-    def tokenize(self, text):
-        split_tokens = []
-        for token in text.lower().split(" "):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        return convert_by_vocab(self.vocab, tokens)
-
-    def convert_ids_to_tokens(self, ids):
-        return convert_by_vocab(self.inv_vocab, ids)
-
-
-class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-    def __init__(self, do_lower_case=True):
-        """Constructs a BasicTokenizer.
-
-        Args:
-            do_lower_case: Whether to lower case the input.
-        """
-        self.do_lower_case = do_lower_case
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text."""
-        text = convert_to_unicode(text)
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        text = self._tokenize_chinese_chars(text)
-
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-            (cp >= 0x3400 and cp <= 0x4DBF) or  #
-            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-            (cp >= 0x2B820 and cp <= 0x2CEAF) or
-            (cp >= 0xF900 and cp <= 0xFAFF) or  #
-            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenziation."""
-
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-
-        For example:
-            input = "unaffable"
-            output = ["un", "##aff", "##able"]
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through `BasicTokenizer.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
-
-        text = convert_to_unicode(text)
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-
-
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-
-
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
diff --git a/hapi/utils.py b/hapi/utils.py
deleted file mode 100644
index de928945dc68a1800c3cd9b14aaa0659e50c9945..0000000000000000000000000000000000000000
--- a/hapi/utils.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import numpy as np
-
-from paddle import fluid
-from paddle.fluid.framework import Variable
-from paddle.fluid.executor import global_scope
-
-
-def to_list(value):
-    if value is None:
-        return value
-    if isinstance(value, (list, tuple)):
-        return list(value)
-    return [value]
-
-
-def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable"
-    if isinstance(var, fluid.core.VarBase):
-        return var.numpy()
-    t = global_scope().find_var(var.name).get_tensor()
-    return np.array(t)
-
-
-def flatten_list(l):
-    assert isinstance(l, list), "not a list"
-    outl = []
-    splits = []
-    for sl in l:
-        assert isinstance(sl, list), "sub content not a list"
-        splits.append(len(sl))
-        outl += sl
-    return outl, splits
-
-
-def restore_flatten_list(l, splits):
-    outl = []
-    for split in splits:
-        assert len(l) >= split, "list length invalid"
-        sl, l = l[:split], l[split:]
-        outl.append(sl)
-    return outl
-
-
-def extract_args(func):
-    if hasattr(inspect, 'getfullargspec'):
-        return inspect.getfullargspec(func)[0]
-    else:
-        return inspect.getargspec(func)[0]
\ No newline at end of file
diff --git a/hapi/vision/__init__.py b/hapi/vision/__init__.py
deleted file mode 100644
index d2be76375599071e4b5016f1e4d1cb3f679050e8..0000000000000000000000000000000000000000
--- a/hapi/vision/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import models
-from . import transforms
-
-__all__ = ["models", "transforms"]
diff --git a/hapi/vision/models/__init__.py b/hapi/vision/models/__init__.py
deleted file mode 100644
index 70ff0df8462068bfdf6c8c156175b7b0a548dba5..0000000000000000000000000000000000000000
--- a/hapi/vision/models/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from . import resnet
-from . import vgg
-from . import mobilenetv1
-from . import mobilenetv2
-from . import lenet
-
-from .resnet import *
-from .mobilenetv1 import *
-from .mobilenetv2 import *
-from .vgg import *
-from .lenet import *
-
-__all__ = resnet.__all__ \
-        + vgg.__all__ \
-        + mobilenetv1.__all__ \
-        + mobilenetv2.__all__ \
-        + lenet.__all__
diff --git a/hapi/vision/models/lenet.py b/hapi/vision/models/lenet.py
deleted file mode 100644
index 568165490e5daaafc921f32553000b4eda558f85..0000000000000000000000000000000000000000
--- a/hapi/vision/models/lenet.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import paddle.fluid as fluid
-
-from paddle.fluid.dygraph.nn import Conv2D, BatchNorm, Pool2D, Linear
-from paddle.fluid.dygraph.container import Sequential
-from hapi.model import Model
-
-__all__ = ['LeNet']
-
-
-class LeNet(Model):
-    """LeNet model from
-    `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_
-
-    Args:
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 10.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import LeNet
-
-            model = LeNet()
-    """
-
-    def __init__(self, num_classes=10, classifier_activation='softmax'):
-        super(LeNet, self).__init__()
-        self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2D(
-                1, 6, 3, stride=1, padding=1),
-            Pool2D(2, 'max', 2),
-            Conv2D(
-                6, 16, 5, stride=1, padding=0),
-            Pool2D(2, 'max', 2))
-
-        if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.fc(x)
-        return x
diff --git a/hapi/vision/models/mobilenetv1.py b/hapi/vision/models/mobilenetv1.py
deleted file mode 100644
index 8afd53a2e2dce9d7e93a6bee9bce8c71668dc3fb..0000000000000000000000000000000000000000
--- a/hapi/vision/models/mobilenetv1.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-
-from hapi.model import Model
-from hapi.download import get_weights_path_from_url
-
-__all__ = ['MobileNetV1', 'mobilenet_v1']
-
-model_urls = {
-    'mobilenetv1_1.0':
-    ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams',
-     'bf0d25cb0bed1114d9dac9384ce2b4a6')
-}
-
-
-class ConvBNLayer(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
-                 stride,
-                 padding,
-                 channels=None,
-                 num_groups=1,
-                 act='relu',
-                 use_cudnn=True,
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "_weights"),
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(
-            num_filters,
-            act=act,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
-
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        return y
-
-
-class DepthwiseSeparable(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters1,
-                 num_filters2,
-                 num_groups,
-                 stride,
-                 scale,
-                 name=None):
-        super(DepthwiseSeparable, self).__init__()
-
-        self._depthwise_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=int(num_filters1 * scale),
-            filter_size=3,
-            stride=stride,
-            padding=1,
-            num_groups=int(num_groups * scale),
-            use_cudnn=False)
-
-        self._pointwise_conv = ConvBNLayer(
-            num_channels=int(num_filters1 * scale),
-            filter_size=1,
-            num_filters=int(num_filters2 * scale),
-            stride=1,
-            padding=0)
-
-    def forward(self, inputs):
-        y = self._depthwise_conv(inputs)
-        y = self._pointwise_conv(y)
-        return y
-
-
-class MobileNetV1(Model):
-    """MobileNetV1 model from
-    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_.
-
-    Args:
-        scale (float): scale of channels in each layer. Default: 1.0.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import MobileNetV1
-
-            model = MobileNetV1()
-    """
-
-    def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
-        super(MobileNetV1, self).__init__()
-        self.scale = scale
-        self.dwsl = []
-        self.num_classes = num_classes
-        self.with_pool = with_pool
-
-        self.conv1 = ConvBNLayer(
-            num_channels=3,
-            filter_size=3,
-            channels=3,
-            num_filters=int(32 * scale),
-            stride=2,
-            padding=1)
-
-        dws21 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(32 * scale),
-                num_filters1=32,
-                num_filters2=64,
-                num_groups=32,
-                stride=1,
-                scale=scale),
-            name="conv2_1")
-        self.dwsl.append(dws21)
-
-        dws22 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(64 * scale),
-                num_filters1=64,
-                num_filters2=128,
-                num_groups=64,
-                stride=2,
-                scale=scale),
-            name="conv2_2")
-        self.dwsl.append(dws22)
-
-        dws31 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=128,
-                num_groups=128,
-                stride=1,
-                scale=scale),
-            name="conv3_1")
-        self.dwsl.append(dws31)
-
-        dws32 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=256,
-                num_groups=128,
-                stride=2,
-                scale=scale),
-            name="conv3_2")
-        self.dwsl.append(dws32)
-
-        dws41 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=256,
-                num_groups=256,
-                stride=1,
-                scale=scale),
-            name="conv4_1")
-        self.dwsl.append(dws41)
-
-        dws42 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=512,
-                num_groups=256,
-                stride=2,
-                scale=scale),
-            name="conv4_2")
-        self.dwsl.append(dws42)
-
-        for i in range(5):
-            tmp = self.add_sublayer(
-                sublayer=DepthwiseSeparable(
-                    num_channels=int(512 * scale),
-                    num_filters1=512,
-                    num_filters2=512,
-                    num_groups=512,
-                    stride=1,
-                    scale=scale),
-                name="conv5_" + str(i + 1))
-            self.dwsl.append(tmp)
-
-        dws56 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(512 * scale),
-                num_filters1=512,
-                num_filters2=1024,
-                num_groups=512,
-                stride=2,
-                scale=scale),
-            name="conv5_6")
-        self.dwsl.append(dws56)
-
-        dws6 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(1024 * scale),
-                num_filters1=1024,
-                num_filters2=1024,
-                num_groups=1024,
-                stride=1,
-                scale=scale),
-            name="conv6")
-        self.dwsl.append(dws6)
-
-        if with_pool:
-            self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > -1:
-            self.out = Linear(
-                int(1024 * scale),
-                num_classes,
-                act=classifier_activation,
-                param_attr=ParamAttr(
-                    initializer=MSRA(), name=self.full_name() + "fc7_weights"),
-                bias_attr=ParamAttr(name="fc7_offset"))
-
-    def forward(self, inputs):
-        y = self.conv1(inputs)
-        for dws in self.dwsl:
-            y = dws(y)
-
-        if self.with_pool:
-            y = self.pool2d_avg(y)
-
-        if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, 1024])
-            y = self.out(y)
-        return y
-
-
-def _mobilenet(arch, pretrained=False, **kwargs):
-    model = MobileNetV1(**kwargs)
-    if pretrained:
-        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
-            arch)
-        weight_path = get_weights_path_from_url(model_urls[arch][0],
-                                                model_urls[arch][1])
-        assert weight_path.endswith(
-            '.pdparams'), "suffix of weight must be .pdparams"
-        model.load(weight_path)
-
-    return model
-
-
-def mobilenet_v1(pretrained=False, scale=1.0, **kwargs):
-    """MobileNetV1
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
-        scale: (float): scale of channels in each layer. Default: 1.0.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import mobilenet_v1
-
-            # build model
-            model = mobilenet_v1()
-
-            #build model and load imagenet pretrained weight
-            model = mobilenet_v1(pretrained=True)
-
-            #build mobilenet v1 with scale=0.5
-            model = mobilenet_v1(scale=0.5)
-    """
-    model = _mobilenet(
-        'mobilenetv1_' + str(scale), pretrained, scale=scale, **kwargs)
-    return model
diff --git a/hapi/vision/models/mobilenetv2.py b/hapi/vision/models/mobilenetv2.py
deleted file mode 100644
index 0b0179334a48fe67dd2e7f9cca3625932ffb33b2..0000000000000000000000000000000000000000
--- a/hapi/vision/models/mobilenetv2.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-
-from hapi.model import Model
-from hapi.download import get_weights_path_from_url
-
-__all__ = ['MobileNetV2', 'mobilenet_v2']
-
-model_urls = {
-    'mobilenetv2_1.0':
-    ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams',
-     '8ff74f291f72533f2a7956a4efff9d88')
-}
-
-
-class ConvBNLayer(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
-                 stride,
-                 padding,
-                 channels=None,
-                 num_groups=1,
-                 use_cudnn=True):
-        super(ConvBNLayer, self).__init__()
-
-        tmp_param = ParamAttr(name=self.full_name() + "_weights")
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=tmp_param,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(
-            num_filters,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
-
-    def forward(self, inputs, if_act=True):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if if_act:
-            y = fluid.layers.relu6(y)
-        return y
-
-
-class InvertedResidualUnit(fluid.dygraph.Layer):
-    def __init__(
-            self,
-            num_channels,
-            num_in_filter,
-            num_filters,
-            stride,
-            filter_size,
-            padding,
-            expansion_factor, ):
-        super(InvertedResidualUnit, self).__init__()
-        num_expfilter = int(round(num_in_filter * expansion_factor))
-        self._expand_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_expfilter,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-        self._bottleneck_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_expfilter,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            num_groups=num_expfilter,
-            use_cudnn=False)
-
-        self._linear_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_filters,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-    def forward(self, inputs, ifshortcut):
-        y = self._expand_conv(inputs, if_act=True)
-        y = self._bottleneck_conv(y, if_act=True)
-        y = self._linear_conv(y, if_act=False)
-        if ifshortcut:
-            y = fluid.layers.elementwise_add(inputs, y)
-        return y
-
-
-class InvresiBlocks(fluid.dygraph.Layer):
-    def __init__(self, in_c, t, c, n, s):
-        super(InvresiBlocks, self).__init__()
-
-        self._first_block = InvertedResidualUnit(
-            num_channels=in_c,
-            num_in_filter=in_c,
-            num_filters=c,
-            stride=s,
-            filter_size=3,
-            padding=1,
-            expansion_factor=t)
-
-        self._inv_blocks = []
-        for i in range(1, n):
-            tmp = self.add_sublayer(
-                sublayer=InvertedResidualUnit(
-                    num_channels=c,
-                    num_in_filter=c,
-                    num_filters=c,
-                    stride=1,
-                    filter_size=3,
-                    padding=1,
-                    expansion_factor=t),
-                name=self.full_name() + "_" + str(i + 1))
-            self._inv_blocks.append(tmp)
-
-    def forward(self, inputs):
-        y = self._first_block(inputs, ifshortcut=False)
-        for inv_block in self._inv_blocks:
-            y = inv_block(y, ifshortcut=True)
-        return y
-
-
-class MobileNetV2(Model):
-    """MobileNetV2 model from
-    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
-
-    Args:
-        scale (float): scale of channels in each layer. Default: 1.0.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import MobileNetV2
-
-            model = MobileNetV2()
-    """
-
-    def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
-        super(MobileNetV2, self).__init__()
-        self.scale = scale
-        self.num_classes = num_classes
-        self.with_pool = with_pool
-
-        bottleneck_params_list = [
-            (1, 16, 1, 1),
-            (6, 24, 2, 2),
-            (6, 32, 3, 2),
-            (6, 64, 4, 2),
-            (6, 96, 3, 1),
-            (6, 160, 3, 2),
-            (6, 320, 1, 1),
-        ]
-
-        self._conv1 = ConvBNLayer(
-            num_channels=3,
-            num_filters=int(32 * scale),
-            filter_size=3,
-            stride=2,
-            padding=1)
-
-        self._invl = []
-        i = 1
-        in_c = int(32 * scale)
-        for layer_setting in bottleneck_params_list:
-            t, c, n, s = layer_setting
-            i += 1
-            tmp = self.add_sublayer(
-                sublayer=InvresiBlocks(
-                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
-                name='conv' + str(i))
-            self._invl.append(tmp)
-            in_c = int(c * scale)
-
-        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
-        self._conv9 = ConvBNLayer(
-            num_channels=in_c,
-            num_filters=self._out_c,
-            filter_size=1,
-            stride=1,
-            padding=0)
-
-        if with_pool:
-            self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > 0:
-            tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
-            self._fc = Linear(
-                self._out_c,
-                num_classes,
-                act=classifier_activation,
-                param_attr=tmp_param,
-                bias_attr=ParamAttr(name="fc10_offset"))
-
-    def forward(self, inputs):
-        y = self._conv1(inputs, if_act=True)
-        for inv in self._invl:
-            y = inv(y)
-        y = self._conv9(y, if_act=True)
-
-        if self.with_pool:
-            y = self._pool2d_avg(y)
-        if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, self._out_c])
-            y = self._fc(y)
-        return y
-
-
-def _mobilenet(arch, pretrained=False, **kwargs):
-    model = MobileNetV2(**kwargs)
-    if pretrained:
-        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
-            arch)
-        weight_path = get_weights_path_from_url(model_urls[arch][0],
-                                                model_urls[arch][1])
-        assert weight_path.endswith(
-            '.pdparams'), "suffix of weight must be .pdparams"
-        model.load(weight_path)
-
-    return model
-
-
-def mobilenet_v2(pretrained=False, scale=1.0, **kwargs):
-    """MobileNetV2
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
-        scale: (float): scale of channels in each layer. Default: 1.0.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import mobilenet_v2
-
-            # build model
-            model = mobilenet_v2()
-
-            #build model and load imagenet pretrained weight
-            model = mobilenet_v2(pretrained=True)
-
-            #build mobilenet v2 with scale=0.5
-            model = mobilenet_v2(scale=0.5)
-    """
-    model = _mobilenet(
-        'mobilenetv2_' + str(scale), pretrained, scale=scale, **kwargs)
-    return model
diff --git a/hapi/vision/models/resnet.py b/hapi/vision/models/resnet.py
deleted file mode 100644
index 2cabe4bdfdd0bfe597963330ffb8a00698455621..0000000000000000000000000000000000000000
--- a/hapi/vision/models/resnet.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle.fluid as fluid
-
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from paddle.fluid.dygraph.container import Sequential
-
-from hapi.model import Model
-from hapi.download import get_weights_path_from_url
-
-__all__ = [
-    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152',
-    'BottleneckBlock', 'BasicBlock'
-]
-
-model_urls = {
-    'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams',
-                 '0ba53eea9bc970962d0ef96f7b94057e'),
-    'resnet34': ('https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams',
-                 '46bc9f7c3dd2e55b7866285bee91eff3'),
-    'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams',
-                 '5ce890a9ad386df17cf7fe2313dca0a1'),
-    'resnet101':
-    ('https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams',
-     'fb07a451df331e4b0bb861ed97c3a9b9'),
-    'resnet152':
-    ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams',
-     'f9c700f26d3644bb76ad2226ed5f5713'),
-}
-
-
-class ConvBNLayer(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(num_filters, act=act)
-
-    def forward(self, inputs):
-        x = self._conv(inputs)
-        x = self._batch_norm(x)
-
-        return x
-
-
-class BasicBlock(fluid.dygraph.Layer):
-    """residual block of resnet18 and resnet34
-    """
-    expansion = 1
-
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
-        super(BasicBlock, self).__init__()
-
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=3,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
-
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters,
-                filter_size=1,
-                stride=stride)
-
-        self.shortcut = shortcut
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-
-        y = short + conv1
-
-        return fluid.layers.relu(y)
-
-
-class BottleneckBlock(fluid.dygraph.Layer):
-    """residual block of resnet50, resnet101 amd resnet152
-    """
-
-    expansion = 4
-
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
-        super(BottleneckBlock, self).__init__()
-
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * self.expansion,
-            filter_size=1,
-            act=None)
-
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * self.expansion,
-                filter_size=1,
-                stride=stride)
-
-        self.shortcut = shortcut
-
-        self._num_channels_out = num_filters * self.expansion
-
-    def forward(self, inputs):
-        x = self.conv0(inputs)
-        conv1 = self.conv1(x)
-        conv2 = self.conv2(conv1)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-
-        x = fluid.layers.elementwise_add(x=short, y=conv2)
-
-        return fluid.layers.relu(x)
-
-
-class ResNet(Model):
-    """ResNet model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
-
-    Args:
-        Block (BasicBlock|BottleneckBlock): block module of model.
-        depth (int): layers of resnet, default: 50.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import ResNet, BottleneckBlock, BasicBlock
-
-            resnet50 = ResNet(BottleneckBlock, 50)
-
-            resnet18 = ResNet(BasicBlock, 18)
-
-    """
-
-    def __init__(self,
-                 Block,
-                 depth=50,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
-        super(ResNet, self).__init__()
-
-        self.num_classes = num_classes
-        self.with_pool = with_pool
-
-        layer_config = {
-            18: [2, 2, 2, 2],
-            34: [3, 4, 6, 3],
-            50: [3, 4, 6, 3],
-            101: [3, 4, 23, 3],
-            152: [3, 8, 36, 3],
-        }
-        assert depth in layer_config.keys(), \
-            "supported depth are {} but input layer is {}".format(
-                layer_config.keys(), depth)
-
-        layers = layer_config[depth]
-
-        in_channels = 64
-        out_channels = [64, 128, 256, 512]
-
-        self.conv = ConvBNLayer(
-            num_channels=3,
-            num_filters=64,
-            filter_size=7,
-            stride=2,
-            act='relu')
-        self.pool = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-
-        self.layers = []
-        for idx, num_blocks in enumerate(layers):
-            blocks = []
-            shortcut = False
-            for b in range(num_blocks):
-                if b == 1:
-                    in_channels = out_channels[idx] * Block.expansion
-                block = Block(
-                    num_channels=in_channels,
-                    num_filters=out_channels[idx],
-                    stride=2 if b == 0 and idx != 0 else 1,
-                    shortcut=shortcut)
-                blocks.append(block)
-                shortcut = True
-            layer = self.add_sublayer("layer_{}".format(idx),
-                                      Sequential(*blocks))
-            self.layers.append(layer)
-
-        if with_pool:
-            self.global_pool = Pool2D(
-                pool_size=7, pool_type='avg', global_pooling=True)
-
-        if num_classes > 0:
-            stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
-            self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
-            self.fc = Linear(
-                self.fc_input_dim,
-                num_classes,
-                act=classifier_activation,
-                param_attr=fluid.param_attr.ParamAttr(
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
-
-    def forward(self, inputs):
-        x = self.conv(inputs)
-        x = self.pool(x)
-        for layer in self.layers:
-            x = layer(x)
-
-        if self.with_pool:
-            x = self.global_pool(x)
-
-        if self.num_classes > -1:
-            x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
-            x = self.fc(x)
-        return x
-
-
-def _resnet(arch, Block, depth, pretrained, **kwargs):
-    model = ResNet(Block, depth, **kwargs)
-    if pretrained:
-        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
-            arch)
-        weight_path = get_weights_path_from_url(model_urls[arch][0],
-                                                model_urls[arch][1])
-        assert weight_path.endswith(
-            '.pdparams'), "suffix of weight must be .pdparams"
-        model.load(weight_path)
-    return model
-
-
-def resnet18(pretrained=False, **kwargs):
-    """ResNet 18-layer model
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import resnet18
-
-            # build model
-            model = resnet18()
-
-            #build model and load imagenet pretrained weight
-            model = resnet18(pretrained=True)
-    """
-    return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs)
-
-
-def resnet34(pretrained=False, **kwargs):
-    """ResNet 34-layer model
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import resnet34
-
-            # build model
-            model = resnet34()
-
-            #build model and load imagenet pretrained weight
-            model = resnet34(pretrained=True)
-    """
-    return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs)
-
-
-def resnet50(pretrained=False, **kwargs):
-    """ResNet 50-layer model
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import resnet50
-
-            # build model
-            model = resnet50()
-
-            #build model and load imagenet pretrained weight
-            model = resnet50(pretrained=True)
-    """
-    return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs)
-
-
-def resnet101(pretrained=False, **kwargs):
-    """ResNet 101-layer model
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import resnet101
-
-            # build model
-            model = resnet101()
-
-            #build model and load imagenet pretrained weight
-            model = resnet101(pretrained=True)
-    """
-    return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs)
-
-
-def resnet152(pretrained=False, **kwargs):
-    """ResNet 152-layer model
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import resnet152
-
-            # build model
-            model = resnet152()
-
-            #build model and load imagenet pretrained weight
-            model = resnet152(pretrained=True)
-    """
-    return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs)
diff --git a/hapi/vision/models/vgg.py b/hapi/vision/models/vgg.py
deleted file mode 100644
index 3a8c59737ab4d42f72de27d20846bb13d727a6ac..0000000000000000000000000000000000000000
--- a/hapi/vision/models/vgg.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from paddle.fluid.dygraph.container import Sequential
-
-from hapi.model import Model
-from hapi.download import get_weights_path_from_url
-
-__all__ = [
-    'VGG',
-    'vgg11',
-    'vgg13',
-    'vgg16',
-    'vgg19',
-]
-
-model_urls = {
-    'vgg16': ('https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams',
-              'c788f453a3b999063e8da043456281ee')
-}
-
-
-class Classifier(fluid.dygraph.Layer):
-    def __init__(self, num_classes, classifier_activation='softmax'):
-        super(Classifier, self).__init__()
-        self.linear1 = Linear(512 * 7 * 7, 4096)
-        self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes, act=classifier_activation)
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear2(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        out = self.linear3(x)
-        return out
-
-
-class VGG(Model):
-    """VGG model from
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
-
-    Args:
-        features (fluid.dygraph.Layer): vgg features create by function make_layers.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-    """
-
-    def __init__(self,
-                 features,
-                 num_classes=1000,
-                 classifier_activation='softmax'):
-        super(VGG, self).__init__()
-        self.features = features
-        self.num_classes = num_classes
-
-        if num_classes > 0:
-            classifier = Classifier(num_classes, classifier_activation)
-            self.classifier = self.add_sublayer("classifier",
-                                                Sequential(classifier))
-
-    def forward(self, x):
-        x = self.features(x)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.classifier(x)
-        return x
-
-
-def make_layers(cfg, batch_norm=False):
-    layers = []
-    in_channels = 3
-
-    for v in cfg:
-        if v == 'M':
-            layers += [Pool2D(pool_size=2, pool_stride=2)]
-        else:
-            if batch_norm:
-                conv2d = Conv2D(in_channels, v, filter_size=3, padding=1)
-                layers += [conv2d, BatchNorm(v, act='relu')]
-            else:
-                conv2d = Conv2D(
-                    in_channels, v, filter_size=3, padding=1, act='relu')
-                layers += [conv2d]
-            in_channels = v
-    return Sequential(*layers)
-
-
-cfgs = {
-    'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
-    'B':
-    [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
-    'D': [
-        64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M',
-        512, 512, 512, 'M'
-    ],
-    'E': [
-        64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512,
-        512, 'M', 512, 512, 512, 512, 'M'
-    ],
-}
-
-
-def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
-    model = VGG(make_layers(
-        cfgs[cfg], batch_norm=batch_norm),
-                num_classes=1000,
-                **kwargs)
-
-    if pretrained:
-        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
-            arch)
-        weight_path = get_weights_path_from_url(model_urls[arch][0],
-                                                model_urls[arch][1])
-        assert weight_path.endswith(
-            '.pdparams'), "suffix of weight must be .pdparams"
-        model.load(weight_path)
-
-    return model
-
-
-def vgg11(pretrained=False, batch_norm=False, **kwargs):
-    """VGG 11-layer model
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
-        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import vgg11
-
-            # build model
-            model = vgg11()
-
-            #build vgg11 model with batch_norm
-            model = vgg11(batch_norm=True)
-    """
-    model_name = 'vgg11'
-    if batch_norm:
-        model_name += ('_bn')
-    return _vgg(model_name, 'A', batch_norm, pretrained, **kwargs)
-
-
-def vgg13(pretrained=False, batch_norm=False, **kwargs):
-    """VGG 13-layer model
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
-        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import vgg13
-
-            # build model
-            model = vgg13()
-
-            #build vgg13 model with batch_norm
-            model = vgg13(batch_norm=True)
-    """
-    model_name = 'vgg13'
-    if batch_norm:
-        model_name += ('_bn')
-    return _vgg(model_name, 'B', batch_norm, pretrained, **kwargs)
-
-
-def vgg16(pretrained=False, batch_norm=False, **kwargs):
-    """VGG 16-layer model 
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
-        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import vgg16
-
-            # build model
-            model = vgg16()
-
-            #build vgg16 model with batch_norm
-            model = vgg16(batch_norm=True)
-    """
-    model_name = 'vgg16'
-    if batch_norm:
-        model_name += ('_bn')
-    return _vgg(model_name, 'D', batch_norm, pretrained, **kwargs)
-
-
-def vgg19(pretrained=False, batch_norm=False, **kwargs):
-    """VGG 19-layer model 
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
-        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
-
-    Examples:
-        .. code-block:: python
-
-            from hapi.vision.models import vgg19
-
-            # build model
-            model = vgg19()
-
-            #build vgg19 model with batch_norm
-            model = vgg19(batch_norm=True)
-    """
-    model_name = 'vgg19'
-    if batch_norm:
-        model_name += ('_bn')
-    return _vgg(model_name, 'E', batch_norm, pretrained, **kwargs)
diff --git a/hapi/vision/transforms/__init__.py b/hapi/vision/transforms/__init__.py
deleted file mode 100644
index f7c5b63b19ed081ee6887850c1aa3ef918715222..0000000000000000000000000000000000000000
--- a/hapi/vision/transforms/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import transforms
-from . import functional
-
-from .transforms import *
-from .functional import *
-
-__all__ = transforms.__all__ \
-        + functional.__all__
diff --git a/hapi/vision/transforms/functional.py b/hapi/vision/transforms/functional.py
deleted file mode 100644
index 6af4af9dadc5dbb7395977d94c76967da928b573..0000000000000000000000000000000000000000
--- a/hapi/vision/transforms/functional.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import collections
-import random
-
-import cv2
-import numpy as np
-
-if sys.version_info < (3, 3):
-    Sequence = collections.Sequence
-    Iterable = collections.Iterable
-else:
-    Sequence = collections.abc.Sequence
-    Iterable = collections.abc.Iterable
-
-__all__ = ['flip', 'resize']
-
-
-def flip(image, code):
-    """
-    Accordding to the code (the type of flip), flip the input image
-
-    Args:
-        image: Input image, with (H, W, C) shape
-        code: Code that indicates the type of flip.
-            -1 : Flip horizontally and vertically
-            0 : Flip vertically
-            1 : Flip horizontally
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            from hapi.vision.transforms import functional as F
-
-            fake_img = np.random.rand(224, 224, 3)
-
-            # flip horizontally and vertically
-            F.flip(fake_img, -1)
-
-            # flip vertically
-            F.flip(fake_img, 0)
-
-            # flip horizontally
-            F.flip(fake_img, 1)
-    """
-    return cv2.flip(image, flipCode=code)
-
-
-def resize(img, size, interpolation=cv2.INTER_LINEAR):
-    """
-    resize the input data to given size
-
-    Args:
-        input: Input data, could be image or masks, with (H, W, C) shape
-        size: Target size of input data, with (height, width) shape.
-        interpolation: Interpolation method.
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            from hapi.vision.transforms import functional as F
-
-            fake_img = np.random.rand(256, 256, 3)
-
-            F.resize(fake_img, 224)
-
-            F.resize(fake_img, (200, 150))
-    """
-
-    if isinstance(interpolation, Sequence):
-        interpolation = random.choice(interpolation)
-
-    if isinstance(size, int):
-        h, w = img.shape[:2]
-        if (w <= h and w == size) or (h <= w and h == size):
-            return img
-        if w < h:
-            ow = size
-            oh = int(size * h / w)
-            return cv2.resize(img, (ow, oh), interpolation=interpolation)
-        else:
-            oh = size
-            ow = int(size * w / h)
-            return cv2.resize(img, (ow, oh), interpolation=interpolation)
-    else:
-        return cv2.resize(img, size[::-1], interpolation=interpolation)
diff --git a/hapi/vision/transforms/transforms.py b/hapi/vision/transforms/transforms.py
deleted file mode 100644
index 09e1cf16607a04a0d9097720e76c4ca0001290c7..0000000000000000000000000000000000000000
--- a/hapi/vision/transforms/transforms.py
+++ /dev/null
@@ -1,802 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-
-import math
-import sys
-import random
-import cv2
-
-import numpy as np
-import numbers
-import types
-import collections
-import warnings
-import traceback
-
-from . import functional as F
-
-if sys.version_info < (3, 3):
-    Sequence = collections.Sequence
-    Iterable = collections.Iterable
-else:
-    Sequence = collections.abc.Sequence
-    Iterable = collections.abc.Iterable
-
-__all__ = [
-    "Compose",
-    "BatchCompose",
-    "Resize",
-    "RandomResizedCrop",
-    "CenterCropResize",
-    "CenterCrop",
-    "RandomHorizontalFlip",
-    "RandomVerticalFlip",
-    "Permute",
-    "Normalize",
-    "GaussianNoise",
-    "BrightnessTransform",
-    "SaturationTransform",
-    "ContrastTransform",
-    "HueTransform",
-    "ColorJitter",
-]
-
-
-class Compose(object):
-    """
-    Composes several transforms together use for composing list of transforms
-    together for a dataset transform.
-
-    Args:
-        transforms (list): List of transforms to compose.
-
-    Returns:
-        A compose object which is callable, __call__ for this Compose
-        object will call each given :attr:`transforms` sequencely.
-
-    Examples:
-    
-        .. code-block:: python
-
-            from hapi.datasets import Flowers
-            from hapi.vision.transforms import Compose, ColorJitter, Resize
-
-            transform = Compose([ColorJitter(), Resize(size=608)])
-            flowers = Flowers(mode='test', transform=transform)
-
-            for i in range(10):
-                sample = flowers[i]
-                print(sample[0].shape, sample[1])
-
-    """
-
-    def __init__(self, transforms):
-        self.transforms = transforms
-
-    def __call__(self, *data):
-        for f in self.transforms:
-            try:
-                # multi-fileds in a sample
-                if isinstance(data, Sequence):
-                    data = f(*data)
-                # single field in a sample, call transform directly
-                else:
-                    data = f(data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                print("fail to perform transform [{}] with error: "
-                      "{} and stack:\n{}".format(f, e, str(stack_info)))
-                raise e
-        return data
-
-    def __repr__(self):
-        format_string = self.__class__.__name__ + '('
-        for t in self.transforms:
-            format_string += '\n'
-            format_string += '    {0}'.format(t)
-        format_string += '\n)'
-        return format_string
-
-
-class BatchCompose(object):
-    """Composes several batch transforms together
-
-    Args:
-        transforms (list): List of transforms to compose.
-                           these transforms perform on batch data.
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-            from paddle.io import DataLoader
-
-            from hapi.model import set_device
-            from hapi.datasets import Flowers
-            from hapi.vision.transforms import Compose, BatchCompose, Resize
-
-            class NormalizeBatch(object):
-                def __init__(self,
-                            mean=[0.485, 0.456, 0.406],
-                            std=[0.229, 0.224, 0.225],
-                            scale=True,
-                            channel_first=True):
-
-                    self.mean = mean
-                    self.std = std
-                    self.scale = scale
-                    self.channel_first = channel_first
-                    if not (isinstance(self.mean, list) and isinstance(self.std, list) and
-                            isinstance(self.scale, bool)):
-                        raise TypeError("{}: input type is invalid.".format(self))
-                    from functools import reduce
-                    if reduce(lambda x, y: x * y, self.std) == 0:
-                        raise ValueError('{}: std is invalid!'.format(self))
-
-                def __call__(self, samples):
-                    for i in range(len(samples)):
-                        samples[i] = list(samples[i])
-                        im = samples[i][0]
-                        im = im.astype(np.float32, copy=False)
-                        mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
-                        std = np.array(self.std)[np.newaxis, np.newaxis, :]
-                        if self.scale:
-                            im = im / 255.0
-                        im -= mean
-                        im /= std
-                        if self.channel_first:
-                            im = im.transpose((2, 0, 1))
-                        samples[i][0] = im
-                    return samples
-
-            transform = Compose([Resize((500, 500))])
-            flowers_dataset = Flowers(mode='test', transform=transform)
-
-            device = set_device('cpu')
-
-            collate_fn = BatchCompose([NormalizeBatch()])
-            loader = DataLoader(
-                        flowers_dataset,
-                        batch_size=4,
-                        places=device,
-                        return_list=True,
-                        collate_fn=collate_fn)
-
-            for data in loader:
-                # do something
-                break
-    """
-
-    def __init__(self, transforms=[]):
-        self.transforms = transforms
-
-    def __call__(self, data):
-        for f in self.transforms:
-            try:
-                data = f(data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                print("fail to perform batch transform [{}] with error: "
-                      "{} and stack:\n{}".format(f, e, str(stack_info)))
-                raise e
-
-        # sample list to batch data
-        batch = list(zip(*data))
-
-        return batch
-
-
-class Resize(object):
-    """Resize the input Image to the given size.
-
-    Args:
-        size (int|list|tuple): Desired output size. If size is a sequence like
-            (h, w), output size will be matched to this. If size is an int,
-            smaller edge of the image will be matched to this number.
-            i.e, if height > width, then image will be rescaled to
-            (size * height / width, size)
-        interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import Resize
-
-            transform = Resize(size=224)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, size, interpolation=cv2.INTER_LINEAR):
-        assert isinstance(size, int) or (isinstance(size, Iterable) and
-                                         len(size) == 2)
-        self.size = size
-        self.interpolation = interpolation
-
-    def __call__(self, img):
-        return F.resize(img, self.size, self.interpolation)
-
-
-class RandomResizedCrop(object):
-    """Crop the input data to random size and aspect ratio.
-    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
-    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
-    After applying crop transfrom, the input data will be resized to given size.
-
-    Args:
-        output_size (int|list|tuple): Target size of output image, with (height, width) shape.
-        scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0)
-        ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import RandomResizedCrop
-
-            transform = RandomResizedCrop(224)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self,
-                 output_size,
-                 scale=(0.08, 1.0),
-                 ratio=(3. / 4, 4. / 3),
-                 interpolation=cv2.INTER_LINEAR):
-        if isinstance(output_size, int):
-            self.output_size = (output_size, output_size)
-        else:
-            self.output_size = output_size
-        assert (scale[0] <= scale[1]), "scale should be of kind (min, max)"
-        assert (ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
-        self.scale = scale
-        self.ratio = ratio
-        self.interpolation = interpolation
-
-    def _get_params(self, image, attempts=10):
-        height, width, _ = image.shape
-        area = height * width
-
-        for _ in range(attempts):
-            target_area = np.random.uniform(*self.scale) * area
-            log_ratio = tuple(math.log(x) for x in self.ratio)
-            aspect_ratio = math.exp(np.random.uniform(*log_ratio))
-
-            w = int(round(math.sqrt(target_area * aspect_ratio)))
-            h = int(round(math.sqrt(target_area / aspect_ratio)))
-
-            if 0 < w <= width and 0 < h <= height:
-                x = np.random.randint(0, width - w + 1)
-                y = np.random.randint(0, height - h + 1)
-                return x, y, w, h
-
-        # Fallback to central crop
-        in_ratio = float(width) / float(height)
-        if in_ratio < min(self.ratio):
-            w = width
-            h = int(round(w / min(self.ratio)))
-        elif in_ratio > max(self.ratio):
-            h = height
-            w = int(round(h * max(self.ratio)))
-        else:  # whole image
-            w = width
-            h = height
-        x = (width - w) // 2
-        y = (height - h) // 2
-        return x, y, w, h
-
-    def __call__(self, img):
-        x, y, w, h = self._get_params(img)
-        cropped_img = img[y:y + h, x:x + w]
-        return F.resize(cropped_img, self.output_size, self.interpolation)
-
-
-class CenterCropResize(object):
-    """Crops to center of image with padding then scales size.
-
-    Args:
-        size (int|list|tuple): Target size of output image, with (height, width) shape.
-        crop_padding (int): Center crop with the padding. Default: 32.
-        interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import CenterCropResize
-
-            transform = CenterCropResize(224)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, size, crop_padding=32, interpolation=cv2.INTER_LINEAR):
-        if isinstance(size, int):
-            self.size = (size, size)
-        else:
-            self.size = size
-        self.crop_padding = crop_padding
-        self.interpolation = interpolation
-
-    def _get_params(self, img):
-        h, w = img.shape[:2]
-        size = min(self.size)
-        c = int(size / (size + self.crop_padding) * min((h, w)))
-        x = (h + 1 - c) // 2
-        y = (w + 1 - c) // 2
-        return c, x, y
-
-    def __call__(self, img):
-        c, x, y = self._get_params(img)
-        cropped_img = img[x:x + c, y:y + c, :]
-        return F.resize(cropped_img, self.size, self.interpolation)
-
-
-class CenterCrop(object):
-    """Crops the given the input data at the center.
-
-    Args:
-        output_size: Target size of output image, with (height, width) shape.
-    
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import CenterCrop
-
-            transform = CenterCrop(224)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, output_size):
-        if isinstance(output_size, int):
-            self.output_size = (output_size, output_size)
-        else:
-            self.output_size = output_size
-
-    def _get_params(self, img):
-        th, tw = self.output_size
-        h, w, _ = img.shape
-        assert th <= h and tw <= w, "output size is bigger than image size"
-        x = int(round((w - tw) / 2.0))
-        y = int(round((h - th) / 2.0))
-        return x, y
-
-    def __call__(self, img):
-        x, y = self._get_params(img)
-        th, tw = self.output_size
-        return img[y:y + th, x:x + tw]
-
-
-class RandomHorizontalFlip(object):
-    """Horizontally flip the input data randomly with a given probability.
-
-    Args:
-        prob (float): Probability of the input data being flipped. Default: 0.5
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import RandomHorizontalFlip
-
-            transform = RandomHorizontalFlip(224)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, prob=0.5):
-        self.prob = prob
-
-    def __call__(self, img):
-        if np.random.random() < self.prob:
-            return F.flip(img, code=1)
-        return img
-
-
-class RandomVerticalFlip(object):
-    """Vertically flip the input data randomly with a given probability.
-
-    Args:
-        prob (float): Probability of the input data being flipped. Default: 0.5
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import RandomVerticalFlip
-
-            transform = RandomVerticalFlip(224)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, prob=0.5):
-        self.prob = prob
-
-    def __call__(self, img):
-        if np.random.random() < self.prob:
-            return F.flip(img, code=0)
-        return img
-
-
-class Normalize(object):
-    """Normalize the input data with mean and standard deviation.
-    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
-    this transform will normalize each channel of the input data.
-    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
-
-    Args:
-        mean (int|float|list): Sequence of means for each channel.
-        std (int|float|list): Sequence of standard deviations for each channel.
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import Normalize
-
-            normalize = Normalize(mean=[0.5, 0.5, 0.5], 
-                                std=[0.5, 0.5, 0.5])
-
-            fake_img = np.random.rand(3, 500, 500).astype('float32')
-
-            fake_img = normalize(fake_img)
-            print(fake_img.shape)
-    
-    """
-
-    def __init__(self, mean=0.0, std=1.0):
-        if isinstance(mean, numbers.Number):
-            mean = [mean, mean, mean]
-
-        if isinstance(std, numbers.Number):
-            mean = [std, std, std]
-
-        self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
-        self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)
-
-    def __call__(self, img):
-        return (img - self.mean) / self.std
-
-
-class Permute(object):
-    """Change input data to a target mode.
-    For example, most transforms use HWC mode image,
-    while the Neural Network might use CHW mode input tensor.
-    Input image should be HWC mode and an instance of numpy.ndarray. 
-
-    Args:
-        mode (str): Output mode of input. Default: "CHW".
-        to_rgb (bool): Convert 'bgr' image to 'rgb'. Default: True.
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import Permute
-
-            transform = Permute()
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, mode="CHW", to_rgb=True):
-        assert mode in [
-            "CHW"
-        ], "Only support 'CHW' mode, but received mode: {}".format(mode)
-        self.mode = mode
-        self.to_rgb = to_rgb
-
-    def __call__(self, img):
-        if self.to_rgb:
-            img = img[..., ::-1]
-        if self.mode == "CHW":
-            return img.transpose((2, 0, 1))
-        return img
-
-
-class GaussianNoise(object):
-    """Add random gaussian noise to the input data.
-    Gaussian noise is generated with given mean and std.
-
-    Args:
-        mean (float): Gaussian mean used to generate noise.
-        std (float): Gaussian standard deviation used to generate noise.
-    
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import GaussianNoise
-
-            transform = GaussianNoise()
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, mean=0.0, std=1.0):
-        self.mean = np.array(mean, dtype=np.float32)
-        self.std = np.array(std, dtype=np.float32)
-
-    def __call__(self, img):
-        dtype = img.dtype
-        noise = np.random.normal(self.mean, self.std, img.shape) * 255
-        img = img + noise.astype(np.float32)
-        return np.clip(img, 0, 255).astype(dtype)
-
-
-class BrightnessTransform(object):
-    """Adjust brightness of the image.
-
-    Args:
-        value (float): How much to adjust the brightness. Can be any
-            non negative number. 0 gives the original image
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import BrightnessTransform
-
-            transform = BrightnessTransform(0.4)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, value):
-        if value < 0:
-            raise ValueError("brightness value should be non-negative")
-        self.value = value
-
-    def __call__(self, img):
-        if self.value == 0:
-            return img
-
-        dtype = img.dtype
-        img = img.astype(np.float32)
-        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
-        img = img * alpha
-        return img.clip(0, 255).astype(dtype)
-
-
-class ContrastTransform(object):
-    """Adjust contrast of the image.
-
-    Args:
-        value (float): How much to adjust the contrast. Can be any
-            non negative number. 0 gives the original image
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import ContrastTransform
-
-            transform = ContrastTransform(0.4)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, value):
-        if value < 0:
-            raise ValueError("contrast value should be non-negative")
-        self.value = value
-
-    def __call__(self, img):
-        if self.value == 0:
-            return img
-
-        dtype = img.dtype
-        img = img.astype(np.float32)
-        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
-        img = img * alpha + cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).mean() * (
-            1 - alpha)
-        return img.clip(0, 255).astype(dtype)
-
-
-class SaturationTransform(object):
-    """Adjust saturation of the image.
-
-    Args:
-        value (float): How much to adjust the saturation. Can be any
-            non negative number. 0 gives the original image
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import SaturationTransform
-
-            transform = SaturationTransform(0.4)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-        
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, value):
-        if value < 0:
-            raise ValueError("saturation value should be non-negative")
-        self.value = value
-
-    def __call__(self, img):
-        if self.value == 0:
-            return img
-
-        dtype = img.dtype
-        img = img.astype(np.float32)
-        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
-        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        gray_img = gray_img[..., np.newaxis]
-        img = img * alpha + gray_img * (1 - alpha)
-        return img.clip(0, 255).astype(dtype)
-
-
-class HueTransform(object):
-    """Adjust hue of the image.
-
-    Args:
-        value (float): How much to adjust the hue. Can be any number
-            between 0 and 0.5, 0 gives the original image
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import HueTransform
-
-            transform = HueTransform(0.4)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, value):
-        if value < 0 or value > 0.5:
-            raise ValueError("hue value should be in [0.0, 0.5]")
-        self.value = value
-
-    def __call__(self, img):
-        if self.value == 0:
-            return img
-
-        dtype = img.dtype
-        img = img.astype(np.uint8)
-        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
-        h, s, v = cv2.split(hsv_img)
-
-        alpha = np.random.uniform(-self.value, self.value)
-        h = h.astype(np.uint8)
-        # uint8 addition take cares of rotation across boundaries
-        with np.errstate(over="ignore"):
-            h += np.uint8(alpha * 255)
-        hsv_img = cv2.merge([h, s, v])
-        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
-
-
-class ColorJitter(object):
-    """Randomly change the brightness, contrast, saturation and hue of an image.
-
-    Args:
-        brightness: How much to jitter brightness.
-            Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
-            or the given [min, max]. Should be non negative numbers.
-        contrast: How much to jitter contrast.
-            Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
-            or the given [min, max]. Should be non negative numbers.
-        saturation: How much to jitter saturation.
-            Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
-            or the given [min, max]. Should be non negative numbers.
-        hue: How much to jitter hue.
-            Chosen uniformly from [-hue, hue] or the given [min, max].
-            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from hapi.vision.transforms import ColorJitter
-
-            transform = ColorJitter(0.4)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
-        transforms = []
-        if brightness != 0:
-            transforms.append(BrightnessTransform(brightness))
-        if contrast != 0:
-            transforms.append(ContrastTransform(contrast))
-        if saturation != 0:
-            transforms.append(SaturationTransform(saturation))
-        if hue != 0:
-            transforms.append(HueTransform(hue))
-
-        random.shuffle(transforms)
-        self.transforms = Compose(transforms)
-
-    def __call__(self, img):
-        return self.transforms(img)
diff --git a/examples/image_classification/README.MD b/image_classification/README.MD
similarity index 100%
rename from examples/image_classification/README.MD
rename to image_classification/README.MD
diff --git a/examples/image_classification/imagenet_dataset.py b/image_classification/imagenet_dataset.py
similarity index 100%
rename from examples/image_classification/imagenet_dataset.py
rename to image_classification/imagenet_dataset.py
diff --git a/examples/image_classification/main.py b/image_classification/main.py
similarity index 100%
rename from examples/image_classification/main.py
rename to image_classification/main.py
diff --git a/examples/image_classification/scripts/mobilenet_v1_x1.0.sh b/image_classification/scripts/mobilenet_v1_x1.0.sh
similarity index 100%
rename from examples/image_classification/scripts/mobilenet_v1_x1.0.sh
rename to image_classification/scripts/mobilenet_v1_x1.0.sh
diff --git a/examples/image_classification/scripts/mobilenet_v2_x1.0.sh b/image_classification/scripts/mobilenet_v2_x1.0.sh
similarity index 100%
rename from examples/image_classification/scripts/mobilenet_v2_x1.0.sh
rename to image_classification/scripts/mobilenet_v2_x1.0.sh
diff --git a/examples/image_classification/scripts/resnet101.sh b/image_classification/scripts/resnet101.sh
similarity index 100%
rename from examples/image_classification/scripts/resnet101.sh
rename to image_classification/scripts/resnet101.sh
diff --git a/examples/image_classification/scripts/resnet152.sh b/image_classification/scripts/resnet152.sh
similarity index 100%
rename from examples/image_classification/scripts/resnet152.sh
rename to image_classification/scripts/resnet152.sh
diff --git a/examples/image_classification/scripts/resnet18.sh b/image_classification/scripts/resnet18.sh
similarity index 100%
rename from examples/image_classification/scripts/resnet18.sh
rename to image_classification/scripts/resnet18.sh
diff --git a/examples/image_classification/scripts/resnet34.sh b/image_classification/scripts/resnet34.sh
similarity index 100%
rename from examples/image_classification/scripts/resnet34.sh
rename to image_classification/scripts/resnet34.sh
diff --git a/examples/image_classification/scripts/resnet50.sh b/image_classification/scripts/resnet50.sh
similarity index 100%
rename from examples/image_classification/scripts/resnet50.sh
rename to image_classification/scripts/resnet50.sh
diff --git a/examples/image_classification/scripts/vgg16.sh b/image_classification/scripts/vgg16.sh
similarity index 100%
rename from examples/image_classification/scripts/vgg16.sh
rename to image_classification/scripts/vgg16.sh
diff --git a/examples/ocr/README.md b/ocr/README.md
similarity index 100%
rename from examples/ocr/README.md
rename to ocr/README.md
diff --git a/examples/ocr/data.py b/ocr/data.py
similarity index 100%
rename from examples/ocr/data.py
rename to ocr/data.py
diff --git a/examples/ocr/eval.py b/ocr/eval.py
similarity index 100%
rename from examples/ocr/eval.py
rename to ocr/eval.py
diff --git a/examples/ocr/images/112_chubbiness_13557.jpg b/ocr/images/112_chubbiness_13557.jpg
similarity index 100%
rename from examples/ocr/images/112_chubbiness_13557.jpg
rename to ocr/images/112_chubbiness_13557.jpg
diff --git a/examples/ocr/images/177_Interfiled_40185.jpg b/ocr/images/177_Interfiled_40185.jpg
similarity index 100%
rename from examples/ocr/images/177_Interfiled_40185.jpg
rename to ocr/images/177_Interfiled_40185.jpg
diff --git a/examples/ocr/images/325_dame_19109.jpg b/ocr/images/325_dame_19109.jpg
similarity index 100%
rename from examples/ocr/images/325_dame_19109.jpg
rename to ocr/images/325_dame_19109.jpg
diff --git a/examples/ocr/images/368_fixtures_29232.jpg b/ocr/images/368_fixtures_29232.jpg
similarity index 100%
rename from examples/ocr/images/368_fixtures_29232.jpg
rename to ocr/images/368_fixtures_29232.jpg
diff --git a/examples/ocr/predict.py b/ocr/predict.py
similarity index 100%
rename from examples/ocr/predict.py
rename to ocr/predict.py
diff --git a/examples/ocr/seq2seq_attn.py b/ocr/seq2seq_attn.py
similarity index 100%
rename from examples/ocr/seq2seq_attn.py
rename to ocr/seq2seq_attn.py
diff --git a/examples/ocr/train.py b/ocr/train.py
similarity index 100%
rename from examples/ocr/train.py
rename to ocr/train.py
diff --git a/examples/ocr/utility.py b/ocr/utility.py
similarity index 100%
rename from examples/ocr/utility.py
rename to ocr/utility.py
diff --git a/examples/sentiment_classification/README.md b/sentiment_classification/README.md
similarity index 100%
rename from examples/sentiment_classification/README.md
rename to sentiment_classification/README.md
diff --git a/examples/sentiment_classification/models.py b/sentiment_classification/models.py
similarity index 100%
rename from examples/sentiment_classification/models.py
rename to sentiment_classification/models.py
diff --git a/examples/sentiment_classification/senta.yaml b/sentiment_classification/senta.yaml
similarity index 100%
rename from examples/sentiment_classification/senta.yaml
rename to sentiment_classification/senta.yaml
diff --git a/examples/sentiment_classification/sentiment_classifier.py b/sentiment_classification/sentiment_classifier.py
similarity index 100%
rename from examples/sentiment_classification/sentiment_classifier.py
rename to sentiment_classification/sentiment_classifier.py
diff --git a/examples/seq2seq/README.md b/seq2seq/README.md
similarity index 100%
rename from examples/seq2seq/README.md
rename to seq2seq/README.md
diff --git a/examples/seq2seq/args.py b/seq2seq/args.py
similarity index 100%
rename from examples/seq2seq/args.py
rename to seq2seq/args.py
diff --git a/examples/seq2seq/download.py b/seq2seq/download.py
similarity index 100%
rename from examples/seq2seq/download.py
rename to seq2seq/download.py
diff --git a/examples/seq2seq/predict.py b/seq2seq/predict.py
similarity index 100%
rename from examples/seq2seq/predict.py
rename to seq2seq/predict.py
diff --git a/examples/seq2seq/reader.py b/seq2seq/reader.py
similarity index 100%
rename from examples/seq2seq/reader.py
rename to seq2seq/reader.py
diff --git a/examples/seq2seq/seq2seq_attn.py b/seq2seq/seq2seq_attn.py
similarity index 100%
rename from examples/seq2seq/seq2seq_attn.py
rename to seq2seq/seq2seq_attn.py
diff --git a/examples/seq2seq/seq2seq_base.py b/seq2seq/seq2seq_base.py
similarity index 100%
rename from examples/seq2seq/seq2seq_base.py
rename to seq2seq/seq2seq_base.py
diff --git a/examples/seq2seq/train.py b/seq2seq/train.py
similarity index 100%
rename from examples/seq2seq/train.py
rename to seq2seq/train.py
diff --git a/examples/seq2seq/utility.py b/seq2seq/utility.py
similarity index 100%
rename from examples/seq2seq/utility.py
rename to seq2seq/utility.py
diff --git a/examples/sequence_tagging/README.md b/sequence_tagging/README.md
similarity index 100%
rename from examples/sequence_tagging/README.md
rename to sequence_tagging/README.md
diff --git a/examples/sequence_tagging/conf/q2b.dic b/sequence_tagging/conf/q2b.dic
similarity index 100%
rename from examples/sequence_tagging/conf/q2b.dic
rename to sequence_tagging/conf/q2b.dic
diff --git a/examples/sequence_tagging/conf/tag.dic b/sequence_tagging/conf/tag.dic
similarity index 100%
rename from examples/sequence_tagging/conf/tag.dic
rename to sequence_tagging/conf/tag.dic
diff --git a/examples/sequence_tagging/conf/word.dic b/sequence_tagging/conf/word.dic
similarity index 100%
rename from examples/sequence_tagging/conf/word.dic
rename to sequence_tagging/conf/word.dic
diff --git a/examples/sequence_tagging/downloads.py b/sequence_tagging/downloads.py
similarity index 100%
rename from examples/sequence_tagging/downloads.py
rename to sequence_tagging/downloads.py
diff --git a/examples/sequence_tagging/downloads.sh b/sequence_tagging/downloads.sh
similarity index 100%
rename from examples/sequence_tagging/downloads.sh
rename to sequence_tagging/downloads.sh
diff --git a/examples/sequence_tagging/eval.py b/sequence_tagging/eval.py
similarity index 100%
rename from examples/sequence_tagging/eval.py
rename to sequence_tagging/eval.py
diff --git a/examples/sequence_tagging/images/gru-crf-model.png b/sequence_tagging/images/gru-crf-model.png
similarity index 100%
rename from examples/sequence_tagging/images/gru-crf-model.png
rename to sequence_tagging/images/gru-crf-model.png
diff --git a/examples/sequence_tagging/predict.py b/sequence_tagging/predict.py
similarity index 100%
rename from examples/sequence_tagging/predict.py
rename to sequence_tagging/predict.py
diff --git a/examples/sequence_tagging/reader.py b/sequence_tagging/reader.py
similarity index 100%
rename from examples/sequence_tagging/reader.py
rename to sequence_tagging/reader.py
diff --git a/examples/sequence_tagging/sequence_tagging.py b/sequence_tagging/sequence_tagging.py
similarity index 100%
rename from examples/sequence_tagging/sequence_tagging.py
rename to sequence_tagging/sequence_tagging.py
diff --git a/examples/sequence_tagging/sequence_tagging.yaml b/sequence_tagging/sequence_tagging.yaml
similarity index 100%
rename from examples/sequence_tagging/sequence_tagging.yaml
rename to sequence_tagging/sequence_tagging.yaml
diff --git a/examples/sequence_tagging/train.py b/sequence_tagging/train.py
similarity index 100%
rename from examples/sequence_tagging/train.py
rename to sequence_tagging/train.py
diff --git a/examples/sequence_tagging/utils/__init__.py b/sequence_tagging/utils/__init__.py
similarity index 100%
rename from examples/sequence_tagging/utils/__init__.py
rename to sequence_tagging/utils/__init__.py
diff --git a/examples/sequence_tagging/utils/check.py b/sequence_tagging/utils/check.py
similarity index 100%
rename from examples/sequence_tagging/utils/check.py
rename to sequence_tagging/utils/check.py
diff --git a/examples/sequence_tagging/utils/configure.py b/sequence_tagging/utils/configure.py
similarity index 100%
rename from examples/sequence_tagging/utils/configure.py
rename to sequence_tagging/utils/configure.py
diff --git a/examples/sequence_tagging/utils/metrics.py b/sequence_tagging/utils/metrics.py
similarity index 100%
rename from examples/sequence_tagging/utils/metrics.py
rename to sequence_tagging/utils/metrics.py
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 0b6f6058e6d048df673e242c452b196bd87ac1f6..0000000000000000000000000000000000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,52 +0,0 @@
-[metadata]
-
-name = hapi
-
-author = zhouxiangyang
-author_email = zhouxiangyang@baidu.com
-
-version = 0.0.1
-
-description = HAPI
-long_description = file: README.md
-long_description_content_type = text/markdown
-
-home_page = https://github.com/PaddlePaddle/hapi
-license = Apache 2.0
-
-classifier =
-    Private :: Do Not Upload
-    Programming Language :: Python
-    Programming Language :: Python :: 2
-    Programming Language :: Python :: 2.7
-    Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.5
-    Programming Language :: Python :: 3.6
-    Programming Language :: Python :: 3.7
-
-keywords =
-    paddlepaddle
-    paddle
-    high-level-api
-
-[options]
-
-packages = find:
-
-#install_requires =
-#    paddlepaddle-gpu >= 1.5.2
-
-include_package_data = True
-zip_safe = False
-
-[sdist]
-dist_dir = output/dist
-
-[bdist_wheel]
-dist_dir = output/dist
-
-[easy_install]
-index_url = http://pip.baidu.com/root/baidu/+simple/
-
-
-
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 4a8c246d75e43a4182190326f5996dd3afb8ba02..0000000000000000000000000000000000000000
--- a/setup.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# -*- coding: UTF-8 -*-
-################################################################################
-#
-#   Copyright (c) 2020  Baidu.com, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-"""
-Setup script.
-Authors: zhouxiangyang(zhouxiangyang@baidu.com)
-Date:    2020/2/4 00:00:01
-"""
-import setuptools
-with open("README.md", "r") as fh:
-    long_description = fh.read()
-setuptools.setup(
-    name="hapi",
-    version="0.0.1",
-    author="PaddlePaddle",
-    author_email="zhouxiangyang@baidu.com",
-    description="A Paddle High-level API that supports both static and dynamic execution modes (still under development)",
-    url="https://github.com/PaddlePaddle/hapi",
-    packages=[
-        'hapi',
-        'hapi.datasets',
-        'hapi.text',
-        'hapi.text.tokenizer',
-        'hapi.text.bert',
-        'hapi.text.bert.utils',
-        'hapi.vision',
-        'hapi.vision.models',
-        'hapi.vision.transforms',
-    ],
-    package_dir={
-        'hapi': './hapi',
-        'hapi.datasets': './hapi/datasets',
-        'hapi.text': './hapi/text',
-        'hapi.text.tokenizer': './hapi/text/tokenizer',
-        'hapi.text.bert': './hapi/text/bert',
-        'hapi.text.bert.utils': './hapi/text/bert/utils',
-        'hapi.vision': './hapi/vision',
-        'hapi.vision.models': './hapi/vision/models',
-        'hapi.vision.transforms': './hapi/vision/transforms',
-    },
-    platforms="any",
-    license='Apache 2.0',
-    classifiers=[
-        'License :: OSI Approved :: Apache Software License',
-        'Programming Language :: Python',
-        'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-    ], )
diff --git a/examples/style-transfer/README.md b/style-transfer/README.md
similarity index 100%
rename from examples/style-transfer/README.md
rename to style-transfer/README.md
diff --git a/examples/style-transfer/images/Starry-Night-by-Vincent-Van-Gogh-painting.jpg b/style-transfer/images/Starry-Night-by-Vincent-Van-Gogh-painting.jpg
similarity index 100%
rename from examples/style-transfer/images/Starry-Night-by-Vincent-Van-Gogh-painting.jpg
rename to style-transfer/images/Starry-Night-by-Vincent-Van-Gogh-painting.jpg
diff --git a/examples/style-transfer/images/chicago_cropped.jpg b/style-transfer/images/chicago_cropped.jpg
similarity index 100%
rename from examples/style-transfer/images/chicago_cropped.jpg
rename to style-transfer/images/chicago_cropped.jpg
diff --git a/examples/style-transfer/images/janelle.png b/style-transfer/images/janelle.png
similarity index 100%
rename from examples/style-transfer/images/janelle.png
rename to style-transfer/images/janelle.png
diff --git a/examples/style-transfer/images/markdown/img1.png b/style-transfer/images/markdown/img1.png
similarity index 100%
rename from examples/style-transfer/images/markdown/img1.png
rename to style-transfer/images/markdown/img1.png
diff --git a/examples/style-transfer/images/markdown/img2.png b/style-transfer/images/markdown/img2.png
similarity index 100%
rename from examples/style-transfer/images/markdown/img2.png
rename to style-transfer/images/markdown/img2.png
diff --git a/examples/style-transfer/images/markdown/output_10_1.png b/style-transfer/images/markdown/output_10_1.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_10_1.png
rename to style-transfer/images/markdown/output_10_1.png
diff --git a/examples/style-transfer/images/markdown/output_20_1.png b/style-transfer/images/markdown/output_20_1.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_1.png
rename to style-transfer/images/markdown/output_20_1.png
diff --git a/examples/style-transfer/images/markdown/output_20_11.png b/style-transfer/images/markdown/output_20_11.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_11.png
rename to style-transfer/images/markdown/output_20_11.png
diff --git a/examples/style-transfer/images/markdown/output_20_13.png b/style-transfer/images/markdown/output_20_13.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_13.png
rename to style-transfer/images/markdown/output_20_13.png
diff --git a/examples/style-transfer/images/markdown/output_20_15.png b/style-transfer/images/markdown/output_20_15.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_15.png
rename to style-transfer/images/markdown/output_20_15.png
diff --git a/examples/style-transfer/images/markdown/output_20_17.png b/style-transfer/images/markdown/output_20_17.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_17.png
rename to style-transfer/images/markdown/output_20_17.png
diff --git a/examples/style-transfer/images/markdown/output_20_19.png b/style-transfer/images/markdown/output_20_19.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_19.png
rename to style-transfer/images/markdown/output_20_19.png
diff --git a/examples/style-transfer/images/markdown/output_20_3.png b/style-transfer/images/markdown/output_20_3.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_3.png
rename to style-transfer/images/markdown/output_20_3.png
diff --git a/examples/style-transfer/images/markdown/output_20_5.png b/style-transfer/images/markdown/output_20_5.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_5.png
rename to style-transfer/images/markdown/output_20_5.png
diff --git a/examples/style-transfer/images/markdown/output_20_7.png b/style-transfer/images/markdown/output_20_7.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_7.png
rename to style-transfer/images/markdown/output_20_7.png
diff --git a/examples/style-transfer/images/markdown/output_20_9.png b/style-transfer/images/markdown/output_20_9.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_20_9.png
rename to style-transfer/images/markdown/output_20_9.png
diff --git a/examples/style-transfer/images/markdown/output_21_1.png b/style-transfer/images/markdown/output_21_1.png
similarity index 100%
rename from examples/style-transfer/images/markdown/output_21_1.png
rename to style-transfer/images/markdown/output_21_1.png
diff --git a/examples/style-transfer/style-transfer.ipynb b/style-transfer/style-transfer.ipynb
similarity index 100%
rename from examples/style-transfer/style-transfer.ipynb
rename to style-transfer/style-transfer.ipynb
diff --git a/examples/style-transfer/style_transfer.py b/style-transfer/style_transfer.py
similarity index 100%
rename from examples/style-transfer/style_transfer.py
rename to style-transfer/style_transfer.py
diff --git a/examples/transformer/README.md b/transformer/README.md
similarity index 100%
rename from examples/transformer/README.md
rename to transformer/README.md
diff --git a/examples/transformer/gen_data.sh b/transformer/gen_data.sh
similarity index 100%
rename from examples/transformer/gen_data.sh
rename to transformer/gen_data.sh
diff --git a/examples/transformer/images/multi_head_attention.png b/transformer/images/multi_head_attention.png
similarity index 100%
rename from examples/transformer/images/multi_head_attention.png
rename to transformer/images/multi_head_attention.png
diff --git a/examples/transformer/images/transformer_network.png b/transformer/images/transformer_network.png
similarity index 100%
rename from examples/transformer/images/transformer_network.png
rename to transformer/images/transformer_network.png
diff --git a/examples/transformer/predict.py b/transformer/predict.py
similarity index 100%
rename from examples/transformer/predict.py
rename to transformer/predict.py
diff --git a/examples/transformer/reader.py b/transformer/reader.py
similarity index 100%
rename from examples/transformer/reader.py
rename to transformer/reader.py
diff --git a/examples/transformer/train.py b/transformer/train.py
similarity index 100%
rename from examples/transformer/train.py
rename to transformer/train.py
diff --git a/examples/transformer/transformer.py b/transformer/transformer.py
similarity index 100%
rename from examples/transformer/transformer.py
rename to transformer/transformer.py
diff --git a/examples/transformer/transformer.yaml b/transformer/transformer.yaml
similarity index 100%
rename from examples/transformer/transformer.yaml
rename to transformer/transformer.yaml
diff --git a/examples/transformer/utils/__init__.py b/transformer/utils/__init__.py
similarity index 100%
rename from examples/transformer/utils/__init__.py
rename to transformer/utils/__init__.py
diff --git a/examples/transformer/utils/check.py b/transformer/utils/check.py
similarity index 100%
rename from examples/transformer/utils/check.py
rename to transformer/utils/check.py
diff --git a/examples/transformer/utils/configure.py b/transformer/utils/configure.py
similarity index 100%
rename from examples/transformer/utils/configure.py
rename to transformer/utils/configure.py
diff --git a/examples/tsm/README.md b/tsm/README.md
similarity index 99%
rename from examples/tsm/README.md
rename to tsm/README.md
index f36cc456257a66c701c15218043ceb5e2e344fd6..1f8323736c82f52378cfd0ab612e0dc077df7319 100644
--- a/examples/tsm/README.md
+++ b/tsm/README.md
@@ -146,4 +146,3 @@ python infer.py --data=<path/to/dataset> --label_list=<path/to/label_list> --inf
 ## 参考论文
 
 - [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/abs/1811.08383v1), Ji Lin, Chuang Gan, Song Han
-
diff --git a/examples/tsm/check.py b/tsm/check.py
similarity index 100%
rename from examples/tsm/check.py
rename to tsm/check.py
diff --git a/examples/tsm/dataset/README.md b/tsm/dataset/README.md
similarity index 100%
rename from examples/tsm/dataset/README.md
rename to tsm/dataset/README.md
diff --git a/examples/tsm/dataset/kinetics/generate_label.py b/tsm/dataset/kinetics/generate_label.py
similarity index 100%
rename from examples/tsm/dataset/kinetics/generate_label.py
rename to tsm/dataset/kinetics/generate_label.py
diff --git a/examples/tsm/dataset/kinetics/video2pkl.py b/tsm/dataset/kinetics/video2pkl.py
similarity index 100%
rename from examples/tsm/dataset/kinetics/video2pkl.py
rename to tsm/dataset/kinetics/video2pkl.py
diff --git a/examples/tsm/images/temporal_shift.png b/tsm/images/temporal_shift.png
similarity index 100%
rename from examples/tsm/images/temporal_shift.png
rename to tsm/images/temporal_shift.png
diff --git a/examples/tsm/infer.py b/tsm/infer.py
similarity index 100%
rename from examples/tsm/infer.py
rename to tsm/infer.py
diff --git a/examples/tsm/kinetics_dataset.py b/tsm/kinetics_dataset.py
similarity index 99%
rename from examples/tsm/kinetics_dataset.py
rename to tsm/kinetics_dataset.py
index 6cfd0d1c6be615c8cd862a59ca3254a47f336c99..e08f588e126af6966b01d35dc420ba0a4da1e0a4 100644
--- a/examples/tsm/kinetics_dataset.py
+++ b/tsm/kinetics_dataset.py
@@ -123,7 +123,7 @@ class KineticsDataset(Dataset):
     def _video_loader(self, frames):
         videolen = len(frames)
         average_dur = int(videolen / self.seg_num)
-        
+
         imgs = []
         for i in range(self.seg_num):
             idx = 0
@@ -143,12 +143,12 @@ class KineticsDataset(Dataset):
                     idx += i * average_dur
                 else:
                     idx = i
-            
+
             for jj in range(idx, idx + self.seg_len):
                 imgbuf = frames[int(jj % videolen)]
                 img = self._imageloader(imgbuf)
                 imgs.append(img)
-        
+
         return imgs
 
     def _imageloader(self, buf):
@@ -156,6 +156,5 @@ class KineticsDataset(Dataset):
             img = Image.open(StringIO(buf))
         else:
             img = Image.open(BytesIO(buf))
-        
-        return img.convert('RGB')
 
+        return img.convert('RGB')
diff --git a/examples/tsm/main.py b/tsm/main.py
similarity index 100%
rename from examples/tsm/main.py
rename to tsm/main.py
diff --git a/examples/tsm/modeling.py b/tsm/modeling.py
similarity index 100%
rename from examples/tsm/modeling.py
rename to tsm/modeling.py
diff --git a/examples/tsm/transforms.py b/tsm/transforms.py
similarity index 91%
rename from examples/tsm/transforms.py
rename to tsm/transforms.py
index 1fa9f1197c7ab482f78ccd9f12a6fb216593b946..766da7658a60c3aeed0e7f9b40aa0d886800cb27 100644
--- a/examples/tsm/transforms.py
+++ b/tsm/transforms.py
@@ -20,8 +20,10 @@ from PIL import Image
 import logging
 logger = logging.getLogger(__name__)
 
-__all__ = ['GroupScale', 'GroupMultiScaleCrop', 'GroupRandomCrop',
-           'GroupRandomFlip', 'GroupCenterCrop', 'NormalizeImage']
+__all__ = [
+    'GroupScale', 'GroupMultiScaleCrop', 'GroupRandomCrop', 'GroupRandomFlip',
+    'GroupCenterCrop', 'NormalizeImage'
+]
 
 
 class GroupScale(object):
@@ -31,6 +33,7 @@ class GroupScale(object):
     Args:
         target_size (int): image resize target size
     """
+
     def __init__(self, target_size=224):
         self.target_size = target_size
 
@@ -52,7 +55,7 @@ class GroupScale(object):
                 oh = self.target_size
                 ow = int(self.target_size * 4.0 / 3.0)
                 resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
-        
+
         return resized_imgs, label
 
 
@@ -60,6 +63,7 @@ class GroupMultiScaleCrop(object):
     """
     FIXME: add comments
     """
+
     def __init__(self,
                  short_size=256,
                  scales=None,
@@ -76,11 +80,11 @@ class GroupMultiScaleCrop(object):
     def __call__(self, imgs, label):
         input_size = [self.short_size, self.short_size]
         im_size = imgs[0].size
-        
+
         # get random crop offset
         def _sample_crop_size(im_size):
             image_w, image_h = im_size[0], im_size[1]
-            
+
             base_size = min(image_w, image_h)
             crop_sizes = [int(base_size * x) for x in self.scales]
             crop_h = [
@@ -91,7 +95,7 @@ class GroupMultiScaleCrop(object):
                 input_size[0] if abs(x - input_size[0]) < 3 else x
                 for x in crop_sizes
             ]
-            
+
             pairs = []
             for i, h in enumerate(crop_h):
                 for j, w in enumerate(crop_w):
@@ -104,7 +108,7 @@ class GroupMultiScaleCrop(object):
             else:
                 w_step = (image_w - crop_pair[0]) / 4
                 h_step = (image_h - crop_pair[1]) / 4
-                
+
                 ret = list()
                 ret.append((0, 0))  # upper left
                 if w_step != 0:
@@ -115,25 +119,26 @@ class GroupMultiScaleCrop(object):
                     ret.append((4 * w_step, 4 * h_step))  # lower right
                 if h_step != 0 or w_step != 0:
                     ret.append((2 * w_step, 2 * h_step))  # center
-                
+
                 if self.more_fix_crop:
                     ret.append((0, 2 * h_step))  # center left
                     ret.append((4 * w_step, 2 * h_step))  # center right
                     ret.append((2 * w_step, 4 * h_step))  # lower center
                     ret.append((2 * w_step, 0 * h_step))  # upper center
-                    
+
                     ret.append((1 * w_step, 1 * h_step))  # upper left quarter
                     ret.append((3 * w_step, 1 * h_step))  # upper right quarter
                     ret.append((1 * w_step, 3 * h_step))  # lower left quarter
                     ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
-                
+
                 w_offset, h_offset = random.choice(ret)
-            
+
             return crop_pair[0], crop_pair[1], w_offset, h_offset
-        
+
         crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)
         crop_imgs = [
-            img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
+            img.crop(
+                (offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
             for img in imgs
         ]
         ret_imgs = [
@@ -151,21 +156,21 @@ class GroupRandomCrop(object):
     def __call__(self, imgs, label):
         w, h = imgs[0].size
         th, tw = self.target_size, self.target_size
-        
+
         assert (w >= self.target_size) and (h >= self.target_size), \
             "image width({}) and height({}) should be larger than " \
             "crop size".format(w, h, self.target_size)
-        
+
         out_images = []
         x1 = np.random.randint(0, w - tw)
         y1 = np.random.randint(0, h - th)
-        
+
         for img in imgs:
             if w == tw and h == th:
                 out_images.append(img)
             else:
                 out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
-        
+
         return out_images, label
 
 
@@ -176,7 +181,7 @@ class GroupRandomFlip(object):
             ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs]
             return ret, label
         else:
-            return imgs, label 
+            return imgs, label
 
 
 class GroupCenterCrop(object):
@@ -194,8 +199,8 @@ class GroupCenterCrop(object):
             x1 = int(round((w - tw) / 2.))
             y1 = int(round((h - th) / 2.))
             crop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))
-        
-        return crop_imgs, label 
+
+        return crop_imgs, label
 
 
 class NormalizeImage(object):
@@ -213,17 +218,16 @@ class NormalizeImage(object):
 
     def __call__(self, imgs, label):
         np_imgs = (np.array(imgs[0]).astype('float32').transpose(
-            (2, 0, 1))).reshape(1, 3, self.target_size,
-            self.target_size) / 255
+            (2, 0, 1))).reshape(1, 3, self.target_size, self.target_size) / 255
         for i in range(len(imgs) - 1):
             img = (np.array(imgs[i + 1]).astype('float32').transpose(
                 (2, 0, 1))).reshape(1, 3, self.target_size,
-                self.target_size) / 255
+                                    self.target_size) / 255
             np_imgs = np.concatenate((np_imgs, img))
-        
+
         np_imgs -= self.img_mean
         np_imgs /= self.img_std
         np_imgs = np.reshape(np_imgs, (self.seg_num, self.seg_len * 3,
-                             self.target_size, self.target_size))
-        
+                                       self.target_size, self.target_size))
+
         return np_imgs, label
diff --git a/examples/tsm/utils.py b/tsm/utils.py
similarity index 100%
rename from examples/tsm/utils.py
rename to tsm/utils.py
diff --git a/examples/yolov3/.gitignore b/yolov3/.gitignore
similarity index 100%
rename from examples/yolov3/.gitignore
rename to yolov3/.gitignore
diff --git a/examples/yolov3/README.md b/yolov3/README.md
similarity index 98%
rename from examples/yolov3/README.md
rename to yolov3/README.md
index 59c806ef2fa15fec7105c60c1acfcfbee2d737b0..8ec75099a3da7afb1c0294ac805da18d57b9aef4 100644
--- a/examples/yolov3/README.md
+++ b/yolov3/README.md
@@ -59,7 +59,7 @@ YOLOv3 的网络结构由基础特征提取网络、multi-scale特征融合层
 
 #### 安装COCO-API
 
-	训练前需要首先下载[COCO-API](https://github.com/cocodataset/cocoapi)：
+    训练前需要首先下载[COCO-API](https://github.com/cocodataset/cocoapi)：
 
     ```bash
     git clone https://github.com/cocodataset/cocoapi.git
@@ -199,4 +199,3 @@ python infer.py --label_list=dataset/voc/label_list.txt --infer_image=image/dog.
 - [You Only Look Once: Unified, Real-Time Object Detection](https://arxiv.org/abs/1506.02640v5), Joseph Redmon, Santosh Divvala, Ross Girshick, Ali Farhadi.
 - [YOLOv3: An Incremental Improvement](https://arxiv.org/abs/1804.02767v1), Joseph Redmon, Ali Farhadi.
 - [Bag of Freebies for Training Object Detection Neural Networks](https://arxiv.org/abs/1902.04103v3), Zhi Zhang, Tong He, Hang Zhang, Zhongyue Zhang, Junyuan Xie, Mu Li.
-
diff --git a/examples/yolov3/coco.py b/yolov3/coco.py
similarity index 99%
rename from examples/yolov3/coco.py
rename to yolov3/coco.py
index 50d31cff06692e30fb153983023d4c8ed7476f2c..7191a6d78f755e8e5ef6c43ff533760109fdc577 100644
--- a/examples/yolov3/coco.py
+++ b/yolov3/coco.py
@@ -84,7 +84,7 @@ class COCODataset(Dataset):
         self._transform = transform
         self._mixup = mixup
         self._alpha = alpha
-        self._beta = beta 
+        self._beta = beta
 
         # load in dataset roidbs
         self._load_roidb_and_cname2cid()
@@ -169,7 +169,8 @@ class COCODataset(Dataset):
             ct += 1
             if self._sample_num > 0 and ct >= self._sample_num:
                 break
-        assert len(records) > 0, 'not found any coco record in %s' % (self._anno_path)
+        assert len(records) > 0, 'not found any coco record in %s' % (
+            self._anno_path)
         logger.info('{} samples in file {}'.format(ct, self._anno_path))
         self._roidbs, self._cname2cid = records, cname2cid
 
@@ -194,14 +195,15 @@ class COCODataset(Dataset):
         return im_id, im_shape, im, gt_bbox, gt_class, gt_score
 
     def __getitem__(self, idx):
-        im_id, im_shape, im, gt_bbox, gt_class, gt_score = self._getitem_by_index(idx)
+        im_id, im_shape, im, gt_bbox, gt_class, gt_score = self._getitem_by_index(
+            idx)
 
         if self._mixup:
             mixup_idx = idx + np.random.randint(1, self.__len__())
             mixup_idx %= self.__len__()
             _, _, mixup_im, mixup_bbox, mixup_class, _ = \
                             self._getitem_by_index(mixup_idx)
-            
+
             im_shape, im, gt_bbox, gt_class, gt_score = \
                     self._mixup_image(im, gt_bbox, gt_class, mixup_im,
                                       mixup_bbox, mixup_class)
@@ -238,7 +240,7 @@ class COCODataset(Dataset):
         im_shape = np.array([h, w], dtype='int32')
 
         return im_shape, img, gt_bbox, gt_class, gt_score
-    
+
     @property
     def mixup(self):
         return self._mixup
@@ -250,6 +252,7 @@ class COCODataset(Dataset):
         logger.info("{} set mixup to {}".format(self, value))
         self._mixup = value
 
+
 def pascalvoc_label(with_background=True):
     labels_map = {
         'aeroplane': 1,
diff --git a/examples/yolov3/coco_metric.py b/yolov3/coco_metric.py
similarity index 95%
rename from examples/yolov3/coco_metric.py
rename to yolov3/coco_metric.py
index 2f2f9825b1f90c08afa7b6089641d5a4b28be51d..7db994de0461edd4a95fb040acb8d989019eba99 100644
--- a/examples/yolov3/coco_metric.py
+++ b/yolov3/coco_metric.py
@@ -48,9 +48,10 @@ class COCOMetric():
 
         self.coco_gt = COCO(anno_path)
         cat_ids = self.coco_gt.getCatIds()
-        self.clsid2catid = dict(
-            {i + int(with_background): catid
-            for i, catid in enumerate(cat_ids)})
+        self.clsid2catid = dict({
+            i + int(with_background): catid
+            for i, catid in enumerate(cat_ids)
+        })
 
     def update(self, img_id, bboxes):
         assert img_id.shape[0] == 1, \
@@ -96,14 +97,13 @@ class COCOMetric():
 
     def cocoapi_eval(self, jsonfile, style, coco_gt=None, anno_file=None):
         assert coco_gt != None or anno_file != None
-        
+
         if coco_gt == None:
             coco_gt = COCO(anno_file)
         logger.info("Start evaluate...")
-        coco_dt = coco_gt.loadRes(jsonfile) 
+        coco_dt = coco_gt.loadRes(jsonfile)
         coco_eval = COCOeval(coco_gt, coco_dt, style)
         coco_eval.evaluate()
         coco_eval.accumulate()
         coco_eval.summarize()
         return coco_eval.stats
-
diff --git a/examples/yolov3/darknet.py b/yolov3/darknet.py
similarity index 83%
rename from examples/yolov3/darknet.py
rename to yolov3/darknet.py
index ca7bd7dc3c59b4fd5a82e6d83eb47795aabbf883..412457b25eb9d3a2d22df44de389ee144f831652 100644
--- a/examples/yolov3/darknet.py
+++ b/yolov3/darknet.py
@@ -25,8 +25,8 @@ __all__ = ['DarkNet', 'darknet53']
 
 # {num_layers: (url, md5)}
 pretrain_infos = {
-        53: ('https://paddlemodels.bj.bcebos.com/hapi/darknet53.pdparams',
-            '2506357a5c31e865785112fc614a487d')
+    53: ('https://paddlemodels.bj.bcebos.com/hapi/darknet53.pdparams',
+         '2506357a5c31e865785112fc614a487d')
 }
 
 
@@ -70,13 +70,9 @@ class ConvBNLayer(fluid.dygraph.Layer):
             out = fluid.layers.leaky_relu(x=out, alpha=0.1)
         return out
 
+
 class DownSample(fluid.dygraph.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size=3,
-                 stride=2,
-                 padding=1):
+    def __init__(self, ch_in, ch_out, filter_size=3, stride=2, padding=1):
 
         super(DownSample, self).__init__()
 
@@ -87,46 +83,45 @@ class DownSample(fluid.dygraph.Layer):
             stride=stride,
             padding=padding)
         self.ch_out = ch_out
+
     def forward(self, inputs):
         out = self.conv_bn_layer(inputs)
         return out
 
+
 class BasicBlock(fluid.dygraph.Layer):
     def __init__(self, ch_in, ch_out):
         super(BasicBlock, self).__init__()
 
         self.conv1 = ConvBNLayer(
-            ch_in=ch_in,
-            ch_out=ch_out,
-            filter_size=1,
-            stride=1,
-            padding=0)
+            ch_in=ch_in, ch_out=ch_out, filter_size=1, stride=1, padding=0)
         self.conv2 = ConvBNLayer(
             ch_in=ch_out,
-            ch_out=ch_out*2,
+            ch_out=ch_out * 2,
             filter_size=3,
             stride=1,
             padding=1)
+
     def forward(self, inputs):
         conv1 = self.conv1(inputs)
         conv2 = self.conv2(conv1)
         out = fluid.layers.elementwise_add(x=inputs, y=conv2, act=None)
         return out
 
+
 class LayerWarp(fluid.dygraph.Layer):
     def __init__(self, ch_in, ch_out, count):
-        super(LayerWarp,self).__init__()
+        super(LayerWarp, self).__init__()
 
         self.basicblock0 = BasicBlock(ch_in, ch_out)
         self.res_out_list = []
-        for i in range(1,count):
+        for i in range(1, count):
             res_out = self.add_sublayer("basic_block_%d" % (i),
-                BasicBlock(
-                    ch_out*2,
-                    ch_out))
+                                        BasicBlock(ch_out * 2, ch_out))
             self.res_out_list.append(res_out)
         self.ch_out = ch_out
-    def forward(self,inputs):
+
+    def forward(self, inputs):
         y = self.basicblock0(inputs)
         for basic_block_i in self.res_out_list:
             y = basic_block_i(y)
@@ -154,36 +149,27 @@ class DarkNet(Model):
         self.stages = self.stages[0:5]
 
         self.conv0 = ConvBNLayer(
-            ch_in=ch_in,
-            ch_out=32,
-            filter_size=3,
-            stride=1,
-            padding=1)
+            ch_in=ch_in, ch_out=32, filter_size=3, stride=1, padding=1)
 
-        self.downsample0 = DownSample(
-            ch_in=32,
-            ch_out=32 * 2)
+        self.downsample0 = DownSample(ch_in=32, ch_out=32 * 2)
         self.darknet53_conv_block_list = []
         self.downsample_list = []
-        ch_in = [64,128,256,512,1024]
+        ch_in = [64, 128, 256, 512, 1024]
         for i, stage in enumerate(self.stages):
-            conv_block = self.add_sublayer(
-                "stage_%d" % (i),
-                LayerWarp(
-                int(ch_in[i]),
-                32*(2**i),
-                stage))
+            conv_block = self.add_sublayer("stage_%d" % (i),
+                                           LayerWarp(
+                                               int(ch_in[i]), 32 * (2**i),
+                                               stage))
             self.darknet53_conv_block_list.append(conv_block)
         for i in range(len(self.stages) - 1):
             downsample = self.add_sublayer(
                 "stage_%d_downsample" % i,
                 DownSample(
-                    ch_in = 32*(2**(i+1)),
-                    ch_out = 32*(2**(i+2))))
+                    ch_in=32 * (2**(i + 1)), ch_out=32 * (2**(i + 2))))
             self.downsample_list.append(downsample)
 
-    def forward(self,inputs):
-        
+    def forward(self, inputs):
+
         out = self.conv0(inputs)
         out = self.downsample0(out)
         blocks = []
diff --git a/examples/yolov3/dataset/download_voc.py b/yolov3/dataset/download_voc.py
similarity index 90%
rename from examples/yolov3/dataset/download_voc.py
rename to yolov3/dataset/download_voc.py
index 9877d7cd6b4946c01f58476b6fe81c328005e711..c3292b85ce62100113ee4acc5efd795899633510 100644
--- a/examples/yolov3/dataset/download_voc.py
+++ b/yolov3/dataset/download_voc.py
@@ -23,12 +23,12 @@ import logging
 logger = logging.getLogger(__name__)
 
 DATASETS = {
-    'voc': [
-        ('https://paddlemodels.bj.bcebos.com/hapi/voc.tar',
-         '9faeb7fd997aeea843092fd608d5bcb4', ),
-    ],
+    'voc': [(
+        'https://paddlemodels.bj.bcebos.com/hapi/voc.tar',
+        '9faeb7fd997aeea843092fd608d5bcb4', ), ],
 }
 
+
 def download_decompress_file(data_dir, url, md5):
     logger.info("Downloading from {}".format(url))
     tar_file = _download(url, data_dir, md5)
@@ -43,4 +43,3 @@ if __name__ == "__main__":
     for name, infos in DATASETS.items():
         for info in infos:
             download_decompress_file(data_dir, *info)
-
diff --git a/examples/yolov3/image/YOLOv3.jpg b/yolov3/image/YOLOv3.jpg
similarity index 100%
rename from examples/yolov3/image/YOLOv3.jpg
rename to yolov3/image/YOLOv3.jpg
diff --git a/examples/yolov3/image/YOLOv3_structure.jpg b/yolov3/image/YOLOv3_structure.jpg
similarity index 100%
rename from examples/yolov3/image/YOLOv3_structure.jpg
rename to yolov3/image/YOLOv3_structure.jpg
diff --git a/examples/yolov3/image/dog.jpg b/yolov3/image/dog.jpg
similarity index 100%
rename from examples/yolov3/image/dog.jpg
rename to yolov3/image/dog.jpg
diff --git a/examples/yolov3/infer.py b/yolov3/infer.py
similarity index 100%
rename from examples/yolov3/infer.py
rename to yolov3/infer.py
diff --git a/examples/yolov3/main.py b/yolov3/main.py
similarity index 100%
rename from examples/yolov3/main.py
rename to yolov3/main.py
diff --git a/examples/yolov3/modeling.py b/yolov3/modeling.py
similarity index 100%
rename from examples/yolov3/modeling.py
rename to yolov3/modeling.py
diff --git a/examples/yolov3/transforms.py b/yolov3/transforms.py
similarity index 100%
rename from examples/yolov3/transforms.py
rename to yolov3/transforms.py
diff --git a/examples/yolov3/utils.py b/yolov3/utils.py
similarity index 100%
rename from examples/yolov3/utils.py
rename to yolov3/utils.py
diff --git a/examples/yolov3/visualizer.py b/yolov3/visualizer.py
similarity index 94%
rename from examples/yolov3/visualizer.py
rename to yolov3/visualizer.py
index 4433df8606ec140fe08f197e445aea6df89bf445..d8fdd272aa738cb42abf3e073d1dead1bd69080b 100644
--- a/examples/yolov3/visualizer.py
+++ b/yolov3/visualizer.py
@@ -69,8 +69,8 @@ def draw_bbox(image, catid2name, bboxes, threshold):
              (xmin, ymin)],
             width=2,
             fill=color)
-        logger.info("detect {} at {} score: {:.2f}".format(
-            catid2name[int(catid)], [xmin, ymin, xmax, ymax], score))
+        logger.info("detect {} at {} score: {:.2f}".format(catid2name[int(
+            catid)], [xmin, ymin, xmax, ymax], score))
 
         # draw label
         text = "{} {:.2f}".format(catid2name[catid], score)