remove python files outside

98f6b52f · dengkaipeng · df61a05c · df61a05c · df61a05c · df61a05c
Showing with 0 addition and 3355 deletion

callbacks.py callbacks.py +0 -279

distributed.py distributed.py +0 -222

metrics.py metrics.py +0 -127

model.py model.py +0 -1268

progressbar.py progressbar.py +0 -163

text.py text.py +0 -1296

未找到文件。
--- a/callbacks.py
+++ b/callbacks.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import six
-import copy
-
-from progressbar import ProgressBar
-from paddle.fluid.dygraph.parallel import ParallelEnv
-
-
-def config_callbacks(callbacks=None,
-                     model=None,
-                     batch_size=None,
-                     epochs=None,
-                     steps=None,
-                     log_freq=2,
-                     verbose=2,
-                     save_freq=1,
-                     save_dir=None,
-                     metrics=None,
-                     mode='train'):
-    cbks = callbacks or []
-    cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks]
-    if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose:
-        cbks = cbks + [ProgBarLogger(log_freq, verbose=verbose)]
-
-    if not any(isinstance(k, ModelCheckpoint) for k in cbks):
-        cbks = cbks + [ModelCheckpoint(save_freq, save_dir)]
-
-    cbk_list = CallbackList(cbks)
-    cbk_list.set_model(model)
-    metrics = metrics or [] if mode != 'test' else []
-    params = {
-        'batch_size': batch_size,
-        'epochs': epochs,
-        'steps': steps,
-        'verbose': verbose,
-        'metrics': metrics,
-    }
-    cbk_list.set_params(params)
-    return cbk_list
-
-
-class CallbackList(object):
-    def __init__(self, callbacks=None):
-        # copy
-        self.callbacks = [c for c in callbacks]
-        self.params = {}
-        self.model = None
-
-    def append(self, callback):
-        self.callbacks.append(callback)
-
-    def __iter__(self):
-        return iter(self.callbacks)
-
-    def set_params(self, params):
-        for c in self.callbacks:
-            c.set_params(params)
-
-    def set_model(self, model):
-        for c in self.callbacks:
-            c.set_model(model)
-
-    def _call(self, name, *args):
-        for c in self.callbacks:
-            func = getattr(c, name)
-            func(*args)
-
-    def _check_mode(self, mode):
-        assert mode in ['train', 'eval', 'test'], \
-            'mode should be train, eval or test'
-
-    def on_begin(self, mode, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_begin'.format(mode)
-        self._call(name, logs)
-
-    def on_end(self, mode, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_end'.format(mode)
-        self._call(name, logs)
-
-    def on_epoch_begin(self, epoch=None, logs=None):
-        self._call('on_epoch_begin', epoch, logs)
-
-    def on_epoch_end(self, epoch=None, logs=None):
-        self._call('on_epoch_end', epoch, logs)
-
-    def on_batch_begin(self, mode, step=None, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_batch_begin'.format(mode)
-        self._call(name, step, logs)
-
-    def on_batch_end(self, mode, step=None, logs=None):
-        self._check_mode(mode)
-        name = 'on_{}_batch_end'.format(mode)
-        self._call(name, step, logs)
-
-
-class Callback(object):
-    def __init__(self):
-        self.model = None
-        self.params = {}
-
-    def set_params(self, params):
-        self.params = params
-
-    def set_model(self, model):
-        self.model = model
-
-    def on_train_begin(self, logs=None):
-        """
-        """
-
-    def on_train_end(self, logs=None):
-        """
-        """
-
-    def on_eval_begin(self, logs=None):
-        """
-        """
-
-    def on_eval_end(self, logs=None):
-        """
-        """
-
-    def on_test_begin(self, logs=None):
-        """
-        """
-
-    def on_test_end(self, logs=None):
-        """
-        """
-
-    def on_epoch_begin(self, epoch, logs=None):
-        """
-        """
-
-    def on_epoch_end(self, epoch, logs=None):
-        """
-        """
-
-    def on_train_batch_begin(self, step, logs=None):
-        """
-        """
-
-    def on_train_batch_end(self, step, logs=None):
-        """
-        """
-
-    def on_eval_batch_begin(self, step, logs=None):
-        """
-        """
-
-    def on_eval_batch_end(self, step, logs=None):
-        """
-        """
-
-    def on_eval_batch_begin(self, step, logs=None):
-        """
-        """
-
-    def on_eval_batch_end(self, step, logs=None):
-        """
-        """
-
-
-class ProgBarLogger(Callback):
-    def __init__(self, log_freq=1, verbose=2):
-        self.epochs = None
-        self.steps = None
-        self.progbar = None
-        self.verbose = verbose
-        self.log_freq = log_freq
-
-    def on_train_begin(self, logs=None):
-        self.epochs = self.params['epochs']
-        assert self.epochs
-        self.train_metrics = self.params['metrics']
-        assert self.train_metrics
-
-    def on_epoch_begin(self, epoch=None, logs=None):
-        self.steps = self.params['steps']
-        self.epoch = epoch
-        self.train_step = 0
-        if self.verbose and self.epochs and ParallelEnv().local_rank == 0:
-            print('Epoch %d/%d' % (epoch + 1, self.epochs))
-        self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
-
-    def _updates(self, logs, mode):
-        values = []
-        metrics = getattr(self, '%s_metrics' % (mode))
-        progbar = getattr(self, '%s_progbar' % (mode))
-        steps = getattr(self, '%s_step' % (mode))
-        for k in metrics:
-            if k in logs:
-                values.append((k, logs[k]))
-        progbar.update(steps, values)
-
-    def on_train_batch_end(self, step, logs=None):
-        logs = logs or {}
-        self.train_step += 1
-
-        if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
-        ).local_rank == 0:
-            # if steps is not None, last step will update in on_epoch_end
-            if self.steps and self.train_step < self.steps:
-                self._updates(logs, 'train')
-            else:
-                self._updates(logs, 'train')
-
-    def on_epoch_end(self, epoch, logs=None):
-        logs = logs or {}
-        if self.verbose and ParallelEnv().local_rank == 0:
-            self._updates(logs, 'train')
-
-    def on_eval_begin(self, logs=None):
-        self.eval_steps = logs.get('steps', None)
-        self.eval_metrics = logs.get('metrics_name', [])
-        self.eval_step = 0
-        self.evaled_samples = 0
-        self.eval_progbar = ProgressBar(
-            num=self.eval_steps, verbose=self.verbose)
-        if ParallelEnv().local_rank == 0:
-            print('Eval begin...')
-
-    def on_eval_batch_end(self, step, logs=None):
-        logs = logs or {}
-        self.eval_step = step
-        samples = logs.get('batch_size', 1)
-        self.evaled_samples += samples
-
-        if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
-        ).local_rank == 0:
-            # if steps is not None, last step will update in on_epoch_end
-            if self.eval_steps and self.eval_step < self.eval_steps:
-                self._updates(logs, 'eval')
-
-    def on_eval_end(self, logs=None):
-        logs = logs or {}
-        if self.verbose and ParallelEnv().local_rank == 0:
-            self._updates(logs, 'eval')
-            print('Eval samples: %d' % (self.evaled_samples))
-
-
-class ModelCheckpoint(Callback):
-    def __init__(self, save_freq=1, save_dir=None):
-        self.save_freq = save_freq
-        self.save_dir = save_dir
-
-    def on_epoch_begin(self, epoch=None, logs=None):
-        self.epoch = epoch
-
-    def _is_save(self):
-        return self.model and self.save_dir and ParallelEnv().local_rank == 0
-
-    def on_epoch_end(self, epoch, logs=None):
-        if self._is_save() and self.epoch % self.save_freq == 0:
-            path = '{}/{}'.format(self.save_dir, epoch)
-            print('save checkpoint at {}'.format(path))
-            self.model.save(path)
-
-    def on_train_end(self, logs=None):
-        if self._is_save():
-            path = '{}/final'.format(self.save_dir)
-            print('save checkpoint at {}'.format(path))
-            self.model.save(path)
--- a/distributed.py
+++ b/distributed.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import six
-import time
-import math
-import socket
-import contextlib
-import numpy as np
-
-from paddle import fluid
-from paddle.fluid.layers import collective
-from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
-from paddle.io import BatchSampler
-
-_parallel_context_initialized = False
-
-
-class DistributedBatchSampler(BatchSampler):
-    """Sampler that restricts data loading to a subset of the dataset.
-
-    In such case, each process can pass a DistributedBatchSampler instance 
-    as a DataLoader sampler, and load a subset of the original dataset that 
-    is exclusive to it.
-
-    .. note::
-        Dataset is assumed to be of constant size.
-        
-    Args:
-        data_source: this could be a `paddle.io.Dataset` implement
-                     or other python object which implemented
-                     `__len__` for BatchSampler to get sample
-                     number of data source.
-        batch_size(int): sample indice number in a mini-batch indices.
-        shuffle(bool): whther to shuffle indices order before genrating
-            batch indices. Default False.
-        drop_last(bool): whether drop the last incomplete batch dataset size
-            is not divisible by the batch size. Default False
-    """
-
-    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
-        self.dataset = dataset
-
-        assert isinstance(batch_size, int) and batch_size > 0, \
-                "batch_size should be a positive integer"
-        self.batch_size = batch_size
-        assert isinstance(shuffle, bool), \
-                "shuffle should be a boolean value"
-        self.shuffle = shuffle
-        assert isinstance(drop_last, bool), \
-                "drop_last should be a boolean number"
-
-        self.drop_last = drop_last
-        self.nranks = ParallelEnv().nranks
-        self.local_rank = ParallelEnv().local_rank
-        self.epoch = 0
-        self.num_samples = int(
-            math.ceil(len(self.dataset) * 1.0 / self.nranks))
-        self.total_size = self.num_samples * self.nranks
-
-    def __iter__(self):
-        num_samples = len(self.dataset)
-        indices = np.arange(num_samples).tolist()
-        indices += indices[:(self.total_size - len(indices))]
-        assert len(indices) == self.total_size
-        if self.shuffle:
-            np.random.RandomState(self.epoch).shuffle(indices)
-            self.epoch += 1
-
-        # subsample
-        def _get_indices_by_batch_size(indices):
-            subsampled_indices = []
-            last_batch_size = self.total_size % (self.batch_size * self.nranks)
-            assert last_batch_size % self.nranks == 0
-            last_local_batch_size = last_batch_size // self.nranks
-
-            for i in range(self.local_rank * self.batch_size,
-                           len(indices) - last_batch_size,
-                           self.batch_size * self.nranks):
-                subsampled_indices.extend(indices[i:i + self.batch_size])
-
-            indices = indices[len(indices) - last_batch_size:]
-            subsampled_indices.extend(indices[
-                self.local_rank * last_local_batch_size:(
-                    self.local_rank + 1) * last_local_batch_size])
-            return subsampled_indices
-
-        if self.nranks > 1:
-            indices = _get_indices_by_batch_size(indices)
-
-        assert len(indices) == self.num_samples
-        _sample_iter = iter(indices)
-
-        batch_indices = []
-        for idx in _sample_iter:
-            batch_indices.append(idx)
-            if len(batch_indices) == self.batch_size:
-                yield batch_indices
-                batch_indices = []
-        if not self.drop_last and len(batch_indices) > 0:
-            yield batch_indices
-
-    def __len__(self):
-        num_samples = self.num_samples
-        num_samples += int(not self.drop_last) * (self.batch_size - 1)
-        return num_samples // self.batch_size
-
-    def set_epoch(self, epoch):
-        self.epoch = epoch
-
-
-def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
-    return collective._c_allgather(
-        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
-
-
-def wait_server_ready(endpoints):
-    assert not isinstance(endpoints, six.string_types)
-    while True:
-        all_ok = True
-        not_ready_endpoints = []
-        for ep in endpoints:
-            ip_port = ep.split(":")
-            with contextlib.closing(
-                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-                sock.settimeout(2)
-                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
-                if result != 0:
-                    all_ok = False
-                    not_ready_endpoints.append(ep)
-        if not all_ok:
-            time.sleep(3)
-        else:
-            break
-
-
-def init_communicator(program, rank, nranks, wait_port, current_endpoint,
-                      endpoints):
-    if nranks < 2:
-        return
-    other_endpoints = endpoints[:]
-    other_endpoints.remove(current_endpoint)
-    if rank == 0 and wait_port:
-        wait_server_ready(other_endpoints)
-    block = program.global_block()
-    nccl_id_var = block.create_var(
-        name=fluid.unique_name.generate('nccl_id'),
-        persistable=True,
-        type=fluid.core.VarDesc.VarType.RAW)
-
-    block.append_op(
-        type='c_gen_nccl_id',
-        inputs={},
-        outputs={'Out': nccl_id_var},
-        attrs={
-            'rank': rank,
-            'endpoint': current_endpoint,
-            'other_endpoints': other_endpoints
-        })
-
-    block.append_op(
-        type='c_comm_init',
-        inputs={'X': nccl_id_var},
-        outputs={},
-        attrs={
-            'nranks': nranks,
-            'rank': rank,
-            'ring_id': 0,
-        })
-
-
-def prepare_distributed_context(place=None):
-    if place is None:
-        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
-            else fluid.CUDAPlace(0)
-
-    strategy = ParallelStrategy()
-    strategy.nranks = ParallelEnv().nranks
-    strategy.local_rank = ParallelEnv().local_rank
-    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-    strategy.current_endpoint = ParallelEnv().current_endpoint
-
-    if strategy.nranks < 2:
-        return
-
-    global _parallel_context_initialized
-
-    if not _parallel_context_initialized and isinstance(place,
-                                                        fluid.CUDAPlace):
-
-        def _init_context():
-            communicator_prog = fluid.Program()
-            init_communicator(communicator_prog, strategy.local_rank,
-                              strategy.nranks, True, strategy.current_endpoint,
-                              strategy.trainer_endpoints)
-            exe = fluid.Executor(place)
-            exe.run(communicator_prog)
-
-        if fluid.in_dygraph_mode():
-            fluid.disable_dygraph()
-            _init_context()
-            fluid.enable_dygraph(place)
-        else:
-            _init_context()
-
-    else:
-        assert ("Only support CUDAPlace for now.")
-
-    _parallel_context_initialized = True
-    return strategy
--- a/metrics.py
+++ b/metrics.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-import six
-import abc
-import numpy as np
-import paddle.fluid as fluid
-
-import logging
-FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-logger = logging.getLogger(__name__)
-
-__all__ = ['Metric', 'Accuracy']
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Metric(object):
-    """
-    Base class for metric, encapsulates metric logic and APIs
-
-    Usage:
-    m = SomeMetric()
-    for prediction, label in ...:
-        m.update(prediction, label)
-    m.accumulate()
-    """
-
-    @abc.abstractmethod
-    def reset(self):
-        """
-        Reset states and result
-        """
-        raise NotImplementedError("function 'reset' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def update(self, *args, **kwargs):
-        """
-        Update states for metric
-        """
-        raise NotImplementedError("function 'update' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def accumulate(self):
-        """
-        Accumulates statistics, computes and returns the metric value
-        """
-        raise NotImplementedError(
-            "function 'accumulate' not implemented in {}.".format(
-                self.__class__.__name__))
-
-    @abc.abstractmethod
-    def name(self):
-        """
-        Returns metric name
-        """
-        raise NotImplementedError("function 'name' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    def add_metric_op(self, pred, label):
-        """
-        Add process op for metric in program
-        """
-        return pred, label
-
-
-class Accuracy(Metric):
-    """
-    Encapsulates accuracy metric logic
-    """
-
-    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
-        super(Accuracy, self).__init__(*args, **kwargs)
-        self.topk = topk
-        self.maxk = max(topk)
-        self._init_name(name)
-        self.reset()
-
-    def add_metric_op(self, pred, label, *args, **kwargs):
-        pred = fluid.layers.argsort(pred[0], descending=True)[1][:, :self.maxk]
-        correct = pred == label[0]
-        return correct
-
-    def update(self, correct, *args, **kwargs):
-        accs = []
-        for i, k in enumerate(self.topk):
-            num_corrects = correct[:, :k].sum()
-            num_samples = len(correct)
-            accs.append(float(num_corrects) / num_samples)
-            self.total[i] += num_corrects
-            self.count[i] += num_samples
-        return accs
-
-    def reset(self):
-        self.total = [0.] * len(self.topk)
-        self.count = [0] * len(self.topk)
-
-    def accumulate(self):
-        res = []
-        for t, c in zip(self.total, self.count):
-            res.append(float(t) / c)
-        return res
-
-    def _init_name(self, name):
-        name = name or 'acc'
-        if self.maxk != 1:
-            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
-        else:
-            self._name = ['acc']
-
-    def name(self):
-        return self._name
--- a/model.py
+++ b/model.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-import inspect
-import os
-import pickle
-import numpy as np
-import six
-import warnings
-import tqdm
-
-from collections import Iterable
-from paddle import fluid
-from paddle.fluid.framework import in_dygraph_mode, Variable
-from paddle.fluid.executor import global_scope
-from paddle.fluid.io import is_belong_to_optimizer
-from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.layers.utils import flatten
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-from paddle.fluid.incubate.fleet.base import role_maker
-from paddle.io import DataLoader, Dataset
-
-from distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
-from metrics import Metric
-from callbacks import config_callbacks
-
-__all__ = ['Model', 'Loss', 'CrossEntropy', 'Input', 'set_device']
-
-
-def set_device(device):
-    """
-    Args:
-        device (str): specify device type, 'cpu' or 'gpu'.
-        
-    Returns:
-        fluid.CUDAPlace or fluid.CPUPlace: Created GPU or CPU place.
-    """
-
-    assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
-    "Expected device in ['cpu', 'gpu'], but got {}".format(device)
-
-    place = fluid.CUDAPlace(ParallelEnv().dev_id) \
-            if device.lower() == 'gpu' and fluid.is_compiled_with_cuda() \
-                else fluid.CPUPlace()
-
-    return place
-
-
-def to_list(value):
-    if value is None:
-        return value
-    if isinstance(value, (list, tuple)):
-        return value
-    return [value]
-
-
-def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable"
-    if isinstance(var, fluid.core.VarBase):
-        return var.numpy()
-    t = global_scope().find_var(var.name).get_tensor()
-    return np.array(t)
-
-
-def flatten_list(l):
-    assert isinstance(l, list), "not a list"
-    outl = []
-    splits = []
-    for sl in l:
-        assert isinstance(sl, list), "sub content not a list"
-        splits.append(len(sl))
-        outl += sl
-    return outl, splits
-
-
-def restore_flatten_list(l, splits):
-    outl = []
-    for split in splits:
-        assert len(l) >= split, "list length invalid"
-        sl, l = l[:split], l[split:]
-        outl.append(sl)
-    return outl
-
-
-def extract_args(func):
-    if hasattr(inspect, 'getfullargspec'):
-        return inspect.getfullargspec(func)[0]
-    else:
-        return inspect.getargspec(func)[0]
-
-
-class Input(fluid.dygraph.Layer):
-    def __init__(self, shape=None, dtype=None, name=None):
-        super(Input, self).__init__()
-        self.shape = shape
-        self.dtype = dtype
-        self.name = name
-
-    def forward(self):
-        return fluid.data(self.name, shape=self.shape, dtype=self.dtype)
-
-
-class Loss(object):
-    def __init__(self, average=True):
-        super(Loss, self).__init__()
-        self.average = average
-
-    def forward(self, outputs, labels):
-        raise NotImplementedError()
-
-    def __call__(self, outputs, labels=None):
-        labels = to_list(labels)
-        if in_dygraph_mode() and labels:
-            labels = [to_variable(l) for l in labels]
-        losses = to_list(self.forward(to_list(outputs), labels))
-        if self.average:
-            losses = [fluid.layers.reduce_mean(l) for l in losses]
-        else:
-            losses = [fluid.layers.reduce_sum(l) for l in losses]
-        return losses
-
-
-class CrossEntropy(Loss):
-    def __init__(self, average=True):
-        super(CrossEntropy, self).__init__()
-
-    def forward(self, outputs, labels):
-        return [
-            fluid.layers.cross_entropy(o, l) for o, l in zip(outputs, labels)
-        ]
-
-
-class StaticGraphAdapter(object):
-    def __init__(self, model):
-        super(StaticGraphAdapter, self).__init__()
-        self.model = model
-        # with `_build_once` gone, parameters are now created in `__init__`
-        # so we need to keep track of the parameters already created
-        self._startup_prog = fluid.default_startup_program()
-        self._orig_prog = fluid.default_main_program()
-
-        self._label_vars = {}  # label variables
-        self._input_vars = {}  # label variables
-        self._endpoints = {}
-        self._loss_endpoint = None
-        self._executor = None
-        self._progs = {}
-        self._compiled_progs = {}
-
-        self._merge_count = {
-            'eval_total': 0,
-            'test_total': 0,
-            'eval_batch': 0,
-            'test_batch': 0
-        }
-
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
-
-    @property
-    def mode(self):
-        return self.model.mode
-
-    @mode.setter
-    def mode(self, value):
-        self.model.mode = value
-
-    def train(self, inputs, labels=None):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
-        self.mode = 'train'
-        return self._run(inputs, labels)
-
-    def eval(self, inputs, labels=None):
-        self.mode = 'eval'
-        return self._run(inputs, labels)
-
-    def test(self, inputs):
-        self.mode = 'test'
-        return self._run(inputs, None)
-
-    def parameters(self, *args, **kwargs):
-        return super(Model, self.model).parameters(*args, **kwargs)
-
-    def save(self, path):
-        def _save(state, path):
-            if not state:
-                return
-            state = {
-                k: to_numpy(v) if isinstance(v, Variable) else v
-                for k, v in state.items()
-            }
-            with open(path, 'wb') as f:
-                pickle.dump(state, f)
-
-        base = os.path.basename(path)
-        assert base != "", "path should be of 'dirname/filename' format"
-        dir_name = os.path.dirname(path)
-        if dir_name and not os.path.exists(dir_name):
-            os.makedirs(dir_name)
-        param_path = path + ".pdparams"
-        _save(self.model.state_dict(), param_path)
-        prog = self._progs.get('train', None)
-        if prog is None or self.model._optimizer is None:
-            return
-        # XXX `optimizer.state_dict()` only work in dygraph mode
-        optim_path = path + ".pdopt"
-        optim = {
-            p.name: p
-            for p in filter(is_belong_to_optimizer, prog.list_vars())
-        }
-        if not optim:
-            return
-
-        _save(optim, optim_path)
-
-    def load(self, param_state_pairs, optim_state):
-        if self._executor is None:
-            executor = fluid.Executor(fluid.CPUPlace())._default_executor
-        else:
-            executor = self._executor._default_executor
-
-        # restore parameter states
-        fluid.core._create_loaded_parameter(
-            [param for param, state in param_state_pairs],
-            global_scope(), executor)
-        for param, state in param_state_pairs:
-            self._set_var(param, state)
-
-        # restore optimizer states
-        # FIXME what if a different optimizer is used?
-        if not self.model._optimizer or not optim_state:
-            return
-        self._load_optimizer(optim_state, executor)
-
-    def _load_optimizer(self, state, executor):
-        prog = self._progs.get('train', None)
-        optim = list(filter(is_belong_to_optimizer, prog.list_vars()))
-        if not optim:
-            return
-
-        fluid.core._create_loaded_parameter(optim, global_scope(), executor)
-
-        converted_state = dict(state)
-        for var in optim:
-            if var.name in ["@LR_DECAY_COUNTER@", "global_step"]:
-                # When using learning rate scheduler, dygraph would name the
-                # global step var as "global_step" to save, while static-graph
-                # would has a state var named as "@LR_DECAY_COUNTER@".
-                # NOTE: dygraph saved global_step is 1 larger than that in
-                # static-graph, since the time of global_step to increase is
-                # different.
-                state_val = (
-                    np.array(converted_state.pop("global_step")) - 1
-                ) if "global_step" in converted_state else converted_state.pop(
-                    "@LR_DECAY_COUNTER@", None)
-                if state_val is not None:
-                    converted_state[var.name] = state_val
-            elif var.name.startswith("learning_rate_"):
-                # When using static learning rate, static-graph would make it
-                # a persistable var named 'unique_name.generate("learning_rate")',
-                # However, dygraph wouldn't save it.
-                if var.name not in state:
-                    continue
-            else:
-                # moment and other accumulators
-                if var.name not in converted_state:
-                    # try to convert from dygraph name
-                    opt_name = self.model._optimizer._name
-                    opt_cls_name = self.model._optimizer.__class__.__name__
-                    opt_unq_name = None
-                    for name in self.model._optimizer._accumulators.keys():
-                        accum_name = name if opt_name is None else name[len(
-                            opt_name) + 1:]
-                        for param_name, state_var in self.model._optimizer._accumulators[
-                                name].items():
-                            if opt_unq_name is None:
-                                # can not infer out the exact unique(opt_name),
-                                # thus try to extract rather than generate
-                                for state_key in sorted(
-                                        state.keys(),
-                                        key=lambda x: len(x),
-                                        reverse=True):
-                                    prefix = param_name + "_" + (
-                                        opt_cls_name if opt_name is None else
-                                        opt_name) + "_"
-                                    if state_key.startswith(prefix):
-                                        prefix_offset = state_key[len(
-                                            prefix):].find("_") + len(prefix)
-                                        opt_unq_name = state_key[len(
-                                            param_name + "_"):prefix_offset]
-                                        # TODO: assert
-                                        # assert opt_unq_name is None
-                                    # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
-                                    # always end with "_0" since the unique optimizer._name
-                            dy_state_name = (param_name + "_" + opt_unq_name +
-                                             "_" + accum_name + "_0")
-                            converted_state[
-                                state_var.name] = converted_state.pop(
-                                    dy_state_name)
-
-            assert var.name in converted_state, \
-                "variable [{}] is not in optimizer state file".format(var.name)
-            self._set_var(var, converted_state[var.name])
-
-    def _set_var(self, var, ndarray):
-        t = global_scope().find_var(var.name).get_tensor()
-        p = t._place()
-        if p.is_cpu_place():
-            place = fluid.CPUPlace()
-        elif p.is_cuda_pinned_place():
-            place = fluid.CUDAPinnedPlace()
-        else:
-            p = fluid.core.Place()
-            p.set_place(t._place())
-            place = fluid.CUDAPlace(p.gpu_device_id())
-
-        t.set(ndarray, place)
-
-    def _run(self, inputs, labels=None):
-        compiled_prog = self._compiled_progs.get(self.mode, None)
-        assert compiled_prog, \
-            "Model is not ready, please call `model.prepare()` first"
-
-        inputs = to_list(inputs)
-        if labels is not None:
-            labels = to_list(labels)
-        assert len(inputs) == len(self._input_vars[self.mode]), \
-            "number of inputs" \
-            + " does not match number of arguments of `forward` method"
-
-        feed = {}
-        input_names = [v.name for v in self._input_vars[self.mode]]
-        for idx, n in enumerate(input_names):
-            # train and test may take different arguments
-            if inputs[idx] is not None:
-                feed[n] = inputs[idx]
-        if labels is not None:
-            for idx, v in enumerate(self._label_vars[self.mode]):
-                feed[v.name] = labels[idx]
-
-        endpoints = self._endpoints[self.mode]
-        if self.mode == 'test':
-            fetch_list = endpoints['output']
-        else:
-            metric_list, metric_splits = flatten_list(endpoints['metric'])
-            fetch_list = endpoints['loss'] + metric_list
-            num_loss = len(endpoints['loss'])
-
-        # if fetch Variable is same as input Variable, do not fetch
-        # from program, get it from input directly
-        pruned_fetch_list = []
-        pruned_fetch_idx_name_map = [""] * len(fetch_list)
-        for i, fetch_var in enumerate(fetch_list):
-            if fetch_var.name in feed.keys():
-                pruned_fetch_idx_name_map[i] = fetch_var.name
-            else:
-                pruned_fetch_list.append(fetch_var)
-
-        rets = self._executor.run(compiled_prog,
-                                  feed=feed,
-                                  fetch_list=pruned_fetch_list,
-                                  return_numpy=False)
-
-        # restore pruned fetch_list Variable from feeds
-        for i, name in enumerate(pruned_fetch_idx_name_map):
-            if len(name) > 0:
-                rets.insert(i, feed[name])
-
-        # LoDTensor cannot be fetch as numpy directly
-        rets = [np.array(v) for v in rets]
-        if self.mode == 'test':
-            return rets[:]
-        losses = rets[:num_loss]
-        metric_states = restore_flatten_list(rets[num_loss:], metric_splits)
-        metrics = []
-        for metric, state in zip(self.model._metrics, metric_states):
-            # cut off padding size
-            if self.mode != 'train' and self.model._test_dataloader is not None \
-                    and isinstance(self.model._test_dataloader, DataLoader) \
-                    and self._nranks > 1:
-                total_size = len(self.model._test_dataloader.dataset)
-                # TODO: fixme if have better way to get batch size
-                samples = state[0].shape[0]
-                current_count = self._merge_count.get(self.mode + '_total', 0)
-                if current_count + samples >= total_size:
-                    state = [
-                        s[:total_size - current_count, ...] for s in state
-                    ]
-                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode +
-                                      '_batch'] = total_size - current_count
-                else:
-                    self._merge_count[self.mode + '_total'] += samples
-                    self._merge_count[self.mode + '_batch'] = samples
-
-            metrics.append(metric.update(*state))
-        return (losses, metrics) if len(metrics) > 0 else losses
-
-    def prepare(self):
-        modes = ['train', 'eval', 'test']
-        for mode in modes:
-            self._make_program(mode)
-            self._compile_and_initialize(self._progs[mode], mode)
-
-    def _make_program(self, mode):
-        prog = self._progs.get(mode, None)
-        if prog is not None:
-            return
-
-        prog = self._orig_prog.clone()
-        # NOTE: When defining learning rate scheduling in static-graph, ops to
-        # increase the global step var and calculate learning rate would be
-        # prepended into _orig_prog. test program maked by `_orig_prog.clone`
-        # also would include these ops. Thus must prune these ops in test
-        # program, otherwise the global step would be changed in test.
-        if mode != 'train':
-            for op in list(prog.global_block().ops):
-                prog.global_block()._remove_op(0)
-        if mode == 'train' and self.model._optimizer \
-                and self.model._optimizer._learning_rate_map:
-            # HACK workaround learning rate map issue
-            lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
-            new_lr_var = prog.global_block().vars[lr_var.name]
-            self.model._optimizer._learning_rate_map[prog] = new_lr_var
-
-        losses = []
-        metrics = []
-        with fluid.program_guard(prog, self._startup_prog):
-            ins = self.model._inputs
-            lbls = self.model._labels if self.model._labels else []
-            inputs = [k.forward() for k in to_list(ins)]
-            labels = [k.forward() for k in to_list(lbls)]
-            self._label_vars[mode] = labels
-            outputs = to_list(self.model.forward(*inputs))
-
-            if mode != 'test' and self.model._loss_function:
-                losses = self.model._loss_function(outputs, labels)
-
-            if self._nranks > 1 and mode != 'train':
-                outputs = [_all_gather(o, self._nranks) for o in outputs]
-                if mode != 'test':
-                    labels = [_all_gather(l, self._nranks) for l in labels]
-
-            if mode != 'test':
-                for metric in self.model._metrics:
-                    metrics.append(
-                        to_list(metric.add_metric_op(outputs, labels)))
-
-            if mode == 'train' and self.model._optimizer:
-                self._loss_endpoint = fluid.layers.sum(losses)
-                if self._nranks > 1:
-                    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-                    fleet.init(role)
-                    dist_strategy = DistributedStrategy()
-                    dist_strategy.mode = "collective"
-                    dist_strategy.collective_mode = "grad_allreduce"
-                    self.model._optimizer = fleet.distributed_optimizer(
-                        self.model._optimizer, strategy=dist_strategy)
-
-                self.model._optimizer.minimize(self._loss_endpoint)
-
-        if mode != 'train':  # clone again to put it in test mode
-            prog = prog.clone(for_test=True)
-
-        self._input_vars[mode] = inputs
-
-        self._progs[mode] = prog
-        self._endpoints[mode] = {
-            "output": outputs,
-            "loss": losses,
-            "metric": metrics
-        }
-
-    def _compile_and_initialize(self, prog, mode):
-        compiled_prog = self._compiled_progs.get(mode, None)
-        if compiled_prog is not None:
-            return compiled_prog
-
-        assert self.model._place is not None, \
-            "device is not set, please call `model.prepare()` first"
-
-        place = self.model._place
-
-        # XXX *ALL WEIGHTS* should be initialized upon model construction
-        # even if `forward()` may run different code path for different mode
-        # therefore startup program only needs to run once
-        if self._executor is None:
-            self._executor = fluid.Executor(place)
-            # XXX incremental initialization
-            uninitialized = []
-            for var_py in self._startup_prog.list_vars():
-                var = fluid.global_scope().find_var(var_py.name)
-                if not var_py.name.startswith('nccl_id') and var and \
-                        var.get_tensor()._is_initialized():
-                    continue
-
-                uninitialized.append(var_py)
-            if uninitialized:
-                startup_prog = self._startup_prog._prune(uninitialized)
-                self._executor.run(startup_prog)
-
-        if self._nranks < 2:
-            compiled_prog = fluid.CompiledProgram(prog)
-        else:
-            compiled_prog = prog
-
-        self._compiled_progs[mode] = compiled_prog
-
-
-class DynamicGraphAdapter(object):
-    def __init__(self, model):
-        super(DynamicGraphAdapter, self).__init__()
-        self.model = model
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
-        self._merge_count = {
-            'eval_total': 0,
-            'test_total': 0,
-            'eval_batch': 0,
-            'test_batch': 0
-        }
-
-        if self._nranks > 1:
-            stradegy = fluid.dygraph.parallel.ParallelStrategy()
-            stradegy.nranks = ParallelEnv().nranks
-            stradegy.local_rank = ParallelEnv().local_rank
-            stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
-            stradegy.current_endpoint = ParallelEnv().current_endpoint
-            self.ddp_model = fluid.dygraph.parallel.DataParallel(self.model,
-                                                                 stradegy)
-
-    @property
-    def mode(self):
-        return self.model.mode
-
-    @mode.setter
-    def mode(self, value):
-        self.model.mode = value
-
-    # TODO multi device in dygraph mode not implemented at present time
-    def train(self, inputs, labels=None):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
-        super(Model, self.model).train()
-        self.mode = 'train'
-        inputs = to_list(inputs)
-        if labels is not None:
-            labels = [to_variable(l) for l in to_list(labels)]
-        if self._nranks > 1:
-            outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs])
-            losses = self.model._loss_function(outputs, labels)
-            final_loss = fluid.layers.sum(losses)
-            final_loss = self.ddp_model.scale_loss(final_loss)
-            final_loss.backward()
-            self.ddp_model.apply_collective_grads()
-        else:
-            outputs = self.model.forward(*[to_variable(x) for x in inputs])
-            losses = self.model._loss_function(outputs, labels)
-            final_loss = fluid.layers.sum(losses)
-            final_loss.backward()
-
-        self.model._optimizer.minimize(final_loss)
-        self.model.clear_gradients()
-        metrics = []
-        for metric in self.model._metrics:
-            metric_outs = metric.add_metric_op(
-                to_list(outputs), to_list(labels))
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
-            metrics.append(m)
-
-        return ([to_numpy(l) for l in losses], metrics) \
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
-
-    def eval(self, inputs, labels=None):
-        super(Model, self.model).eval()
-        self.mode = 'eval'
-        inputs = to_list(inputs)
-        if labels is not None:
-            labels = [to_variable(l) for l in to_list(labels)]
-        outputs = self.model.forward(*[to_variable(x) for x in inputs])
-        if self.model._loss_function:
-            losses = self.model._loss_function(outputs, labels)
-        else:
-            losses = []
-        if self._nranks > 1:
-            outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
-            labels = [_all_gather(l, self._nranks) for l in labels]
-        metrics = []
-        for metric in self.model._metrics:
-            # cut off padding value.
-            if self.model._test_dataloader is not None and self._nranks > 1 \
-                    and isinstance(self.model._test_dataloader, DataLoader):
-                total_size = len(self.model._test_dataloader.dataset)
-                samples = outputs[0].shape[0]
-                current_count = self._merge_count.get(self.mode + '_total', 0)
-                if current_count + samples >= total_size:
-                    outputs = [o[:total_size - current_count] for o in outputs]
-                    labels = [l[:total_size - current_count] for l in labels]
-                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode +
-                                      '_batch'] = total_size - current_count
-                else:
-                    self._merge_count[self.mode + '_total'] += samples
-                    self._merge_count[self.mode + '_batch'] = samples
-
-            metric_outs = metric.add_metric_op(to_list(outputs), labels)
-            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
-            metrics.append(m)
-
-        # To be consistent with static graph
-        # return empty loss if loss_function is None
-        return ([to_numpy(l) for l in losses], metrics) \
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
-
-    def test(self, inputs):
-        super(Model, self.model).eval()
-        self.mode = 'test'
-        inputs = [to_variable(x) for x in to_list(inputs)]
-        outputs = self.model.forward(*inputs)
-        if self._nranks > 1 and isinstance(self.model._place, fluid.CUDAPlace):
-            outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
-
-        return [to_numpy(o) for o in to_list(outputs)]
-
-    def parameters(self, *args, **kwargs):
-        return super(Model, self.model).parameters(*args, **kwargs)
-
-    def save(self, path):
-        params = self.model.state_dict()
-        fluid.save_dygraph(params, path)
-        if self.model._optimizer is None:
-            return
-        if self.model._optimizer.state_dict():
-            optim = self.model._optimizer.state_dict()
-            fluid.save_dygraph(optim, path)
-
-    def load(self, param_state_pairs, optim_state):
-        # restore parameter states
-        for param, state in param_state_pairs:
-            param.set_value(state)
-
-        # resotre optimizer states
-        if not self.model._optimizer or not optim_state:
-            return
-
-        # If optimizer performs set_dict when state vars haven't been created,
-        # which would happen when set_dict before minimize, the state would be
-        # stored in optimizer._accumulators_holder and loaded lazily.
-        # To contrive this when loading from static-graph saved states, extend
-        # state dict to include keys named accoring to dygraph naming rules.
-        # TODO: if len(self.model._optimizer._accumulators) > 0
-        converted_state = dict(optim_state)
-        opt_unq_name = self.model._optimizer._name
-        opt_cls_name = self.model._optimizer.__class__.__name__
-        opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
-        param_names = [param.name for param in self.model.parameters()]
-        for var_name, state_var in sorted(
-                optim_state.items(), key=lambda x: len(x[0]), reverse=True):
-            if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
-                # NOTE: dygraph saved global_step is 1 larger than that in
-                # static-graph, since the time of global_step to increase is
-                # different.
-                if var_name == "@LR_DECAY_COUNTER@":
-                    converted_state["global_step"] = np.array(
-                        converted_state.pop("@LR_DECAY_COUNTER@")) + 1
-            else:
-                # moment and other accumulators
-                # extend state dict to include promising dygraph names
-                for param_name in param_names:
-                    if var_name.startswith(param_name + "_" + opt_name):
-                        # when init optimizer with name
-                        accum_name = var_name[len(param_name + "_" + opt_name +
-                                                  "_"):]
-                    elif var_name.startswith(param_name +
-                                             "_") and opt_name == opt_cls_name:
-                        # when init optimizer without name
-                        accum_name = var_name[len(param_name + "_"):]
-                    else:
-                        continue
-                    # remove suffix idx
-                    accum_name = accum_name[:accum_name.rfind("_")]
-                    # state names always end with "_0" in dygraph because of the
-                    # unique optimizer._name
-                    dy_state_name = (param_name + "_" + opt_unq_name + "_" +
-                                     accum_name + "_0")
-                    converted_state[dy_state_name] = state_var
-
-        self.model._optimizer.set_dict(converted_state)
-
-
-class Model(fluid.dygraph.Layer):
-    """
-    FIXME: add more comments and usage
-    """
-
-    def __init__(self):
-        super(Model, self).__init__(self.__class__.__name__)
-        self.mode = 'train'
-        self._inputs = None
-        self._labels = None
-        self._loss_function = None
-        self._loss_weights = None
-        self._optimizer = None
-        self._device = None
-        self._optimizer = None
-        self._test_dataloader = None
-
-        # init backend
-        if fluid.in_dygraph_mode():
-            self._adapter = DynamicGraphAdapter(self)
-        else:
-            self._adapter = StaticGraphAdapter(self)
-
-    def train(self, *args, **kwargs):
-        return self._adapter.train(*args, **kwargs)
-
-    def eval(self, *args, **kwargs):
-        return self._adapter.eval(*args, **kwargs)
-
-    def test(self, *args, **kwargs):
-        return self._adapter.test(*args, **kwargs)
-
-    def save(self, *args, **kwargs):
-        if ParallelEnv().local_rank == 0:
-            return self._adapter.save(*args, **kwargs)
-
-    def load(self, path, skip_mismatch=False, reset_optimizer=False):
-        """
-        Load from files storing the model states and optimizer states. The file
-        for optimizer states is not necessary if no need to restore the optimizer.
-
-        NOTE: parameters are retrieved out from the file storing model states
-        accoring to their structured names.
-
-        For fine-tuning or transfer-learning models where some of the layers have
-        changed, keep parameters needed to restore have same structured names in
-        the pre-trained model and fine-tuning model.
-
-        Args:
-            path (str): The prefix of files storing the model states and
-                optimizer states. The files would be `path.pdparams` and
-                `path.pdopt` separately, and the latter is not necessary
-                when no need to restore.
-            skip_mismatch (bool): Whether to skip the loading of mismatch
-                parameter or raise an error when mismatch happens (not found
-                the parameter in file storing model states of or receives a
-                mismatch shape).
-            reset_optimizer (bool): If True, ignore the providing file storing
-                optimizer states and initialize optimizer states from scratch.
-                Otherwise, restore optimizer states from `path.pdopt` if
-                a optimizer has been set to the model. Default False.
-        """
-
-        def _load_state_from_path(path):
-            if not os.path.exists(path):
-                return
-            with open(path, 'rb') as f:
-                return pickle.load(f) if six.PY2 else pickle.load(
-                    f, encoding='latin1')
-
-        def _check_match(key, param):
-            state = param_state.get(key, None)
-            if state is None:
-                raise ValueError(
-                    "{} is not found in the providing file.".format(key))
-            if list(state.shape) != list(param.shape):
-                raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
-            return param, state
-
-        param_state = _load_state_from_path(path + ".pdparams")
-        assert param_state, "Failed to load parameters, please check path."
-
-        matched_param_state = []
-        for key, param in self.state_dict().items():
-            try:
-                match_res = _check_match(key, param)
-            except ValueError as err:
-                if skip_mismatch:
-                    warnings.warn(
-                        ("Skip loading for {}. ".format(key) + err.message))
-                    # reset optimizer when mismatch happens
-                    reset_optimizer = True
-                else:
-                    raise err
-            matched_param_state.append(match_res)
-
-        optim_state = None if reset_optimizer else _load_state_from_path(
-            path + ".pdopt")
-        return self._adapter.load(matched_param_state, optim_state)
-
-    def parameters(self, *args, **kwargs):
-        return self._adapter.parameters(*args, **kwargs)
-
-    def prepare(self,
-                optimizer=None,
-                loss_function=None,
-                metrics=None,
-                inputs=None,
-                labels=None,
-                device=None):
-        """
-        FIXME: add comments
-        Args:
-            optimizer (Optimizer|None): optimizer must be set in training
-                and should be a Optimizer instance. It can be None in eval
-                and test mode.
-            loss_function (Loss|None): loss function must be set in training
-                and should be a Loss instance. It can be None when there is
-                no loss.
-            metrics (Metric|list of Metric|None): if metrics is set, all
-                metric will be calculate and output in train/eval mode.
-            inputs (Input|list|dict|None): inputs, entry points of network,
-                could be a Input layer, or lits of Input layers,
-                or dict (name: Input), or None. For static graph,
-                inputs must be set. For dynamic graph, it could be None.
-            labels (Input|list|None): labels, entry points of network,
-                could be a Input layer or lits of Input layers, or None.
-                For static graph, if set loss_function in Model.prepare(), it
-                must be set. Otherwise, it could be None.
-            device (str|None): specify device type, 'CPU' or 'GPU'.
-                If None, automatically select device according to
-                installation package version.
-        """
-
-        if isinstance(device, fluid.CUDAPlace) or \
-            (isinstance(device, six.string_types) and device.lower() == 'gpu') \
-            or (device is None and fluid.is_compiled_with_cuda()):
-            if isinstance(device, fluid.CUDAPlace):
-                self._place = device
-            else:
-                self._place = fluid.CUDAPlace(ParallelEnv().dev_id) \
-                    if ParallelEnv().nranks > 1 else fluid.CUDAPlace(0)
-
-            global _parallel_context_initialized
-            if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
-                if fluid.in_dygraph_mode():
-                    fluid.disable_dygraph()
-                    fluid.enable_dygraph(self._place)
-                    fluid.dygraph.parallel.prepare_context()
-                else:
-                    prepare_distributed_context(self._place)
-
-                _parallel_context_initialized = True
-        elif isinstance(device, fluid.CPUPlace):
-            self._place = device
-        elif (isinstance(device, six.string_types) and device.lower() == 'cpu') \
-            or (device is None):
-            self._place = fluid.CPUPlace()
-        else:
-            raise ValueError(
-                "Expected device in ('gpu', 'cpu', fluid.CUDAPlace, fluid.CPUPlace, None), \
-                but got {}".format(device))
-
-        self._optimizer = optimizer
-        if loss_function:
-            if not isinstance(loss_function, Loss):
-                raise TypeError(
-                    "'loss_function' must be sub classes of 'Loss'")
-        self._loss_function = loss_function
-        if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
-                raise TypeError(
-                    "'inputs' must be list or dict in static graph mode")
-
-        metrics = metrics or []
-        for metric in to_list(metrics):
-            assert isinstance(metric, Metric), \
-                "{} is not sub class of Metric".format(
-                    metric.__class__.__name__)
-        self._metrics = to_list(metrics)
-
-        self._inputs = to_list(inputs) if not isinstance(inputs, dict) else [
-            inputs[n] for n in extract_args(self.forward) if n != 'self'
-        ]
-        self._labels = to_list(labels)
-
-        if not in_dygraph_mode():
-            self._adapter.prepare()
-
-    def fit(
-            self,
-            train_data=None,
-            eval_data=None,
-            batch_size=1,
-            epochs=1,
-            eval_freq=1,
-            log_freq=10,
-            save_dir=None,
-            save_freq=1,
-            verbose=2,
-            drop_last=False,
-            shuffle=True,
-            num_workers=0,
-            callbacks=None, ):
-        """
-        FIXME: add more comments and usage
-        Args:
-            train_data (Dataset|DataLoader): An iterable data loader is used for 
-                train. An instance of paddle paddle.io.Dataset or 
-                paddle.io.Dataloader is recomended.
-            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.io.Dataset or paddle.io.Dataloader 
-                is recomended.
-            batch_size (int): Integer number. The batch size of train_data and eval_data. 
-                When train_data and eval_data are both the instance of Dataloader, this 
-                parameter will be ignored.
-            epochs (int): Integer number. The number of epochs to train the model.
-            eval_freq (int): The frequency, in number of epochs, an evalutation
-                is performed.
-            log_freq (int): The frequency, in number of steps, the training logs
-                are printed.
-            save_dir(str|None): The directory to save checkpoint during training.
-                If None, will not save checkpoint.
-            save_freq (int): The frequency, in number of epochs, to save checkpoint.
-            verbose (int): The verbosity mode, should be 0, 1, or 2.
-                0 = silent, 1 = progress bar, 2 = one line per epoch.
-            drop_last (bool): whether drop the last incomplete batch of train_data 
-                when dataset size is not divisible by the batch size. When train_data 
-                is an instance of Dataloader, this parameter will be ignored.
-            shuffle (bool): whther to shuffle train_data. When train_data is an instance 
-                of Dataloader, this parameter will be ignored.
-            num_workers (int): the number of subprocess to load data, 0 for no subprocess 
-                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this parameter will be ignored.
-            callbacks (Callback|None): A list of `Callback` instances to apply
-                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
-                are automatically inserted.
-        """
-
-        assert train_data is not None, \
-                "train_data must be given!"
-
-        if fluid.in_dygraph_mode():
-            feed_list = None
-        else:
-            feed_list = [x.forward() for x in self._inputs + self._labels]
-
-        if isinstance(train_data, Dataset):
-            train_sampler = DistributedBatchSampler(
-                train_data,
-                batch_size=batch_size,
-                shuffle=shuffle,
-                drop_last=drop_last)
-            train_loader = DataLoader(
-                train_data,
-                batch_sampler=train_sampler,
-                places=self._place,
-                feed_list=feed_list,
-                num_workers=num_workers,
-                return_list=True)
-        else:
-            train_loader = train_data
-
-        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(
-                eval_data, batch_size=batch_size)
-            eval_loader = DataLoader(
-                eval_data,
-                batch_sampler=eval_sampler,
-                places=self._place,
-                feed_list=feed_list,
-                num_workers=num_workers,
-                return_list=True)
-        elif eval_data is not None:
-            eval_loader = eval_data
-        else:
-            eval_loader = None
-
-        do_eval = eval_loader is not None
-        self._test_dataloader = eval_loader
-        metrics_name = self._metrics_name()
-        steps = len(train_loader) if hasattr(train_loader, '__len__') else None
-        cbks = config_callbacks(
-            callbacks,
-            model=self,
-            epochs=epochs,
-            steps=steps,
-            log_freq=log_freq,
-            save_freq=save_freq,
-            save_dir=save_dir,
-            verbose=verbose,
-            metrics=self._metrics_name(), )
-
-        cbks.on_begin('train')
-        for epoch in range(epochs):
-
-            # FIXME: adapt to DataLoader
-            loader = train_loader
-            if not isinstance(train_loader, Iterable):
-                loader = train_loader()
-            logs = self._run_one_epoch(
-                loader, cbks, 'train', metrics_name, epoch=epoch)
-
-            if do_eval and epoch % eval_freq == 0:
-                # FIXME: adapt to DataLoader
-                loader = eval_loader
-                if not isinstance(eval_loader, Iterable):
-                    loader = eval_loader()
-
-                eval_steps = len(loader) if hasattr(loader,
-                                                    '__len__') else None
-                cbks.on_begin('eval', {
-                    'steps': eval_steps,
-                    'metrics_name': metrics_name
-                })
-
-                logs = self._run_one_epoch(loader, cbks, 'eval', metrics_name)
-
-                cbks.on_end('eval', logs)
-
-        cbks.on_end('train', logs)
-        self._test_dataloader = None
-
-    def evaluate(
-            self,
-            eval_data,
-            batch_size=1,
-            log_freq=10,
-            verbose=2,
-            num_workers=0,
-            callbacks=None, ):
-        """
-        FIXME: add more comments and usage
-        Args:
-            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.io.Dataset or 
-                paddle.io.Dataloader is recomended.
-            batch_size (int): Integer number. The batch size of train_data and eval_data. 
-                When train_data and eval_data are both the instance of Dataloader, this 
-                parameter will be ignored.
-            log_freq (int): The frequency, in number of steps, the eval logs
-                are printed.
-            verbose (int): The verbosity mode, should be 0, 1, or 2.
-                0 = silent, 1 = progress bar, 2 = one line per epoch.
-            num_workers (int): The number of subprocess to load data, 0 for no subprocess 
-                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this parameter will be ignored.
-            callbacks (Callback|None): A list of `Callback` instances to apply
-                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
-                are automatically inserted.
-        """
-
-        if fluid.in_dygraph_mode():
-            feed_list = None
-        else:
-            feed_list = [x.forward() for x in self._inputs + self._labels]
-
-        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(
-                eval_data, batch_size=batch_size)
-            eval_loader = DataLoader(
-                eval_data,
-                batch_sampler=eval_sampler,
-                places=self._place,
-                feed_list=feed_list,
-                num_workers=num_workers,
-                return_list=True)
-        else:
-            eval_loader = eval_data
-
-        self._test_dataloader = eval_loader
-        metrics_name = self._metrics_name()
-
-        cbks = config_callbacks(
-            callbacks,
-            model=self,
-            log_freq=log_freq,
-            verbose=verbose,
-            metrics=self._metrics_name(), )
-
-        loader = eval_loader
-        if not isinstance(eval_loader, Iterable):
-            loader = eval_loader()
-
-        eval_steps = len(loader) if hasattr(loader, '__len__') else None
-        cbks.on_begin('eval',
-                      {'steps': eval_steps,
-                       'metrics_name': metrics_name})
-
-        logs = self._run_one_epoch(loader, cbks, 'eval', metrics_name)
-
-        cbks.on_end('eval', logs)
-
-        self._test_dataloader = None
-
-        eval_result = {}
-        for k in self._metrics_name():
-            eval_result[k] = logs[k]
-
-        return eval_result
-
-    def predict(self,
-                test_data,
-                batch_size=1,
-                num_workers=0,
-                stack_outputs=True):
-        """
-        FIXME: add more comments and usage
-        Args:
-            test_data (Dataset|DataLoader): An iterable data loader is used for
-                predict. An instance of paddle.io.Dataset or paddle.io.Dataloader 
-                is recomended.
-            batch_size (int): Integer number. The batch size of train_data and eval_data. 
-                When train_data and eval_data are both the instance of Dataloader, this 
-                parameter will be ignored.
-            num_workers (int): the number of subprocess to load data, 0 for no subprocess 
-                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this parameter will be ignored.
-            stack_output (bool): whether stack output field like a batch, as for an output
-                filed of a sample is in shape [X, Y], test_data contains N samples, predict
-                output field will be in shape [N, X, Y] if stack_output is True, and will
-                be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
-                is False. stack_outputs as False is used for LoDTensor output situation,
-                it is recommended set as True if outputs contains no LoDTensor. Default False
-        """
-
-        if fluid.in_dygraph_mode():
-            feed_list = None
-        else:
-            feed_list = [x.forward() for x in self._inputs + self._labels]
-
-        if test_data is not None and isinstance(test_data, Dataset):
-            test_sampler = DistributedBatchSampler(
-                test_data, batch_size=batch_size)
-            test_loader = DataLoader(
-                test_data,
-                batch_sampler=test_sampler,
-                places=self._place,
-                feed_list=feed_list,
-                num_workers=num_workers,
-                return_list=True)
-        else:
-            test_loader = test_data
-
-        self._test_dataloader = test_loader
-
-        loader = test_loader
-        if not isinstance(test_loader, Iterable):
-            loader = test_loader()
-
-        outputs = []
-        for data in tqdm.tqdm(loader):
-            data = flatten(data)
-            outputs.append(self.test(data[:len(self._inputs)]))
-
-        # NOTE: for lod tensor output, we should not stack outputs
-        # for stacking may loss its detail info
-        outputs = list(zip(*outputs))
-        if stack_outputs:
-            outputs = [np.stack(outs, axis=0) for outs in outputs]
-
-        self._test_dataloader = None
-        if test_loader is not None and self._adapter._nranks > 1 \
-                    and isinstance(test_loader, DataLoader):
-            outputs = [o[:len(test_loader.dataset)] for o in outputs]
-        return outputs
-
-    def set_eval_data(self, eval_data):
-        """
-        Args:
-            eval_data (Dataset|DataLoader|None): An iterable data loader is used for 
-                eval. An instance of paddle.io.Dataset or 
-                paddle.io.Dataloader is recomended. 
-        """
-        assert isinstance(
-            eval_data,
-            DataLoader), "eval_data must be a instance of Dataloader!"
-        self._test_dataloader = eval_data
-
-    def _run_one_epoch(self,
-                       data_loader,
-                       callbacks,
-                       mode,
-                       metrics_name,
-                       epoch=None):
-        size = len(data_loader) if hasattr(data_loader, '__len__') else None
-        logs = {
-            'steps': size,
-            'metrics_name': metrics_name,
-        }
-
-        if mode == 'train':
-            assert epoch is not None, 'when mode is train, epoch must be given'
-            callbacks.on_epoch_begin(epoch)
-
-        for step, data in enumerate(data_loader):
-            # data might come from different types of data_loader and have
-            # different format, as following:
-            # 1. DataLoader in static graph:
-            #    [[input1, input2, ..., label1, lable2, ...]]
-            # 2. DataLoader in dygraph
-            #    [input1, input2, ..., label1, lable2, ...]
-            # 3. custumed iterator yield concated inputs and labels:
-            #   [input1, input2, ..., label1, lable2, ...]
-            # 4. custumed iterator yield seperated inputs and labels:
-            #   ([input1, input2, ...], [label1, lable2, ...])
-            # To handle all of these, flatten (nested) list to list.
-            data = flatten(data)
-            # LoDTensor.shape is callable, where LoDTensor comes from
-            # DataLoader in static graph
-            batch_size = data[0].shape()[0] if callable(data[
-                0].shape) else data[0].shape[0]
-
-            callbacks.on_batch_begin(mode, step, logs)
-            if mode == 'train':
-                outs = self.train(data[:len(self._inputs)],
-                                  data[len(self._inputs):])
-            else:
-                outs = self.eval(data[:len(self._inputs)],
-                                 data[len(self._inputs):])
-
-            # losses
-            loss = outs[0] if self._metrics else outs
-            metrics = [[l[0] for l in loss]]
-
-            # metrics
-            for metric in self._metrics:
-                res = metric.accumulate()
-                metrics.extend(to_list(res))
-
-            assert len(metrics_name) == len(metrics)
-            for k, v in zip(metrics_name, metrics):
-                logs[k] = v
-
-            logs['step'] = step
-            if mode == 'train' or self._adapter._merge_count.get(
-                    mode + '_batch', 0) <= 0:
-                logs['batch_size'] = batch_size * ParallelEnv().nranks
-            else:
-                logs['batch_size'] = self._adapter._merge_count[mode +
-                                                                '_batch']
-
-            callbacks.on_batch_end(mode, step, logs)
-        self._reset_metrics()
-
-        if mode == 'train':
-            assert epoch is not None, 'when mode is train, epoch must be given'
-            callbacks.on_epoch_end(epoch)
-
-        return logs
-
-    def _reset_metrics(self):
-        for metric in self._metrics:
-            metric.reset()
-
-    def _metrics_name(self):
-        metrics_name = ['loss']
-        for m in self._metrics:
-            metrics_name.extend(to_list(m.name()))
-        return metrics_name
--- a/progressbar.py
+++ b/progressbar.py
-import sys
-import time
-import numpy as np
-
-
-class ProgressBar(object):
-    """progress bar """
-
-    def __init__(self,
-                 num=None,
-                 width=30,
-                 verbose=1,
-                 start=True,
-                 file=sys.stdout):
-        self._num = num
-        if isinstance(num, int) and num <= 0:
-            raise TypeError('num should be None or integer (> 0)')
-        max_width = self._get_max_width()
-        self._width = width if width <= max_width else max_width
-        self._total_width = 0
-        self._verbose = verbose
-        self.file = file
-        self._values = {}
-        self._values_order = []
-        if start:
-            self._start = time.time()
-        self._last_update = 0
-
-        self._dynamic_display = (
-            (hasattr(self.file, 'isatty') and
-             self.file.isatty()) or 'ipykernel' in sys.modules or
-            'posix' in sys.modules or 'PYCHARM_HOSTED' in os.environ)
-
-    def _get_max_width(self):
-        if sys.version_info > (3, 3):
-            from shutil import get_terminal_size
-        else:
-            from backports.shutil_get_terminal_size import get_terminal_size
-        terminal_width, _ = get_terminal_size()
-        max_width = min(int(terminal_width * 0.6), terminal_width - 50)
-        return max_width
-
-    def start(self):
-        self.file.flush()
-        self._start = time.time()
-
-    def update(self, current_num, values=None):
-        now = time.time()
-
-        if current_num:
-            time_per_unit = (now - self._start) / current_num
-        else:
-            time_per_unit = 0
-
-        if time_per_unit >= 1 or time_per_unit == 0:
-            fps = ' - %.0fs/%s' % (time_per_unit, 'step')
-        elif time_per_unit >= 1e-3:
-            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, 'step')
-        else:
-            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, 'step')
-
-        info = ''
-        if self._verbose == 1:
-            prev_total_width = self._total_width
-
-            if self._dynamic_display:
-                sys.stdout.write('\b' * prev_total_width)
-                sys.stdout.write('\r')
-            else:
-                sys.stdout.write('\n')
-
-            if self._num is not None:
-                numdigits = int(np.log10(self._num)) + 1
-
-                bar_chars = ('step %' + str(numdigits) + 'd/%d [') % (
-                    current_num, self._num)
-                prog = float(current_num) / self._num
-                prog_width = int(self._width * prog)
-
-                if prog_width > 0:
-                    bar_chars += ('=' * (prog_width - 1))
-                    if current_num < self._num:
-                        bar_chars += '>'
-                    else:
-                        bar_chars += '='
-                bar_chars += ('.' * (self._width - prog_width))
-                bar_chars += ']'
-            else:
-                bar_chars = 'step %3d' % current_num
-
-            self._total_width = len(bar_chars)
-            sys.stdout.write(bar_chars)
-
-            for k, val in values:
-                info += ' - %s:' % k
-                val = val if isinstance(val, list) else [val]
-                for i, v in enumerate(val):
-                    if isinstance(v, (float, np.float32, np.float64)):
-                        if abs(v) > 1e-3:
-                            info += ' %.4f' % v
-                        else:
-                            info += ' %.4e' % v
-                    else:
-                        info += ' %s' % v
-
-            if self._num is not None and current_num < self._num:
-                eta = time_per_unit * (self._num - current_num)
-                if eta > 3600:
-                    eta_format = '%d:%02d:%02d' % (eta // 3600, (eta % 3600) //
-                                                   60, eta % 60)
-                elif eta > 60:
-                    eta_format = '%d:%02d' % (eta // 60, eta % 60)
-                else:
-                    eta_format = '%ds' % eta
-
-                info += ' - ETA: %s' % eta_format
-
-            info += fps
-            self._total_width += len(info)
-            if prev_total_width > self._total_width:
-                info += (' ' * (prev_total_width - self._total_width))
-
-            # newline for another epoch
-            if self._num is not None and current_num >= self._num:
-                info += '\n'
-            if self._num is None:
-                info += '\n'
-
-            sys.stdout.write(info)
-            sys.stdout.flush()
-            self._last_update = now
-        elif self._verbose == 2:
-            if self._num:
-                numdigits = int(np.log10(self._num)) + 1
-                count = ('step %' + str(numdigits) + 'd/%d') % (current_num,
-                                                                self._num)
-            else:
-                count = 'step %3d' % current_num
-            info = count + info
-
-            for k, val in values:
-                info += ' - %s:' % k
-                val = val if isinstance(val, list) else [val]
-                for v in val:
-                    if isinstance(v, (float, np.float32, np.float64)):
-                        if abs(v) > 1e-3:
-                            info += ' %.4f' % v
-                        else:
-                            info += ' %.4e' % v
-                    elif isinstance(v, np.ndarray) and \
-                        v.size == 1 and \
-                        isinstance(v.dtype, (np.float32, np.float64)):
-                        if abs(v[0]) > 1e-3:
-                            info += ' %.4f' % v[0]
-                        else:
-                            info += ' %.4e' % v[0]
-                    else:
-                        info += ' %s' % v
-
-            info += fps
-            info += '\n'
-            sys.stdout.write(info)
-            sys.stdout.flush()
--- a/text.py
+++ b/text.py
-import collections
-import copy
-import six
-import sys
-from functools import partial, reduce
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers.utils as utils
-from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit
-from paddle.fluid.data_feeder import convert_dtype
-
-from paddle.fluid import layers
-from paddle.fluid.dygraph import Layer
-from paddle.fluid.layers import BeamSearchDecoder
-
-__all__ = [
-    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
-    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
-    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
-    'TransformerDecoder', 'TransformerBeamSearchDecoder', 'DynamicGRU', 'BiGRU',
-    'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging']
-
-
-class RNNCell(Layer):
-    def get_initial_states(self,
-                           batch_ref,
-                           shape=None,
-                           dtype=None,
-                           init_value=0,
-                           batch_dim_idx=0):
-        """
-        Generate initialized states according to provided shape, data type and
-        value.
-
-        Parameters:
-            batch_ref: A (possibly nested structure of) tensor variable[s].
-                The first dimension of the tensor will be used as batch size to
-                initialize states.
-            shape: A (possiblely nested structure of) shape[s], where a shape is
-                represented as a list/tuple of integer). -1(for batch size) will
-                beautomatically inserted if shape is not started with it. If None,
-                property `state_shape` will be used. The default value is None.
-            dtype: A (possiblely nested structure of) data type[s]. The structure
-                must be same as that of `shape`, except when all tensors' in states
-                has the same data type, a single data type can be used. If None and
-                property `cell.state_shape` is not available, float32 will be used
-                as the data type. The default value is None.
-            init_value: A float value used to initialize states.
-
-        Returns:
-            Variable: tensor variable[s] packed in the same structure provided \
-                by shape, representing the initialized states.
-        """
-        # TODO: use inputs and batch_size
-        batch_ref = flatten(batch_ref)[0]
-
-        def _is_shape_sequence(seq):
-            if sys.version_info < (3, ):
-                integer_types = (
-                    int,
-                    long, )
-            else:
-                integer_types = (int, )
-            """For shape, list/tuple of integer is the finest-grained objection"""
-            if (isinstance(seq, list) or isinstance(seq, tuple)):
-                if reduce(
-                        lambda flag, x: isinstance(x, integer_types) and flag,
-                        seq, True):
-                    return False
-            # TODO: Add check for the illegal
-            if isinstance(seq, dict):
-                return True
-            return (isinstance(seq, collections.Sequence) and
-                    not isinstance(seq, six.string_types))
-
-        class Shape(object):
-            def __init__(self, shape):
-                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
-
-        # nested structure of shapes
-        states_shapes = self.state_shape if shape is None else shape
-        is_sequence_ori = utils.is_sequence
-        utils.is_sequence = _is_shape_sequence
-        states_shapes = map_structure(lambda shape: Shape(shape),
-                                      states_shapes)
-        utils.is_sequence = is_sequence_ori
-
-        # nested structure of dtypes
-        try:
-            states_dtypes = self.state_dtype if dtype is None else dtype
-        except NotImplementedError:  # use fp32 as default
-            states_dtypes = "float32"
-        if len(flatten(states_dtypes)) == 1:
-            dtype = flatten(states_dtypes)[0]
-            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
-
-        init_states = map_structure(
-            lambda shape, dtype: fluid.layers.fill_constant_batch_size_like(
-                input=batch_ref,
-                shape=shape.shape,
-                dtype=dtype,
-                value=init_value,
-                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
-        return init_states
-
-    @property
-    def state_shape(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) shape[s], where a shape is represented
-        as a list/tuple of integers (-1 for batch size would be automatically
-        inserted into a shape if shape is not started with it).
-        Not necessary to be implemented if states are not initialized by
-        `get_initial_states` or the `shape` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_shape` in the used cell.")
-
-    @property
-    def state_dtype(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) data types[s]. The structure must be
-        same as that of `shape`, except when all tensors' in states has the same
-        data type, a signle data type can be used.
-        Not necessary to be implemented if states are not initialized
-        by `get_initial_states` or the `dtype` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_dtype` in the used cell.")
-
-
-class BasicLSTMCell(RNNCell):
-    """
-    ****
-    BasicLSTMUnit class, Using basic operator to build LSTM
-    The algorithm can be described as the code below.
-        .. math::
-           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-           h_t &= o_t \odot tanh(c_t)
-        - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-          of weights from the input gate to the input)
-        - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-        - sigmoid is the logistic sigmoid function.
-        - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-          and cell activation vectors, respectively, all of which have the same size as
-          the cell output activation vector $h$.
-        - The :math:`\odot` is the element-wise product of the vectors.
-        - :math:`tanh` is the activation functions.
-        - :math:`\\tilde{c_t}` is also called candidate hidden state,
-          which is computed based on the current input and the previous hidden state.
-    Args:
-        name_scope(string) : The name scope used to identify parameter and bias name
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized as zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cells (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate
-        dtype(string): data type used in this unit
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 dtype='float32'):
-        super(BasicLSTMCell, self).__init__()
-
-        self._hidden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        self._forget_bias = layers.fill_constant(
-            [1], dtype=dtype, value=forget_bias)
-        self._forget_bias.stop_gradient = False
-        self._dtype = dtype
-        self._input_size = input_size
-
-        self._weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=[
-                self._input_size + self._hidden_size, 4 * self._hidden_size
-            ],
-            dtype=self._dtype)
-
-        self._bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[4 * self._hidden_size],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input, state):
-        pre_hidden, pre_cell = state
-        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
-
-        gate_input = layers.elementwise_add(gate_input, self._bias)
-        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-        new_cell = layers.elementwise_add(
-            layers.elementwise_mul(
-                pre_cell,
-                layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
-            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
-        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
-
-        return new_hidden, [new_hidden, new_cell]
-
-    @property
-    def state_shape(self):
-        return [[self._hidden_size], [self._hidden_size]]
-
-
-class BasicGRUCell(RNNCell):
-    """
-    ****
-    BasicGRUUnit class, using basic operators to build GRU
-    The algorithm can be described as the equations below.
-
-        .. math::
-            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
-
-            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
-
-            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
-
-            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
-
-    Args:
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, gru_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cell (actNode).
-                             Default: 'fluid.layers.tanh'
-        dtype(string): data type used in this unit
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 dtype='float32'):
-        super(BasicGRUCell, self).__init__()
-        self._input_size = input_size
-        self._hiden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        self._dtype = dtype
-
-        if self._param_attr is not None and self._param_attr.name is not None:
-            gate_param_attr = copy.deepcopy(self._param_attr)
-            candidate_param_attr = copy.deepcopy(self._param_attr)
-            gate_param_attr.name += "_gate"
-            candidate_param_attr.name += "_candidate"
-        else:
-            gate_param_attr = self._param_attr
-            candidate_param_attr = self._param_attr
-
-        self._gate_weight = self.create_parameter(
-            attr=gate_param_attr,
-            shape=[self._input_size + self._hiden_size, 2 * self._hiden_size],
-            dtype=self._dtype)
-
-        self._candidate_weight = self.create_parameter(
-            attr=candidate_param_attr,
-            shape=[self._input_size + self._hiden_size, self._hiden_size],
-            dtype=self._dtype)
-
-        if self._bias_attr is not None and self._bias_attr.name is not None:
-            gate_bias_attr = copy.deepcopy(self._bias_attr)
-            candidate_bias_attr = copy.deepcopy(self._bias_attr)
-            gate_bias_attr.name += "_gate"
-            candidate_bias_attr.name += "_candidate"
-        else:
-            gate_bias_attr = self._bias_attr
-            candidate_bias_attr = self._bias_attr
-
-        self._gate_bias = self.create_parameter(
-            attr=gate_bias_attr,
-            shape=[2 * self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
-        self._candidate_bias = self.create_parameter(
-            attr=candidate_bias_attr,
-            shape=[self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input, state):
-        pre_hidden = state
-        concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
-
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
-
-        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
-
-        gate_input = self._gate_activation(gate_input)
-        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
-
-        r_hidden = r * pre_hidden
-
-        candidate = layers.matmul(
-            layers.concat([input, r_hidden], 1), self._candidate_weight)
-        candidate = layers.elementwise_add(candidate, self._candidate_bias)
-
-        c = self._activation(candidate)
-        new_hidden = u * pre_hidden + (1 - u) * c
-
-        return new_hidden
-
-    @property
-    def state_shape(self):
-        return [self._hidden_size]
-
-
-class RNN(fluid.dygraph.Layer):
-    def __init__(self, cell, is_reverse=False, time_major=False):
-        super(RNN, self).__init__()
-        self.cell = cell
-        if not hasattr(self.cell, "call"):
-            self.cell.call = self.cell.forward
-        self.is_reverse = is_reverse
-        self.time_major = time_major
-        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
-                                                                            1)
-
-    def forward(self,
-                inputs,
-                initial_states=None,
-                sequence_length=None,
-                **kwargs):
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                new_state = fluid.layers.elementwise_mul(
-                    new_state, step_mask,
-                    axis=0) - fluid.layers.elementwise_mul(
-                        state, (step_mask - 1), axis=0)
-                return new_state
-
-            flat_inputs = flatten(inputs)
-            batch_size, time_steps = (
-                flat_inputs[0].shape[self.batch_index],
-                flat_inputs[0].shape[self.time_step_index])
-
-            if initial_states is None:
-                initial_states = self.cell.get_initial_states(
-                    batch_ref=inputs, batch_dim_idx=self.batch_index)
-
-            if not self.time_major:
-                inputs = map_structure(
-                    lambda x: fluid.layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), inputs)
-
-            if sequence_length:
-                mask = fluid.layers.sequence_mask(
-                    sequence_length,
-                    maxlen=time_steps,
-                    dtype=flatten(initial_states)[0].dtype)
-                mask = fluid.layers.transpose(mask, [1, 0])
-
-            if self.is_reverse:
-                inputs = map_structure(
-                    lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
-                mask = fluid.layers.reverse(
-                    mask, axis=[0]) if sequence_length else None
-
-            states = initial_states
-            outputs = []
-            for i in range(time_steps):
-                step_inputs = map_structure(lambda x: x[i], inputs)
-                step_outputs, new_states = self.cell(step_inputs, states,
-                                                     **kwargs)
-                if sequence_length:
-                    new_states = map_structure(
-                        partial(
-                            _maybe_copy, step_mask=mask[i]),
-                        states,
-                        new_states)
-                states = new_states
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if i == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array,
-                                             axis=self.time_step_index),
-                outputs)
-
-            if self.is_reverse:
-                final_outputs = map_structure(
-                    lambda x: fluid.layers.reverse(x,
-                                                   axis=self.time_step_index),
-                    final_outputs)
-
-            final_states = new_states
-        else:
-            final_outputs, final_states = fluid.layers.rnn(
-                self.cell,
-                inputs,
-                initial_states=initial_states,
-                sequence_length=sequence_length,
-                time_major=self.time_major,
-                is_reverse=self.is_reverse,
-                **kwargs)
-        return final_outputs, final_states
-
-
-class DynamicDecode(Layer):
-    def __init__(self,
-                 decoder,
-                 max_step_num=None,
-                 output_time_major=False,
-                 impute_finished=False,
-                 is_test=False,
-                 return_length=False):
-        super(DynamicDecode, self).__init__()
-        self.decoder = decoder
-        self.max_step_num = max_step_num
-        self.output_time_major = output_time_major
-        self.impute_finished = impute_finished
-        self.is_test = is_test
-        self.return_length = return_length
-
-    def forward(self, inits=None, **kwargs):
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-                def __getitem__(self, item):
-                    return self.array.__getitem__(item)
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                state_dtype = state.dtype
-                if convert_dtype(state_dtype) in ["bool"]:
-                    state = layers.cast(state, dtype="float32")
-                    new_state = layers.cast(new_state, dtype="float32")
-                if step_mask.dtype != state.dtype:
-                    step_mask = layers.cast(step_mask, dtype=state.dtype)
-                    # otherwise, renamed bool gradients of would be summed up leading
-                    # to sum(bool) error.
-                    step_mask.stop_gradient = True
-                new_state = layers.elementwise_mul(
-                    state, step_mask, axis=0) - layers.elementwise_mul(
-                        new_state, (step_mask - 1), axis=0)
-                if convert_dtype(state_dtype) in ["bool"]:
-                    new_state = layers.cast(new_state, dtype=state_dtype)
-                return new_state
-
-            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
-                inits)
-            inputs, states, finished = (initial_inputs, initial_states,
-                                        initial_finished)
-            cond = layers.logical_not((layers.reduce_all(initial_finished)))
-            sequence_lengths = layers.cast(
-                layers.zeros_like(initial_finished), "int64")
-            outputs = None
-
-            step_idx = 0
-            step_idx_tensor = layers.fill_constant(
-                shape=[1], dtype="int64", value=step_idx)
-            while cond.numpy():
-                (step_outputs, next_states, next_inputs,
-                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
-                                                    states, **kwargs)
-                if not self.decoder.tracks_own_finished:
-                    # BeamSearchDecoder would track it own finished, since
-                    # beams would be reordered and the finished status of each
-                    # entry might change. Otherwise, perform logical OR which
-                    # would not change the already finished.
-                    next_finished = layers.logical_or(next_finished, finished)
-                    # To confirm states.finished/finished be consistent with
-                    # next_finished.
-                    layers.assign(next_finished, finished)
-                next_sequence_lengths = layers.elementwise_add(
-                    sequence_lengths,
-                    layers.cast(
-                        layers.logical_not(finished), sequence_lengths.dtype))
-
-                if self.impute_finished:  # rectify the states for the finished.
-                    next_states = map_structure(
-                        lambda x, y: _maybe_copy(x, y, finished), states,
-                        next_states)
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if step_idx == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-                inputs, states, finished, sequence_lengths = (
-                    next_inputs, next_states, next_finished,
-                    next_sequence_lengths)
-
-                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
-                step_idx += 1
-
-                layers.logical_not(layers.reduce_all(finished), cond)
-                if self.max_step_num is not None and step_idx > self.max_step_num:
-                    break
-
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
-            final_states = states
-
-            try:
-                final_outputs, final_states = self.decoder.finalize(
-                    final_outputs, final_states, sequence_lengths)
-            except NotImplementedError:
-                pass
-
-            if not self.output_time_major:
-                final_outputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), final_outputs)
-
-            return (final_outputs, final_states,
-                    sequence_lengths) if self.return_length else (
-                        final_outputs, final_states)
-        else:
-            return fluid.layers.dynamic_decode(
-                self.decoder,
-                inits,
-                max_step_num=self.max_step_num,
-                output_time_major=self.output_time_major,
-                impute_finished=self.impute_finished,
-                is_test=self.is_test,
-                return_length=self.return_length,
-                **kwargs)
-
-
-class TransfomerCell(object):
-    """
-    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
-    used as RNNCell
-    """
-
-    def __init__(self, decoder):
-        self.decoder = decoder
-
-    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
-                 static_caches):
-        trg_word, trg_pos = inputs
-        for cache, static_cache in zip(states, static_caches):
-            cache.update(static_cache)
-        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
-                              enc_output, states)
-        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
-        return logits, new_states
-
-
-class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
-    def __init__(self, cell, start_token, end_token, beam_size,
-                 var_dim_in_state):
-        super(TransformerBeamSearchDecoder,
-              self).__init__(cell, start_token, end_token, beam_size)
-        self.cell = cell
-        self.var_dim_in_state = var_dim_in_state
-
-    def _merge_batch_beams_with_var_dim(self, x):
-        # init length of cache is 0, and it increases with decoding carrying on,
-        # thus need to reshape elaborately
-        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
-        x = layers.transpose(x,
-                             list(range(var_dim_in_state, len(x.shape))) +
-                             list(range(0, var_dim_in_state)))
-        x = layers.reshape(
-            x, [0] * (len(x.shape) - var_dim_in_state
-                      ) + [self.batch_size * self.beam_size] +
-            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
-        x = layers.transpose(
-            x,
-            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
-            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
-        return x
-
-    def _split_batch_beams_with_var_dim(self, x):
-        var_dim_size = layers.shape(x)[self.var_dim_in_state]
-        x = layers.reshape(
-            x, [-1, self.beam_size] +
-            [int(size)
-             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
-            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
-        return x
-
-    def step(self, time, inputs, states, **kwargs):
-        # compared to RNN, Transformer has 3D data at every decoding step
-        inputs = layers.reshape(inputs, [-1, 1])  # token
-        pos = layers.ones_like(inputs) * time  # pos
-        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
-                                    states.cell_states)
-
-        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
-                                                   **kwargs)
-        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
-        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
-                                         next_cell_states)
-
-        beam_search_output, beam_search_state = self._beam_search_step(
-            time=time,
-            logits=cell_outputs,
-            next_cell_states=next_cell_states,
-            beam_state=states)
-        next_inputs, finished = (beam_search_output.predicted_ids,
-                                 beam_search_state.finished)
-
-        return (beam_search_output, beam_search_state, next_inputs, finished)
-
-
-### Transformer Modules ###
-class PrePostProcessLayer(Layer):
-    """
-    PrePostProcessLayer
-    """
-
-    def __init__(self, process_cmd, d_model, dropout_rate):
-        super(PrePostProcessLayer, self).__init__()
-        self.process_cmd = process_cmd
-        self.functors = []
-        for cmd in self.process_cmd:
-            if cmd == "a":  # add residual connection
-                self.functors.append(lambda x, y: x + y if y else x)
-            elif cmd == "n":  # add layer normalization
-                self.functors.append(
-                    self.add_sublayer(
-                        "layer_norm_%d" % len(
-                            self.sublayers(include_sublayers=False)),
-                        LayerNorm(
-                            normalized_shape=d_model,
-                            param_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(1.)),
-                            bias_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(0.)))))
-            elif cmd == "d":  # add dropout
-                self.functors.append(lambda x: layers.dropout(
-                    x, dropout_prob=dropout_rate, is_test=False)
-                                     if dropout_rate else x)
-
-    def forward(self, x, residual=None):
-        for i, cmd in enumerate(self.process_cmd):
-            if cmd == "a":
-                x = self.functors[i](x, residual)
-            else:
-                x = self.functors[i](x)
-        return x
-
-
-class MultiHeadAttention(Layer):
-    """
-    Multi-Head Attention
-    """
-
-    def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
-        super(MultiHeadAttention, self).__init__()
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
-        self.d_model = d_model
-        self.dropout_rate = dropout_rate
-        self.q_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.k_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.v_fc = Linear(
-            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
-        self.proj_fc = Linear(
-            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
-
-    def _prepare_qkv(self, queries, keys, values, cache=None):
-        if keys is None:  # self-attention
-            keys, values = queries, queries
-            static_kv = False
-        else:  # cross-attention
-            static_kv = True
-
-        q = self.q_fc(queries)
-        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
-        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
-
-        if cache is not None and static_kv and "static_k" in cache:
-            # for encoder-decoder attention in inference and has cached
-            k = cache["static_k"]
-            v = cache["static_v"]
-        else:
-            k = self.k_fc(keys)
-            v = self.v_fc(values)
-            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-
-        if cache is not None:
-            if static_kv and not "static_k" in cache:
-                # for encoder-decoder attention in inference and has not cached
-                cache["static_k"], cache["static_v"] = k, v
-            elif not static_kv:
-                # for decoder self-attention in inference
-                cache_k, cache_v = cache["k"], cache["v"]
-                k = layers.concat([cache_k, k], axis=2)
-                v = layers.concat([cache_v, v], axis=2)
-                cache["k"], cache["v"] = k, v
-
-        return q, k, v
-
-    def forward(self, queries, keys, values, attn_bias, cache=None):
-        # compute q ,k ,v
-        q, k, v = self._prepare_qkv(queries, keys, values, cache)
-
-        # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if self.dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=self.dropout_rate, is_test=False)
-
-        out = layers.matmul(weights, v)
-
-        # combine heads
-        out = layers.transpose(out, perm=[0, 2, 1, 3])
-        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
-
-        # project to output
-        out = self.proj_fc(out)
-        return out
-
-    def cal_kv(self, keys, values):
-        k = self.k_fc(keys)
-        v = self.v_fc(values)
-        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-        return k, v
-
-
-class FFN(Layer):
-    """
-    Feed-Forward Network
-    """
-
-    def __init__(self, d_inner_hid, d_model, dropout_rate):
-        super(FFN, self).__init__()
-        self.dropout_rate = dropout_rate
-        self.fc1 = Linear(
-            input_dim=d_model, output_dim=d_inner_hid, act="relu")
-        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
-
-    def forward(self, x):
-        hidden = self.fc1(x)
-        if self.dropout_rate:
-            hidden = layers.dropout(
-                hidden, dropout_prob=self.dropout_rate, is_test=False)
-        out = self.fc2(hidden)
-        return out
-
-
-class TransformerEncoderLayer(Layer):
-    """
-    EncoderLayer
-    """
-
-    def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
-
-        super(TransformerEncoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                            attention_dropout)
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-    def forward(self, enc_input, attn_bias):
-        attn_output = self.self_attn(
-            self.preprocesser1(enc_input), None, None, attn_bias)
-        attn_output = self.postprocesser1(attn_output, enc_input)
-
-        ffn_output = self.ffn(self.preprocesser2(attn_output))
-        ffn_output = self.postprocesser2(ffn_output, attn_output)
-        return ffn_output
-
-
-class TransformerEncoder(Layer):
-    """
-    encoder
-    """
-
-    def __init__(self,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
-
-        super(TransformerEncoder, self).__init__()
-
-        self.encoder_layers = list()
-        for i in range(n_layer):
-            self.encoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerEncoderLayer(
-                        n_head, d_key, d_value, d_model, d_inner_hid,
-                        prepostprocess_dropout, attention_dropout,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
-
-    def forward(self, enc_input, attn_bias):
-        for encoder_layer in self.encoder_layers:
-            enc_output = encoder_layer(enc_input, attn_bias)
-            enc_input = enc_output
-
-        return self.processer(enc_output)
-
-
-class TransformerDecoderLayer(Layer):
-    """
-    decoder
-    """
-
-    def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
-        super(TransformerDecoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                            attention_dropout)
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                             attention_dropout)
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
-        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                cache=None):
-        self_attn_output = self.self_attn(
-            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
-        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
-
-        cross_attn_output = self.cross_attn(
-            self.preprocesser2(self_attn_output), enc_output, enc_output,
-            cross_attn_bias, cache)
-        cross_attn_output = self.postprocesser2(cross_attn_output,
-                                                self_attn_output)
-
-        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
-        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
-
-        return ffn_output
-
-
-class TransformerDecoder(Layer):
-    """
-    decoder
-    """
-
-    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-                 prepostprocess_dropout, attention_dropout, relu_dropout,
-                 preprocess_cmd, postprocess_cmd):
-        super(TransformerDecoder, self).__init__()
-
-        self.decoder_layers = list()
-        for i in range(n_layer):
-            self.decoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerDecoderLayer(
-                        n_head, d_key, d_value, d_model, d_inner_hid,
-                        prepostprocess_dropout, attention_dropout,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
-
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias,
-                cross_attn_bias,
-                caches=None):
-        for i, decoder_layer in enumerate(self.decoder_layers):
-            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
-                                       cross_attn_bias, None
-                                       if caches is None else caches[i])
-            dec_input = dec_output
-
-        return self.processer(dec_output)
-
-    def prepare_static_cache(self, enc_output):
-        return [
-            dict(
-                zip(("static_k", "static_v"),
-                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
-            for decoder_layer in self.decoder_layers
-        ]
-
-
-class DynamicGRU(fluid.dygraph.Layer):
-    def __init__(self,
-                 size,
-                 h_0=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 origin_mode=False,
-                 init_size=None):
-        super(DynamicGRU, self).__init__()
-
-        self.gru_unit = GRUUnit(
-            size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
-
-        self.size = size
-        self.h_0 = h_0
-        self.is_reverse = is_reverse
-
-    def forward(self, inputs):
-        hidden = self.h_0
-        res = []
-
-        for i in range(inputs.shape[1]):
-            if self.is_reverse:
-                i = inputs.shape[1] - 1 - i
-            input_ = inputs[:, i:i + 1, :]
-            input_ = fluid.layers.reshape(
-                input_, [-1, input_.shape[2]], inplace=False)
-            hidden, reset, gate = self.gru_unit(input_, hidden)
-            hidden_ = fluid.layers.reshape(
-                hidden, [-1, 1, hidden.shape[1]], inplace=False)
-            res.append(hidden_)
-        if self.is_reverse:
-            res = res[::-1]
-        res = fluid.layers.concat(res, axis=1)
-        return res
-
-
-class BiGRU(fluid.dygraph.Layer):
-    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
-        super(BiGRU, self).__init__()
-
-        self.pre_gru = Linear(
-            input_dim=input_dim,
-            output_dim=grnn_hidden_dim * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.gru = DynamicGRU(
-            size=grnn_hidden_dim,
-            h_0=h_0,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.pre_gru_r = Linear(
-            input_dim=input_dim,
-            output_dim=grnn_hidden_dim * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.gru_r = DynamicGRU(
-            size=grnn_hidden_dim,
-            is_reverse=True,
-            h_0=h_0,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-    def forward(self, input_feature):
-        res_pre_gru = self.pre_gru(input_feature)
-        res_gru = self.gru(res_pre_gru)
-        res_pre_gru_r = self.pre_gru_r(input_feature)
-        res_gru_r = self.gru_r(res_pre_gru_r)
-        bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1)
-        return bi_merge
-
-
-class Linear_chain_crf(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Linear_chain_crf, self).__init__()
-
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-
-    @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-
-    def forward(self, input, label, length=None):
-
-        alpha = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        emission_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        transition_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        log_likelihood = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": [label]
-        }
-        if length:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='linear_chain_crf',
-            inputs=this_inputs,
-            outputs={
-                "Alpha": [alpha],
-                "EmissionExps": [emission_exps],
-                "TransitionExps": transition_exps,
-                "LogLikelihood": log_likelihood
-            },
-            attrs={"is_test": self._is_test, })
-        return log_likelihood
-
-
-class Crf_decoding(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Crf_decoding, self).__init__()
-
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._param_attr = param_attr
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-
-    @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-
-    def forward(self, input, label=None, length=None):
-
-        viterbi_path = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": label
-        }
-        if length:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='crf_decoding',
-            inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={"is_test": self._is_test, })
-        return viterbi_path
-
-
-class SequenceTagging(fluid.dygraph.Layer):
-    def __init__(self, 
-             vocab_size,
-             num_labels,
-             batch_size, 
-             word_emb_dim=128,
-             grnn_hidden_dim=128,
-             emb_learning_rate=0.1,
-             crf_learning_rate=0.1,
-             bigru_num=2,
-             init_bound=0.1,
-             length=None):
-        super(SequenceTagging, self).__init__()
-        """
-        define the sequence tagging network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
-        self.word_emb_dim = word_emb_dim
-        self.vocab_size = vocab_size
-        self.num_labels = num_labels
-        self.grnn_hidden_dim = grnn_hidden_dim
-        self.emb_lr = emb_learning_rate
-        self.crf_lr = crf_learning_rate
-        self.bigru_num = bigru_num
-        self.batch_size = batch_size
-        self.init_bound = 0.1
-
-        self.word_embedding = Embedding(
-            size=[self.vocab_size, self.word_emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
-                learning_rate=self.emb_lr,
-                name="word_emb",
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound)))
-
-        h_0 = fluid.layers.create_global_var(
-            shape=[self.batch_size, self.grnn_hidden_dim],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            force_cpu=True,
-            name='h_0')
-
-        self.bigru_units = []
-        for i in range(self.bigru_num):
-            if i == 0:
-                self.bigru_units.append(
-                    self.add_sublayer(
-                        "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0)))
-            else:
-                self.bigru_units.append(
-                    self.add_sublayer(
-                        "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim * 2,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0)))
-
-        self.fc = Linear(
-            input_dim=self.grnn_hidden_dim * 2,
-            output_dim=self.num_labels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.linear_chain_crf = Linear_chain_crf(
-            param_attr=fluid.ParamAttr(
-                name='linear_chain_crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-
-        self.crf_decoding = Crf_decoding(
-            param_attr=fluid.ParamAttr(
-                name='crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-
-    def forward(self, word, target, lengths):
-        """
-        Configure the network
-        """
-        word_embed = self.word_embedding(word)
-        input_feature = word_embed
-
-        for i in range(self.bigru_num):
-            bigru_output = self.bigru_units[i](input_feature)
-            input_feature = bigru_output
-
-        emission = self.fc(bigru_output)
-
-        crf_cost = self.linear_chain_crf(
-            input=emission, label=target, length=lengths)
-        avg_cost = fluid.layers.mean(x=crf_cost)
-        self.crf_decoding.weight = self.linear_chain_crf.weight
-        crf_decode = self.crf_decoding(input=emission, length=lengths)
-        return crf_decode, avg_cost, lengths