delete Pantheon module (#813)

c9818511 · zhouzj · GitHub · 5e363043 · 5e363043 · 5e363043
37 changed file
--- a/demo/pantheon/lexical_anlysis/README.md
+++ b/demo/pantheon/lexical_anlysis/README.md
-# Distillation example: Chinese lexical analysis
-We demonstrated how to use the Pantheon framework for online distillation of the Chinese lexical analysis model with sample dataset. The effect of large-scale online distillation is shown below:
-| model | Precision | Recall | F1-score|
-| ------ | ------ | ------ | ------ |
-| BiGRU | 89.2 | 89.4 | 89.3 |
-| BERT fine-tuned | 90.2 | 90.4 | 90.3 |
-| ERNIE fine-tuned | 91.7 | 91.7 | 91.7 |
-| DistillBiGRU | 90.20  | 90.52 | 90.36 |
-BiGRU is to train a BiGRU based LAC model from scratch; BERT fine-tuned is to fine-tune LAC task on BERT base model; ERNIE fine-tuned is to fine-tune LAC task on BERT base model; DistillBiGRU is trained through large-scale online distillation with ERNIE fine-tuned as teacher model.
-## Introduction
-Lexical Analysis of Chinese, or LAC for short, is a lexical analysis model that completes the tasks of Chinese word segmentation, part-of-speech tagging, and named entity recognition in a single model. We conduct an overall evaluation of word segmentation, part-of-speech tagging, and named entity recognition on a self-built dataset. We use the finetuned ERNIE model as the Teacher model and GRU as the Student model, which are needed by the Pantheon framework for online distillation.
-#### 1. Download the training data set
-Download the data set file, and after decompression, a `./data/` folder will be created.
-```bash
-python downloads.py dataset
-```
-#### 2. Download the Teacher model
-```bash
-# download ERNIE finetuned model
-python downloads.py finetuned
-python downloads.py conf
-```
-### 3. Distilling Student model
-```bash
-# start teacher service
-bash run_teacher.sh
-# start student service
-bash run_student.sh
-```
-> If you want to learn more about LAC, you can refer to this repo: https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/lexical_analysis
--- a/demo/pantheon/lexical_anlysis/README_cn.md
+++ b/demo/pantheon/lexical_anlysis/README_cn.md
-# 蒸馏样例：中文词法分析
-我们在样例数据集上，对中文词法分析模型，演示了如何使用Pantheon框架进行在线蒸馏。大规模在线蒸馏的效果如下图所示：
-| 模型 | 精度 | 召回率 | F1值|
-| ------ | ------ | ------ | ------ |
-| BiGRU | 89.2 | 89.4 | 89.3 |
-| BERT fine-tuned | 90.2 | 90.4 | 90.3 |
-| ERNIE fine-tuned | 91.7 | 91.7 | 91.7 |
-| DistillBiGRU | 90.20  | 90.52 | 90.36 |
-BiGRU 是使用双向GRU网络从头训练LAC任务；BERT fine-tuned 是在BERT base模型上微调LAC任务；ERNIE fine-tuned 是在ERNIE base模型上微调LAC任务；DistillBiGRU 是使用ERNIE fine-tuned模型作为teacher模型，通过大规模蒸馏训练LAC任务。
-## 简介
-Lexical Analysis of Chinese，简称 LAC，是一个联合的词法分析模型，在单个模型中完成中文分词、词性标注、专名识别任务。我们在自建的数据集上对分词、词性标注、专名识别进行整体的评估效果。我们使用经过finetune的 ERNIE 模型作为Teacher模型，使用GRU作为Student模型，使用Pantheon框架进行在线蒸馏。
-#### 1. 下载训练数据集
-下载数据集文件，解压后会生成 `./data/` 文件夹
-```bash
-python downloads.py dataset
-```
-#### 2. 下载Teacher模型
-```bash
-# download ERNIE finetuned model
-python downloads.py finetuned
-python downloads.py conf
-```
-### 3. 蒸馏Student模型
-```bash
-# start teacher service
-bash run_teacher.sh
-# start student service
-bash run_student.sh
-```
-> 如果你想详细了解LAC的原理可以参照相关repo: https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/lexical_analysis
--- a/demo/pantheon/lexical_anlysis/__init__.py
+++ b/demo/pantheon/lexical_anlysis/__init__.py
-from .teacher import Teacher
-from .student import Student
-__all__ = teacher.__all__ + student.__all__
--- a/demo/pantheon/lexical_anlysis/creator.py
+++ b/demo/pantheon/lexical_anlysis/creator.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Define the function to create lexical analysis model and model's data reader
-"""
-import sys
-import os
-import math
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.initializer import NormalInitializer
-from reader import Dataset
-from ernie_reader import SequenceLabelReader
-from models.sequence_labeling import nets
-from models.representation.ernie import ernie_encoder, ernie_pyreader
-def create_model(args, vocab_size, num_labels, mode='train'):
-    """create lac model"""
-    # model's input data
-    words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1)
-    targets = fluid.data(
-        name='targets', shape=[-1, 1], dtype='int64', lod_level=1)
-    if mode == "train":
-        print("create model mode: ", mode)
-        teacher_crf_decode = fluid.data(
-            name='teacher_crf_decode', shape=[-1, 1], dtype='float32', lod_level=1)
-    else:
-        print("create model mode: ", mode)
-        teacher_crf_decode = None
-    feed_list = [words, targets]
-    if teacher_crf_decode:
-        feed_list.append(teacher_crf_decode)
-    pyreader = fluid.io.DataLoader.from_generator(
-                feed_list=feed_list,
-                capacity=200,
-                use_double_buffer=True,
-                iterable=False)
-    # for test or train process
-    avg_cost, crf_avg_cost, teacher_cost, crf_decode= nets.lex_net(
-        words, args, vocab_size, num_labels, teacher_crf_decode,for_infer=False, target=targets)
-    (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
-     num_correct_chunks) = fluid.layers.chunk_eval(
-         input=crf_decode,
-         label=targets,
-         chunk_scheme="IOB",
-         num_chunk_types=int(math.ceil((num_labels - 1) / 2.0)))
-    chunk_evaluator = fluid.metrics.ChunkEvaluator()
-    chunk_evaluator.reset()
-    ret = {
-        "pyreader": pyreader,
-        "words": words,
-        "targets": targets,
-        "avg_cost": avg_cost,
-        "crf_avg_cost": crf_avg_cost,
-        "teacher_cost": teacher_cost,
-        "crf_decode": crf_decode,
-        "precision": precision,
-        "recall": recall,
-        "f1_score": f1_score,
-        "chunk_evaluator": chunk_evaluator,
-        "num_infer_chunks": num_infer_chunks,
-        "num_label_chunks": num_label_chunks,
-        "num_correct_chunks": num_correct_chunks
-    }
-    return ret
-def create_lexnet_data_generator(args,
-                                 reader,
-                                 file_name,
-                                 place,
-                                 mode='train'):
-    if mode == 'train':
-        def wrapper():
-            batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, []
-            emi_lens = []
-            for epoch in range(args.epoch):
-                print("data epoch: {}".format(epoch))
-                for instance in reader.file_reader(file_name, mode="train")():
-                    words, labels, emission = instance
-                    if len(seq_lens) < args.batch_size:
-                        batch_words.append(words)
-                        batch_labels.append(labels)
-                        if batch_emissions is not None:
-                            batch_emissions = np.concatenate((batch_emissions, emission))
-                        else:
-                            batch_emissions = emission
-                        seq_lens.append(len(words))
-                        emi_lens.append(emission.shape[0])
-                    if len(seq_lens) == args.batch_size:  
-                        #print("batch words len", [len(seq) for seq in batch_words])
-                        #print("batch labels len", [len(seq) for seq in batch_labels])
-                        #print("emi lens:", emi_lens)
-                        #print("emission first dim:", batch_emissions.shape[0])
-                        #print("reduced seq_lens:", sum(seq_lens))
-                        t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place)
-                        t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place)
-                        t_emissions = fluid.create_lod_tensor(batch_emissions, [seq_lens], place)
-                        yield t_words, t_labels, t_emissions
-                        batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, []
-                        emi_lens = []
-                if len(seq_lens) > 0:                
-                    t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place)
-                    t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place)
-                    t_emissions = fluid.create_lod_tensor(batch_emissions, [seq_lens], place)
-                    yield t_words, t_labels, t_emissions
-                    batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, []
-    else:
-        def wrapper():
-            batch_words, batch_labels, seq_lens = [], [], []
-            for instance in reader.file_reader(file_name, mode="test")():
-                words, labels = instance
-                if len(seq_lens) < args.batch_size:
-                    batch_words.append(words)
-                    batch_labels.append(labels)
-                    seq_lens.append(len(words))
-                if len(seq_lens) == args.batch_size:  
-                    t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place)
-                    t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place)
-                    yield t_words, t_labels
-                    batch_words, batch_labels, seq_lens = [], [], []
-            if len(seq_lens) > 0:                
-                t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place)
-                t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place)
-                yield t_words, t_labels
-                batch_words, batch_labels, seq_lens = [], [], []
-    return wrapper
-def create_pyreader(args,
-                    file_name,
-                    feed_list,
-                    place,
-                    model='lac',
-                    reader=None,
-                    return_reader=False,
-                    mode='train'):
-    reader = SequenceLabelReader(
-                vocab_path=args.vocab_path,
-                label_map_config=args.label_map_config,
-                max_seq_len=args.max_seq_len,
-                do_lower_case=args.do_lower_case,
-                random_seed=args.random_seed)
-    return reader.data_generator(file_name,args.batch_size,args.epoch,shuffle=False,phase="train")
-def create_ernie_model(args, ernie_config):
-    """
-    Create Model for LAC based on ERNIE encoder
-    """
-    # ERNIE's input data
-    src_ids = fluid.data(
-        name='src_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    sent_ids = fluid.data(
-        name='sent_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    pos_ids = fluid.data(
-        name='pos_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    input_mask = fluid.data(
-        name='input_mask', shape=[-1, args.max_seq_len, 1], dtype='float32')
-    padded_labels = fluid.data(
-        name='padded_labels', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    seq_lens = fluid.data(
-        name='seq_lens', shape=[-1], dtype='int64', lod_level=0)
-    squeeze_labels = fluid.layers.squeeze(padded_labels, axes=[-1])
-    # ernie_pyreader
-    ernie_inputs = {
-        "src_ids": src_ids,
-        "sent_ids": sent_ids,
-        "pos_ids": pos_ids,
-        "input_mask": input_mask,
-        "seq_lens": seq_lens
-    }
-    embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config)
-    padded_token_embeddings = embeddings["padded_token_embeddings"]
-    emission = fluid.layers.fc(
-        size=args.num_labels,
-        input=padded_token_embeddings,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Uniform(
-                low=-args.init_bound, high=args.init_bound),
-            regularizer=fluid.regularizer.L2DecayRegularizer(
-                regularization_coeff=1e-4)),
-        num_flatten_dims=2)
-    crf_cost = fluid.layers.linear_chain_crf(
-        input=emission,
-        label=padded_labels,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=args.crf_learning_rate),
-        length=seq_lens)
-    avg_cost = fluid.layers.mean(x=crf_cost)
-    crf_decode = fluid.layers.crf_decoding(
-        input=emission,
-        param_attr=fluid.ParamAttr(name='crfw'),
-        length=seq_lens)
-    (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
-     num_correct_chunks) = fluid.layers.chunk_eval(
-         input=crf_decode,
-         label=squeeze_labels,
-         chunk_scheme="IOB",
-         num_chunk_types=int(math.ceil((args.num_labels - 1) / 2.0)),
-         seq_length=seq_lens)
-    chunk_evaluator = fluid.metrics.ChunkEvaluator()
-    chunk_evaluator.reset()
-    ret = {
-        "feed_list":
-        [src_ids, sent_ids, pos_ids, input_mask, padded_labels, seq_lens],
-        "words": src_ids,
-        "pos_ids":pos_ids,
-        "sent_ids":sent_ids,
-        "input_mask":input_mask,
-        "labels": padded_labels,
-        "seq_lens": seq_lens,
-        "avg_cost": avg_cost,
-        "crf_decode": crf_decode,
-        "precision": precision,
-        "recall": recall,
-        "f1_score": f1_score,
-        "chunk_evaluator": chunk_evaluator,
-        "num_infer_chunks": num_infer_chunks,
-        "num_label_chunks": num_label_chunks,
-        "num_correct_chunks": num_correct_chunks,
-        "emission":emission, 
-        "alpha": None
-    }
-    return ret
--- a/demo/pantheon/lexical_anlysis/downloads.py
+++ b/demo/pantheon/lexical_anlysis/downloads.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Download script, download dataset and pretrain models.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import io
-import os
-import sys
-import time
-import hashlib
-import tarfile
-import requests
-FILE_INFO = {
-    'BASE_URL': 'https://baidu-nlp.bj.bcebos.com/',
-    'DATA': {
-        'name': 'lexical_analysis-dataset-2.0.0.tar.gz',
-        'md5': '71e4a9a36d0f0177929a1bccedca7dba'
-    },
-    'FINETURN_MODEL': {
-        'name': 'lexical_analysis_finetuned-1.0.0.tar.gz',
-        'md5': "ee2c7614b06dcfd89561fbbdaac34342"
-    },
-    'CONF': {
-        'name': 'conf.tar.gz',
-        'md5': "7a0fe28db46db496fff4361eebaa6515", 
-        'url': 'https://paddlemodels.bj.bcebos.com/PaddleSlim/pantheon/lexical_analysis/',
-    }
-}
-def usage():
-    desc = ("\nDownload datasets and pretrained models for LAC.\n"
-            "Usage:\n"
-            "   1. python download.py all\n"
-            "   2. python download.py dataset\n"
-            "   3. python download.py finetuned\n"
-            "   4. python download.py conf\n")
-    print(desc)
-def md5file(fname):
-    hash_md5 = hashlib.md5()
-    with io.open(fname, "rb") as fin:
-        for chunk in iter(lambda: fin.read(4096), b""):
-            hash_md5.update(chunk)
-    return hash_md5.hexdigest()
-def extract(fname, dir_path):
-    """
-    Extract tar.gz file
-    """
-    try:
-        tar = tarfile.open(fname, "r")
-        file_names = tar.getnames()
-        for file_name in file_names:
-            tar.extract(file_name, dir_path)
-            print(file_name)
-        tar.close()
-    except Exception as e:
-        raise e
-def _download(url, filename, md5sum):
-    """
-    Download file and check md5
-    """
-    retry = 0
-    retry_limit = 3
-    chunk_size = 4096
-    while not (os.path.exists(filename) and md5file(filename) == md5sum):
-        if retry < retry_limit:
-            retry += 1
-        else:
-            raise RuntimeError(
-                "Cannot download dataset ({0}) with retry {1} times.".format(
-                    url, retry_limit))
-        try:
-            start = time.time()
-            size = 0
-            res = requests.get(url, stream=True)
-            filesize = int(res.headers['content-length'])
-            if res.status_code == 200:
-                print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024))
-                # save by chunk
-                with io.open(filename, "wb") as fout:
-                    for chunk in res.iter_content(chunk_size=chunk_size):
-                        if chunk:
-                            fout.write(chunk)
-                            size += len(chunk)
-                            pr = '>' * int(size * 50 / filesize)
-                            print(
-                                '\r[Process ]: %s%.2f%%' %
-                                (pr, float(size / filesize * 100)),
-                                end='')
-            end = time.time()
-            print("\n[CostTime]: %.2f s" % (end - start))
-        except Exception as e:
-            print(e)
-def download(name, dir_path):
-    # import ipdb; ipdb.set_trace()
-    if name == 'CONF':
-        url = FILE_INFO[name]['url'] + FILE_INFO[name]['name']
-    else:
-        url = FILE_INFO['BASE_URL'] + FILE_INFO[name]['name']
-    file_path = os.path.join(dir_path, FILE_INFO[name]['name'])
-    if not os.path.exists(dir_path):
-        os.makedirs(dir_path)
-    # download data
-    print("Downloading : %s" % name)
-    _download(url, file_path, FILE_INFO[name]['md5'])
-    # extract data
-    print("Extracting : %s" % file_path)
-    extract(file_path, dir_path)
-    os.remove(file_path)
-if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        usage()
-        sys.exit(1)
-    pwd = os.path.join(os.path.dirname(__file__), './')
-    ernie_dir = os.path.join(os.path.dirname(__file__), './pretrained')
-    if sys.argv[1] == 'all':
-        download('DATA', pwd)
-        download('FINETURN_MODEL', pwd)
-        download('CONF', pwd)
-    if sys.argv[1] == "dataset":
-        download('DATA', pwd)
-    elif sys.argv[1] == "finetuned":
-        download('FINETURN_MODEL', pwd)
-    elif sys.argv[1] == "conf":
-        download('CONF', pwd)
-    else:
-        usage()
--- a/demo/pantheon/lexical_anlysis/ernie_reader.py
+++ b/demo/pantheon/lexical_anlysis/ernie_reader.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module provides reader for ernie model
-"""
-import sys
-from collections import namedtuple
-import numpy as np
-sys.path.append("..")
-from preprocess.ernie.task_reader import BaseReader, tokenization
-def pad_batch_data(insts,
-                   pad_idx=0,
-                   max_len=128,
-                   return_pos=False,
-                   return_input_mask=False,
-                   return_max_len=False,
-                   return_num_token=False,
-                   return_seq_lens=False):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and input mask.
-    """
-    return_list = []
-    # max_len = max(len(inst) for inst in insts)
-    max_len = max_len
-    # Any token included in dict can be used to pad, since the paddings' loss
-    # will be masked out by weights and make no effect on parameter gradients.
-    inst_data = np.array(
-        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
-    # position data
-    if return_pos:
-        inst_pos = np.array([
-            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
-            for inst in insts
-        ])
-        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
-    if return_input_mask:
-        # This is used to avoid attention on paddings.
-        input_mask_data = np.array([[1] * len(inst) + [0] *
-                                    (max_len - len(inst)) for inst in insts])
-        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
-        return_list += [input_mask_data.astype("float32")]
-    if return_max_len:
-        return_list += [max_len]
-    if return_num_token:
-        num_token = 0
-        for inst in insts:
-            num_token += len(inst)
-        return_list += [num_token]
-    if return_seq_lens:
-        seq_lens = np.array([len(inst) for inst in insts])
-        return_list += [seq_lens.astype("int64").reshape([-1])]
-    return return_list if len(return_list) > 1 else return_list[0]
-class SequenceLabelReader(BaseReader):
-    """SequenceLabelReader"""
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-        batch_label_ids = [record.label_ids for record in batch_records]
-        # padding
-        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
-            batch_token_ids,
-            max_len=self.max_seq_len,
-            pad_idx=self.pad_id,
-            return_input_mask=True,
-            return_seq_lens=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, max_len=self.max_seq_len, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, max_len=self.max_seq_len, pad_idx=self.pad_id)
-        padded_label_ids = pad_batch_data(
-            batch_label_ids,
-            max_len=self.max_seq_len,
-            pad_idx=len(self.label_map) - 1)
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            input_mask, padded_label_ids, batch_seq_lens
-        ]
-        return return_list
-    def _reseg_token_label(self, tokens, labels, tokenizer):
-        assert len(tokens) == len(labels)
-        ret_tokens = []
-        ret_labels = []
-        for token, label in zip(tokens, labels):
-            sub_token = tokenizer.tokenize(token)
-            if len(sub_token) == 0:
-                continue
-            ret_tokens.extend(sub_token)
-            ret_labels.append(label)
-            if len(sub_token) < 2:
-                continue
-            sub_label = label
-            if label.startswith("B-"):
-                sub_label = "I-" + label[2:]
-            ret_labels.extend([sub_label] * (len(sub_token) - 1))
-        assert len(ret_tokens) == len(ret_labels)
-        return ret_tokens, ret_labels
-    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
-        tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
-        labels = tokenization.convert_to_unicode(example.label).split(u"")
-        tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
-        if len(tokens) > max_seq_length - 2:
-            tokens = tokens[0:(max_seq_length - 2)]
-            labels = labels[0:(max_seq_length - 2)]
-        tokens = ["[CLS]"] + tokens + ["[SEP]"]
-        token_ids = tokenizer.convert_tokens_to_ids(tokens)
-        position_ids = list(range(len(token_ids)))
-        text_type_ids = [0] * len(token_ids)
-        no_entity_id = len(self.label_map) - 1
-        labels = [
-            label if label in self.label_map else u"O" for label in labels
-        ]
-        label_ids = [no_entity_id] + [
-            self.label_map[label] for label in labels
-        ] + [no_entity_id]
-        Record = namedtuple(
-            'Record',
-            ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
-        record = Record(
-            token_ids=token_ids,
-            text_type_ids=text_type_ids,
-            position_ids=position_ids,
-            label_ids=label_ids)
-        return record
--- a/demo/pantheon/lexical_anlysis/eval.py
+++ b/demo/pantheon/lexical_anlysis/eval.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import time
-import sys
-import paddle.fluid as fluid
-import paddle
-import model_utils
-import reader
-import creator
-sys.path.append('models/')
-from model_check import check_cuda
-from model_check import check_version
-parser = argparse.ArgumentParser(__doc__)
-# 1. model parameters
-model_g = model_utils.ArgumentGroup(parser, "model", "model configuration")
-model_g.add_arg("word_emb_dim", int, 128,
-                "The dimension in which a word is embedded.")
-model_g.add_arg("grnn_hidden_dim", int, 128,
-                "The number of hidden nodes in the GRNN layer.")
-model_g.add_arg("bigru_num", int, 2,
-                "The number of bi_gru layers in the network.")
-model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
-# 2. data parameters
-data_g = model_utils.ArgumentGroup(parser, "data", "data paths")
-data_g.add_arg("word_dict_path", str, "./conf/word.dic",
-               "The path of the word dictionary.")
-data_g.add_arg("label_dict_path", str, "./conf/tag.dic",
-               "The path of the label dictionary.")
-data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic",
-               "The path of the word replacement Dictionary.")
-data_g.add_arg("test_data", str, "./data/test.tsv",
-               "The folder where the training data is located.")
-data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model")
-data_g.add_arg(
-    "batch_size", int, 200,
-    "The number of sequences contained in a mini-batch, "
-    "or the maximum number of tokens (include paddings) contained in a mini-batch."
-)
-def do_eval(args):
-    print('do_eval...........')
-    dataset = reader.Dataset(args)
-    test_program = fluid.Program()
-    with fluid.program_guard(test_program, fluid.default_startup_program()):
-        with fluid.unique_name.guard():
-            test_ret = creator.create_model(
-                args, dataset.vocab_size, dataset.num_labels, mode='test')
-    test_program = test_program.clone(for_test=True)
-    # init executor
-    if args.use_cuda:
-        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
-    else:
-        place = fluid.CPUPlace()
-    pyreader = creator.create_pyreader(
-        args,
-        file_name=args.test_data,
-        feed_list=test_ret['feed_list'],
-        place=place,
-        model='lac',
-        reader=dataset,
-        mode='test')
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-    # load model
-    model_utils.init_checkpoint(exe, args.init_checkpoint, test_program)
-    test_process(
-        exe=exe, program=test_program, reader=pyreader, test_ret=test_ret)
-def test_process(exe, program, reader, test_ret):
-    """
-    the function to execute the infer process
-    :param exe: the fluid Executor
-    :param program: the infer_program
-    :param reader: data reader
-    :return: the list of prediction result
-    """
-    print('test_process...........')
-    test_ret["chunk_evaluator"].reset()
-    start_time = time.time()
-    reader.start()
-    while True:
-        try:
-            nums_infer, nums_label, nums_correct = exe.run(
-		program,
-		fetch_list=[
-		    test_ret["num_infer_chunks"],
-		    test_ret["num_label_chunks"],
-		    test_ret["num_correct_chunks"],
-		])
-            test_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct)
-        except fluid.core.EOFException:
-            reader.reset()
-            break
-    precision, recall, f1 = test_ret["chunk_evaluator"].eval()
-    end_time = time.time()
-    print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" %
-          (precision, recall, f1, end_time - start_time))
-if __name__ == '__main__':
-    args = parser.parse_args()
-    check_cuda(args.use_cuda)
-    check_version()
-    do_eval(args)
--- a/demo/pantheon/lexical_anlysis/model_utils.py
+++ b/demo/pantheon/lexical_anlysis/model_utils.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-util tools
-"""
-from __future__ import print_function
-import os
-import sys
-import numpy as np
-import paddle.fluid as fluid
-import yaml
-import io
-def str2bool(v):
-    """
-    argparse does not support True or False in python
-    """
-    return v.lower() in ("true", "t", "1")
-class ArgumentGroup(object):
-    """
-    Put arguments to one group
-    """
-    def __init__(self, parser, title, des):
-        """none"""
-        self._group = parser.add_argument_group(title=title, description=des)
-    def add_arg(self, name, type, default, help, **kwargs):
-        """ Add argument """
-        type = str2bool if type == bool else type
-        self._group.add_argument(
-            "--" + name,
-            default=default,
-            type=type,
-            help=help + ' Default: %(default)s.',
-            **kwargs)
-def load_yaml(parser, file_name, **kwargs):
-    with io.open(file_name, 'r', encoding='utf8') as f:
-        args = yaml.load(f)
-        for title in args:
-            group = parser.add_argument_group(title=title, description='')
-            for name in args[title]:
-                _type = type(args[title][name]['val'])
-                _type = str2bool if _type == bool else _type
-                group.add_argument(
-                    "--" + name,
-                    default=args[title][name]['val'],
-                    type=_type,
-                    help=args[title][name]['meaning'] +
-                    ' Default: %(default)s.',
-                    **kwargs)
-def print_arguments(args):
-    """none"""
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).items()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-def to_str(string, encoding="utf-8"):
-    """convert to str for print"""
-    if sys.version_info.major == 3:
-        if isinstance(string, bytes):
-            return string.decode(encoding)
-    elif sys.version_info.major == 2:
-        if isinstance(string, unicode):
-            if os.name == 'nt':
-                return string
-            else:
-                return string.encode(encoding)
-    return string
-def to_lodtensor(data, place):
-    """
-    Convert data in list into lodtensor.
-    """
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.Tensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-def parse_result(words, crf_decode, dataset):
-    """ parse result """
-    offset_list = (crf_decode.lod())[0]
-    words = np.array(words)
-    crf_decode = np.array(crf_decode)
-    batch_size = len(offset_list) - 1
-    batch_out = []
-    for sent_index in range(batch_size):
-        begin, end = offset_list[sent_index], offset_list[sent_index + 1]
-        sent = [dataset.id2word_dict[str(id[0])] for id in words[begin:end]]
-        tags = [
-            dataset.id2label_dict[str(id[0])] for id in crf_decode[begin:end]
-        ]
-        sent_out = []
-        tags_out = []
-        parital_word = ""
-        for ind, tag in enumerate(tags):
-            # for the first word
-            if parital_word == "":
-                parital_word = sent[ind]
-                tags_out.append(tag.split('-')[0])
-                continue
-            # for the beginning of word
-            if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
-                sent_out.append(parital_word)
-                tags_out.append(tag.split('-')[0])
-                parital_word = sent[ind]
-                continue
-            parital_word += sent[ind]
-        # append the last word, except for len(tags)=0
-        if len(sent_out) < len(tags_out):
-            sent_out.append(parital_word)
-        batch_out.append([sent_out, tags_out])
-    return batch_out
-def parse_padding_result(words, crf_decode, seq_lens, dataset):
-    """ parse padding result """
-    words = np.squeeze(words)
-    batch_size = len(seq_lens)
-    batch_out = []
-    for sent_index in range(batch_size):
-        sent = [
-            dataset.id2word_dict[str(id)]
-            for id in words[sent_index][1:seq_lens[sent_index] - 1]
-        ]
-        tags = [
-            dataset.id2label_dict[str(id)]
-            for id in crf_decode[sent_index][1:seq_lens[sent_index] - 1]
-        ]
-        sent_out = []
-        tags_out = []
-        parital_word = ""
-        for ind, tag in enumerate(tags):
-            # for the first word
-            if parital_word == "":
-                parital_word = sent[ind]
-                tags_out.append(tag.split('-')[0])
-                continue
-            # for the beginning of word
-            if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
-                sent_out.append(parital_word)
-                tags_out.append(tag.split('-')[0])
-                parital_word = sent[ind]
-                continue
-            parital_word += sent[ind]
-        # append the last word, except for len(tags)=0
-        if len(sent_out) < len(tags_out):
-            sent_out.append(parital_word)
-        batch_out.append([sent_out, tags_out])
-    return batch_out
-def init_checkpoint(exe, init_checkpoint_path, main_program):
-    """
-    Init CheckPoint
-    """
-    assert os.path.exists(
-        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
-    def existed_persitables(var):
-        """
-        If existed presitabels
-        """
-        if not fluid.io.is_persistable(var):
-            return False
-        if os.path.exists(os.path.join(init_checkpoint_path, var.name)):
-            print("INIT {}".format(var.name))
-            return True
-        else:
-            print("SKIP {}".format(var.name))
-            return False
-    fluid.io.load_vars(
-        exe,
-        init_checkpoint_path,
-        main_program=main_program,
-        predicate=existed_persitables)
-    print("Load model from {}".format(init_checkpoint_path))
-def init_pretraining_params(exe,
-                            pretraining_params_path,
-                            main_program,
-                            use_fp16=False):
-    """load params of pretrained model, NOT including moment, learning_rate"""
-    assert os.path.exists(pretraining_params_path
-                          ), "[%s] cann't be found." % pretraining_params_path
-    def _existed_params(var):
-        if not isinstance(var, fluid.framework.Parameter):
-            return False
-        if os.path.exists(os.path.join(pretraining_params_path, var.name)):
-            print("INIT {}".format(var.name))
-            return True
-        else:
-            print("SKIP {}".format(var.name))
-            return False
-    fluid.io.load_vars(
-        exe,
-        pretraining_params_path,
-        main_program=main_program,
-        predicate=_existed_params)
-    print("Load pretraining parameters from {}.".format(
-        pretraining_params_path))
--- a/demo/pantheon/lexical_anlysis/models/__init__.py
+++ b/demo/pantheon/lexical_anlysis/models/__init__.py
--- a/demo/pantheon/lexical_anlysis/models/model_check.py
+++ b/demo/pantheon/lexical_anlysis/models/model_check.py
-#encoding=utf8
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import paddle
-import paddle.fluid as fluid
-def check_cuda(use_cuda, err = \
-    "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
-    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
-                                                                                                                     ):
-    """
-    Log error and exit when set use_gpu=true in paddlepaddle
-    cpu version.
-    """
-    try:
-        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
-            print(err)
-            sys.exit(1)
-    except Exception as e:
-        pass
-def check_version():
-        """
-        Log error and exit when the installed version of paddlepaddle is
-        not satisfied.
-        """
-        err = "PaddlePaddle version 1.6 or higher is required, " \
-            "or a suitable develop version is satisfied as well. \n" \
-            "Please make sure the version is good with your code." \
-        try:
-            fluid.require_version('1.6.0')
-        except Exception as e:
-            print(err)
-            sys.exit(1)
-def check_version():
-    """
-    Log error and exit when the installed version of paddlepaddle is
-    not satisfied.
-    """
-    err = "PaddlePaddle version 1.6 or higher is required, " \
-        "or a suitable develop version is satisfied as well. \n" \
-        "Please make sure the version is good with your code." \
-    try:
-        fluid.require_version('1.6.0')
-    except Exception as e:
-        print(err)
-        sys.exit(1)
-if __name__ == "__main__":
-    check_cuda(True)
-    check_cuda(False)
-    check_cuda(True, "This is only for testing.")
--- a/demo/pantheon/lexical_anlysis/models/representation/__init__.py
+++ b/demo/pantheon/lexical_anlysis/models/representation/__init__.py
--- a/demo/pantheon/lexical_anlysis/models/representation/ernie.py
+++ b/demo/pantheon/lexical_anlysis/models/representation/ernie.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module provides ErnieModel and ErnieConfig
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import json
-import six
-import paddle.fluid as fluid
-from models.transformer_encoder import encoder, pre_process_layer
-def ernie_pyreader(args, pyreader_name):
-    """define standard ernie pyreader"""
-    src_ids = fluid.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    sent_ids = fluid.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    pos_ids = fluid.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
-    input_mask = fluid.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='float32')
-    labels = fluid.data(name='5', shape=[-1, 1], dtype='int64')
-    seq_lens = fluid.data(name='6', shape=[-1], dtype='int64')
-    pyreader = fluid.io.DataLoader.from_generator(
-        feed_list=[src_ids, sent_ids, pos_ids, input_mask, labels, seq_lens],
-        capacity=50,
-        iterable=False,
-        use_double_buffer=True)
-    ernie_inputs = {
-        "src_ids": src_ids,
-        "sent_ids": sent_ids,
-        "pos_ids": pos_ids,
-        "input_mask": input_mask,
-        "seq_lens": seq_lens
-    }
-    return pyreader, ernie_inputs, labels
-def ernie_encoder_with_paddle_hub(ernie_inputs, max_seq_len):
-    ernie = hub.Module(name="ernie")
-    inputs, outputs, program = ernie.context(
-        trainable=True, max_seq_len=max_seq_len, learning_rate=1)
-    main_program = fluid.default_main_program()
-    input_dict = {
-        inputs["input_ids"].name: ernie_inputs["src_ids"],
-        inputs["segment_ids"].name: ernie_inputs["sent_ids"],
-        inputs["position_ids"].name: ernie_inputs["pos_ids"],
-        inputs["input_mask"].name: ernie_inputs["input_mask"]
-    }
-    hub.connect_program(
-        pre_program=main_program,
-        next_program=program,
-        input_dict=input_dict,
-        inplace=True)
-    enc_out = outputs["sequence_output"]
-    unpad_enc_out = fluid.layers.sequence_unpad(
-        enc_out, length=ernie_inputs["seq_lens"])
-    cls_feats = outputs["pooled_output"]
-    embeddings = {
-        "sentence_embeddings": cls_feats,
-        "token_embeddings": unpad_enc_out,
-        "padded_token_embeddings": enc_out
-    }
-    for k, v in embeddings.items():
-        v.persistable = True
-    return embeddings
-def ernie_encoder(ernie_inputs, ernie_config):
-    """return sentence embedding and token embeddings"""
-    ernie = ErnieModel(
-        src_ids=ernie_inputs["src_ids"],
-        position_ids=ernie_inputs["pos_ids"],
-        sentence_ids=ernie_inputs["sent_ids"],
-        input_mask=ernie_inputs["input_mask"],
-        config=ernie_config)
-    enc_out = ernie.get_sequence_output()
-    unpad_enc_out = fluid.layers.sequence_unpad(
-        enc_out, length=ernie_inputs["seq_lens"])
-    cls_feats = ernie.get_pooled_output()
-    embeddings = {
-        "sentence_embeddings": cls_feats,
-        "token_embeddings": unpad_enc_out,
-        "padded_token_embeddings": enc_out
-    }
-    for k, v in embeddings.items():
-        v.persistable = True
-    return embeddings
-class ErnieConfig(object):
-    """ErnieConfig"""
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except Exception:
-            raise IOError("Error in parsing Ernie model config file '%s'" %
-                          config_path)
-        else:
-            return config_dict
-    def __getitem__(self, key):
-        return self._config_dict[key]
-    def print_config(self):
-        """print config"""
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-class ErnieModel(object):
-    """ErnieModel"""
-    def __init__(self,
-                 src_ids,
-                 position_ids,
-                 sentence_ids,
-                 input_mask,
-                 config,
-                 weight_sharing=True,
-                 use_fp16=False):
-        self._emb_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        self._sent_types = config['type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self._weight_sharing = weight_sharing
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._dtype = "float16" if use_fp16 else "float32"
-        # Initialize all weigths by truncated normal initializer, and all biases
-        # will be initialized by constant zero by default.
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range'])
-        self._build_model(src_ids, position_ids, sentence_ids, input_mask)
-    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
-        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(
-            input=src_ids,
-            size=[self._voc_size, self._emb_size],
-            dtype=self._dtype,
-            param_attr=fluid.ParamAttr(
-                name=self._word_emb_name, initializer=self._param_initializer),
-            is_sparse=False)
-        position_emb_out = fluid.layers.embedding(
-            input=position_ids,
-            size=[self._max_position_seq_len, self._emb_size],
-            dtype=self._dtype,
-            param_attr=fluid.ParamAttr(
-                name=self._pos_emb_name, initializer=self._param_initializer))
-        sent_emb_out = fluid.layers.embedding(
-            sentence_ids,
-            size=[self._sent_types, self._emb_size],
-            dtype=self._dtype,
-            param_attr=fluid.ParamAttr(
-                name=self._sent_emb_name, initializer=self._param_initializer))
-        emb_out = emb_out + position_emb_out
-        emb_out = emb_out + sent_emb_out
-        emb_out = pre_process_layer(
-            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-        if self._dtype == "float16":
-            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
-        self_attn_mask = fluid.layers.matmul(
-            x=input_mask, y=input_mask, transpose_y=True)
-        self_attn_mask = fluid.layers.scale(
-            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
-        n_head_self_attn_mask = fluid.layers.stack(
-            x=[self_attn_mask] * self._n_head, axis=1)
-        n_head_self_attn_mask.stop_gradient = True
-        self._enc_out = encoder(
-            enc_input=emb_out,
-            attn_bias=n_head_self_attn_mask,
-            n_layer=self._n_layer,
-            n_head=self._n_head,
-            d_key=self._emb_size // self._n_head,
-            d_value=self._emb_size // self._n_head,
-            d_model=self._emb_size,
-            d_inner_hid=self._emb_size * 4,
-            prepostprocess_dropout=self._prepostprocess_dropout,
-            attention_dropout=self._attention_dropout,
-            relu_dropout=0,
-            hidden_act=self._hidden_act,
-            preprocess_cmd="",
-            postprocess_cmd="dan",
-            param_initializer=self._param_initializer,
-            name='encoder')
-    def get_sequence_output(self):
-        """Get embedding of each token for squence labeling"""
-        return self._enc_out
-    def get_pooled_output(self):
-        """Get the first feature of each sequence for classification"""
-        next_sent_feat = fluid.layers.slice(
-            input=self._enc_out, axes=[1], starts=[0], ends=[1])
-        next_sent_feat = fluid.layers.fc(
-            input=next_sent_feat,
-            size=self._emb_size,
-            act="tanh",
-            param_attr=fluid.ParamAttr(
-                name="pooled_fc.w_0", initializer=self._param_initializer),
-            bias_attr="pooled_fc.b_0")
-        return next_sent_feat
-    def get_pretraining_output(self, mask_label, mask_pos, labels):
-        """Get the loss & accuracy for pretraining"""
-        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-        # extract the first token feature in each sentence
-        next_sent_feat = self.get_pooled_output()
-        reshaped_emb_out = fluid.layers.reshape(
-            x=self._enc_out, shape=[-1, self._emb_size])
-        # extract masked tokens' feature
-        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-        # transform: fc
-        mask_trans_feat = fluid.layers.fc(
-            input=mask_feat,
-            size=self._emb_size,
-            act=self._hidden_act,
-            param_attr=fluid.ParamAttr(
-                name='mask_lm_trans_fc.w_0',
-                initializer=self._param_initializer),
-            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
-        # transform: layer norm
-        mask_trans_feat = pre_process_layer(
-            mask_trans_feat, 'n', name='mask_lm_trans')
-        mask_lm_out_bias_attr = fluid.ParamAttr(
-            name="mask_lm_out_fc.b_0",
-            initializer=fluid.initializer.Constant(value=0.0))
-        if self._weight_sharing:
-            fc_out = fluid.layers.matmul(
-                x=mask_trans_feat,
-                y=fluid.default_main_program().global_block().var(
-                    self._word_emb_name),
-                transpose_y=True)
-            fc_out += fluid.layers.create_parameter(
-                shape=[self._voc_size],
-                dtype=self._dtype,
-                attr=mask_lm_out_bias_attr,
-                is_bias=True)
-        else:
-            fc_out = fluid.layers.fc(input=mask_trans_feat,
-                                     size=self._voc_size,
-                                     param_attr=fluid.ParamAttr(
-                                         name="mask_lm_out_fc.w_0",
-                                         initializer=self._param_initializer),
-                                     bias_attr=mask_lm_out_bias_attr)
-        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
-            logits=fc_out, label=mask_label)
-        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
-        next_sent_fc_out = fluid.layers.fc(
-            input=next_sent_feat,
-            size=2,
-            param_attr=fluid.ParamAttr(
-                name="next_sent_fc.w_0", initializer=self._param_initializer),
-            bias_attr="next_sent_fc.b_0")
-        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
-            logits=next_sent_fc_out, label=labels, return_softmax=True)
-        next_sent_acc = fluid.layers.accuracy(
-            input=next_sent_softmax, label=labels)
-        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
-        loss = mean_next_sent_loss + mean_mask_lm_loss
-        return next_sent_acc, mean_mask_lm_loss, loss
--- a/demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py
+++ b/demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py
--- a/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py
+++ b/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The function lex_net(args) define the lexical analysis network structure
-"""
-import sys
-import os
-import math
-import paddle.fluid as fluid
-from paddle.fluid.initializer import NormalInitializer
-def lex_net(word, args, vocab_size, num_labels, teacher_crf_decode=None, for_infer=True,target=None):
-    """
-    define the lexical analysis network structure
-    word: stores the input of the model
-    for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-    return:
-        for infer: return the prediction
-        otherwise: return the prediction
-    """
-    word_emb_dim = args.word_emb_dim
-    grnn_hidden_dim = args.grnn_hidden_dim
-    emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(args) else 1.0
-    crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(args) else 1.0
-    bigru_num = args.bigru_num
-    init_bound = 0.1
-    IS_SPARSE = True
-    def _bigru_layer(input_feature):
-        """
-        define the bidirectional gru layer
-        """
-        pre_gru = fluid.layers.fc(
-            input=input_feature,
-            size=grnn_hidden_dim * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-        gru = fluid.layers.dynamic_gru(
-            input=pre_gru,
-            size=grnn_hidden_dim,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-        pre_gru_r = fluid.layers.fc(
-            input=input_feature,
-            size=grnn_hidden_dim * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-        gru_r = fluid.layers.dynamic_gru(
-            input=pre_gru_r,
-            size=grnn_hidden_dim,
-            is_reverse=True,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-        bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
-        return bi_merge
-    def log_softmax(logits, axis=-1):
-        logsoftmax = logits-fluid.layers.log(fluid.layers.reduce_sum(fluid.layers.exp(logits),axis))
-        return logsoftmax
-    def cross_entropy(student, teacher):
-        ce_loss = -1.0 * fluid.layers.reduce_sum(teacher*fluid.layers.log(student), dim=1)
-        ce_loss = fluid.layers.sequence_pool(ce_loss, "sum")
-        return ce_loss
-    def kl_div(student, teacher):
-        ce_loss = fluid.layers.reduce_sum(teacher*(fluid.layers.log(teacher) - fluid.layers.log(student)), dim=1)
-        ce_loss = fluid.layers.sequence_pool(ce_loss, "sum")
-        return ce_loss
-    def pred(student, teacher,t=1.0):
-        return fluid.layers.reduce_mean(-1.0*fluid.layers.softmax(teacher)*log_softmax(student/t))
-    def normalize(alpha):
-        """ alpha shape (-1, 57)
-        """
-        tag_num = alpha.shape[1] 
-        sum_alpha = fluid.layers.reduce_sum(alpha, dim=1)
-        sum_alpha = fluid.layers.unsqueeze(sum_alpha, axes=[1])
-        sum_alpha = fluid.layers.expand(sum_alpha, [1, tag_num])
-        norm_alpha = alpha / sum_alpha
-        return norm_alpha
-    def _net_conf(word, target=None):
-        """
-        Configure the network
-        """
-        word_embedding = fluid.embedding(
-            input=word,
-            size=[vocab_size, word_emb_dim],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr=fluid.ParamAttr(
-                learning_rate=emb_lr,
-                name="word_emb",
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound)))
-        input_feature = word_embedding
-        for i in range(bigru_num):
-            bigru_output = _bigru_layer(input_feature)
-            input_feature = bigru_output
-        emission = fluid.layers.fc(
-            size=num_labels,
-            input=bigru_output,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-        if target is not None:
-            crf_cost = fluid.layers.linear_chain_crf(
-                input=emission,
-                label=target,
-                param_attr=fluid.ParamAttr(
-                    name='crfw', learning_rate=crf_lr))
-            if teacher_crf_decode is not None:
-                teacher_cost = pred(student=emission, teacher=teacher_crf_decode,t=1.0)
-            else:
-                teacher_cost = 0
-                print('no teacher emission')
-            crf_avg_cost = fluid.layers.mean(x=crf_cost)
-            alpha, beta = 0.5, 0.5
-            print("alpha * crf_avg_cost + beta * teacher_cost: ", alpha, beta)
-            avg_cost = alpha * crf_avg_cost+ beta * teacher_cost
-            crf_decode = fluid.layers.crf_decoding(
-                input=emission, param_attr=fluid.ParamAttr(name='crfw'))
-            return avg_cost, crf_avg_cost, teacher_cost, crf_decode
-        else:
-            size = emission.shape[1]
-            fluid.layers.create_parameter(
-                shape=[size + 2, size], dtype=emission.dtype, name='crfw')
-            crf_decode = fluid.layers.crf_decoding(
-                input=emission, param_attr=fluid.ParamAttr(name='crfw'))
-        return crf_decode
-    if for_infer:
-        return _net_conf(word)
-    else:
-        # assert target != None, "target is necessary for training"
-        return _net_conf(word, target)
--- a/demo/pantheon/lexical_anlysis/models/transformer_encoder.py
+++ b/demo/pantheon/lexical_anlysis/models/transformer_encoder.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from functools import partial
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_query_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_key_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_value_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-        return q, k, v
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3:
-            return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x=trans_x,
-            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace=True)
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                dropout_implementation="upscale_in_train",
-                is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat(
-            [layers.reshape(
-                cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat(
-            [layers.reshape(
-                cache["v"], shape=[0, 0, d_model]), v], axis=1)
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
-                                                  dropout_rate)
-    out = __combine_heads(ctx_multiheads)
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(
-                             name=name + '_output_fc.w_0',
-                             initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-    return proj_out
-def positionwise_feed_forward(x,
-                              d_inner_hid,
-                              d_hid,
-                              dropout_rate,
-                              hidden_act,
-                              param_initializer=None,
-                              name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=hidden_act,
-                       param_attr=fluid.ParamAttr(
-                           name=name + '_fc_0.w_0',
-                           initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    if dropout_rate:
-        hidden = layers.dropout(
-            hidden,
-            dropout_prob=dropout_rate,
-            dropout_implementation="upscale_in_train",
-            is_test=False)
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(
-                        name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-    return out
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
-                           name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out_dtype = out.dtype
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float32")
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_scale',
-                    initializer=fluid.initializer.Constant(1.)),
-                bias_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_bias',
-                    initializer=fluid.initializer.Constant(0.)))
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float16")
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob=dropout_rate,
-                    dropout_implementation="upscale_in_train",
-                    is_test=False)
-    return out
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name=''):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(
-        pre_process_layer(
-            enc_input,
-            preprocess_cmd,
-            prepostprocess_dropout,
-            name=name + '_pre_att'),
-        None,
-        None,
-        attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        param_initializer=param_initializer,
-        name=name + '_multi_head_att')
-    attn_output = post_process_layer(
-        enc_input,
-        attn_output,
-        postprocess_cmd,
-        prepostprocess_dropout,
-        name=name + '_post_att')
-    ffd_output = positionwise_feed_forward(
-        pre_process_layer(
-            attn_output,
-            preprocess_cmd,
-            prepostprocess_dropout,
-            name=name + '_pre_ffn'),
-        d_inner_hid,
-        d_model,
-        relu_dropout,
-        hidden_act,
-        param_initializer=param_initializer,
-        name=name + '_ffn')
-    return post_process_layer(
-        attn_output,
-        ffd_output,
-        postprocess_cmd,
-        prepostprocess_dropout,
-        name=name + '_post_ffn')
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            param_initializer=None,
-            name=''):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            param_initializer=param_initializer,
-            name=name + '_layer_' + str(i))
-        enc_input = enc_output
-    enc_output = pre_process_layer(
-        enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-    return enc_output
--- a/demo/pantheon/lexical_anlysis/preprocess/__init__.py
+++ b/demo/pantheon/lexical_anlysis/preprocess/__init__.py
--- a/demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py
+++ b/demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py
--- a/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py
+++ b/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module provides reader for classification and sequence labing
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from collections import namedtuple
-import csv
-import json
-import numpy as np
-from preprocess.ernie import tokenization
-from preprocess.padding import pad_batch_data
-import io
-def csv_reader(fd, delimiter='\t'):
-    def gen():
-        for i in fd:
-            slots = i.rstrip('\n').split(delimiter)
-            if len(slots) == 1:
-                yield slots,
-            else:
-                yield slots
-    return gen()
-class BaseReader(object):
-    """BaseReader for classify and sequence labeling task"""
-    def __init__(self,
-                 vocab_path,
-                 label_map_config=None,
-                 max_seq_len=512,
-                 do_lower_case=True,
-                 in_tokens=False,
-                 random_seed=None):
-        self.max_seq_len = max_seq_len
-        self.tokenizer = tokenization.FullTokenizer(
-            vocab_file=vocab_path, do_lower_case=do_lower_case)
-        self.vocab = self.tokenizer.vocab
-        self.pad_id = self.vocab["[PAD]"]
-        self.cls_id = self.vocab["[CLS]"]
-        self.sep_id = self.vocab["[SEP]"]
-        self.in_tokens = in_tokens
-        np.random.seed(random_seed)
-        self.current_example = 0
-        self.current_epoch = 0
-        self.num_examples = 0
-        if label_map_config:
-            with open(label_map_config) as f:
-                self.label_map = json.load(f)
-        else:
-            self.label_map = None
-    def get_train_progress(self):
-        """Gets progress for training phase."""
-        return self.current_example, self.current_epoch
-    def _read_tsv(self, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with io.open(input_file, "r", encoding="utf8") as f:
-            reader = csv_reader(f, delimiter="\t")
-            headers = next(reader)
-            Example = namedtuple('Example', headers)
-            examples = []
-            for line in reader:
-                example = Example(*line)
-                examples.append(example)
-            return examples
-    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
-        """Truncates a sequence pair in place to the maximum length."""
-        # This is a simple heuristic which will always truncate the longer sequence
-        # one token at a time. This makes more sense than truncating an equal percent
-        # of tokens from each, since if one sequence is very short then each token
-        # that's truncated likely contains more information than a longer sequence.
-        while True:
-            total_length = len(tokens_a) + len(tokens_b)
-            if total_length <= max_length:
-                break
-            if len(tokens_a) > len(tokens_b):
-                tokens_a.pop()
-            else:
-                tokens_b.pop()
-    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
-        """Converts a single `Example` into a single `Record`."""
-        text_a = tokenization.convert_to_unicode(example.text_a)
-        tokens_a = tokenizer.tokenize(text_a)
-        tokens_b = None
-        if "text_b" in example._fields:
-            text_b = tokenization.convert_to_unicode(example.text_b)
-            tokens_b = tokenizer.tokenize(text_b)
-        if tokens_b:
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[0:(max_seq_length - 2)]
-        # The convention in BERT/ERNIE is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0     0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        text_type_ids = []
-        tokens.append("[CLS]")
-        text_type_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            text_type_ids.append(0)
-        tokens.append("[SEP]")
-        text_type_ids.append(0)
-        if tokens_b:
-            for token in tokens_b:
-                tokens.append(token)
-                text_type_ids.append(1)
-            tokens.append("[SEP]")
-            text_type_ids.append(1)
-        token_ids = tokenizer.convert_tokens_to_ids(tokens)
-        position_ids = list(range(len(token_ids)))
-        if self.label_map:
-            label_id = self.label_map[example.label]
-        else:
-            label_id = example.label
-        Record = namedtuple(
-            'Record',
-            ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
-        qid = None
-        if "qid" in example._fields:
-            qid = example.qid
-        record = Record(
-            token_ids=token_ids,
-            text_type_ids=text_type_ids,
-            position_ids=position_ids,
-            label_id=label_id,
-            qid=qid)
-        return record
-    def _prepare_batch_data(self, examples, batch_size, phase=None):
-        """generate batch records"""
-        batch_records, max_len = [], 0
-        for index, example in enumerate(examples):
-            if phase == "train":
-                self.current_example = index
-            record = self._convert_example_to_record(example, self.max_seq_len,
-                                                     self.tokenizer)
-            max_len = max(max_len, len(record.token_ids))
-            if self.in_tokens:
-                to_append = (len(batch_records) + 1) * max_len <= batch_size
-            else:
-                to_append = len(batch_records) < batch_size
-            if to_append:
-                batch_records.append(record)
-            else:
-                yield self._pad_batch_records(batch_records)
-                batch_records, max_len = [record], len(record.token_ids)
-        if batch_records:
-            yield self._pad_batch_records(batch_records)
-    def get_num_examples(self, input_file):
-        """return total number of examples"""
-        examples = self._read_tsv(input_file)
-        return len(examples)
-    def data_generator(self,
-                       input_file,
-                       batch_size,
-                       epoch,
-                       shuffle=True,
-                       phase=None):
-        """return generator which yields batch data for pyreader"""
-        examples = self._read_tsv(input_file)
-        def _wrapper():
-            for epoch_index in range(epoch):
-                if phase == "train":
-                    self.current_example = 0
-                    self.current_epoch = epoch_index
-                if shuffle:
-                    np.random.shuffle(examples)
-                for batch_data in self._prepare_batch_data(
-                        examples, batch_size, phase=phase):
-                    yield batch_data
-        return _wrapper
-class ClassifyReader(BaseReader):
-    """ClassifyReader"""
-    def _read_tsv(self, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with io.open(input_file, "r", encoding="utf8") as f:
-            reader = csv_reader(f, delimiter="\t")
-            headers = next(reader)
-            text_indices = [
-                index for index, h in enumerate(headers) if h != "label"
-            ]
-            Example = namedtuple('Example', headers)
-            examples = []
-            for line in reader:
-                for index, text in enumerate(line):
-                    if index in text_indices:
-                        line[index] = text.replace(' ', '')
-                example = Example(*line)
-                examples.append(example)
-            return examples
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-        batch_labels = [record.label_id for record in batch_records]
-        batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1])
-        # padding
-        padded_token_ids, input_mask, seq_lens = pad_batch_data(
-            batch_token_ids,
-            pad_idx=self.pad_id,
-            return_input_mask=True,
-            return_seq_lens=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            input_mask, batch_labels, seq_lens
-        ]
-        return return_list
-class SequenceLabelReader(BaseReader):
-    """SequenceLabelReader"""
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-        batch_label_ids = [record.label_ids for record in batch_records]
-        # padding
-        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
-            batch_token_ids,
-            pad_idx=self.pad_id,
-            return_input_mask=True,
-            return_seq_lens=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        padded_label_ids = pad_batch_data(
-            batch_label_ids, pad_idx=len(self.label_map) - 1)
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            input_mask, padded_label_ids, batch_seq_lens
-        ]
-        return return_list
-    def _reseg_token_label(self, tokens, labels, tokenizer):
-        assert len(tokens) == len(labels)
-        ret_tokens = []
-        ret_labels = []
-        for token, label in zip(tokens, labels):
-            sub_token = tokenizer.tokenize(token)
-            if len(sub_token) == 0:
-                continue
-            ret_tokens.extend(sub_token)
-            ret_labels.append(label)
-            if len(sub_token) < 2:
-                continue
-            sub_label = label
-            if label.startswith("B-"):
-                sub_label = "I-" + label[2:]
-            ret_labels.extend([sub_label] * (len(sub_token) - 1))
-        assert len(ret_tokens) == len(ret_labels)
-        return ret_tokens, ret_labels
-    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
-        tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
-        labels = tokenization.convert_to_unicode(example.label).split(u"")
-        tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
-        if len(tokens) > max_seq_length - 2:
-            tokens = tokens[0:(max_seq_length - 2)]
-            labels = labels[0:(max_seq_length - 2)]
-        tokens = ["[CLS]"] + tokens + ["[SEP]"]
-        token_ids = tokenizer.convert_tokens_to_ids(tokens)
-        position_ids = list(range(len(token_ids)))
-        text_type_ids = [0] * len(token_ids)
-        no_entity_id = len(self.label_map) - 1
-        labels = [
-            label if label in self.label_map else u"O" for label in labels
-        ]
-        label_ids = [no_entity_id] + [
-            self.label_map[label] for label in labels
-        ] + [no_entity_id]
-        Record = namedtuple(
-            'Record',
-            ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
-        record = Record(
-            token_ids=token_ids,
-            text_type_ids=text_type_ids,
-            position_ids=position_ids,
-            label_ids=label_ids)
-        return record
-class ExtractEmbeddingReader(BaseReader):
-    """ExtractEmbeddingReader"""
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-        # padding
-        padded_token_ids, input_mask, seq_lens = pad_batch_data(
-            batch_token_ids,
-            pad_idx=self.pad_id,
-            return_input_mask=True,
-            return_seq_lens=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            input_mask, seq_lens
-        ]
-        return return_list
-if __name__ == '__main__':
-    pass
--- a/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py
+++ b/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import collections
-import unicodedata
-import six
-import io
-def convert_to_unicode(text):
-    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text.decode("utf-8", "ignore")
-        elif isinstance(text, unicode):
-            return text
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-def printable_text(text):
-    """Returns text encoded in a way suitable for print or `tf.logging`."""
-    # These functions want `str` for both Python2 and Python3, but in one case
-    # it's a Unicode string and in the other it's a byte string.
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, unicode):
-            return text.encode("utf-8")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    fin = io.open(vocab_file, encoding="utf8")
-    for num, line in enumerate(fin):
-        items = convert_to_unicode(line.strip()).split("\t")
-        if len(items) > 2:
-            break
-        token = items[0]
-        index = items[1] if len(items) == 2 else num
-        token = token.strip()
-        vocab[token] = int(index)
-    return vocab
-def convert_by_vocab(vocab, items):
-    """Converts a sequence of [tokens|ids] using the vocab."""
-    output = []
-    for item in items:
-        output.append(vocab[item])
-    return output
-def convert_tokens_to_ids(vocab, tokens):
-    return convert_by_vocab(vocab, tokens)
-def convert_ids_to_tokens(inv_vocab, ids):
-    return convert_by_vocab(inv_vocab, ids)
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a peice of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-class FullTokenizer(object):
-    """Runs end-to-end tokenziation."""
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.inv_vocab = {v: k for k, v in self.vocab.items()}
-        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-    def tokenize(self, text):
-        split_tokens = []
-        for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-        return split_tokens
-    def convert_tokens_to_ids(self, tokens):
-        return convert_by_vocab(self.vocab, tokens)
-    def convert_ids_to_tokens(self, ids):
-        return convert_by_vocab(self.inv_vocab, ids)
-class CharTokenizer(object):
-    """Runs end-to-end tokenziation."""
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.inv_vocab = {v: k for k, v in self.vocab.items()}
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-    def tokenize(self, text):
-        split_tokens = []
-        for token in text.lower().split(" "):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-        return split_tokens
-    def convert_tokens_to_ids(self, tokens):
-        return convert_by_vocab(self.vocab, tokens)
-    def convert_ids_to_tokens(self, ids):
-        return convert_by_vocab(self.inv_vocab, ids)
-class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-    def __init__(self, do_lower_case=True):
-        """Constructs a BasicTokenizer.
-        Args:
-            do_lower_case: Whether to lower case the input.
-        """
-        self.do_lower_case = do_lower_case
-    def tokenize(self, text):
-        """Tokenizes a piece of text."""
-        text = convert_to_unicode(text)
-        text = self._clean_text(text)
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-        return ["".join(x) for x in output]
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-            (cp >= 0x3400 and cp <= 0x4DBF) or  #
-            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-            (cp >= 0x2B820 and cp <= 0x2CEAF) or
-            (cp >= 0xF900 and cp <= 0xFAFF) or  #
-            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-        return False
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenziation."""
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-        For example:
-            input = "unaffable"
-            output = ["un", "##aff", "##able"]
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through `BasicTokenizer.
-        Returns:
-            A list of wordpiece tokens.
-        """
-        text = convert_to_unicode(text)
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
--- a/demo/pantheon/lexical_anlysis/preprocess/padding.py
+++ b/demo/pantheon/lexical_anlysis/preprocess/padding.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Mask, padding and batching.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import numpy as np
-def pad_batch_data(insts,
-                   pad_idx=0,
-                   return_pos=False,
-                   return_input_mask=False,
-                   return_max_len=False,
-                   return_num_token=False,
-                   return_seq_lens=False):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and input mask.
-    """
-    return_list = []
-    max_len = max(len(inst) for inst in insts)
-    # Any token included in dict can be used to pad, since the paddings' loss
-    # will be masked out by weights and make no effect on parameter gradients.
-    inst_data = np.array(
-        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
-    # position data
-    if return_pos:
-        inst_pos = np.array([
-            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
-            for inst in insts
-        ])
-        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
-    if return_input_mask:
-        # This is used to avoid attention on paddings.
-        input_mask_data = np.array([[1] * len(inst) + [0] *
-                                    (max_len - len(inst)) for inst in insts])
-        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
-        return_list += [input_mask_data.astype("float32")]
-    if return_max_len:
-        return_list += [max_len]
-    if return_num_token:
-        num_token = 0
-        for inst in insts:
-            num_token += len(inst)
-        return_list += [num_token]
-    if return_seq_lens:
-        seq_lens = np.array([len(inst) for inst in insts])
-        return_list += [seq_lens.astype("int64").reshape([-1])]
-    return return_list if len(return_list) > 1 else return_list[0]
-if __name__ == "__main__":
-    pass
--- a/demo/pantheon/lexical_anlysis/reader.py
+++ b/demo/pantheon/lexical_anlysis/reader.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The file_reader converts raw corpus to input.
-"""
-import os
-import argparse
-import __future__
-import io
-import glob
-from paddleslim.pantheon import Student
-import random
-import numpy as np
-import six
-def load_kv_dict(dict_path,
-                 reverse=False,
-                 delimiter="\t",
-                 key_func=None,
-                 value_func=None):
-    """
-    Load key-value dict from file
-    """
-    result_dict = {}
-    for line in io.open(dict_path, "r", encoding='utf8'):
-        terms = line.strip("\n").split(delimiter)
-        if len(terms) != 2:
-            continue
-        if reverse:
-            value, key = terms
-        else:
-            key, value = terms
-        if key in result_dict:
-            raise KeyError("key duplicated with [%s]" % (key))
-        if key_func:
-            key = key_func(key)
-        if value_func:
-            value = value_func(value)
-        result_dict[key] = value
-    return result_dict
-class Dataset(object):
-    """data reader"""
-    def __init__(self, args, mode="train"):
-        # read dict
-        self.word2id_dict = load_kv_dict(
-            args.word_dict_path, reverse=True, value_func=int)
-        self.id2word_dict = load_kv_dict(args.word_dict_path)
-        self.label2id_dict = load_kv_dict(
-            args.label_dict_path, reverse=True, value_func=int)
-        self.id2label_dict = load_kv_dict(args.label_dict_path)
-        self.word_replace_dict = load_kv_dict(args.word_rep_dict_path)
-        self._student = Student()
-        self._student.register_teacher(in_address=args.in_address)
-        self._student.start()
-        self._know_desc = self._student.get_knowledge_desc()
-        self._know_data_generator = self._student.get_knowledge_generator(batch_size=1, drop_last=False)()
-        self._train_shuffle_buf_size = args.traindata_shuffle_buffer
-    @property
-    def vocab_size(self):
-        """vocabuary size"""
-        return max(self.word2id_dict.values()) + 1
-    @property
-    def num_labels(self):
-        """num_labels"""
-        return max(self.label2id_dict.values()) + 1
-    def get_num_examples(self, filename):
-        """num of line of file"""
-        return sum(1 for line in io.open(filename, "r", encoding='utf8'))
-    def word_to_ids(self, words):
-        """convert word to word index"""
-        word_ids = []
-        for word in words:
-            word = self.word_replace_dict.get(word, word)
-            if word not in self.word2id_dict:
-                word = "OOV"
-            word_id = self.word2id_dict[word]
-            word_ids.append(word_id)
-        return word_ids
-    def label_to_ids(self, labels):
-        """convert label to label index"""
-        label_ids = []
-        for label in labels:
-            if label not in self.label2id_dict:
-                label = "O"
-            label_id = self.label2id_dict[label]
-            label_ids.append(label_id)
-        return label_ids
-    def file_reader(self, filename, max_seq_len=126, mode="train"):
-        """
-        yield (word_idx, target_idx, teacher_emission) one by one from file,
-            or yield (word_idx, ) in `infer` mode
-        """
-        def wrapper():
-            invalid_samples = 0
-            fread = io.open(filename, "r", encoding="utf-8")
-            if mode == "infer":
-                for line in fread:
-                    words = line.strip()
-                    word_ids = self.word_to_ids(words)
-                    yield (word_ids[0:max_seq_len], )
-            elif mode == "test":
-                headline = next(fread)
-                headline = headline.strip().split('\t')
-                assert len(headline) == 2 and headline[
-                    0] == "text_a" and headline[1] == "label"
-                for line in fread:
-                    words, labels = line.strip("\n").split("\t")
-                    if len(words) < 1:
-                        continue
-                    word_ids = self.word_to_ids(words.split("\002"))
-                    label_ids = self.label_to_ids(labels.split("\002"))
-                    assert len(word_ids) == len(label_ids)
-                    yield word_ids[0:max_seq_len], label_ids[0:max_seq_len]
-            else:
-                headline = next(fread)
-                headline = headline.strip().split('\t')
-                assert len(headline) == 2 and headline[
-                    0] == "text_a" and headline[1] == "label"
-                buf = []
-                for line in fread:
-                    words, labels = line.strip("\n").split("\t")
-                    if len(words) < 1:
-                        continue
-                    word_ids = self.word_to_ids(words.split("\002"))
-                    label_ids = self.label_to_ids(labels.split("\002"))
-                    if six.PY2:
-                        know_data = self._know_data_generator.next()
-                    else:
-                        know_data = self._know_data_generator.__next__()
-                    teacher_crf_decode = know_data["crf_decode"]
-                    if len(teacher_crf_decode.shape) == 1:
-                        teacher_crf_decode = np.reshape(teacher_crf_decode, [-1, 1])
-                    teacher_seq_len = know_data["seq_lens"]
-                    assert len(word_ids) == len(label_ids)
-                    real_len = len(word_ids) if len(word_ids) < max_seq_len else max_seq_len
-                    if real_len == teacher_seq_len[0] - 2: 
-                        teacher_crf_decode_range = teacher_crf_decode[0][1:teacher_seq_len[0]-1]
-                        teacher_crf_decode_range = np.reshape(teacher_crf_decode_range, [-1, 1])
-                        buf.append([word_ids[0:max_seq_len], label_ids[0:max_seq_len], teacher_crf_decode_range])
-                        #buf.append([word_ids[0:max_seq_len], label_ids[0:max_seq_len], teacher_crf_decode[0][1:teacher_seq_len[0]-1]])
-                        if len(buf) > self._train_shuffle_buf_size:
-                            buf_ids = range(len(buf))
-                            random.shuffle(buf_ids)
-                            for idx in buf_ids:
-                                yield buf[idx]
-                            buf = []
-                    else:
-                        invalid_samples += 1
-                if len(buf) > 0:
-                    buf_ids = list(range(len(buf)))
-                    random.shuffle(buf_ids)
-                    for idx in buf_ids:
-                        yield buf[idx]
-                print("invalid samples in one epoch: {}".format(invalid_samples))
-            fread.close()
-        return wrapper
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument(
-        "--word_dict_path",
-        type=str,
-        default="./conf/word.dic",
-        help="word dict")
-    parser.add_argument(
-        "--label_dict_path",
-        type=str,
-        default="./conf/tag.dic",
-        help="label dict")
-    parser.add_argument(
-        "--word_rep_dict_path",
-        type=str,
-        default="./conf/q2b.dic",
-        help="word replace dict")
-    args = parser.parse_args()
-    dataset = Dataset(args)
-   # data_generator = dataset.file_reader("data/train.tsv")
-    #for word_idx, target_idx in data_generator():
-       # print(word_idx, target_idx)
-       # print(len(word_idx), len(target_idx))
-       # break
--- a/demo/pantheon/lexical_anlysis/run_student.sh
+++ b/demo/pantheon/lexical_anlysis/run_student.sh
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=5,6
-python -u train_student.py \
-    --train_data ./data/train.tsv \
-    --test_data ./data/test.tsv \
-    --model_save_dir ./teacher_ernie_init_lac_1gru_emb128 \
-    --validation_steps 1000 \
-    --save_steps 1000 \
-    --print_steps 100 \
-    --batch_size 32 \
-    --epoch 10 \
-    --traindata_shuffle_buffer 20000 \
-    --word_emb_dim 128 \
-    --grnn_hidden_dim 128 \
-    --bigru_num 1 \
-    --base_learning_rate 1e-3 \
-    --emb_learning_rate 2 \
-    --crf_learning_rate 0.2 \
-    --word_dict_path ./conf/word.dic \
-    --label_dict_path ./conf/tag.dic \
-    --word_rep_dict_path ./conf/q2b.dic \
-    --enable_ce false \
-    --use_cuda true \
-    --in_address "127.0.0.1:5002"
--- a/demo/pantheon/lexical_anlysis/run_teacher.sh
+++ b/demo/pantheon/lexical_anlysis/run_teacher.sh
-#!/bin/bash
-export FLAGS_sync_nccl_allreduce=0
-export FLAGS_eager_delete_tensor_gb=1
-export FLAGS_fraction_of_gpu_memory_to_use=0.99
-export CUDA_VISIBLE_DEVICES=5,6     # which GPU to use
-ERNIE_FINETUNED_MODEL_PATH=./model_finetuned
-DATA_PATH=./data/
-python -u teacher_ernie.py \
-    --ernie_config_path "conf/ernie_config.json" \
-    --init_checkpoint "${ERNIE_FINETUNED_MODEL_PATH}" \
-    --init_bound 0.1 \
-    --vocab_path "conf/vocab.txt" \
-    --batch_size 32 \
-    --random_seed 0 \
-    --num_labels 57 \
-    --max_seq_len 128 \
-    --test_data "${DATA_PATH}/train.tsv" \
-    --label_map_config "./conf/label_map.json" \
-    --do_lower_case true \
-    --use_cuda true \
-    --out_port=5002
--- a/demo/pantheon/lexical_anlysis/teacher_ernie.py
+++ b/demo/pantheon/lexical_anlysis/teacher_ernie.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Baidu's open-source Lexical Analysis tool for Chinese, including:
-    1. Word Segmentation,
-    2. Part-of-Speech Tagging
-    3. Named Entity Recognition
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import time
-import argparse
-import numpy as np
-import multiprocessing
-import sys
-from collections import namedtuple
-from paddleslim.pantheon import Teacher
-import paddle.fluid as fluid
-import creator
-import model_utils
-print('model representation') 
-from models.representation.ernie import ErnieConfig
-print('model check') 
-from models.model_check import check_cuda
-from models.model_check import check_version
-def do_eval(args):
-    # init executor
-    if args.use_cuda:
-        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
-    else:
-        place = fluid.CPUPlace()
-    print('ernie config') 
-    ernie_config = ErnieConfig(args.ernie_config_path)
-    ernie_config.print_config()
-    test_program = fluid.Program()
-    print('test program') 
-    with fluid.program_guard(test_program, fluid.default_startup_program()):
-        with fluid.unique_name.guard():
-            test_ret = creator.create_ernie_model(args, ernie_config)
-    test_program = test_program.clone(for_test=True)
-    #print('create pyreader') 
-    pyreader = creator.create_pyreader(
-        args,
-        file_name=args.test_data,
-        feed_list=[ret.name for ret in test_ret['feed_list']],
-        model="ernie",
-        place=place,
-        return_reader=True,
-        mode='test')
-    #data_inter = reader.data_generator(args.test_data, args.batch_size, 1, shuffle=False, phase="train")
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-    # load model
-    if not args.init_checkpoint:
-        raise ValueError(
-            "args 'init_checkpoint' should be set if only doing test or infer!")
-    model_utils.init_checkpoint(exe, args.init_checkpoint, test_program)
-    teacher = Teacher(out_path=None, out_port=int(args.out_port))
-    teacher.start()
-    print('run teacher......')
-    test_ret["chunk_evaluator"].reset()
-    reader_config = {"batch_generator": pyreader}
-    teacher.start_knowledge_service(
-            feed_list=[test_ret["words"].name, test_ret["sent_ids"].name, test_ret["pos_ids"].name, test_ret["input_mask"].name, test_ret["labels"].name, test_ret["seq_lens"].name],
-            schema={"crf_decode":test_ret["crf_decode"],"seq_lens":test_ret["seq_lens"]},
-            program=test_program,
-            reader_config=reader_config,
-            exe=exe,
-            times=10)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(__doc__)
-    model_utils.load_yaml(parser, './conf/ernie_args.yaml')
-    # config for pantheon teacher
-    parser.add_argument('--out_path', type=str, default=None, help="The path to dump knowledge for offline mode.")
-    parser.add_argument('--out_port', type=str, default=None, help="The IP port number to send out knowledge for \
-                            online mode, should be unique when launching multiple teachers in \
-                            the same node.")
-    args = parser.parse_args()
-    check_cuda(args.use_cuda)
-    check_version()
-    model_utils.print_arguments(args)
-    do_eval(args)
--- a/demo/pantheon/lexical_anlysis/train_student.py
+++ b/demo/pantheon/lexical_anlysis/train_student.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import math
-import time
-import random
-import argparse
-import multiprocessing
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import reader
-import model_utils
-import creator
-from eval import test_process
-from models.model_check import check_cuda
-from models.model_check import check_version
-# the function to train model
-def do_train(args):
-    # init executor
-    if args.use_cuda:
-        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
-        dev_count = fluid.core.get_cuda_device_count()
-    else:
-        dev_count = min(multiprocessing.cpu_count(), args.cpu_num)
-        if (dev_count < args.cpu_num):
-            print(
-                "WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. "
-                "Change the cpu_num from %d to %d" %
-                (dev_count, args.cpu_num, dev_count))
-        os.environ['CPU_NUM'] = str(dev_count)
-        place = fluid.CPUPlace()
-    train_program = fluid.Program()
-    test_program = fluid.Program()
-    startup_program = fluid.Program()
-    dataset = reader.Dataset(args)
-    with fluid.program_guard(train_program, startup_program):
-        #train_program.random_seed = args.random_seed
-        startup_program.random_seed = args.random_seed
-        with fluid.unique_name.guard():
-            train_ret = creator.create_model(
-                args, dataset.vocab_size, dataset.num_labels, mode='train')
-            optimizer = fluid.optimizer.Adam(
-                learning_rate=args.base_learning_rate)
-            optimizer.minimize(train_ret["avg_cost"])
-    with fluid.program_guard(test_program, startup_program):
-        with fluid.unique_name.guard():
-            test_ret = creator.create_model(
-                args, dataset.vocab_size, dataset.num_labels, mode='test')
-            test_program = test_program.clone(for_test=True)
-    exe = fluid.Executor(place)
-    exe.run(startup_program)
-    if args.init_checkpoint:
-        model_utils.init_checkpoint(exe, args.init_checkpoint, train_program)
-    if dev_count > 1:
-        device = "GPU" if args.use_cuda else "CPU"
-        print("%d %s are used to train model" % (dev_count, device))
-        # multi cpu/gpu config
-        exec_strategy = fluid.ExecutionStrategy()
-        build_strategy = fluid.compiler.BuildStrategy()
-        compiled_prog = fluid.compiler.CompiledProgram(
-            train_program).with_data_parallel(
-                loss_name=train_ret['avg_cost'].name,
-                build_strategy=build_strategy,
-                exec_strategy=exec_strategy)
-    else:
-        compiled_prog = fluid.compiler.CompiledProgram(train_program)
-    # start training
-    num_train_examples = dataset.get_num_examples(args.train_data)
-    max_train_steps = args.epoch * num_train_examples // args.batch_size
-    print("Num train examples: %d" % num_train_examples)
-    print("Max train steps: %d" % max_train_steps)
-    train_generator = creator.create_lexnet_data_generator(args,
-                                         reader=dataset, 
-                                         file_name=args.train_data,
-                                         place=place, 
-                                         mode='train')
-    test_generator = creator.create_lexnet_data_generator(args, 
-                                         reader=dataset,
-                                         file_name=args.test_data, 
-                                         place=place, 
-                                         mode='test')
-    train_reader, test_reader = train_ret['pyreader'], test_ret['pyreader']
-    train_reader.set_batch_generator(train_generator, places=place)
-    test_reader.set_batch_generator(test_generator, places=place)
-    ce_info = []
-    step = 0
-    ce_time = 0
-    train_reader.start()
-    while True:
-        try:
-            # this is for minimizing the fetching op, saving the training speed.
-            if step % args.print_steps == 0:
-                fetch_list = [
-                    train_ret["avg_cost"], train_ret["precision"],
-                    train_ret["recall"], train_ret["f1_score"],
-                            train_ret["crf_avg_cost"], train_ret["teacher_cost"]
-                ]
-            else:
-                fetch_list = []
-            start_time = time.time()
-            outputs = exe.run(
-            program=compiled_prog,
-            fetch_list=fetch_list)
-            end_time = time.time()
-            if step % args.print_steps == 0:
-                avg_cost, precision, recall, f1_score, crf_avg_cost, teacher_cost = [
-                    np.mean(x) for x in outputs
-                ]
-                print("Data loader queue size: %d " % train_reader.queue.size())
-                print(
-                    "[train] step = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, crf_avg_cost: %.5f, teacher_cost: %.5f, elapsed time %.5f"
-                    % (step, avg_cost, precision, recall, f1_score, crf_avg_cost, teacher_cost,
-                    end_time - start_time))
-            if step % args.validation_steps == 0:
-                test_process(exe, test_program, test_reader, test_ret)
-                ce_time += end_time - start_time
-                ce_info.append([ce_time, avg_cost, precision, recall, f1_score])
-            # save checkpoints
-            if step % args.save_steps == 0 and step != 0:
-                save_path = os.path.join(args.model_save_dir,
-                            "step_" + str(step))
-                fluid.io.save_persistables(exe, save_path, train_program)
-            step += 1
-        except fluid.core.EOFException:
-            train_reader.reset()
-            break
-    if args.enable_ce:
-        card_num = get_cards()
-        ce_cost = 0
-        ce_f1 = 0
-        ce_p = 0
-        ce_r = 0
-        ce_time = 0
-        try:
-            ce_time = ce_info[-2][0]
-            ce_cost = ce_info[-2][1]
-            ce_p = ce_info[-2][2]
-            ce_r = ce_info[-2][3]
-            ce_f1 = ce_info[-2][4]
-        except:
-            print("ce info error")
-        print("kpis\teach_step_duration_card%s\t%s" % (card_num, ce_time))
-        print("kpis\ttrain_cost_card%s\t%f" % (card_num, ce_cost))
-        print("kpis\ttrain_precision_card%s\t%f" % (card_num, ce_p))
-        print("kpis\ttrain_recall_card%s\t%f" % (card_num, ce_r))
-        print("kpis\ttrain_f1_card%s\t%f" % (card_num, ce_f1))
-def get_cards():
-    num = 0
-    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-    if cards != '':
-        num = len(cards.split(","))
-    return num
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(__doc__)
-    model_utils.load_yaml(parser, 'conf/args.yaml')
-    # config for pantheon student
-    parser.add_argument('--in_path', type=str, default=None, help="The path of dumped knowledge from teacher for offline mode.")
-    parser.add_argument('--in_address', type=str, default=None, help="The IP port number to receive knowledge from teacher for \
-                            online mode")
-    args = parser.parse_args()
-    check_cuda(args.use_cuda)
-    check_version()
-    do_train(args)
--- a/demo/pantheon/toy/README.md
+++ b/demo/pantheon/toy/README.md
-## Toy example for Pantheon
-See more details about Pantheon in [PaddleSlim/Pantheon](../../../paddleslim/pantheon).
-Here implements two teacher models (not trainable, just for demo): teacher1 takes an integer **x** as input and predicts value **2x-1**, see in [run_teacher1.py](run_teacher1.py); teacher2 also takes **x** as input and predicts **2x+1**, see in [run_teacher2.py](run_teacher2.py). They two share a data reader to read a sequence of increasing natural numbers from zero to some positive inter **max_n** as input and generate different knowledge. And the schema keys for  knowledge in teacher1 is [**"x", "2x-1", "result"**], and [**"2x+1", "result"**] for knowledge in teacher2, in which **"result"** is the common schema and the copy of two  predictions respectively. On instantiating the **Student** object, the merging strategy for the common schema **"result"** should be specified, and the schema keys for the merged knowledge will be [**"x", "2x-1", "2x+1", "result"**], with the merged **"result"** equal to **"2x"** when the merging strategy is **"mean"** and **"4x"** when merging strategy is **"sum"**. The student model gets merged knowledge from teachers and prints them out, see in [run_student.py](run_student.py).
-The toy "knowledge distillation" system can be launched in three different modes, i.e., offline, online and their hybrid. All three modes should have the same outputs, and the correctness of results can be verified by checking the order and values of outputs.
-### Offline
- The two teachers work in offline mode, and start them with given local file paths.
- ```shell
-export PYTHONPATH=../../../:$PYTHONPATH
-export CUDA_VISIBLE_DEVICES=0,1
-export NUM_POSTPROCESS_THREADS=10 # default 8
-nohup python -u run_teacher1.py --use_cuda true --out_path teacher1_offline.dat > teacher1_offline.log 2>&1&
-export CUDA_VISIBLE_DEVICES=2
-nohup python -u run_teacher2.py --use_cuda true --out_path teacher2_offline.dat > teacher2_offline.log 2>&1&
- ```
- After the two executions both finished, start the student model with the two generated knowledge files.
- ```shell
-export PYTHONPATH=../../../:$PYTHONPATH
- python -u run_student.py \
-        --in_path0 teacher1_offline.dat \
-        --in_path1 teacher2_offline.dat
- ```
-### Online
-The two teachers work in online mode, and start them with given TCP/IP ports. Please make sure that the ICP/IP ports are available.
-```shell
-export PYTHONPATH=../../../:$PYTHONPATH
-export CUDA_VISIBLE_DEVICES=0
-nohup python -u run_teacher1.py --use_cuda true --out_port 8080  > teacher1_online.log 2>&1&
-export CUDA_VISIBLE_DEVICES=1,2
-nohup python -u run_teacher2.py --use_cuda true --out_port 8081  > teacher2_online.log 2>&1&
-```
-Start the student model with the IP addresses that can reach the ports of the two teacher models, e.g., in the same node
-```shell
-export PYTHONPATH=../../../:$PYTHONPATH
-python -u run_student.py \
-         --in_address0 127.0.0.1:8080 \
-         --in_address1 127.0.0.1:8081 \
-```
-**Note:** in online mode, the starting order of teachers and the sudent doesn't matter, and they will wait for each other to establish connection.
-### Hybrid of offline and online
-One teacher works in offline mode and another one works in online mode. This time, start the offline teacher first. After the offline knowledge file gets well prepared, start the online teacher and the student at the same time.
--- a/demo/pantheon/toy/run_student.py
+++ b/demo/pantheon/toy/run_student.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-from paddleslim.pantheon import Student
-from utils import str2bool
-def parse_args():
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument(
-        "--in_address0",
-        type=str,
-        default=None,
-        help="Input address for teacher 0. (default: %(default)s)")
-    parser.add_argument(
-        "--in_path0",
-        type=str,
-        default=None,
-        help="Input file path for teacher 0. (default: %(default)s)")
-    parser.add_argument(
-        "--in_address1",
-        type=str,
-        default=None,
-        help="Input address for teacher 1. (default: %(default)s)")
-    parser.add_argument(
-        "--in_path1",
-        type=str,
-        default=None,
-        help="Input file path for teacher 1. (default: %(default)s)")
-    parser.add_argument(
-        "--test_send_recv",
-        type=str2bool,
-        default=False,
-        help="Whether to test send/recv interfaces. (default: %(default)s)")
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=32,
-        help="The batch size of student model. (default: %(default)s)")
-    args = parser.parse_args()
-    return args
-def run(args):
-    if args.in_address0 and args.in_path0:
-        raise ValueError(
-            "args.in_address0 and args.in_path0 should not be valid "
-            "at the same time!")
-    if not args.in_address0 and not args.in_path0:
-        raise ValueError(
-            "One of args.in_address0 and args.in_path0 must be valid!")
-    if args.in_address1 and args.in_path1:
-        raise ValueError(
-            "args.in_address1 and args.in_path1 should not be valid "
-            "at the same time!")
-    if not args.in_address1 and not args.in_path1:
-        raise ValueError(
-            "One of args.in_address1 and args.in_path1 must be valid")
-    student = Student(merge_strategy={"result": "sum"})
-    student.register_teacher(
-        in_address=args.in_address0, in_path=args.in_path0)
-    student.register_teacher(
-        in_address=args.in_address1, in_path=args.in_path1)
-    student.start()
-    if args.test_send_recv:
-        for t in range(2):
-            for i in range(3):
-                print(student.recv(t))
-        student.send("message from student!")
-    knowledge_desc = student.get_knowledge_desc()
-    data_generator = student.get_knowledge_generator(
-        batch_size=args.batch_size, drop_last=False)
-    for batch_data in data_generator():
-        batch_size = list(batch_data.values())[0].shape[0]
-        keys = batch_data.keys()
-        for i in range(batch_size):
-            data = {}
-            for key in keys:
-                data[key] = batch_data[key][i]
-            print(data)
-if __name__ == '__main__':
-    args = parse_args()
-    run(args)
--- a/demo/pantheon/toy/run_teacher1.py
+++ b/demo/pantheon/toy/run_teacher1.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle.fluid as fluid
-from utils import parse_args, sample_generator, sample_list_generator, batch_generator
-from paddleslim.pantheon import Teacher
-def run(args):
-    if args.out_path and args.out_port:
-        raise ValueError("args.out_path and args.out_port should not be valid "
-                         "at the same time")
-    if not args.out_path and not args.out_port:
-        raise ValueError("One of args.out_path and args.out_port be valid")
-    # user-defined program: y = 2*x - 1 
-    startup = fluid.Program()
-    program = fluid.Program()
-    with fluid.program_guard(program, startup):
-        inp_x = fluid.layers.data(name='x', shape=[-1, 1], dtype="int64")
-        y = inp_x * 2 - 1
-        result = fluid.layers.assign(y)
-    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(startup)
-    teacher = Teacher(out_path=args.out_path, out_port=args.out_port)
-    teacher.start()
-    if args.generator_type == "sample_generator":
-        reader_config = {
-            "sample_generator": sample_generator(max_n=1000),
-            "batch_size": args.batch_size,
-            "drop_last": False
-        }
-    elif args.generator_type == "sample_list_generator":
-        reader_config = {
-            "sample_list_generator": sample_list_generator(
-                max_n=1000, batch_size=args.batch_size)
-        }
-    else:
-        reader_config = {
-            "batch_generator": batch_generator(
-                max_n=1000, batch_size=args.batch_size)
-        }
-    if args.test_send_recv:
-        teacher.send("greetings from teacher1")
-        teacher.send({"x": 1, "y": 2})
-        teacher.send({3, 5})
-        print("recved {}".format(teacher.recv()))
-    teacher.start_knowledge_service(
-        feed_list=[inp_x.name],
-        schema={"x": inp_x,
-                "2x-1": y,
-                "result": result},
-        program=program,
-        reader_config=reader_config,
-        exe=exe,
-        use_fp16=True,
-        times=args.serving_times)
-if __name__ == '__main__':
-    args = parse_args()
-    run(args)
--- a/demo/pantheon/toy/run_teacher2.py
+++ b/demo/pantheon/toy/run_teacher2.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle.fluid as fluid
-from utils import parse_args, sample_generator, sample_list_generator, batch_generator
-from paddleslim.pantheon import Teacher
-def run(args):
-    if args.out_path and args.out_port:
-        raise ValueError("args.out_path and args.out_port should not be valid "
-                         "at the same time")
-    if not args.out_path and not args.out_port:
-        raise ValueError("One of args.out_path and args.out_port be valid")
-    # user-defined program: y = 2*x + 1 
-    startup = fluid.Program()
-    program = fluid.Program()
-    with fluid.program_guard(program, startup):
-        inp_x = fluid.layers.data(name='x', shape=[-1, 1], dtype="int64")
-        y = inp_x * 2 + 1
-        result = fluid.layers.assign(y)
-    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(startup)
-    teacher = Teacher(out_path=args.out_path, out_port=args.out_port)
-    teacher.start()
-    if args.generator_type == "sample_generator":
-        reader_config = {
-            "sample_generator": sample_generator(max_n=1000),
-            "batch_size": args.batch_size,
-            "drop_last": False
-        }
-    elif args.generator_type == "sample_list_generator":
-        reader_config = {
-            "sample_list_generator": sample_list_generator(
-                max_n=1000, batch_size=args.batch_size)
-        }
-    else:
-        reader_config = {
-            "batch_generator": batch_generator(
-                max_n=1000, batch_size=args.batch_size)
-        }
-    if args.test_send_recv:
-        teacher.send("greetings from teacher2")
-        teacher.send([1])
-        teacher.send({1, 2, 3})
-        print("recved {}".format(teacher.recv()))
-    teacher.start_knowledge_service(
-        feed_list=[inp_x.name],
-        schema={"2x+1": y,
-                "result": result},
-        program=program,
-        reader_config=reader_config,
-        exe=exe,
-        times=args.serving_times)
-if __name__ == '__main__':
-    args = parse_args()
-    run(args)
--- a/demo/pantheon/toy/utils.py
+++ b/demo/pantheon/toy/utils.py
-import numpy as np
-import argparse
-def str2bool(v):
-    return v.lower() in ("true", "t", "1")
-def parse_args():
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument(
-        "--out_port",
-        type=int,
-        default=None,
-        help="IP port number for sending out data. (default: %(default)s)")
-    parser.add_argument(
-        "--out_path",
-        type=str,
-        default=None,
-        help="The file path to dump knowledge data. (default: %(default)s)")
-    parser.add_argument(
-        "--use_cuda",
-        type=str2bool,
-        default=False,
-        help="Whether to use GPU for prediction. (default: %(default)s)")
-    parser.add_argument(
-        "--test_send_recv",
-        type=str2bool,
-        default=False,
-        help="Whether to test send/recv interfaces. (default: %(default)s)")
-    parser.add_argument(
-        "--generator_type",
-        type=str,
-        choices=[
-            "sample_generator", "sample_list_generator", "batch_generator"
-        ],
-        default="batch_generator",
-        help="Which data generator to use. (default: %(default)s)")
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=32,
-        help="The batch size per device for data generators. (default: %(default)s)"
-    )
-    parser.add_argument(
-        "--serving_times",
-        type=int,
-        default=1,
-        help="The maximum times of teacher serving knowledge. (default: %(default)s)"
-    )
-    args = parser.parse_args()
-    return args
-def sample_generator(max_n):
-    def wrapper():
-        for i in range(max_n):
-            yield [i]
-    return wrapper
-def sample_list_generator(max_n, batch_size=500):
-    def wrapper():
-        sample_list = []
-        for sample in sample_generator(max_n)():
-            if len(sample_list) < batch_size:
-                sample_list.append(sample)
-            if len(sample_list) == batch_size:
-                yield sample_list
-                sample_list = []
-        if len(sample_list) > 0:
-            yield sample_list
-    return wrapper
-# data_generator
-def batch_generator(max_n, batch_size=500):
-    def wrapper():
-        batch = []
-        for sample in sample_generator(max_n)():
-            if len(batch) < batch_size:
-                batch.append(sample)
-            if len(batch) == batch_size:
-                yield [np.array(batch).astype('int64').reshape((-1, 1))]
-                batch = []
-        if len(batch) > 0:
-            yield [np.array(batch).astype('int64').reshape((-1, 1))]
-    return wrapper
--- a/paddleslim/__init__.py
+++ b/paddleslim/__init__.py
@@ -19,11 +19,8 @@ from paddleslim import nas
 from paddleslim import analysis
 from paddleslim import dist
 from paddleslim import quant
-from paddleslim import pantheon
 from paddleslim import dygraph
-__all__ = [
+__all__ = ['models', 'prune', 'nas', 'analysis', 'dist', 'quant', 'dygraph']
-    'models', 'prune', 'nas', 'analysis', 'dist', 'quant', 'pantheon', 'dygraph'
-]
 from paddleslim.dygraph import *
 __all__ += dygraph.__all__

--- a/paddleslim/pantheon/README.md
+++ b/paddleslim/pantheon/README.md
-# Pantheon: Paddle large-scale scalable knowledge distillation framework
-Pantheon is a universal solution for knowledge distillation in Paddle Fluid. Its design takes account of many possible behaviors of teacher models. Every teacher and student model in Pantheon works in different processes and they communicate with each other via local files or TCP/IP ports. The knowledge can be easily transferred to the student model from a single teacher model or the ensemble of multiple teacher models, in which each teacher model can work in online or offline mode independently. And Pantheon also provides a highly optimized interface for the large-scale prediction of teacher models. Beneficial from the low coupling of teachers and the student, users can allocate computation resources for different roles dependent on their computation complexity, and build a large-scale and practical knowledge distillation learning system on Pantheon.  
-The illustration below shows an application of Pantheon, where the sudent model is trained with knowledge from multiple online teachers. These teachers may work on the same node but different devices, or different nodes with the student model, as long as they can communicate with each other via the Internet. The student model can send queries to teachers, and the latter take these queries as input and generate streaming knowledge data for the former. Or in a simpler way, the student model can read the training data in the **same order** with the teachers, avoiding the procedure of sending queryies.  
-<div align="center">
-  <img src="images/pantheon_arch.png" width=600 /> <br>
-  The architecture for one online knowledge distillation system based on Pantheon
-</div>
-## Prerequisites
- Python 2.7.x or 3.x
- PaddlePaddle >= 1.7.0
- System: MacOS/Linux
-## APIs
-Pantheon defines two classes **Teacher** and **Student** for the communication and knowledge transfer between teacher and student.
- **Teacher**: used by the teacher model. Can receive queries from student and write out the knowledge from teacher model via TCP/IP port (online mode) or into a local file (offline mode).
- **Student**: used by the student model. Can receive and merge the knowledge from teachers, and feed the student model along with local data for training.
-Usually, the public methods of these two classes work in the pairwise way. Their mapping relations and suitable working modes are listed in the following table.
-<table>
-  <tr>
-    <th rowspan="2">Teacher</th>
-    <th rowspan="2">Student</th>
-    <th colspan="2">Supported Graph</th>
-    <th colspan="2">Mode</th>
-    <th rowspan="2">remarks</th>
-  </tr>
-  <tr>
-   <td>static</td>
-   <td>dynamic</td>
-   <td>online</td>
-   <td>offline</td>
-  </tr>
-    <tr>
-    <td><strong>__init__</strong>(<br>&nbsp;&nbsp;&nbsp;&nbsp;out_path=None,          <br>&nbsp;&nbsp;&nbsp;&nbsp;out_port=None)</td>
-    <td><strong>__init__</strong>(<br>&nbsp;&nbsp;&nbsp;&nbsp;merge_strategy=None)</td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td>[1]</td>
-  </tr>
-  <tr>
-    <td></td>
-    <td><strong>register_teacher</strong>(
-            <br>&nbsp;&nbsp;&nbsp;&nbsp;in_path=None,
-            <br>&nbsp;&nbsp;&nbsp;&nbsp;in_address=None)
-    </td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td>[2]</td>
-  </tr>
-  <tr>
-    <td><strong>start()</strong></td>
-    <td><strong>start()</strong></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td>[3]</td>
-  </tr>
-  <tr>
-    <td><strong>send</strong>(data)</td>
-    <td><strong>recv</strong>(teacher_id)</td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center></center></td>
-    <td>[4]</td>
-  </tr>
-   <tr>
-    <td><strong>recv()</strong></td>
-    <td><strong>send</strong>(data, <br>&nbsp;&nbsp;&nbsp;
-        &nbsp;teacher_ids=None)
-    </td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center></center></td>
-    <td>[5]</td>
-  </tr>
-   <tr>
-    <td><strong>dump</strong>(knowledge)</td>
-    <td></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td><center></center></td>
-    <td><center>✅</center></td>
-    <td>[6]</td>
-  </tr>
-  <tr>
-    <td rowspan="3"><strong>start_knowledge_service</strong>(
-    <br>&nbsp;&nbsp;&nbsp;&nbsp;feed_list,
-    <br>&nbsp;&nbsp;&nbsp;&nbsp;schema,
-    <br>&nbsp;&nbsp;&nbsp;&nbsp;program,
-    <br>&nbsp;&nbsp;&nbsp;&nbsp;reader_config,
-    <br>&nbsp;&nbsp;&nbsp;&nbsp;exe,
-    <br>&nbsp;&nbsp;&nbsp;&nbsp;buf_size=10,
-    <br>&nbsp;&nbsp;&nbsp;&nbsp;use_fp16=False,
-    <br>&nbsp;&nbsp;&nbsp;&nbsp;times=1)</td>
-    <td><strong>get_knowledge_desc</strong>()</td>
-    <td><center>✅</center></td>
-    <td><center></center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-    <td rowspan="3">[7]</td>
-  </tr>
-  <tr>
-    <td><strong>get_knowledge_qsize</strong>()</td>
-    <td><center>✅</center></td>
-    <td><center></center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-  </tr>
-   <tr>
-    <td><strong>get_knowledge_generator</strong>(<br>&nbsp;&nbsp;&nbsp;&nbsp;batch_size,
-        <br>&nbsp;&nbsp;&nbsp;&nbsp;drop_last=False)</td>
-    <td><center>✅</center></td>
-    <td><center></center></td>
-    <td><center>✅</center></td>
-    <td><center>✅</center></td>
-  </tr>
-</table>
-**Remarks:**
-  - [1] Decalre the teacher object for teacher model with **out\_path** or **out\_port**, and the student for student model with **merge\_strategy** for knowledge from different teachers.
-  - [2] Register a teacher, and allocate an id for it which starts from zero in the order of registration. **register\_teacher()** can be called many times for multiple-teacher mode.
-  - [3] Estabish TCP/IP link between teachers and the student, and synchronize all of them.
-  - [4] Send one data from teacher to student.
-  - [5] Send one data from student to teacher.
-  - [6] Dump one batch knowledge data into the output file.
-  - [7] Highly optimized high-level interfaces to build service for knowledge transfer:
-     -  **start\_knowledge\_service()** can perform large-scale prediction of teacher model on multiple devices;
-     - Support auto merging of knowledge from different teachers;
-     - Support auto reconnection of student and teachers.
-### About the data format
- **Knowledge**: A dictionary with the keys specified by users and the values that are numpy ndarray tensors predicted by teacher models. The first dimension of tensors should be batch size and LoDTensor is not supported yet. One can call **get\_knowledge\_desc()** to get the description of knowledge, which is also a dictionary, including the shape, data type and LoD level about knowledge data.
- **Offline knowledge file**: The first line is knowledge description, and the following lines are knowledge data, one line for one batch samples, all dumped by cPickle.
-### Usage
-If separately runnable teacher models and the student model
-have been ready, basically one can build the trainable system with knowledge
-distillation by following two simple steps.
-1) Instantiate a **Teacher** object for the teacher model, and launch knowledge serving
-```python
-from paddleslim.pantheon import Teacher
-...
-teacher = Teacher(out_path=args.out_path, out_port=args.out_port)
-teacher.start()
-teacher.start_knowledge_service(
-    feed_list=[inp_x.name],
-    schema={"x": inp_x,
-            "y": y},
-    program=program,
-    reader_config={"batch_generator": batch_generator},
-    exe=exe,
-    buf_size=100,
-    times=1)
-```
-2) Instantiate a **Student** object, specify the way to merge knowledge, register teachers,
-   and get knowledge description and data generator for the student model
-```python
-from paddleslim.pantheon import Student
-...
-student = Student(merge_strategy={"result": "sum"})
-student.register_teacher(
-        in_address=args.in_address0, in_path=args.in_path0)
-student.register_teacher(
-        in_address=args.in_address1, in_path=args.in_path1)
-student.start()
-knowledge_desc = student.get_knowledge_desc()
-data_generator = student.get_knowledge_generator(
-    batch_size=32, drop_last=False)
-```
-## Examples
-### Toy Example
-A toy example is provied to show how the knowledge data is transferred from teachers to the student model and merged, including offline, online modes and their hybrid. See [demo/pantheon/toy](../../demo/pantheon/toy).
--- a/paddleslim/pantheon/__init__.py
+++ b/paddleslim/pantheon/__init__.py
-from .teacher import Teacher
-from .student import Student
-__all__ = teacher.__all__ + student.__all__
--- a/paddleslim/pantheon/images/pantheon_arch.png
+++ b/paddleslim/pantheon/images/pantheon_arch.png
--- a/paddleslim/pantheon/student.py
+++ b/paddleslim/pantheon/student.py
--- a/paddleslim/pantheon/teacher.py
+++ b/paddleslim/pantheon/teacher.py
--- a/paddleslim/pantheon/utils.py
+++ b/paddleslim/pantheon/utils.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-public_authkey = u"aBcXyZ123"
-class StartSignal():
-    pass
-class EndSignal():
-    pass
-class SyncSignal():
-    pass
-def convert_dtype(dtype):
-    import paddle.fluid as fluid
-    if isinstance(dtype, fluid.core.VarDesc.VarType):
-        if dtype == fluid.core.VarDesc.VarType.BOOL:
-            return 'bool'
-        elif dtype == fluid.core.VarDesc.VarType.FP16:
-            return 'float16'
-        elif dtype == fluid.core.VarDesc.VarType.FP32:
-            return 'float32'
-        elif dtype == fluid.core.VarDesc.VarType.FP64:
-            return 'float64'
-        elif dtype == fluid.core.VarDesc.VarType.INT8:
-            return 'int8'
-        elif dtype == fluid.core.VarDesc.VarType.INT16:
-            return 'int16'
-        elif dtype == fluid.core.VarDesc.VarType.INT32:
-            return 'int32'
-        elif dtype == fluid.core.VarDesc.VarType.INT64:
-            return 'int64'
-        elif dtype == fluid.core.VarDesc.VarType.UINT8:
-            return 'uint8'
-def check_ip(address):
-    import IPy
-    try:
-        IPy.IP(address)
-        return True
-    except Exception as e:
-        return False