From cf6f83d436441eb30e62bd9f0cd389f49080d4cc Mon Sep 17 00:00:00 2001 From: zhouzj <41366441+zzjjay@users.noreply.github.com> Date: Thu, 24 Jun 2021 11:48:01 +0800 Subject: [PATCH] [cherry pick] Delete 'pantheon' module and related documents (#820) --- demo/pantheon/lexical_anlysis/README.md | 40 -- demo/pantheon/lexical_anlysis/README_cn.md | 41 -- demo/pantheon/lexical_anlysis/__init__.py | 4 - demo/pantheon/lexical_anlysis/creator.py | 260 ------- demo/pantheon/lexical_anlysis/downloads.py | 163 ----- demo/pantheon/lexical_anlysis/ernie_reader.py | 160 ----- demo/pantheon/lexical_anlysis/eval.py | 131 ---- demo/pantheon/lexical_anlysis/model_utils.py | 248 ------- .../lexical_anlysis/models/__init__.py | 0 .../lexical_anlysis/models/model_check.py | 73 -- .../models/representation/__init__.py | 0 .../models/representation/ernie.py | 322 --------- .../models/sequence_labeling/__init__.py | 0 .../models/sequence_labeling/nets.py | 174 ----- .../models/transformer_encoder.py | 342 --------- .../lexical_anlysis/preprocess/__init__.py | 0 .../preprocess/ernie/__init__.py | 0 .../preprocess/ernie/task_reader.py | 392 ----------- .../preprocess/ernie/tokenization.py | 370 ---------- .../lexical_anlysis/preprocess/padding.py | 78 --- demo/pantheon/lexical_anlysis/reader.py | 208 ------ demo/pantheon/lexical_anlysis/run_student.sh | 26 - demo/pantheon/lexical_anlysis/run_teacher.sh | 25 - .../pantheon/lexical_anlysis/teacher_ernie.py | 111 --- .../pantheon/lexical_anlysis/train_student.py | 208 ------ demo/pantheon/toy/README.md | 54 -- demo/pantheon/toy/run_student.py | 103 --- demo/pantheon/toy/run_teacher1.py | 81 --- demo/pantheon/toy/run_teacher2.py | 79 --- demo/pantheon/toy/utils.py | 91 --- docs/en/api_en/paddleslim.pantheon.rst | 36 - docs/zh_cn/api_cn/static/dist/pantheon_api.md | 268 ------- paddleslim/__init__.py | 5 +- paddleslim/pantheon/README.md | 206 ------ paddleslim/pantheon/__init__.py | 4 - paddleslim/pantheon/images/pantheon_arch.png | Bin 98589 -> 0 bytes paddleslim/pantheon/student.py | 596 ---------------- paddleslim/pantheon/teacher.py | 662 ------------------ paddleslim/pantheon/utils.py | 61 -- 39 files changed, 1 insertion(+), 5621 deletions(-) delete mode 100644 demo/pantheon/lexical_anlysis/README.md delete mode 100644 demo/pantheon/lexical_anlysis/README_cn.md delete mode 100644 demo/pantheon/lexical_anlysis/__init__.py delete mode 100644 demo/pantheon/lexical_anlysis/creator.py delete mode 100644 demo/pantheon/lexical_anlysis/downloads.py delete mode 100755 demo/pantheon/lexical_anlysis/ernie_reader.py delete mode 100755 demo/pantheon/lexical_anlysis/eval.py delete mode 100755 demo/pantheon/lexical_anlysis/model_utils.py delete mode 100755 demo/pantheon/lexical_anlysis/models/__init__.py delete mode 100755 demo/pantheon/lexical_anlysis/models/model_check.py delete mode 100755 demo/pantheon/lexical_anlysis/models/representation/__init__.py delete mode 100755 demo/pantheon/lexical_anlysis/models/representation/ernie.py delete mode 100755 demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py delete mode 100755 demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py delete mode 100755 demo/pantheon/lexical_anlysis/models/transformer_encoder.py delete mode 100644 demo/pantheon/lexical_anlysis/preprocess/__init__.py delete mode 100644 demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py delete mode 100644 demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py delete mode 100644 demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py delete mode 100644 demo/pantheon/lexical_anlysis/preprocess/padding.py delete mode 100644 demo/pantheon/lexical_anlysis/reader.py delete mode 100644 demo/pantheon/lexical_anlysis/run_student.sh delete mode 100755 demo/pantheon/lexical_anlysis/run_teacher.sh delete mode 100644 demo/pantheon/lexical_anlysis/teacher_ernie.py delete mode 100644 demo/pantheon/lexical_anlysis/train_student.py delete mode 100644 demo/pantheon/toy/README.md delete mode 100644 demo/pantheon/toy/run_student.py delete mode 100644 demo/pantheon/toy/run_teacher1.py delete mode 100644 demo/pantheon/toy/run_teacher2.py delete mode 100644 demo/pantheon/toy/utils.py delete mode 100644 docs/en/api_en/paddleslim.pantheon.rst delete mode 100644 docs/zh_cn/api_cn/static/dist/pantheon_api.md delete mode 100644 paddleslim/pantheon/README.md delete mode 100644 paddleslim/pantheon/__init__.py delete mode 100644 paddleslim/pantheon/images/pantheon_arch.png delete mode 100644 paddleslim/pantheon/student.py delete mode 100644 paddleslim/pantheon/teacher.py delete mode 100644 paddleslim/pantheon/utils.py diff --git a/demo/pantheon/lexical_anlysis/README.md b/demo/pantheon/lexical_anlysis/README.md deleted file mode 100644 index ec3af05d..00000000 --- a/demo/pantheon/lexical_anlysis/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Distillation example: Chinese lexical analysis -We demonstrated how to use the Pantheon framework for online distillation of the Chinese lexical analysis model with sample dataset. The effect of large-scale online distillation is shown below: -| model | Precision | Recall | F1-score| -| ------ | ------ | ------ | ------ | -| BiGRU | 89.2 | 89.4 | 89.3 | -| BERT fine-tuned | 90.2 | 90.4 | 90.3 | -| ERNIE fine-tuned | 91.7 | 91.7 | 91.7 | -| DistillBiGRU | 90.20 | 90.52 | 90.36 | - -BiGRU is to train a BiGRU based LAC model from scratch; BERT fine-tuned is to fine-tune LAC task on BERT base model; ERNIE fine-tuned is to fine-tune LAC task on BERT base model; DistillBiGRU is trained through large-scale online distillation with ERNIE fine-tuned as teacher model. - -## Introduction - -Lexical Analysis of Chinese, or LAC for short, is a lexical analysis model that completes the tasks of Chinese word segmentation, part-of-speech tagging, and named entity recognition in a single model. We conduct an overall evaluation of word segmentation, part-of-speech tagging, and named entity recognition on a self-built dataset. We use the finetuned [ERNIE](https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE) model as the Teacher model and GRU as the Student model, which are needed by the Pantheon framework for online distillation. - -#### 1. Download the training data set - -Download the data set file, and after decompression, a `./data/` folder will be created. -```bash -python downloads.py dataset -``` - -#### 2. Download the Teacher model - -```bash -# download ERNIE finetuned model -python downloads.py finetuned -python downloads.py conf -``` - -### 3. Distilling Student model -```bash -# start teacher service -bash run_teacher.sh - -# start student service -bash run_student.sh -``` - -> If you want to learn more about LAC, you can refer to this repo: https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/lexical_analysis \ No newline at end of file diff --git a/demo/pantheon/lexical_anlysis/README_cn.md b/demo/pantheon/lexical_anlysis/README_cn.md deleted file mode 100644 index 77e4a944..00000000 --- a/demo/pantheon/lexical_anlysis/README_cn.md +++ /dev/null @@ -1,41 +0,0 @@ -# 蒸馏样例:中文词法分析 -我们在样例数据集上,对中文词法分析模型,演示了如何使用Pantheon框架进行在线蒸馏。大规模在线蒸馏的效果如下图所示: - -| 模型 | 精度 | 召回率 | F1值| -| ------ | ------ | ------ | ------ | -| BiGRU | 89.2 | 89.4 | 89.3 | -| BERT fine-tuned | 90.2 | 90.4 | 90.3 | -| ERNIE fine-tuned | 91.7 | 91.7 | 91.7 | -| DistillBiGRU | 90.20 | 90.52 | 90.36 | - -BiGRU 是使用双向GRU网络从头训练LAC任务;BERT fine-tuned 是在BERT base模型上微调LAC任务;ERNIE fine-tuned 是在ERNIE base模型上微调LAC任务;DistillBiGRU 是使用ERNIE fine-tuned模型作为teacher模型,通过大规模蒸馏训练LAC任务。 - -## 简介 - -Lexical Analysis of Chinese,简称 LAC,是一个联合的词法分析模型,在单个模型中完成中文分词、词性标注、专名识别任务。我们在自建的数据集上对分词、词性标注、专名识别进行整体的评估效果。我们使用经过finetune的 [ERNIE](https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE) 模型作为Teacher模型,使用GRU作为Student模型,使用Pantheon框架进行在线蒸馏。 - -#### 1. 下载训练数据集 - -下载数据集文件,解压后会生成 `./data/` 文件夹 -```bash -python downloads.py dataset -``` - -#### 2. 下载Teacher模型 - -```bash -# download ERNIE finetuned model -python downloads.py finetuned -python downloads.py conf -``` - -### 3. 蒸馏Student模型 -```bash -# start teacher service -bash run_teacher.sh - -# start student service -bash run_student.sh -``` - -> 如果你想详细了解LAC的原理可以参照相关repo: https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/lexical_analysis diff --git a/demo/pantheon/lexical_anlysis/__init__.py b/demo/pantheon/lexical_anlysis/__init__.py deleted file mode 100644 index bcc99e78..00000000 --- a/demo/pantheon/lexical_anlysis/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .teacher import Teacher -from .student import Student - -__all__ = teacher.__all__ + student.__all__ diff --git a/demo/pantheon/lexical_anlysis/creator.py b/demo/pantheon/lexical_anlysis/creator.py deleted file mode 100644 index 48324091..00000000 --- a/demo/pantheon/lexical_anlysis/creator.py +++ /dev/null @@ -1,260 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Define the function to create lexical analysis model and model's data reader -""" -import sys -import os -import math -import numpy as np -import paddle -import paddle.fluid as fluid -from paddle.fluid.initializer import NormalInitializer - -from reader import Dataset -from ernie_reader import SequenceLabelReader - -from models.sequence_labeling import nets -from models.representation.ernie import ernie_encoder, ernie_pyreader - - -def create_model(args, vocab_size, num_labels, mode='train'): - """create lac model""" - - # model's input data - words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1) - targets = fluid.data( - name='targets', shape=[-1, 1], dtype='int64', lod_level=1) - if mode == "train": - print("create model mode: ", mode) - teacher_crf_decode = fluid.data( - name='teacher_crf_decode', shape=[-1, 1], dtype='float32', lod_level=1) - else: - print("create model mode: ", mode) - teacher_crf_decode = None - - feed_list = [words, targets] - if teacher_crf_decode: - feed_list.append(teacher_crf_decode) - - pyreader = fluid.io.DataLoader.from_generator( - feed_list=feed_list, - capacity=200, - use_double_buffer=True, - iterable=False) - # for test or train process - avg_cost, crf_avg_cost, teacher_cost, crf_decode= nets.lex_net( - words, args, vocab_size, num_labels, teacher_crf_decode,for_infer=False, target=targets) - - (precision, recall, f1_score, num_infer_chunks, num_label_chunks, - num_correct_chunks) = fluid.layers.chunk_eval( - input=crf_decode, - label=targets, - chunk_scheme="IOB", - num_chunk_types=int(math.ceil((num_labels - 1) / 2.0))) - chunk_evaluator = fluid.metrics.ChunkEvaluator() - chunk_evaluator.reset() - - ret = { - "pyreader": pyreader, - "words": words, - "targets": targets, - "avg_cost": avg_cost, - "crf_avg_cost": crf_avg_cost, - "teacher_cost": teacher_cost, - "crf_decode": crf_decode, - "precision": precision, - "recall": recall, - "f1_score": f1_score, - "chunk_evaluator": chunk_evaluator, - "num_infer_chunks": num_infer_chunks, - "num_label_chunks": num_label_chunks, - "num_correct_chunks": num_correct_chunks - } - return ret - -def create_lexnet_data_generator(args, - reader, - file_name, - place, - mode='train'): - if mode == 'train': - def wrapper(): - batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, [] - emi_lens = [] - for epoch in range(args.epoch): - print("data epoch: {}".format(epoch)) - for instance in reader.file_reader(file_name, mode="train")(): - words, labels, emission = instance - if len(seq_lens) < args.batch_size: - batch_words.append(words) - batch_labels.append(labels) - if batch_emissions is not None: - batch_emissions = np.concatenate((batch_emissions, emission)) - else: - batch_emissions = emission - seq_lens.append(len(words)) - emi_lens.append(emission.shape[0]) - if len(seq_lens) == args.batch_size: - - #print("batch words len", [len(seq) for seq in batch_words]) - #print("batch labels len", [len(seq) for seq in batch_labels]) - #print("emi lens:", emi_lens) - #print("emission first dim:", batch_emissions.shape[0]) - #print("reduced seq_lens:", sum(seq_lens)) - t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place) - t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place) - t_emissions = fluid.create_lod_tensor(batch_emissions, [seq_lens], place) - yield t_words, t_labels, t_emissions - batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, [] - emi_lens = [] - - if len(seq_lens) > 0: - t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place) - t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place) - t_emissions = fluid.create_lod_tensor(batch_emissions, [seq_lens], place) - yield t_words, t_labels, t_emissions - batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, [] - - else: - def wrapper(): - batch_words, batch_labels, seq_lens = [], [], [] - for instance in reader.file_reader(file_name, mode="test")(): - words, labels = instance - if len(seq_lens) < args.batch_size: - batch_words.append(words) - batch_labels.append(labels) - seq_lens.append(len(words)) - if len(seq_lens) == args.batch_size: - t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place) - t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place) - yield t_words, t_labels - batch_words, batch_labels, seq_lens = [], [], [] - - if len(seq_lens) > 0: - t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place) - t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place) - yield t_words, t_labels - batch_words, batch_labels, seq_lens = [], [], [] - return wrapper - -def create_pyreader(args, - file_name, - feed_list, - place, - model='lac', - reader=None, - return_reader=False, - mode='train'): - reader = SequenceLabelReader( - vocab_path=args.vocab_path, - label_map_config=args.label_map_config, - max_seq_len=args.max_seq_len, - do_lower_case=args.do_lower_case, - random_seed=args.random_seed) - return reader.data_generator(file_name,args.batch_size,args.epoch,shuffle=False,phase="train") - - -def create_ernie_model(args, ernie_config): - """ - Create Model for LAC based on ERNIE encoder - """ - # ERNIE's input data - - src_ids = fluid.data( - name='src_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') - sent_ids = fluid.data( - name='sent_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') - pos_ids = fluid.data( - name='pos_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') - input_mask = fluid.data( - name='input_mask', shape=[-1, args.max_seq_len, 1], dtype='float32') - - padded_labels = fluid.data( - name='padded_labels', shape=[-1, args.max_seq_len, 1], dtype='int64') - - seq_lens = fluid.data( - name='seq_lens', shape=[-1], dtype='int64', lod_level=0) - - squeeze_labels = fluid.layers.squeeze(padded_labels, axes=[-1]) - - # ernie_pyreader - ernie_inputs = { - "src_ids": src_ids, - "sent_ids": sent_ids, - "pos_ids": pos_ids, - "input_mask": input_mask, - "seq_lens": seq_lens - } - embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config) - - padded_token_embeddings = embeddings["padded_token_embeddings"] - - emission = fluid.layers.fc( - size=args.num_labels, - input=padded_token_embeddings, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-args.init_bound, high=args.init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4)), - num_flatten_dims=2) - - crf_cost = fluid.layers.linear_chain_crf( - input=emission, - label=padded_labels, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=args.crf_learning_rate), - length=seq_lens) - - avg_cost = fluid.layers.mean(x=crf_cost) - crf_decode = fluid.layers.crf_decoding( - input=emission, - param_attr=fluid.ParamAttr(name='crfw'), - length=seq_lens) - - (precision, recall, f1_score, num_infer_chunks, num_label_chunks, - num_correct_chunks) = fluid.layers.chunk_eval( - input=crf_decode, - label=squeeze_labels, - chunk_scheme="IOB", - num_chunk_types=int(math.ceil((args.num_labels - 1) / 2.0)), - seq_length=seq_lens) - chunk_evaluator = fluid.metrics.ChunkEvaluator() - chunk_evaluator.reset() - - ret = { - "feed_list": - [src_ids, sent_ids, pos_ids, input_mask, padded_labels, seq_lens], - "words": src_ids, - "pos_ids":pos_ids, - "sent_ids":sent_ids, - "input_mask":input_mask, - "labels": padded_labels, - "seq_lens": seq_lens, - "avg_cost": avg_cost, - "crf_decode": crf_decode, - "precision": precision, - "recall": recall, - "f1_score": f1_score, - "chunk_evaluator": chunk_evaluator, - "num_infer_chunks": num_infer_chunks, - "num_label_chunks": num_label_chunks, - "num_correct_chunks": num_correct_chunks, - "emission":emission, - "alpha": None - } - - return ret diff --git a/demo/pantheon/lexical_anlysis/downloads.py b/demo/pantheon/lexical_anlysis/downloads.py deleted file mode 100644 index c0aae6ec..00000000 --- a/demo/pantheon/lexical_anlysis/downloads.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Download script, download dataset and pretrain models. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import io -import os -import sys -import time -import hashlib -import tarfile -import requests - -FILE_INFO = { - 'BASE_URL': 'https://baidu-nlp.bj.bcebos.com/', - 'DATA': { - 'name': 'lexical_analysis-dataset-2.0.0.tar.gz', - 'md5': '71e4a9a36d0f0177929a1bccedca7dba' - }, - 'FINETURN_MODEL': { - 'name': 'lexical_analysis_finetuned-1.0.0.tar.gz', - 'md5': "ee2c7614b06dcfd89561fbbdaac34342" - }, - 'CONF': { - 'name': 'conf.tar.gz', - 'md5': "7a0fe28db46db496fff4361eebaa6515", - 'url': 'https://paddlemodels.bj.bcebos.com/PaddleSlim/pantheon/lexical_analysis/', - } -} - - -def usage(): - desc = ("\nDownload datasets and pretrained models for LAC.\n" - "Usage:\n" - " 1. python download.py all\n" - " 2. python download.py dataset\n" - " 3. python download.py finetuned\n" - " 4. python download.py conf\n") - print(desc) - - -def md5file(fname): - hash_md5 = hashlib.md5() - with io.open(fname, "rb") as fin: - for chunk in iter(lambda: fin.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() - - -def extract(fname, dir_path): - """ - Extract tar.gz file - """ - try: - tar = tarfile.open(fname, "r") - file_names = tar.getnames() - for file_name in file_names: - tar.extract(file_name, dir_path) - print(file_name) - tar.close() - except Exception as e: - raise e - - -def _download(url, filename, md5sum): - """ - Download file and check md5 - """ - retry = 0 - retry_limit = 3 - chunk_size = 4096 - while not (os.path.exists(filename) and md5file(filename) == md5sum): - if retry < retry_limit: - retry += 1 - else: - raise RuntimeError( - "Cannot download dataset ({0}) with retry {1} times.".format( - url, retry_limit)) - try: - start = time.time() - size = 0 - res = requests.get(url, stream=True) - filesize = int(res.headers['content-length']) - if res.status_code == 200: - print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024)) - # save by chunk - with io.open(filename, "wb") as fout: - for chunk in res.iter_content(chunk_size=chunk_size): - if chunk: - fout.write(chunk) - size += len(chunk) - pr = '>' * int(size * 50 / filesize) - print( - '\r[Process ]: %s%.2f%%' % - (pr, float(size / filesize * 100)), - end='') - end = time.time() - print("\n[CostTime]: %.2f s" % (end - start)) - except Exception as e: - print(e) - - -def download(name, dir_path): - # import ipdb; ipdb.set_trace() - if name == 'CONF': - url = FILE_INFO[name]['url'] + FILE_INFO[name]['name'] - else: - url = FILE_INFO['BASE_URL'] + FILE_INFO[name]['name'] - file_path = os.path.join(dir_path, FILE_INFO[name]['name']) - - if not os.path.exists(dir_path): - os.makedirs(dir_path) - - # download data - print("Downloading : %s" % name) - _download(url, file_path, FILE_INFO[name]['md5']) - - # extract data - print("Extracting : %s" % file_path) - extract(file_path, dir_path) - os.remove(file_path) - - -if __name__ == '__main__': - if len(sys.argv) != 2: - usage() - sys.exit(1) - pwd = os.path.join(os.path.dirname(__file__), './') - ernie_dir = os.path.join(os.path.dirname(__file__), './pretrained') - - if sys.argv[1] == 'all': - download('DATA', pwd) - download('FINETURN_MODEL', pwd) - download('CONF', pwd) - - if sys.argv[1] == "dataset": - download('DATA', pwd) - - elif sys.argv[1] == "finetuned": - download('FINETURN_MODEL', pwd) - - elif sys.argv[1] == "conf": - download('CONF', pwd) - - else: - usage() - diff --git a/demo/pantheon/lexical_anlysis/ernie_reader.py b/demo/pantheon/lexical_anlysis/ernie_reader.py deleted file mode 100755 index 5e8b6e4b..00000000 --- a/demo/pantheon/lexical_anlysis/ernie_reader.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This module provides reader for ernie model -""" - -import sys - -from collections import namedtuple -import numpy as np - -sys.path.append("..") -from preprocess.ernie.task_reader import BaseReader, tokenization - - -def pad_batch_data(insts, - pad_idx=0, - max_len=128, - return_pos=False, - return_input_mask=False, - return_max_len=False, - return_num_token=False, - return_seq_lens=False): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and input mask. - """ - return_list = [] - # max_len = max(len(inst) for inst in insts) - max_len = max_len - # Any token included in dict can be used to pad, since the paddings' loss - # will be masked out by weights and make no effect on parameter gradients. - - inst_data = np.array( - [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts]) - return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] - - # position data - if return_pos: - inst_pos = np.array([ - list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) - for inst in insts - ]) - - return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] - - if return_input_mask: - # This is used to avoid attention on paddings. - input_mask_data = np.array([[1] * len(inst) + [0] * - (max_len - len(inst)) for inst in insts]) - input_mask_data = np.expand_dims(input_mask_data, axis=-1) - return_list += [input_mask_data.astype("float32")] - - if return_max_len: - return_list += [max_len] - - if return_num_token: - num_token = 0 - for inst in insts: - num_token += len(inst) - return_list += [num_token] - - if return_seq_lens: - seq_lens = np.array([len(inst) for inst in insts]) - return_list += [seq_lens.astype("int64").reshape([-1])] - - return return_list if len(return_list) > 1 else return_list[0] - - -class SequenceLabelReader(BaseReader): - """SequenceLabelReader""" - - def _pad_batch_records(self, batch_records): - batch_token_ids = [record.token_ids for record in batch_records] - batch_text_type_ids = [record.text_type_ids for record in batch_records] - batch_position_ids = [record.position_ids for record in batch_records] - batch_label_ids = [record.label_ids for record in batch_records] - - # padding - padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( - batch_token_ids, - max_len=self.max_seq_len, - pad_idx=self.pad_id, - return_input_mask=True, - return_seq_lens=True) - padded_text_type_ids = pad_batch_data( - batch_text_type_ids, max_len=self.max_seq_len, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data( - batch_position_ids, max_len=self.max_seq_len, pad_idx=self.pad_id) - padded_label_ids = pad_batch_data( - batch_label_ids, - max_len=self.max_seq_len, - pad_idx=len(self.label_map) - 1) - - return_list = [ - padded_token_ids, padded_text_type_ids, padded_position_ids, - input_mask, padded_label_ids, batch_seq_lens - ] - return return_list - - def _reseg_token_label(self, tokens, labels, tokenizer): - assert len(tokens) == len(labels) - ret_tokens = [] - ret_labels = [] - for token, label in zip(tokens, labels): - sub_token = tokenizer.tokenize(token) - if len(sub_token) == 0: - continue - ret_tokens.extend(sub_token) - ret_labels.append(label) - if len(sub_token) < 2: - continue - sub_label = label - if label.startswith("B-"): - sub_label = "I-" + label[2:] - ret_labels.extend([sub_label] * (len(sub_token) - 1)) - - assert len(ret_tokens) == len(ret_labels) - return ret_tokens, ret_labels - - def _convert_example_to_record(self, example, max_seq_length, tokenizer): - tokens = tokenization.convert_to_unicode(example.text_a).split(u"") - labels = tokenization.convert_to_unicode(example.label).split(u"") - tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) - - if len(tokens) > max_seq_length - 2: - tokens = tokens[0:(max_seq_length - 2)] - labels = labels[0:(max_seq_length - 2)] - tokens = ["[CLS]"] + tokens + ["[SEP]"] - token_ids = tokenizer.convert_tokens_to_ids(tokens) - position_ids = list(range(len(token_ids))) - text_type_ids = [0] * len(token_ids) - no_entity_id = len(self.label_map) - 1 - labels = [ - label if label in self.label_map else u"O" for label in labels - ] - label_ids = [no_entity_id] + [ - self.label_map[label] for label in labels - ] + [no_entity_id] - - Record = namedtuple( - 'Record', - ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) - record = Record( - token_ids=token_ids, - text_type_ids=text_type_ids, - position_ids=position_ids, - label_ids=label_ids) - return record diff --git a/demo/pantheon/lexical_anlysis/eval.py b/demo/pantheon/lexical_anlysis/eval.py deleted file mode 100755 index b7a9072b..00000000 --- a/demo/pantheon/lexical_anlysis/eval.py +++ /dev/null @@ -1,131 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import time -import sys - -import paddle.fluid as fluid -import paddle - -import model_utils -import reader -import creator -sys.path.append('models/') -from model_check import check_cuda -from model_check import check_version - -parser = argparse.ArgumentParser(__doc__) -# 1. model parameters -model_g = model_utils.ArgumentGroup(parser, "model", "model configuration") -model_g.add_arg("word_emb_dim", int, 128, - "The dimension in which a word is embedded.") -model_g.add_arg("grnn_hidden_dim", int, 128, - "The number of hidden nodes in the GRNN layer.") -model_g.add_arg("bigru_num", int, 2, - "The number of bi_gru layers in the network.") -model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.") - -# 2. data parameters -data_g = model_utils.ArgumentGroup(parser, "data", "data paths") -data_g.add_arg("word_dict_path", str, "./conf/word.dic", - "The path of the word dictionary.") -data_g.add_arg("label_dict_path", str, "./conf/tag.dic", - "The path of the label dictionary.") -data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", - "The path of the word replacement Dictionary.") -data_g.add_arg("test_data", str, "./data/test.tsv", - "The folder where the training data is located.") -data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model") -data_g.add_arg( - "batch_size", int, 200, - "The number of sequences contained in a mini-batch, " - "or the maximum number of tokens (include paddings) contained in a mini-batch." -) - - -def do_eval(args): - print('do_eval...........') - dataset = reader.Dataset(args) - - test_program = fluid.Program() - with fluid.program_guard(test_program, fluid.default_startup_program()): - with fluid.unique_name.guard(): - test_ret = creator.create_model( - args, dataset.vocab_size, dataset.num_labels, mode='test') - test_program = test_program.clone(for_test=True) - - # init executor - if args.use_cuda: - place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) - else: - place = fluid.CPUPlace() - - pyreader = creator.create_pyreader( - args, - file_name=args.test_data, - feed_list=test_ret['feed_list'], - place=place, - model='lac', - reader=dataset, - mode='test') - - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - # load model - model_utils.init_checkpoint(exe, args.init_checkpoint, test_program) - test_process( - exe=exe, program=test_program, reader=pyreader, test_ret=test_ret) - - -def test_process(exe, program, reader, test_ret): - """ - the function to execute the infer process - :param exe: the fluid Executor - :param program: the infer_program - :param reader: data reader - :return: the list of prediction result - """ - print('test_process...........') - test_ret["chunk_evaluator"].reset() - start_time = time.time() - reader.start() - while True: - try: - nums_infer, nums_label, nums_correct = exe.run( - program, - fetch_list=[ - test_ret["num_infer_chunks"], - test_ret["num_label_chunks"], - test_ret["num_correct_chunks"], - ]) - test_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct) - except fluid.core.EOFException: - reader.reset() - break - - precision, recall, f1 = test_ret["chunk_evaluator"].eval() - end_time = time.time() - print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" % - (precision, recall, f1, end_time - start_time)) - - -if __name__ == '__main__': - args = parser.parse_args() - check_cuda(args.use_cuda) - check_version() - do_eval(args) diff --git a/demo/pantheon/lexical_anlysis/model_utils.py b/demo/pantheon/lexical_anlysis/model_utils.py deleted file mode 100755 index d9f10b17..00000000 --- a/demo/pantheon/lexical_anlysis/model_utils.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -util tools -""" -from __future__ import print_function -import os -import sys -import numpy as np -import paddle.fluid as fluid -import yaml -import io - - -def str2bool(v): - """ - argparse does not support True or False in python - """ - return v.lower() in ("true", "t", "1") - - -class ArgumentGroup(object): - """ - Put arguments to one group - """ - - def __init__(self, parser, title, des): - """none""" - self._group = parser.add_argument_group(title=title, description=des) - - def add_arg(self, name, type, default, help, **kwargs): - """ Add argument """ - type = str2bool if type == bool else type - self._group.add_argument( - "--" + name, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - -def load_yaml(parser, file_name, **kwargs): - with io.open(file_name, 'r', encoding='utf8') as f: - args = yaml.load(f) - for title in args: - group = parser.add_argument_group(title=title, description='') - for name in args[title]: - _type = type(args[title][name]['val']) - _type = str2bool if _type == bool else _type - group.add_argument( - "--" + name, - default=args[title][name]['val'], - type=_type, - help=args[title][name]['meaning'] + - ' Default: %(default)s.', - **kwargs) - - -def print_arguments(args): - """none""" - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).items()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -def to_str(string, encoding="utf-8"): - """convert to str for print""" - if sys.version_info.major == 3: - if isinstance(string, bytes): - return string.decode(encoding) - elif sys.version_info.major == 2: - if isinstance(string, unicode): - if os.name == 'nt': - return string - else: - return string.encode(encoding) - return string - - -def to_lodtensor(data, place): - """ - Convert data in list into lodtensor. - """ - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = fluid.Tensor() - res.set(flattened_data, place) - res.set_lod([lod]) - return res - - -def parse_result(words, crf_decode, dataset): - """ parse result """ - offset_list = (crf_decode.lod())[0] - words = np.array(words) - crf_decode = np.array(crf_decode) - batch_size = len(offset_list) - 1 - - batch_out = [] - for sent_index in range(batch_size): - begin, end = offset_list[sent_index], offset_list[sent_index + 1] - sent = [dataset.id2word_dict[str(id[0])] for id in words[begin:end]] - tags = [ - dataset.id2label_dict[str(id[0])] for id in crf_decode[begin:end] - ] - - sent_out = [] - tags_out = [] - parital_word = "" - for ind, tag in enumerate(tags): - # for the first word - if parital_word == "": - parital_word = sent[ind] - tags_out.append(tag.split('-')[0]) - continue - - # for the beginning of word - if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"): - sent_out.append(parital_word) - tags_out.append(tag.split('-')[0]) - parital_word = sent[ind] - continue - - parital_word += sent[ind] - - # append the last word, except for len(tags)=0 - if len(sent_out) < len(tags_out): - sent_out.append(parital_word) - - batch_out.append([sent_out, tags_out]) - return batch_out - - -def parse_padding_result(words, crf_decode, seq_lens, dataset): - """ parse padding result """ - words = np.squeeze(words) - batch_size = len(seq_lens) - - batch_out = [] - for sent_index in range(batch_size): - - sent = [ - dataset.id2word_dict[str(id)] - for id in words[sent_index][1:seq_lens[sent_index] - 1] - ] - tags = [ - dataset.id2label_dict[str(id)] - for id in crf_decode[sent_index][1:seq_lens[sent_index] - 1] - ] - - sent_out = [] - tags_out = [] - parital_word = "" - for ind, tag in enumerate(tags): - # for the first word - if parital_word == "": - parital_word = sent[ind] - tags_out.append(tag.split('-')[0]) - continue - - # for the beginning of word - if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"): - sent_out.append(parital_word) - tags_out.append(tag.split('-')[0]) - parital_word = sent[ind] - continue - - parital_word += sent[ind] - - # append the last word, except for len(tags)=0 - if len(sent_out) < len(tags_out): - sent_out.append(parital_word) - - batch_out.append([sent_out, tags_out]) - return batch_out - - -def init_checkpoint(exe, init_checkpoint_path, main_program): - """ - Init CheckPoint - """ - assert os.path.exists( - init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path - - def existed_persitables(var): - """ - If existed presitabels - """ - if not fluid.io.is_persistable(var): - return False - if os.path.exists(os.path.join(init_checkpoint_path, var.name)): - print("INIT {}".format(var.name)) - return True - else: - print("SKIP {}".format(var.name)) - return False - - fluid.io.load_vars( - exe, - init_checkpoint_path, - main_program=main_program, - predicate=existed_persitables) - print("Load model from {}".format(init_checkpoint_path)) - - -def init_pretraining_params(exe, - pretraining_params_path, - main_program, - use_fp16=False): - """load params of pretrained model, NOT including moment, learning_rate""" - assert os.path.exists(pretraining_params_path - ), "[%s] cann't be found." % pretraining_params_path - - def _existed_params(var): - if not isinstance(var, fluid.framework.Parameter): - return False - if os.path.exists(os.path.join(pretraining_params_path, var.name)): - print("INIT {}".format(var.name)) - return True - else: - print("SKIP {}".format(var.name)) - return False - - fluid.io.load_vars( - exe, - pretraining_params_path, - main_program=main_program, - predicate=_existed_params) - print("Load pretraining parameters from {}.".format( - pretraining_params_path)) diff --git a/demo/pantheon/lexical_anlysis/models/__init__.py b/demo/pantheon/lexical_anlysis/models/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/demo/pantheon/lexical_anlysis/models/model_check.py b/demo/pantheon/lexical_anlysis/models/model_check.py deleted file mode 100755 index 51713452..00000000 --- a/demo/pantheon/lexical_anlysis/models/model_check.py +++ /dev/null @@ -1,73 +0,0 @@ -#encoding=utf8 -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import paddle -import paddle.fluid as fluid - - -def check_cuda(use_cuda, err = \ - "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \ - Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n" - ): - """ - Log error and exit when set use_gpu=true in paddlepaddle - cpu version. - """ - try: - if use_cuda == True and fluid.is_compiled_with_cuda() == False: - print(err) - sys.exit(1) - except Exception as e: - pass - -def check_version(): - """ - Log error and exit when the installed version of paddlepaddle is - not satisfied. - """ - err = "PaddlePaddle version 1.6 or higher is required, " \ - "or a suitable develop version is satisfied as well. \n" \ - "Please make sure the version is good with your code." \ - - try: - fluid.require_version('1.6.0') - except Exception as e: - print(err) - sys.exit(1) - - -def check_version(): - """ - Log error and exit when the installed version of paddlepaddle is - not satisfied. - """ - err = "PaddlePaddle version 1.6 or higher is required, " \ - "or a suitable develop version is satisfied as well. \n" \ - "Please make sure the version is good with your code." \ - - try: - fluid.require_version('1.6.0') - except Exception as e: - print(err) - sys.exit(1) - - -if __name__ == "__main__": - check_cuda(True) - - check_cuda(False) - - check_cuda(True, "This is only for testing.") diff --git a/demo/pantheon/lexical_anlysis/models/representation/__init__.py b/demo/pantheon/lexical_anlysis/models/representation/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/demo/pantheon/lexical_anlysis/models/representation/ernie.py b/demo/pantheon/lexical_anlysis/models/representation/ernie.py deleted file mode 100755 index ced3196f..00000000 --- a/demo/pantheon/lexical_anlysis/models/representation/ernie.py +++ /dev/null @@ -1,322 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This module provides ErnieModel and ErnieConfig -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import json - -import six -import paddle.fluid as fluid - -from models.transformer_encoder import encoder, pre_process_layer - - -def ernie_pyreader(args, pyreader_name): - """define standard ernie pyreader""" - src_ids = fluid.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64') - sent_ids = fluid.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64') - pos_ids = fluid.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64') - input_mask = fluid.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='float32') - labels = fluid.data(name='5', shape=[-1, 1], dtype='int64') - seq_lens = fluid.data(name='6', shape=[-1], dtype='int64') - - pyreader = fluid.io.DataLoader.from_generator( - feed_list=[src_ids, sent_ids, pos_ids, input_mask, labels, seq_lens], - capacity=50, - iterable=False, - use_double_buffer=True) - - ernie_inputs = { - "src_ids": src_ids, - "sent_ids": sent_ids, - "pos_ids": pos_ids, - "input_mask": input_mask, - "seq_lens": seq_lens - } - return pyreader, ernie_inputs, labels - - -def ernie_encoder_with_paddle_hub(ernie_inputs, max_seq_len): - ernie = hub.Module(name="ernie") - inputs, outputs, program = ernie.context( - trainable=True, max_seq_len=max_seq_len, learning_rate=1) - - main_program = fluid.default_main_program() - input_dict = { - inputs["input_ids"].name: ernie_inputs["src_ids"], - inputs["segment_ids"].name: ernie_inputs["sent_ids"], - inputs["position_ids"].name: ernie_inputs["pos_ids"], - inputs["input_mask"].name: ernie_inputs["input_mask"] - } - - hub.connect_program( - pre_program=main_program, - next_program=program, - input_dict=input_dict, - inplace=True) - - enc_out = outputs["sequence_output"] - unpad_enc_out = fluid.layers.sequence_unpad( - enc_out, length=ernie_inputs["seq_lens"]) - cls_feats = outputs["pooled_output"] - - embeddings = { - "sentence_embeddings": cls_feats, - "token_embeddings": unpad_enc_out, - "padded_token_embeddings": enc_out - } - - for k, v in embeddings.items(): - v.persistable = True - - return embeddings - - -def ernie_encoder(ernie_inputs, ernie_config): - """return sentence embedding and token embeddings""" - - ernie = ErnieModel( - src_ids=ernie_inputs["src_ids"], - position_ids=ernie_inputs["pos_ids"], - sentence_ids=ernie_inputs["sent_ids"], - input_mask=ernie_inputs["input_mask"], - config=ernie_config) - - enc_out = ernie.get_sequence_output() - unpad_enc_out = fluid.layers.sequence_unpad( - enc_out, length=ernie_inputs["seq_lens"]) - cls_feats = ernie.get_pooled_output() - - embeddings = { - "sentence_embeddings": cls_feats, - "token_embeddings": unpad_enc_out, - "padded_token_embeddings": enc_out - } - - for k, v in embeddings.items(): - v.persistable = True - - return embeddings - - -class ErnieConfig(object): - """ErnieConfig""" - - def __init__(self, config_path): - self._config_dict = self._parse(config_path) - - def _parse(self, config_path): - try: - with open(config_path) as json_file: - config_dict = json.load(json_file) - except Exception: - raise IOError("Error in parsing Ernie model config file '%s'" % - config_path) - else: - return config_dict - - def __getitem__(self, key): - return self._config_dict[key] - - def print_config(self): - """print config""" - for arg, value in sorted(six.iteritems(self._config_dict)): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -class ErnieModel(object): - """ErnieModel""" - - def __init__(self, - src_ids, - position_ids, - sentence_ids, - input_mask, - config, - weight_sharing=True, - use_fp16=False): - - self._emb_size = config['hidden_size'] - self._n_layer = config['num_hidden_layers'] - self._n_head = config['num_attention_heads'] - self._voc_size = config['vocab_size'] - self._max_position_seq_len = config['max_position_embeddings'] - self._sent_types = config['type_vocab_size'] - self._hidden_act = config['hidden_act'] - self._prepostprocess_dropout = config['hidden_dropout_prob'] - self._attention_dropout = config['attention_probs_dropout_prob'] - self._weight_sharing = weight_sharing - - self._word_emb_name = "word_embedding" - self._pos_emb_name = "pos_embedding" - self._sent_emb_name = "sent_embedding" - self._dtype = "float16" if use_fp16 else "float32" - - # Initialize all weigths by truncated normal initializer, and all biases - # will be initialized by constant zero by default. - self._param_initializer = fluid.initializer.TruncatedNormal( - scale=config['initializer_range']) - - self._build_model(src_ids, position_ids, sentence_ids, input_mask) - - def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): - # padding id in vocabulary must be set to 0 - emb_out = fluid.layers.embedding( - input=src_ids, - size=[self._voc_size, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr( - name=self._word_emb_name, initializer=self._param_initializer), - is_sparse=False) - position_emb_out = fluid.layers.embedding( - input=position_ids, - size=[self._max_position_seq_len, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr( - name=self._pos_emb_name, initializer=self._param_initializer)) - - sent_emb_out = fluid.layers.embedding( - sentence_ids, - size=[self._sent_types, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr( - name=self._sent_emb_name, initializer=self._param_initializer)) - - emb_out = emb_out + position_emb_out - emb_out = emb_out + sent_emb_out - - emb_out = pre_process_layer( - emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') - - if self._dtype == "float16": - input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) - self_attn_mask = fluid.layers.matmul( - x=input_mask, y=input_mask, transpose_y=True) - - self_attn_mask = fluid.layers.scale( - x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) - n_head_self_attn_mask = fluid.layers.stack( - x=[self_attn_mask] * self._n_head, axis=1) - n_head_self_attn_mask.stop_gradient = True - - self._enc_out = encoder( - enc_input=emb_out, - attn_bias=n_head_self_attn_mask, - n_layer=self._n_layer, - n_head=self._n_head, - d_key=self._emb_size // self._n_head, - d_value=self._emb_size // self._n_head, - d_model=self._emb_size, - d_inner_hid=self._emb_size * 4, - prepostprocess_dropout=self._prepostprocess_dropout, - attention_dropout=self._attention_dropout, - relu_dropout=0, - hidden_act=self._hidden_act, - preprocess_cmd="", - postprocess_cmd="dan", - param_initializer=self._param_initializer, - name='encoder') - - def get_sequence_output(self): - """Get embedding of each token for squence labeling""" - return self._enc_out - - def get_pooled_output(self): - """Get the first feature of each sequence for classification""" - next_sent_feat = fluid.layers.slice( - input=self._enc_out, axes=[1], starts=[0], ends=[1]) - next_sent_feat = fluid.layers.fc( - input=next_sent_feat, - size=self._emb_size, - act="tanh", - param_attr=fluid.ParamAttr( - name="pooled_fc.w_0", initializer=self._param_initializer), - bias_attr="pooled_fc.b_0") - return next_sent_feat - - def get_pretraining_output(self, mask_label, mask_pos, labels): - """Get the loss & accuracy for pretraining""" - - mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') - - # extract the first token feature in each sentence - next_sent_feat = self.get_pooled_output() - reshaped_emb_out = fluid.layers.reshape( - x=self._enc_out, shape=[-1, self._emb_size]) - # extract masked tokens' feature - mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) - - # transform: fc - mask_trans_feat = fluid.layers.fc( - input=mask_feat, - size=self._emb_size, - act=self._hidden_act, - param_attr=fluid.ParamAttr( - name='mask_lm_trans_fc.w_0', - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) - # transform: layer norm - mask_trans_feat = pre_process_layer( - mask_trans_feat, 'n', name='mask_lm_trans') - - mask_lm_out_bias_attr = fluid.ParamAttr( - name="mask_lm_out_fc.b_0", - initializer=fluid.initializer.Constant(value=0.0)) - if self._weight_sharing: - fc_out = fluid.layers.matmul( - x=mask_trans_feat, - y=fluid.default_main_program().global_block().var( - self._word_emb_name), - transpose_y=True) - fc_out += fluid.layers.create_parameter( - shape=[self._voc_size], - dtype=self._dtype, - attr=mask_lm_out_bias_attr, - is_bias=True) - - else: - fc_out = fluid.layers.fc(input=mask_trans_feat, - size=self._voc_size, - param_attr=fluid.ParamAttr( - name="mask_lm_out_fc.w_0", - initializer=self._param_initializer), - bias_attr=mask_lm_out_bias_attr) - - mask_lm_loss = fluid.layers.softmax_with_cross_entropy( - logits=fc_out, label=mask_label) - mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) - - next_sent_fc_out = fluid.layers.fc( - input=next_sent_feat, - size=2, - param_attr=fluid.ParamAttr( - name="next_sent_fc.w_0", initializer=self._param_initializer), - bias_attr="next_sent_fc.b_0") - - next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( - logits=next_sent_fc_out, label=labels, return_softmax=True) - - next_sent_acc = fluid.layers.accuracy( - input=next_sent_softmax, label=labels) - - mean_next_sent_loss = fluid.layers.mean(next_sent_loss) - - loss = mean_next_sent_loss + mean_mask_lm_loss - return next_sent_acc, mean_mask_lm_loss, loss diff --git a/demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py b/demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py b/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py deleted file mode 100755 index 414e89b0..00000000 --- a/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -The function lex_net(args) define the lexical analysis network structure -""" -import sys -import os -import math - -import paddle.fluid as fluid -from paddle.fluid.initializer import NormalInitializer - - -def lex_net(word, args, vocab_size, num_labels, teacher_crf_decode=None, for_infer=True,target=None): - """ - define the lexical analysis network structure - word: stores the input of the model - for_infer: a boolean value, indicating if the model to be created is for training or predicting. - - return: - for infer: return the prediction - otherwise: return the prediction - """ - word_emb_dim = args.word_emb_dim - grnn_hidden_dim = args.grnn_hidden_dim - emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(args) else 1.0 - crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(args) else 1.0 - bigru_num = args.bigru_num - init_bound = 0.1 - IS_SPARSE = True - - def _bigru_layer(input_feature): - """ - define the bidirectional gru layer - """ - pre_gru = fluid.layers.fc( - input=input_feature, - size=grnn_hidden_dim * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - gru = fluid.layers.dynamic_gru( - input=pre_gru, - size=grnn_hidden_dim, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - - pre_gru_r = fluid.layers.fc( - input=input_feature, - size=grnn_hidden_dim * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - gru_r = fluid.layers.dynamic_gru( - input=pre_gru_r, - size=grnn_hidden_dim, - is_reverse=True, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - - bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1) - return bi_merge - - def log_softmax(logits, axis=-1): - logsoftmax = logits-fluid.layers.log(fluid.layers.reduce_sum(fluid.layers.exp(logits),axis)) - return logsoftmax - - def cross_entropy(student, teacher): - ce_loss = -1.0 * fluid.layers.reduce_sum(teacher*fluid.layers.log(student), dim=1) - ce_loss = fluid.layers.sequence_pool(ce_loss, "sum") - return ce_loss - - def kl_div(student, teacher): - ce_loss = fluid.layers.reduce_sum(teacher*(fluid.layers.log(teacher) - fluid.layers.log(student)), dim=1) - ce_loss = fluid.layers.sequence_pool(ce_loss, "sum") - return ce_loss - - def pred(student, teacher,t=1.0): - return fluid.layers.reduce_mean(-1.0*fluid.layers.softmax(teacher)*log_softmax(student/t)) - - def normalize(alpha): - """ alpha shape (-1, 57) - """ - tag_num = alpha.shape[1] - sum_alpha = fluid.layers.reduce_sum(alpha, dim=1) - sum_alpha = fluid.layers.unsqueeze(sum_alpha, axes=[1]) - sum_alpha = fluid.layers.expand(sum_alpha, [1, tag_num]) - norm_alpha = alpha / sum_alpha - return norm_alpha - - def _net_conf(word, target=None): - """ - Configure the network - """ - word_embedding = fluid.embedding( - input=word, - size=[vocab_size, word_emb_dim], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr=fluid.ParamAttr( - learning_rate=emb_lr, - name="word_emb", - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound))) - - input_feature = word_embedding - for i in range(bigru_num): - bigru_output = _bigru_layer(input_feature) - input_feature = bigru_output - - emission = fluid.layers.fc( - size=num_labels, - input=bigru_output, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - - if target is not None: - crf_cost = fluid.layers.linear_chain_crf( - input=emission, - label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=crf_lr)) - if teacher_crf_decode is not None: - teacher_cost = pred(student=emission, teacher=teacher_crf_decode,t=1.0) - else: - teacher_cost = 0 - print('no teacher emission') - crf_avg_cost = fluid.layers.mean(x=crf_cost) - alpha, beta = 0.5, 0.5 - print("alpha * crf_avg_cost + beta * teacher_cost: ", alpha, beta) - avg_cost = alpha * crf_avg_cost+ beta * teacher_cost - crf_decode = fluid.layers.crf_decoding( - input=emission, param_attr=fluid.ParamAttr(name='crfw')) - return avg_cost, crf_avg_cost, teacher_cost, crf_decode - - else: - size = emission.shape[1] - fluid.layers.create_parameter( - shape=[size + 2, size], dtype=emission.dtype, name='crfw') - crf_decode = fluid.layers.crf_decoding( - input=emission, param_attr=fluid.ParamAttr(name='crfw')) - - return crf_decode - - if for_infer: - return _net_conf(word) - - else: - # assert target != None, "target is necessary for training" - return _net_conf(word, target) diff --git a/demo/pantheon/lexical_anlysis/models/transformer_encoder.py b/demo/pantheon/lexical_anlysis/models/transformer_encoder.py deleted file mode 100755 index 77908896..00000000 --- a/demo/pantheon/lexical_anlysis/models/transformer_encoder.py +++ /dev/null @@ -1,342 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Transformer encoder.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from functools import partial - -import paddle.fluid as fluid -import paddle.fluid.layers as layers - - -def multi_head_attention(queries, - keys, - values, - attn_bias, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - param_initializer=None, - name='multi_head_att'): - """ - Multi-Head Attention. Note that attn_bias is added to the logit before - computing softmax activiation to mask certain selected positions so that - they will not considered in attention weights. - """ - keys = queries if keys is None else keys - values = keys if values is None else values - - if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): - raise ValueError( - "Inputs: quries, keys and values should all be 3-D tensors.") - - def __compute_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Add linear projection to queries, keys, and values. - """ - q = layers.fc(input=queries, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_query_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_query_fc.b_0') - k = layers.fc(input=keys, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_key_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_key_fc.b_0') - v = layers.fc(input=values, - size=d_value * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_value_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_value_fc.b_0') - return q, k, v - - def __split_heads(x, n_head): - """ - Reshape the last dimension of inpunt tensor x so that it becomes two - dimensions and then transpose. Specifically, input a tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] then output a tensor - with shape [bs, n_head, max_sequence_length, hidden_dim]. - """ - hidden_size = x.shape[-1] - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - reshaped = layers.reshape( - x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) - - # permuate the dimensions into: - # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) - - def __combine_heads(x): - """ - Transpose and then reshape the last two dimensions of inpunt tensor x - so that it becomes one dimension, which is reverse to __split_heads. - """ - if len(x.shape) == 3: - return x - if len(x.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - return layers.reshape( - x=trans_x, - shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], - inplace=True) - - def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): - """ - Scaled Dot-Product Attention - """ - scaled_q = layers.scale(x=q, scale=d_key**-0.5) - product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - if attn_bias: - product += attn_bias - weights = layers.softmax(product) - if dropout_rate: - weights = layers.dropout( - weights, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.matmul(weights, v) - return out - - q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - - if cache is not None: # use cache and concat time steps - # Since the inplace reshape in __split_heads changes the shape of k and - # v, which is the cache input for next time step, reshape the cache - # input from the previous time step first. - k = cache["k"] = layers.concat( - [layers.reshape( - cache["k"], shape=[0, 0, d_model]), k], axis=1) - v = cache["v"] = layers.concat( - [layers.reshape( - cache["v"], shape=[0, 0, d_model]), v], axis=1) - - q = __split_heads(q, n_head) - k = __split_heads(k, n_head) - v = __split_heads(v, n_head) - - ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, - dropout_rate) - - out = __combine_heads(ctx_multiheads) - - # Project back to the model size. - proj_out = layers.fc(input=out, - size=d_model, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_output_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_output_fc.b_0') - return proj_out - - -def positionwise_feed_forward(x, - d_inner_hid, - d_hid, - dropout_rate, - hidden_act, - param_initializer=None, - name='ffn'): - """ - Position-wise Feed-Forward Networks. - This module consists of two linear transformations with a ReLU activation - in between, which is applied to each position separately and identically. - """ - hidden = layers.fc(input=x, - size=d_inner_hid, - num_flatten_dims=2, - act=hidden_act, - param_attr=fluid.ParamAttr( - name=name + '_fc_0.w_0', - initializer=param_initializer), - bias_attr=name + '_fc_0.b_0') - if dropout_rate: - hidden = layers.dropout( - hidden, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.fc(input=hidden, - size=d_hid, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_fc_1.w_0', initializer=param_initializer), - bias_attr=name + '_fc_1.b_0') - return out - - -def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., - name=''): - """ - Add residual connection, layer normalization and droput to the out tensor - optionally according to the value of process_cmd. - This will be used before or after multi-head attention and position-wise - feed-forward networks. - """ - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out_dtype = out.dtype - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float32") - out = layers.layer_norm( - out, - begin_norm_axis=len(out.shape) - 1, - param_attr=fluid.ParamAttr( - name=name + '_layer_norm_scale', - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr( - name=name + '_layer_norm_bias', - initializer=fluid.initializer.Constant(0.))) - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float16") - elif cmd == "d": # add dropout - if dropout_rate: - out = layers.dropout( - out, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - return out - - -pre_process_layer = partial(pre_post_process_layer, None) -post_process_layer = pre_post_process_layer - - -def encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """The encoder layers that can be stacked to form a deep encoder. - This module consits of a multi-head (self) attention followed by - position-wise feed-forward networks and both the two components companied - with the post_process_layer to add residual connection, layer normalization - and droput. - """ - attn_output = multi_head_attention( - pre_process_layer( - enc_input, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_att'), - None, - None, - attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - param_initializer=param_initializer, - name=name + '_multi_head_att') - attn_output = post_process_layer( - enc_input, - attn_output, - postprocess_cmd, - prepostprocess_dropout, - name=name + '_post_att') - ffd_output = positionwise_feed_forward( - pre_process_layer( - attn_output, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_ffn'), - d_inner_hid, - d_model, - relu_dropout, - hidden_act, - param_initializer=param_initializer, - name=name + '_ffn') - return post_process_layer( - attn_output, - ffd_output, - postprocess_cmd, - prepostprocess_dropout, - name=name + '_post_ffn') - - -def encoder(enc_input, - attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """ - The encoder is composed of a stack of identical layers returned by calling - encoder_layer. - """ - for i in range(n_layer): - enc_output = encoder_layer( - enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - param_initializer=param_initializer, - name=name + '_layer_' + str(i)) - enc_input = enc_output - enc_output = pre_process_layer( - enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") - - return enc_output diff --git a/demo/pantheon/lexical_anlysis/preprocess/__init__.py b/demo/pantheon/lexical_anlysis/preprocess/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py b/demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py b/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py deleted file mode 100644 index b3a8a0d7..00000000 --- a/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py +++ /dev/null @@ -1,392 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This module provides reader for classification and sequence labing -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from collections import namedtuple -import csv -import json - -import numpy as np - -from preprocess.ernie import tokenization -from preprocess.padding import pad_batch_data -import io - -def csv_reader(fd, delimiter='\t'): - def gen(): - for i in fd: - slots = i.rstrip('\n').split(delimiter) - if len(slots) == 1: - yield slots, - else: - yield slots - return gen() - -class BaseReader(object): - """BaseReader for classify and sequence labeling task""" - - def __init__(self, - vocab_path, - label_map_config=None, - max_seq_len=512, - do_lower_case=True, - in_tokens=False, - random_seed=None): - self.max_seq_len = max_seq_len - self.tokenizer = tokenization.FullTokenizer( - vocab_file=vocab_path, do_lower_case=do_lower_case) - self.vocab = self.tokenizer.vocab - self.pad_id = self.vocab["[PAD]"] - self.cls_id = self.vocab["[CLS]"] - self.sep_id = self.vocab["[SEP]"] - self.in_tokens = in_tokens - - np.random.seed(random_seed) - - self.current_example = 0 - self.current_epoch = 0 - self.num_examples = 0 - - if label_map_config: - with open(label_map_config) as f: - self.label_map = json.load(f) - else: - self.label_map = None - - def get_train_progress(self): - """Gets progress for training phase.""" - return self.current_example, self.current_epoch - - def _read_tsv(self, input_file, quotechar=None): - """Reads a tab separated value file.""" - with io.open(input_file, "r", encoding="utf8") as f: - reader = csv_reader(f, delimiter="\t") - headers = next(reader) - Example = namedtuple('Example', headers) - - examples = [] - for line in reader: - example = Example(*line) - examples.append(example) - return examples - - def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - def _convert_example_to_record(self, example, max_seq_length, tokenizer): - """Converts a single `Example` into a single `Record`.""" - - text_a = tokenization.convert_to_unicode(example.text_a) - tokens_a = tokenizer.tokenize(text_a) - tokens_b = None - if "text_b" in example._fields: - text_b = tokenization.convert_to_unicode(example.text_b) - tokens_b = tokenizer.tokenize(text_b) - - if tokens_b: - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[0:(max_seq_length - 2)] - - # The convention in BERT/ERNIE is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = [] - text_type_ids = [] - tokens.append("[CLS]") - text_type_ids.append(0) - for token in tokens_a: - tokens.append(token) - text_type_ids.append(0) - tokens.append("[SEP]") - text_type_ids.append(0) - - if tokens_b: - for token in tokens_b: - tokens.append(token) - text_type_ids.append(1) - tokens.append("[SEP]") - text_type_ids.append(1) - - token_ids = tokenizer.convert_tokens_to_ids(tokens) - position_ids = list(range(len(token_ids))) - - if self.label_map: - label_id = self.label_map[example.label] - else: - label_id = example.label - - Record = namedtuple( - 'Record', - ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid']) - - qid = None - if "qid" in example._fields: - qid = example.qid - - record = Record( - token_ids=token_ids, - text_type_ids=text_type_ids, - position_ids=position_ids, - label_id=label_id, - qid=qid) - return record - - def _prepare_batch_data(self, examples, batch_size, phase=None): - """generate batch records""" - batch_records, max_len = [], 0 - for index, example in enumerate(examples): - if phase == "train": - self.current_example = index - record = self._convert_example_to_record(example, self.max_seq_len, - self.tokenizer) - max_len = max(max_len, len(record.token_ids)) - if self.in_tokens: - to_append = (len(batch_records) + 1) * max_len <= batch_size - else: - to_append = len(batch_records) < batch_size - if to_append: - batch_records.append(record) - else: - yield self._pad_batch_records(batch_records) - batch_records, max_len = [record], len(record.token_ids) - - if batch_records: - yield self._pad_batch_records(batch_records) - - def get_num_examples(self, input_file): - """return total number of examples""" - examples = self._read_tsv(input_file) - return len(examples) - - def data_generator(self, - input_file, - batch_size, - epoch, - shuffle=True, - phase=None): - """return generator which yields batch data for pyreader""" - examples = self._read_tsv(input_file) - - def _wrapper(): - for epoch_index in range(epoch): - if phase == "train": - self.current_example = 0 - self.current_epoch = epoch_index - if shuffle: - np.random.shuffle(examples) - - for batch_data in self._prepare_batch_data( - examples, batch_size, phase=phase): - yield batch_data - - return _wrapper - - -class ClassifyReader(BaseReader): - """ClassifyReader""" - - def _read_tsv(self, input_file, quotechar=None): - """Reads a tab separated value file.""" - with io.open(input_file, "r", encoding="utf8") as f: - reader = csv_reader(f, delimiter="\t") - headers = next(reader) - text_indices = [ - index for index, h in enumerate(headers) if h != "label" - ] - Example = namedtuple('Example', headers) - - examples = [] - for line in reader: - for index, text in enumerate(line): - if index in text_indices: - line[index] = text.replace(' ', '') - example = Example(*line) - examples.append(example) - return examples - - def _pad_batch_records(self, batch_records): - batch_token_ids = [record.token_ids for record in batch_records] - batch_text_type_ids = [record.text_type_ids for record in batch_records] - batch_position_ids = [record.position_ids for record in batch_records] - batch_labels = [record.label_id for record in batch_records] - batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1]) - - # padding - padded_token_ids, input_mask, seq_lens = pad_batch_data( - batch_token_ids, - pad_idx=self.pad_id, - return_input_mask=True, - return_seq_lens=True) - padded_text_type_ids = pad_batch_data( - batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data( - batch_position_ids, pad_idx=self.pad_id) - - return_list = [ - padded_token_ids, padded_text_type_ids, padded_position_ids, - input_mask, batch_labels, seq_lens - ] - - return return_list - - -class SequenceLabelReader(BaseReader): - """SequenceLabelReader""" - - def _pad_batch_records(self, batch_records): - batch_token_ids = [record.token_ids for record in batch_records] - batch_text_type_ids = [record.text_type_ids for record in batch_records] - batch_position_ids = [record.position_ids for record in batch_records] - batch_label_ids = [record.label_ids for record in batch_records] - - # padding - padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( - batch_token_ids, - pad_idx=self.pad_id, - return_input_mask=True, - return_seq_lens=True) - padded_text_type_ids = pad_batch_data( - batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data( - batch_position_ids, pad_idx=self.pad_id) - padded_label_ids = pad_batch_data( - batch_label_ids, pad_idx=len(self.label_map) - 1) - - return_list = [ - padded_token_ids, padded_text_type_ids, padded_position_ids, - input_mask, padded_label_ids, batch_seq_lens - ] - return return_list - - def _reseg_token_label(self, tokens, labels, tokenizer): - assert len(tokens) == len(labels) - ret_tokens = [] - ret_labels = [] - for token, label in zip(tokens, labels): - sub_token = tokenizer.tokenize(token) - if len(sub_token) == 0: - continue - ret_tokens.extend(sub_token) - ret_labels.append(label) - if len(sub_token) < 2: - continue - sub_label = label - if label.startswith("B-"): - sub_label = "I-" + label[2:] - ret_labels.extend([sub_label] * (len(sub_token) - 1)) - - assert len(ret_tokens) == len(ret_labels) - return ret_tokens, ret_labels - - def _convert_example_to_record(self, example, max_seq_length, tokenizer): - tokens = tokenization.convert_to_unicode(example.text_a).split(u"") - labels = tokenization.convert_to_unicode(example.label).split(u"") - tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) - - if len(tokens) > max_seq_length - 2: - tokens = tokens[0:(max_seq_length - 2)] - labels = labels[0:(max_seq_length - 2)] - - tokens = ["[CLS]"] + tokens + ["[SEP]"] - token_ids = tokenizer.convert_tokens_to_ids(tokens) - position_ids = list(range(len(token_ids))) - text_type_ids = [0] * len(token_ids) - no_entity_id = len(self.label_map) - 1 - labels = [ - label if label in self.label_map else u"O" for label in labels - ] - label_ids = [no_entity_id] + [ - self.label_map[label] for label in labels - ] + [no_entity_id] - - Record = namedtuple( - 'Record', - ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) - record = Record( - token_ids=token_ids, - text_type_ids=text_type_ids, - position_ids=position_ids, - label_ids=label_ids) - return record - - -class ExtractEmbeddingReader(BaseReader): - """ExtractEmbeddingReader""" - - def _pad_batch_records(self, batch_records): - batch_token_ids = [record.token_ids for record in batch_records] - batch_text_type_ids = [record.text_type_ids for record in batch_records] - batch_position_ids = [record.position_ids for record in batch_records] - - # padding - padded_token_ids, input_mask, seq_lens = pad_batch_data( - batch_token_ids, - pad_idx=self.pad_id, - return_input_mask=True, - return_seq_lens=True) - padded_text_type_ids = pad_batch_data( - batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data( - batch_position_ids, pad_idx=self.pad_id) - - return_list = [ - padded_token_ids, padded_text_type_ids, padded_position_ids, - input_mask, seq_lens - ] - - return return_list - - -if __name__ == '__main__': - pass diff --git a/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py b/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py deleted file mode 100644 index 2a06a581..00000000 --- a/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py +++ /dev/null @@ -1,370 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization classes.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import collections -import unicodedata -import six -import io - -def convert_to_unicode(text): - """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text.decode("utf-8", "ignore") - elif isinstance(text, unicode): - return text - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") - - -def printable_text(text): - """Returns text encoded in a way suitable for print or `tf.logging`.""" - - # These functions want `str` for both Python2 and Python3, but in one case - # it's a Unicode string and in the other it's a byte string. - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text - elif isinstance(text, unicode): - return text.encode("utf-8") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - fin = io.open(vocab_file, encoding="utf8") - for num, line in enumerate(fin): - items = convert_to_unicode(line.strip()).split("\t") - if len(items) > 2: - break - token = items[0] - index = items[1] if len(items) == 2 else num - token = token.strip() - vocab[token] = int(index) - return vocab - - -def convert_by_vocab(vocab, items): - """Converts a sequence of [tokens|ids] using the vocab.""" - output = [] - for item in items: - output.append(vocab[item]) - return output - - -def convert_tokens_to_ids(vocab, tokens): - return convert_by_vocab(vocab, tokens) - - -def convert_ids_to_tokens(inv_vocab, ids): - return convert_by_vocab(inv_vocab, ids) - - -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a peice of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class FullTokenizer(object): - """Runs end-to-end tokenziation.""" - - def __init__(self, vocab_file, do_lower_case=True): - self.vocab = load_vocab(vocab_file) - self.inv_vocab = {v: k for k, v in self.vocab.items()} - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - - def tokenize(self, text): - split_tokens = [] - for token in self.basic_tokenizer.tokenize(text): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - - return split_tokens - - def convert_tokens_to_ids(self, tokens): - return convert_by_vocab(self.vocab, tokens) - - def convert_ids_to_tokens(self, ids): - return convert_by_vocab(self.inv_vocab, ids) - - -class CharTokenizer(object): - """Runs end-to-end tokenziation.""" - - def __init__(self, vocab_file, do_lower_case=True): - self.vocab = load_vocab(vocab_file) - self.inv_vocab = {v: k for k, v in self.vocab.items()} - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - - def tokenize(self, text): - split_tokens = [] - for token in text.lower().split(" "): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - - return split_tokens - - def convert_tokens_to_ids(self, tokens): - return convert_by_vocab(self.vocab, tokens) - - def convert_ids_to_tokens(self, ids): - return convert_by_vocab(self.inv_vocab, ids) - - -class BasicTokenizer(object): - """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - - def __init__(self, do_lower_case=True): - """Constructs a BasicTokenizer. - - Args: - do_lower_case: Whether to lower case the input. - """ - self.do_lower_case = do_lower_case - - def tokenize(self, text): - """Tokenizes a piece of text.""" - text = convert_to_unicode(text) - text = self._clean_text(text) - - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - text = self._tokenize_chinese_chars(text) - - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case: - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) - - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(" ") - output.append(char) - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - -class WordpieceTokenizer(object): - """Runs WordPiece tokenziation.""" - - def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """Tokenizes a piece of text into its word pieces. - - This uses a greedy longest-match-first algorithm to perform tokenization - using the given vocabulary. - - For example: - input = "unaffable" - output = ["un", "##aff", "##able"] - - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer. - - Returns: - A list of wordpiece tokens. - """ - - text = convert_to_unicode(text) - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = "".join(chars[start:end]) - if start > 0: - substr = "##" + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end - - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens - - -def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically contorl characters but we treat them - # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": - return True - cat = unicodedata.category(char) - if cat == "Zs": - return True - return False - - -def _is_control(char): - """Checks whether `chars` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == "\t" or char == "\n" or char == "\r": - return False - cat = unicodedata.category(char) - if cat.startswith("C"): - return True - return False - - -def _is_punctuation(char): - """Checks whether `chars` is a punctuation character.""" - cp = ord(char) - # We treat all non-letter/number ASCII as punctuation. - # Characters such as "^", "$", and "`" are not in the Unicode - # Punctuation class but we treat them as punctuation anyways, for - # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith("P"): - return True - return False diff --git a/demo/pantheon/lexical_anlysis/preprocess/padding.py b/demo/pantheon/lexical_anlysis/preprocess/padding.py deleted file mode 100644 index 82171e68..00000000 --- a/demo/pantheon/lexical_anlysis/preprocess/padding.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Mask, padding and batching. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - - -def pad_batch_data(insts, - pad_idx=0, - return_pos=False, - return_input_mask=False, - return_max_len=False, - return_num_token=False, - return_seq_lens=False): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and input mask. - """ - return_list = [] - max_len = max(len(inst) for inst in insts) - # Any token included in dict can be used to pad, since the paddings' loss - # will be masked out by weights and make no effect on parameter gradients. - - inst_data = np.array( - [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts]) - return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] - - # position data - if return_pos: - inst_pos = np.array([ - list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) - for inst in insts - ]) - - return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] - - if return_input_mask: - # This is used to avoid attention on paddings. - input_mask_data = np.array([[1] * len(inst) + [0] * - (max_len - len(inst)) for inst in insts]) - input_mask_data = np.expand_dims(input_mask_data, axis=-1) - return_list += [input_mask_data.astype("float32")] - - if return_max_len: - return_list += [max_len] - - if return_num_token: - num_token = 0 - for inst in insts: - num_token += len(inst) - return_list += [num_token] - - if return_seq_lens: - seq_lens = np.array([len(inst) for inst in insts]) - return_list += [seq_lens.astype("int64").reshape([-1])] - - return return_list if len(return_list) > 1 else return_list[0] - - -if __name__ == "__main__": - pass diff --git a/demo/pantheon/lexical_anlysis/reader.py b/demo/pantheon/lexical_anlysis/reader.py deleted file mode 100644 index 11958919..00000000 --- a/demo/pantheon/lexical_anlysis/reader.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -The file_reader converts raw corpus to input. -""" - -import os -import argparse -import __future__ -import io -import glob -from paddleslim.pantheon import Student -import random -import numpy as np -import six - -def load_kv_dict(dict_path, - reverse=False, - delimiter="\t", - key_func=None, - value_func=None): - """ - Load key-value dict from file - """ - result_dict = {} - for line in io.open(dict_path, "r", encoding='utf8'): - terms = line.strip("\n").split(delimiter) - if len(terms) != 2: - continue - if reverse: - value, key = terms - else: - key, value = terms - if key in result_dict: - raise KeyError("key duplicated with [%s]" % (key)) - if key_func: - key = key_func(key) - if value_func: - value = value_func(value) - result_dict[key] = value - return result_dict - - -class Dataset(object): - """data reader""" - - def __init__(self, args, mode="train"): - # read dict - self.word2id_dict = load_kv_dict( - args.word_dict_path, reverse=True, value_func=int) - self.id2word_dict = load_kv_dict(args.word_dict_path) - self.label2id_dict = load_kv_dict( - args.label_dict_path, reverse=True, value_func=int) - self.id2label_dict = load_kv_dict(args.label_dict_path) - self.word_replace_dict = load_kv_dict(args.word_rep_dict_path) - self._student = Student() - self._student.register_teacher(in_address=args.in_address) - self._student.start() - self._know_desc = self._student.get_knowledge_desc() - self._know_data_generator = self._student.get_knowledge_generator(batch_size=1, drop_last=False)() - self._train_shuffle_buf_size = args.traindata_shuffle_buffer - - @property - def vocab_size(self): - """vocabuary size""" - return max(self.word2id_dict.values()) + 1 - - @property - def num_labels(self): - """num_labels""" - return max(self.label2id_dict.values()) + 1 - - def get_num_examples(self, filename): - """num of line of file""" - return sum(1 for line in io.open(filename, "r", encoding='utf8')) - - def word_to_ids(self, words): - """convert word to word index""" - word_ids = [] - for word in words: - word = self.word_replace_dict.get(word, word) - if word not in self.word2id_dict: - word = "OOV" - word_id = self.word2id_dict[word] - word_ids.append(word_id) - - return word_ids - - def label_to_ids(self, labels): - """convert label to label index""" - label_ids = [] - for label in labels: - if label not in self.label2id_dict: - label = "O" - label_id = self.label2id_dict[label] - label_ids.append(label_id) - return label_ids - - def file_reader(self, filename, max_seq_len=126, mode="train"): - """ - yield (word_idx, target_idx, teacher_emission) one by one from file, - or yield (word_idx, ) in `infer` mode - """ - - def wrapper(): - invalid_samples = 0 - fread = io.open(filename, "r", encoding="utf-8") - if mode == "infer": - for line in fread: - words = line.strip() - word_ids = self.word_to_ids(words) - yield (word_ids[0:max_seq_len], ) - elif mode == "test": - headline = next(fread) - headline = headline.strip().split('\t') - assert len(headline) == 2 and headline[ - 0] == "text_a" and headline[1] == "label" - for line in fread: - words, labels = line.strip("\n").split("\t") - if len(words) < 1: - continue - word_ids = self.word_to_ids(words.split("\002")) - label_ids = self.label_to_ids(labels.split("\002")) - assert len(word_ids) == len(label_ids) - yield word_ids[0:max_seq_len], label_ids[0:max_seq_len] - else: - headline = next(fread) - headline = headline.strip().split('\t') - assert len(headline) == 2 and headline[ - 0] == "text_a" and headline[1] == "label" - buf = [] - for line in fread: - words, labels = line.strip("\n").split("\t") - if len(words) < 1: - continue - word_ids = self.word_to_ids(words.split("\002")) - label_ids = self.label_to_ids(labels.split("\002")) - if six.PY2: - know_data = self._know_data_generator.next() - else: - know_data = self._know_data_generator.__next__() - teacher_crf_decode = know_data["crf_decode"] - - if len(teacher_crf_decode.shape) == 1: - teacher_crf_decode = np.reshape(teacher_crf_decode, [-1, 1]) - teacher_seq_len = know_data["seq_lens"] - assert len(word_ids) == len(label_ids) - - real_len = len(word_ids) if len(word_ids) < max_seq_len else max_seq_len - if real_len == teacher_seq_len[0] - 2: - teacher_crf_decode_range = teacher_crf_decode[0][1:teacher_seq_len[0]-1] - teacher_crf_decode_range = np.reshape(teacher_crf_decode_range, [-1, 1]) - buf.append([word_ids[0:max_seq_len], label_ids[0:max_seq_len], teacher_crf_decode_range]) - #buf.append([word_ids[0:max_seq_len], label_ids[0:max_seq_len], teacher_crf_decode[0][1:teacher_seq_len[0]-1]]) - if len(buf) > self._train_shuffle_buf_size: - buf_ids = range(len(buf)) - random.shuffle(buf_ids) - for idx in buf_ids: - yield buf[idx] - buf = [] - else: - invalid_samples += 1 - if len(buf) > 0: - buf_ids = list(range(len(buf))) - random.shuffle(buf_ids) - for idx in buf_ids: - yield buf[idx] - - print("invalid samples in one epoch: {}".format(invalid_samples)) - fread.close() - return wrapper - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(__doc__) - parser.add_argument( - "--word_dict_path", - type=str, - default="./conf/word.dic", - help="word dict") - parser.add_argument( - "--label_dict_path", - type=str, - default="./conf/tag.dic", - help="label dict") - parser.add_argument( - "--word_rep_dict_path", - type=str, - default="./conf/q2b.dic", - help="word replace dict") - args = parser.parse_args() - dataset = Dataset(args) - # data_generator = dataset.file_reader("data/train.tsv") - #for word_idx, target_idx in data_generator(): - # print(word_idx, target_idx) - # print(len(word_idx), len(target_idx)) - # break diff --git a/demo/pantheon/lexical_anlysis/run_student.sh b/demo/pantheon/lexical_anlysis/run_student.sh deleted file mode 100644 index a4b0a241..00000000 --- a/demo/pantheon/lexical_anlysis/run_student.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -export CUDA_VISIBLE_DEVICES=5,6 -python -u train_student.py \ - --train_data ./data/train.tsv \ - --test_data ./data/test.tsv \ - --model_save_dir ./teacher_ernie_init_lac_1gru_emb128 \ - --validation_steps 1000 \ - --save_steps 1000 \ - --print_steps 100 \ - --batch_size 32 \ - --epoch 10 \ - --traindata_shuffle_buffer 20000 \ - --word_emb_dim 128 \ - --grnn_hidden_dim 128 \ - --bigru_num 1 \ - --base_learning_rate 1e-3 \ - --emb_learning_rate 2 \ - --crf_learning_rate 0.2 \ - --word_dict_path ./conf/word.dic \ - --label_dict_path ./conf/tag.dic \ - --word_rep_dict_path ./conf/q2b.dic \ - --enable_ce false \ - --use_cuda true \ - --in_address "127.0.0.1:5002" - diff --git a/demo/pantheon/lexical_anlysis/run_teacher.sh b/demo/pantheon/lexical_anlysis/run_teacher.sh deleted file mode 100755 index d0acc194..00000000 --- a/demo/pantheon/lexical_anlysis/run_teacher.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -export FLAGS_sync_nccl_allreduce=0 -export FLAGS_eager_delete_tensor_gb=1 -export FLAGS_fraction_of_gpu_memory_to_use=0.99 - -export CUDA_VISIBLE_DEVICES=5,6 # which GPU to use -ERNIE_FINETUNED_MODEL_PATH=./model_finetuned -DATA_PATH=./data/ - -python -u teacher_ernie.py \ - --ernie_config_path "conf/ernie_config.json" \ - --init_checkpoint "${ERNIE_FINETUNED_MODEL_PATH}" \ - --init_bound 0.1 \ - --vocab_path "conf/vocab.txt" \ - --batch_size 32 \ - --random_seed 0 \ - --num_labels 57 \ - --max_seq_len 128 \ - --test_data "${DATA_PATH}/train.tsv" \ - --label_map_config "./conf/label_map.json" \ - --do_lower_case true \ - --use_cuda true \ - --out_port=5002 - diff --git a/demo/pantheon/lexical_anlysis/teacher_ernie.py b/demo/pantheon/lexical_anlysis/teacher_ernie.py deleted file mode 100644 index 9235fda1..00000000 --- a/demo/pantheon/lexical_anlysis/teacher_ernie.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Baidu's open-source Lexical Analysis tool for Chinese, including: - 1. Word Segmentation, - 2. Part-of-Speech Tagging - 3. Named Entity Recognition -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time -import argparse -import numpy as np -import multiprocessing -import sys -from collections import namedtuple -from paddleslim.pantheon import Teacher -import paddle.fluid as fluid - -import creator -import model_utils -print('model representation') -from models.representation.ernie import ErnieConfig -print('model check') -from models.model_check import check_cuda -from models.model_check import check_version - - - -def do_eval(args): - # init executor - if args.use_cuda: - place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) - else: - place = fluid.CPUPlace() - print('ernie config') - ernie_config = ErnieConfig(args.ernie_config_path) - ernie_config.print_config() - test_program = fluid.Program() - print('test program') - with fluid.program_guard(test_program, fluid.default_startup_program()): - with fluid.unique_name.guard(): - test_ret = creator.create_ernie_model(args, ernie_config) - test_program = test_program.clone(for_test=True) - #print('create pyreader') - pyreader = creator.create_pyreader( - args, - file_name=args.test_data, - feed_list=[ret.name for ret in test_ret['feed_list']], - model="ernie", - place=place, - return_reader=True, - mode='test') - - #data_inter = reader.data_generator(args.test_data, args.batch_size, 1, shuffle=False, phase="train") - - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - # load model - if not args.init_checkpoint: - raise ValueError( - "args 'init_checkpoint' should be set if only doing test or infer!") - model_utils.init_checkpoint(exe, args.init_checkpoint, test_program) - - teacher = Teacher(out_path=None, out_port=int(args.out_port)) - teacher.start() - print('run teacher......') - - test_ret["chunk_evaluator"].reset() - - reader_config = {"batch_generator": pyreader} - - teacher.start_knowledge_service( - feed_list=[test_ret["words"].name, test_ret["sent_ids"].name, test_ret["pos_ids"].name, test_ret["input_mask"].name, test_ret["labels"].name, test_ret["seq_lens"].name], - schema={"crf_decode":test_ret["crf_decode"],"seq_lens":test_ret["seq_lens"]}, - program=test_program, - reader_config=reader_config, - exe=exe, - times=10) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(__doc__) - model_utils.load_yaml(parser, './conf/ernie_args.yaml') - - # config for pantheon teacher - parser.add_argument('--out_path', type=str, default=None, help="The path to dump knowledge for offline mode.") - parser.add_argument('--out_port', type=str, default=None, help="The IP port number to send out knowledge for \ - online mode, should be unique when launching multiple teachers in \ - the same node.") - - args = parser.parse_args() - check_cuda(args.use_cuda) - check_version() - model_utils.print_arguments(args) - do_eval(args) diff --git a/demo/pantheon/lexical_anlysis/train_student.py b/demo/pantheon/lexical_anlysis/train_student.py deleted file mode 100644 index 5a553431..00000000 --- a/demo/pantheon/lexical_anlysis/train_student.py +++ /dev/null @@ -1,208 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import math -import time -import random -import argparse -import multiprocessing - -import numpy as np -import paddle -import paddle.fluid as fluid - -import reader -import model_utils -import creator -from eval import test_process -from models.model_check import check_cuda -from models.model_check import check_version - -# the function to train model -def do_train(args): - # init executor - if args.use_cuda: - place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) - dev_count = fluid.core.get_cuda_device_count() - else: - dev_count = min(multiprocessing.cpu_count(), args.cpu_num) - if (dev_count < args.cpu_num): - print( - "WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. " - "Change the cpu_num from %d to %d" % - (dev_count, args.cpu_num, dev_count)) - os.environ['CPU_NUM'] = str(dev_count) - place = fluid.CPUPlace() - - train_program = fluid.Program() - test_program = fluid.Program() - startup_program = fluid.Program() - - dataset = reader.Dataset(args) - with fluid.program_guard(train_program, startup_program): - #train_program.random_seed = args.random_seed - startup_program.random_seed = args.random_seed - - with fluid.unique_name.guard(): - train_ret = creator.create_model( - args, dataset.vocab_size, dataset.num_labels, mode='train') - - optimizer = fluid.optimizer.Adam( - learning_rate=args.base_learning_rate) - optimizer.minimize(train_ret["avg_cost"]) - - with fluid.program_guard(test_program, startup_program): - with fluid.unique_name.guard(): - test_ret = creator.create_model( - args, dataset.vocab_size, dataset.num_labels, mode='test') - - test_program = test_program.clone(for_test=True) - - exe = fluid.Executor(place) - exe.run(startup_program) - - if args.init_checkpoint: - model_utils.init_checkpoint(exe, args.init_checkpoint, train_program) - if dev_count > 1: - device = "GPU" if args.use_cuda else "CPU" - print("%d %s are used to train model" % (dev_count, device)) - # multi cpu/gpu config - exec_strategy = fluid.ExecutionStrategy() - - build_strategy = fluid.compiler.BuildStrategy() - - compiled_prog = fluid.compiler.CompiledProgram( - train_program).with_data_parallel( - loss_name=train_ret['avg_cost'].name, - build_strategy=build_strategy, - exec_strategy=exec_strategy) - else: - compiled_prog = fluid.compiler.CompiledProgram(train_program) - - # start training - num_train_examples = dataset.get_num_examples(args.train_data) - max_train_steps = args.epoch * num_train_examples // args.batch_size - print("Num train examples: %d" % num_train_examples) - print("Max train steps: %d" % max_train_steps) - - train_generator = creator.create_lexnet_data_generator(args, - reader=dataset, - file_name=args.train_data, - place=place, - mode='train') - test_generator = creator.create_lexnet_data_generator(args, - reader=dataset, - file_name=args.test_data, - place=place, - mode='test') - - train_reader, test_reader = train_ret['pyreader'], test_ret['pyreader'] - train_reader.set_batch_generator(train_generator, places=place) - test_reader.set_batch_generator(test_generator, places=place) - - ce_info = [] - step = 0 - ce_time = 0 - train_reader.start() - while True: - try: - # this is for minimizing the fetching op, saving the training speed. - if step % args.print_steps == 0: - fetch_list = [ - train_ret["avg_cost"], train_ret["precision"], - train_ret["recall"], train_ret["f1_score"], - train_ret["crf_avg_cost"], train_ret["teacher_cost"] - ] - else: - fetch_list = [] - - start_time = time.time() - outputs = exe.run( - program=compiled_prog, - fetch_list=fetch_list) - - end_time = time.time() - if step % args.print_steps == 0: - avg_cost, precision, recall, f1_score, crf_avg_cost, teacher_cost = [ - np.mean(x) for x in outputs - ] - print("Data loader queue size: %d " % train_reader.queue.size()) - print( - "[train] step = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, crf_avg_cost: %.5f, teacher_cost: %.5f, elapsed time %.5f" - % (step, avg_cost, precision, recall, f1_score, crf_avg_cost, teacher_cost, - end_time - start_time)) - - if step % args.validation_steps == 0: - test_process(exe, test_program, test_reader, test_ret) - - ce_time += end_time - start_time - ce_info.append([ce_time, avg_cost, precision, recall, f1_score]) - - # save checkpoints - if step % args.save_steps == 0 and step != 0: - save_path = os.path.join(args.model_save_dir, - "step_" + str(step)) - fluid.io.save_persistables(exe, save_path, train_program) - step += 1 - except fluid.core.EOFException: - train_reader.reset() - break - - if args.enable_ce: - card_num = get_cards() - ce_cost = 0 - ce_f1 = 0 - ce_p = 0 - ce_r = 0 - ce_time = 0 - try: - ce_time = ce_info[-2][0] - ce_cost = ce_info[-2][1] - ce_p = ce_info[-2][2] - ce_r = ce_info[-2][3] - ce_f1 = ce_info[-2][4] - except: - print("ce info error") - print("kpis\teach_step_duration_card%s\t%s" % (card_num, ce_time)) - print("kpis\ttrain_cost_card%s\t%f" % (card_num, ce_cost)) - print("kpis\ttrain_precision_card%s\t%f" % (card_num, ce_p)) - print("kpis\ttrain_recall_card%s\t%f" % (card_num, ce_r)) - print("kpis\ttrain_f1_card%s\t%f" % (card_num, ce_f1)) - - -def get_cards(): - num = 0 - cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') - if cards != '': - num = len(cards.split(",")) - return num - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(__doc__) - model_utils.load_yaml(parser, 'conf/args.yaml') - - # config for pantheon student - parser.add_argument('--in_path', type=str, default=None, help="The path of dumped knowledge from teacher for offline mode.") - parser.add_argument('--in_address', type=str, default=None, help="The IP port number to receive knowledge from teacher for \ - online mode") - - args = parser.parse_args() - check_cuda(args.use_cuda) - check_version() - do_train(args) diff --git a/demo/pantheon/toy/README.md b/demo/pantheon/toy/README.md deleted file mode 100644 index 3cb561b4..00000000 --- a/demo/pantheon/toy/README.md +++ /dev/null @@ -1,54 +0,0 @@ -## Toy example for Pantheon - -See more details about Pantheon in [PaddleSlim/Pantheon](../../../paddleslim/pantheon). - -Here implements two teacher models (not trainable, just for demo): teacher1 takes an integer **x** as input and predicts value **2x-1**, see in [run_teacher1.py](run_teacher1.py); teacher2 also takes **x** as input and predicts **2x+1**, see in [run_teacher2.py](run_teacher2.py). They two share a data reader to read a sequence of increasing natural numbers from zero to some positive inter **max_n** as input and generate different knowledge. And the schema keys for knowledge in teacher1 is [**"x", "2x-1", "result"**], and [**"2x+1", "result"**] for knowledge in teacher2, in which **"result"** is the common schema and the copy of two predictions respectively. On instantiating the **Student** object, the merging strategy for the common schema **"result"** should be specified, and the schema keys for the merged knowledge will be [**"x", "2x-1", "2x+1", "result"**], with the merged **"result"** equal to **"2x"** when the merging strategy is **"mean"** and **"4x"** when merging strategy is **"sum"**. The student model gets merged knowledge from teachers and prints them out, see in [run_student.py](run_student.py). - -The toy "knowledge distillation" system can be launched in three different modes, i.e., offline, online and their hybrid. All three modes should have the same outputs, and the correctness of results can be verified by checking the order and values of outputs. - -### Offline - - The two teachers work in offline mode, and start them with given local file paths. - - ```shell -export PYTHONPATH=../../../:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=0,1 -export NUM_POSTPROCESS_THREADS=10 # default 8 -nohup python -u run_teacher1.py --use_cuda true --out_path teacher1_offline.dat > teacher1_offline.log 2>&1& -export CUDA_VISIBLE_DEVICES=2 -nohup python -u run_teacher2.py --use_cuda true --out_path teacher2_offline.dat > teacher2_offline.log 2>&1& - ``` - After the two executions both finished, start the student model with the two generated knowledge files. - - ```shell -export PYTHONPATH=../../../:$PYTHONPATH - python -u run_student.py \ - --in_path0 teacher1_offline.dat \ - --in_path1 teacher2_offline.dat - ``` - - -### Online - -The two teachers work in online mode, and start them with given TCP/IP ports. Please make sure that the ICP/IP ports are available. - -```shell -export PYTHONPATH=../../../:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=0 -nohup python -u run_teacher1.py --use_cuda true --out_port 8080 > teacher1_online.log 2>&1& -export CUDA_VISIBLE_DEVICES=1,2 -nohup python -u run_teacher2.py --use_cuda true --out_port 8081 > teacher2_online.log 2>&1& -``` -Start the student model with the IP addresses that can reach the ports of the two teacher models, e.g., in the same node - -```shell -export PYTHONPATH=../../../:$PYTHONPATH -python -u run_student.py \ - --in_address0 127.0.0.1:8080 \ - --in_address1 127.0.0.1:8081 \ -``` -**Note:** in online mode, the starting order of teachers and the sudent doesn't matter, and they will wait for each other to establish connection. - -### Hybrid of offline and online - -One teacher works in offline mode and another one works in online mode. This time, start the offline teacher first. After the offline knowledge file gets well prepared, start the online teacher and the student at the same time. diff --git a/demo/pantheon/toy/run_student.py b/demo/pantheon/toy/run_student.py deleted file mode 100644 index b2ede92f..00000000 --- a/demo/pantheon/toy/run_student.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -from paddleslim.pantheon import Student - -from utils import str2bool - - -def parse_args(): - parser = argparse.ArgumentParser(__doc__) - parser.add_argument( - "--in_address0", - type=str, - default=None, - help="Input address for teacher 0. (default: %(default)s)") - parser.add_argument( - "--in_path0", - type=str, - default=None, - help="Input file path for teacher 0. (default: %(default)s)") - parser.add_argument( - "--in_address1", - type=str, - default=None, - help="Input address for teacher 1. (default: %(default)s)") - parser.add_argument( - "--in_path1", - type=str, - default=None, - help="Input file path for teacher 1. (default: %(default)s)") - parser.add_argument( - "--test_send_recv", - type=str2bool, - default=False, - help="Whether to test send/recv interfaces. (default: %(default)s)") - parser.add_argument( - "--batch_size", - type=int, - default=32, - help="The batch size of student model. (default: %(default)s)") - args = parser.parse_args() - return args - - -def run(args): - if args.in_address0 and args.in_path0: - raise ValueError( - "args.in_address0 and args.in_path0 should not be valid " - "at the same time!") - if not args.in_address0 and not args.in_path0: - raise ValueError( - "One of args.in_address0 and args.in_path0 must be valid!") - - if args.in_address1 and args.in_path1: - raise ValueError( - "args.in_address1 and args.in_path1 should not be valid " - "at the same time!") - if not args.in_address1 and not args.in_path1: - raise ValueError( - "One of args.in_address1 and args.in_path1 must be valid") - - student = Student(merge_strategy={"result": "sum"}) - - student.register_teacher( - in_address=args.in_address0, in_path=args.in_path0) - student.register_teacher( - in_address=args.in_address1, in_path=args.in_path1) - student.start() - - if args.test_send_recv: - for t in range(2): - for i in range(3): - print(student.recv(t)) - student.send("message from student!") - - knowledge_desc = student.get_knowledge_desc() - data_generator = student.get_knowledge_generator( - batch_size=args.batch_size, drop_last=False) - for batch_data in data_generator(): - batch_size = list(batch_data.values())[0].shape[0] - keys = batch_data.keys() - for i in range(batch_size): - data = {} - for key in keys: - data[key] = batch_data[key][i] - print(data) - - -if __name__ == '__main__': - args = parse_args() - run(args) diff --git a/demo/pantheon/toy/run_teacher1.py b/demo/pantheon/toy/run_teacher1.py deleted file mode 100644 index 1e0e0898..00000000 --- a/demo/pantheon/toy/run_teacher1.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import paddle.fluid as fluid - -from utils import parse_args, sample_generator, sample_list_generator, batch_generator -from paddleslim.pantheon import Teacher - - -def run(args): - if args.out_path and args.out_port: - raise ValueError("args.out_path and args.out_port should not be valid " - "at the same time") - if not args.out_path and not args.out_port: - raise ValueError("One of args.out_path and args.out_port be valid") - - # user-defined program: y = 2*x - 1 - startup = fluid.Program() - program = fluid.Program() - with fluid.program_guard(program, startup): - inp_x = fluid.layers.data(name='x', shape=[-1, 1], dtype="int64") - y = inp_x * 2 - 1 - result = fluid.layers.assign(y) - - place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup) - - teacher = Teacher(out_path=args.out_path, out_port=args.out_port) - teacher.start() - - if args.generator_type == "sample_generator": - reader_config = { - "sample_generator": sample_generator(max_n=1000), - "batch_size": args.batch_size, - "drop_last": False - } - elif args.generator_type == "sample_list_generator": - reader_config = { - "sample_list_generator": sample_list_generator( - max_n=1000, batch_size=args.batch_size) - } - else: - reader_config = { - "batch_generator": batch_generator( - max_n=1000, batch_size=args.batch_size) - } - - if args.test_send_recv: - teacher.send("greetings from teacher1") - teacher.send({"x": 1, "y": 2}) - teacher.send({3, 5}) - print("recved {}".format(teacher.recv())) - - teacher.start_knowledge_service( - feed_list=[inp_x.name], - schema={"x": inp_x, - "2x-1": y, - "result": result}, - program=program, - reader_config=reader_config, - exe=exe, - use_fp16=True, - times=args.serving_times) - - -if __name__ == '__main__': - args = parse_args() - run(args) diff --git a/demo/pantheon/toy/run_teacher2.py b/demo/pantheon/toy/run_teacher2.py deleted file mode 100644 index 5d45fec9..00000000 --- a/demo/pantheon/toy/run_teacher2.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import paddle.fluid as fluid - -from utils import parse_args, sample_generator, sample_list_generator, batch_generator -from paddleslim.pantheon import Teacher - - -def run(args): - if args.out_path and args.out_port: - raise ValueError("args.out_path and args.out_port should not be valid " - "at the same time") - if not args.out_path and not args.out_port: - raise ValueError("One of args.out_path and args.out_port be valid") - - # user-defined program: y = 2*x + 1 - startup = fluid.Program() - program = fluid.Program() - with fluid.program_guard(program, startup): - inp_x = fluid.layers.data(name='x', shape=[-1, 1], dtype="int64") - y = inp_x * 2 + 1 - result = fluid.layers.assign(y) - - place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup) - - teacher = Teacher(out_path=args.out_path, out_port=args.out_port) - teacher.start() - - if args.generator_type == "sample_generator": - reader_config = { - "sample_generator": sample_generator(max_n=1000), - "batch_size": args.batch_size, - "drop_last": False - } - elif args.generator_type == "sample_list_generator": - reader_config = { - "sample_list_generator": sample_list_generator( - max_n=1000, batch_size=args.batch_size) - } - else: - reader_config = { - "batch_generator": batch_generator( - max_n=1000, batch_size=args.batch_size) - } - - if args.test_send_recv: - teacher.send("greetings from teacher2") - teacher.send([1]) - teacher.send({1, 2, 3}) - print("recved {}".format(teacher.recv())) - - teacher.start_knowledge_service( - feed_list=[inp_x.name], - schema={"2x+1": y, - "result": result}, - program=program, - reader_config=reader_config, - exe=exe, - times=args.serving_times) - - -if __name__ == '__main__': - args = parse_args() - run(args) diff --git a/demo/pantheon/toy/utils.py b/demo/pantheon/toy/utils.py deleted file mode 100644 index af88d2a6..00000000 --- a/demo/pantheon/toy/utils.py +++ /dev/null @@ -1,91 +0,0 @@ -import numpy as np -import argparse - - -def str2bool(v): - return v.lower() in ("true", "t", "1") - - -def parse_args(): - parser = argparse.ArgumentParser(__doc__) - parser.add_argument( - "--out_port", - type=int, - default=None, - help="IP port number for sending out data. (default: %(default)s)") - parser.add_argument( - "--out_path", - type=str, - default=None, - help="The file path to dump knowledge data. (default: %(default)s)") - parser.add_argument( - "--use_cuda", - type=str2bool, - default=False, - help="Whether to use GPU for prediction. (default: %(default)s)") - parser.add_argument( - "--test_send_recv", - type=str2bool, - default=False, - help="Whether to test send/recv interfaces. (default: %(default)s)") - parser.add_argument( - "--generator_type", - type=str, - choices=[ - "sample_generator", "sample_list_generator", "batch_generator" - ], - default="batch_generator", - help="Which data generator to use. (default: %(default)s)") - parser.add_argument( - "--batch_size", - type=int, - default=32, - help="The batch size per device for data generators. (default: %(default)s)" - ) - parser.add_argument( - "--serving_times", - type=int, - default=1, - help="The maximum times of teacher serving knowledge. (default: %(default)s)" - ) - args = parser.parse_args() - return args - - -def sample_generator(max_n): - def wrapper(): - for i in range(max_n): - yield [i] - - return wrapper - - -def sample_list_generator(max_n, batch_size=500): - def wrapper(): - sample_list = [] - for sample in sample_generator(max_n)(): - if len(sample_list) < batch_size: - sample_list.append(sample) - if len(sample_list) == batch_size: - yield sample_list - sample_list = [] - if len(sample_list) > 0: - yield sample_list - - return wrapper - - -# data_generator -def batch_generator(max_n, batch_size=500): - def wrapper(): - batch = [] - for sample in sample_generator(max_n)(): - if len(batch) < batch_size: - batch.append(sample) - if len(batch) == batch_size: - yield [np.array(batch).astype('int64').reshape((-1, 1))] - batch = [] - if len(batch) > 0: - yield [np.array(batch).astype('int64').reshape((-1, 1))] - - return wrapper diff --git a/docs/en/api_en/paddleslim.pantheon.rst b/docs/en/api_en/paddleslim.pantheon.rst deleted file mode 100644 index 59f48ce9..00000000 --- a/docs/en/api_en/paddleslim.pantheon.rst +++ /dev/null @@ -1,36 +0,0 @@ -paddleslim\.pantheon package -============================ - -.. automodule:: paddleslim.pantheon - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -paddleslim\.pantheon\.student module ------------------------------------- - -.. automodule:: paddleslim.pantheon.student - :members: - :undoc-members: - :show-inheritance: - -paddleslim\.pantheon\.teacher module ------------------------------------- - -.. automodule:: paddleslim.pantheon.teacher - :members: - :undoc-members: - :show-inheritance: - -paddleslim\.pantheon\.utils module ----------------------------------- - -.. automodule:: paddleslim.pantheon.utils - :members: - :undoc-members: - :show-inheritance: - - diff --git a/docs/zh_cn/api_cn/static/dist/pantheon_api.md b/docs/zh_cn/api_cn/static/dist/pantheon_api.md deleted file mode 100644 index 87fc6724..00000000 --- a/docs/zh_cn/api_cn/static/dist/pantheon_api.md +++ /dev/null @@ -1,268 +0,0 @@ -# 大规模可扩展知识蒸馏框架 Pantheon - -## Teacher - -pantheon.Teacher() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L78) - -: The class defined for the teacher model. Generate knowledge data and transfer them to the student model. - -**Args:** - -- **out\_path (str|None)** - The path to dump knowledge data for offline mode. - -- **out\_port (int|None)** - The IP port number to send out knowledge for online mode, should be unique when launching multiple teachers in the same node. - -**Return:** An object of class Teacher - - -pantheon.Teacher.start() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L133) - -: Start teacher service, sychronize with student and launch the thread - to monitor commands from student. - -**Args:** None - -**Return:** None - - -pantheon.Teacher.send(data) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L181) - -: Send one data object to student. - -**Args:** - -- **data (Python data):** - The data to be sent, can be any type of Python data object. - -**Return:** None - - -pantheon.Teacher.recv() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L196) - -: Recieve one data object from student. - -**Args:** None - -**Return:** - -- The received data, can be any type of Python data object. - - -pantheon.Teacher.dump(knowledge) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L214) - -: Dump one batch knowledge data into the output file, only used in the offline mode. - -**Args:** - -- **knowledge (dict):** - The knowledge data to be dumped. - -**Return:** None - - -pantheon.Teacher.start\_knowledge\_service(feed\_list, schema, program, reader\_config, exe, buf\_size=10, times=1) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L259) - -: Start the knowledge service to generate and transfer knowledge data. In GPU mode, the devices to execute knowledge prediction will be determined by the - environment variable **FLAGS\_selected\_gpus**, or by **CUDA\_VISIBLE\_DEVICES** if it is not set, and by **CPU\_NUM** (default 1) in CPU mode. Only supported in static graph. - - **Args:** - - - **feed\_list (list):** - A list of feed Variables or their names for the - input teacher Program. - - **schema (dict):** - A dict to specify keys and fetched Variables - to generate knowledge. - - **program (fluid.Program):** - Inference Program of the teacher model. - - **reader\_config (dict):** - The config for data reader. Support all the three types of generators used by [fluid.io.PyReader](https://www.paddlepaddle.org.cn/documentation/docs/en/api/io/PyReader.html) and [fluid.io.DataLoader](https://www.paddlepaddle.org.cn/documentation/docs/en/api/io/DataLoader.html#dataloader), and their configs contain the key-value pair of the generator type and a generator object, plus other necessary argument pairs. See the following: - - 1) **sample generator:** - - ``` - reader_config={"sample_generator": some_sample_generator, - "batch_size": batch_size, "drop_last": drop_last} - # drop_last set to True by default - ``` - - 2) **sample list generator:** - - ``` - reader_config={"sample_list_generator": some_sample_list_generator} - ``` - - 3) **batch generator:** - - ``` - reader_config={"batch_generator": some_batch_genrator} - ``` - - The trial to parse config will be in the order of 1) -> 3), and any other unrelated keys in these configs will be ignored. - -- **exe (fluid.Executor):** The executor to run the input program. -- **buf\_size (int):** The size of buffers for data reader and knowledge - writer on each device. -- **times (int):** The maximum repeated serving times, default 1. Whenever - the public method **get\_knowledge\_generator()** in **Student** - object called once, the serving times will be added one, - until reaching the maximum and ending the service. Only - valid in online mode, and will be ignored in offline mode. - -**Return:** None - -**Examples:** - -```python -import paddle -import paddle.fluid as fluid -from paddleslim.pantheon import Teacher - -startup = fluid.Program() -program = fluid.Program() -with fluid.program_guard(program, startup): - images = fluid.data( - name='pixel', shape=[None, 3 * 32 * 32], dtype='float32') - labels = fluid.data(name='label', shape=[None, 1], dtype='int64') - logits = fluid.layers.fc(input=images, size=10) - loss = fluid.layers.softmax_with_cross_entropy(logits, labels) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) -exe.run(startup) - -train_reader = paddle.fluid.io.batch( - paddle.dataset.cifar.train10(), batch_size=32) - -teacher = Teacher(out_path="example_knowledge.dat", # offline mode - #out_port=5000 # online mode - ) -teacher.start() - -teacher.start_knowledge_service( - feed_list=[images, labels], - schema={"logits": logits, - "labels": labels}, - program=program, - reader_config={"sample_list_generator": train_reader}, - exe=exe) -``` - -!!! note "Note" - This example should be run with the example of class **Student**. - - -## Student - -pantheon.Student(merge_strategy=None) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L34) - -: The class defined for the student model. Receive knowledge data from - teacher model and carry out knowledge merging. - - **Args:** - - - **merge\_strategy (dict|None):** - A dict whose keys are the common schemas shared by different teachers, and each corresponding value specifies the merging strategy for different schemas respectively, supporting **sum** and **mean** now. - -**Return:** An object of class Student. - - -pantheon.Student.register\_teacher(in\_path=None, in\_address=None) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L72) - -: Register one teacher model and assign the order number to it as its id, with the file path (offline mode) or IP address (online mode) that the teacher model writes knowledge data to. - -**Args:** - -- **in\_path (str|None):** The input file path. Default None. -- **in\_address (str|None):** The input IP address, in the format "<IP\_address>:<IP\_port>" (e.g. "127.0.0.1:8080"). Default None. - -**Return:** None - - -pantheon.Student.start() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L213) - -: End teachers' registration and synchronize with all of them. - -**Args:** None - -**Return:** None - -pantheon.Student.send(self, data, teacher_ids=None) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L240) - -: Send data to teachers. - -**Args:** - -- **data (Python data):** - A Python data object to be sent. -- **teacher_ids (list|None):** - A list of teacher ids to send data. If set to None, send the data to all teachers. Default None. - -**Return:** None - -pantheon.Student.recv(teacher_id) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L262) - -: Receive data from one teacher. - - **Args:** - -- **teacher\_id (int):** - The id of teacher that receives data from. - -**Return:** - -- The received data object. - -pantheon.Student.get\_knowledge\_desc() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L283) - - : Get description for knowledge, including shape, data type and lod level for each schema. - - **Args:** None - - **Return:** - - - Knowledge description, which is a dict. - - -pantheon.Student.get\_knowledge\_qsize() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L318) - - : Get the real-time size of knowledge queue. If this size is denoted as - **qsize**, it means that there are **qsize** batch knowledge data - already pushed into knowledge queue and waiting for the knowledge - generator to pop out. It's dynamic and limited up to 100, the capacity - of the knowledge queue. - - **Args:** None - - **Return:** - - - The real-time size of knowledge queue. - -pantheon.Student.get\_knowledge\_generator(batch\_size, drop\_last=False) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L334) - -: Get the generator for knowledge data, return None if last generator doesn't finish yet. - -**Args:** - -- **batch\_size (int):** - The batch size of returned knowledge data. -- **drop\_last (bool):** - Whether to drop the last batch if its size is less than batch size. - -**Return:** - -- The wrapper of knowledge data generator. - -**Examples:** - -```python -from paddleslim.pantheon import Student - -student = Student() - -student.register_teacher(in_path="example_knowledge.dat", # offline mode - #in_address="127.0.0.1:5000" # online mode - ) -student.start() - -knowledge_desc = student.get_knowledge_desc() -data_generator = student.get_knowledge_generator( - batch_size=128, drop_last=True) - -# get knowledge data -for knowledge in data_generator(): - print("knowledge queue size: {}".format(student.get_knowledge_qsize())) - - # do something else -``` - -!!! note "Note" - This example should be run with the example of class **Teacher**. diff --git a/paddleslim/__init__.py b/paddleslim/__init__.py index 9d312329..ff857786 100644 --- a/paddleslim/__init__.py +++ b/paddleslim/__init__.py @@ -19,11 +19,8 @@ from paddleslim import nas from paddleslim import analysis from paddleslim import dist from paddleslim import quant -from paddleslim import pantheon from paddleslim import dygraph -__all__ = [ - 'models', 'prune', 'nas', 'analysis', 'dist', 'quant', 'pantheon', 'dygraph' -] +__all__ = ['models', 'prune', 'nas', 'analysis', 'dist', 'quant', 'dygraph'] from paddleslim.dygraph import * __all__ += dygraph.__all__ diff --git a/paddleslim/pantheon/README.md b/paddleslim/pantheon/README.md deleted file mode 100644 index 7cd10928..00000000 --- a/paddleslim/pantheon/README.md +++ /dev/null @@ -1,206 +0,0 @@ -# Pantheon: Paddle large-scale scalable knowledge distillation framework - -Pantheon is a universal solution for knowledge distillation in Paddle Fluid. Its design takes account of many possible behaviors of teacher models. Every teacher and student model in Pantheon works in different processes and they communicate with each other via local files or TCP/IP ports. The knowledge can be easily transferred to the student model from a single teacher model or the ensemble of multiple teacher models, in which each teacher model can work in online or offline mode independently. And Pantheon also provides a highly optimized interface for the large-scale prediction of teacher models. Beneficial from the low coupling of teachers and the student, users can allocate computation resources for different roles dependent on their computation complexity, and build a large-scale and practical knowledge distillation learning system on Pantheon. - -The illustration below shows an application of Pantheon, where the sudent model is trained with knowledge from multiple online teachers. These teachers may work on the same node but different devices, or different nodes with the student model, as long as they can communicate with each other via the Internet. The student model can send queries to teachers, and the latter take these queries as input and generate streaming knowledge data for the former. Or in a simpler way, the student model can read the training data in the **same order** with the teachers, avoiding the procedure of sending queryies. - - -
Teacher | -Student | -Supported Graph | -Mode | -remarks | -||
---|---|---|---|---|---|---|
static | -dynamic | -online | -offline | -|||
__init__( out_path=None, out_port=None) |
- __init__( merge_strategy=None) |
- [1] | -||||
- | register_teacher(
- in_path=None, - in_address=None) - |
- [2] | -||||
start() | -start() | -[3] | -||||
send(data) | -recv(teacher_id) | -[4] | -||||
recv() | -send(data, - teacher_ids=None) - |
- [5] | -||||
dump(knowledge) | -- | [6] | -||||
start_knowledge_service(
- feed_list, - schema, - program, - reader_config, - exe, - buf_size=10, - use_fp16=False, - times=1) |
- get_knowledge_desc() | -[7] | -||||
get_knowledge_qsize() | -||||||
get_knowledge_generator( batch_size, - drop_last=False) |
-
N*BLJdrnr2z8E8~5wws^&Uy`k%tqAAEpAYIZuqH@jy
zDQ#nWb0ZSL>-Ga{r{QDD!UQc-1+FuG1h)HLnY1uh^yv?^1+K0$gx*~
z`zqwHsoKFQe=S;$*PcKWCJEOM?!#b@h^bEjIlnXiK``sFdA_C=9k6eclAp>g4E`AC
z{;Inlwx}e{b7&psF%h!$ToT*vi%qXy;Z7T)9(i(s&VDzxR>P4`
zCoki3t`N@B7RNAP3cb-~8!KTso&DL-R&MaIwQf4ei`*6$^F%?)nhqRKv8x{$cbxqX
zitraFy0pOd0+pZE-Fk549@$58n9(wmQute
zZ!@BcF$lU7a>*7G1>ElHZH+_XT-ouji)`(mTjBJDX~ueon+MyYLX
zGvP(#;I>UPBfUf4k7jV}AZ`nqNc5Y?LoKQt=!%TdXlZRGwHx8
zt>N+s!P^Qj<63K