From 5b92f265a32a82e5f489eb6f6828a6b9792123e8 Mon Sep 17 00:00:00 2001 From: Xing Wu Date: Thu, 21 May 2020 16:57:14 +0800 Subject: [PATCH] upload lexical_anlysis_toy (#268) * upload lexical_anlysis_toy --- demo/pantheon/lexical_anlysis/README.md | 40 ++ demo/pantheon/lexical_anlysis/README_cn.md | 41 ++ demo/pantheon/lexical_anlysis/__init__.py | 4 + demo/pantheon/lexical_anlysis/creator.py | 260 ++++++++++++ demo/pantheon/lexical_anlysis/downloads.py | 163 ++++++++ demo/pantheon/lexical_anlysis/ernie_reader.py | 160 +++++++ demo/pantheon/lexical_anlysis/eval.py | 131 ++++++ demo/pantheon/lexical_anlysis/model_utils.py | 248 +++++++++++ .../lexical_anlysis/models/__init__.py | 0 .../lexical_anlysis/models/model_check.py | 73 ++++ .../models/representation/__init__.py | 0 .../models/representation/ernie.py | 322 ++++++++++++++ .../models/sequence_labeling/__init__.py | 0 .../models/sequence_labeling/nets.py | 174 ++++++++ .../models/transformer_encoder.py | 342 +++++++++++++++ .../lexical_anlysis/preprocess/__init__.py | 0 .../preprocess/ernie/__init__.py | 0 .../preprocess/ernie/task_reader.py | 392 ++++++++++++++++++ .../preprocess/ernie/tokenization.py | 370 +++++++++++++++++ .../lexical_anlysis/preprocess/padding.py | 78 ++++ demo/pantheon/lexical_anlysis/reader.py | 208 ++++++++++ demo/pantheon/lexical_anlysis/run_student.sh | 26 ++ demo/pantheon/lexical_anlysis/run_teacher.sh | 25 ++ .../pantheon/lexical_anlysis/teacher_ernie.py | 111 +++++ .../pantheon/lexical_anlysis/train_student.py | 208 ++++++++++ 25 files changed, 3376 insertions(+) create mode 100644 demo/pantheon/lexical_anlysis/README.md create mode 100644 demo/pantheon/lexical_anlysis/README_cn.md create mode 100644 demo/pantheon/lexical_anlysis/__init__.py create mode 100644 demo/pantheon/lexical_anlysis/creator.py create mode 100644 demo/pantheon/lexical_anlysis/downloads.py create mode 100755 demo/pantheon/lexical_anlysis/ernie_reader.py create mode 100755 demo/pantheon/lexical_anlysis/eval.py create mode 100755 demo/pantheon/lexical_anlysis/model_utils.py create mode 100755 demo/pantheon/lexical_anlysis/models/__init__.py create mode 100755 demo/pantheon/lexical_anlysis/models/model_check.py create mode 100755 demo/pantheon/lexical_anlysis/models/representation/__init__.py create mode 100755 demo/pantheon/lexical_anlysis/models/representation/ernie.py create mode 100755 demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py create mode 100755 demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py create mode 100755 demo/pantheon/lexical_anlysis/models/transformer_encoder.py create mode 100644 demo/pantheon/lexical_anlysis/preprocess/__init__.py create mode 100644 demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py create mode 100644 demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py create mode 100644 demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py create mode 100644 demo/pantheon/lexical_anlysis/preprocess/padding.py create mode 100644 demo/pantheon/lexical_anlysis/reader.py create mode 100644 demo/pantheon/lexical_anlysis/run_student.sh create mode 100755 demo/pantheon/lexical_anlysis/run_teacher.sh create mode 100644 demo/pantheon/lexical_anlysis/teacher_ernie.py create mode 100644 demo/pantheon/lexical_anlysis/train_student.py diff --git a/demo/pantheon/lexical_anlysis/README.md b/demo/pantheon/lexical_anlysis/README.md new file mode 100644 index 00000000..ec3af05d --- /dev/null +++ b/demo/pantheon/lexical_anlysis/README.md @@ -0,0 +1,40 @@ +# Distillation example: Chinese lexical analysis +We demonstrated how to use the Pantheon framework for online distillation of the Chinese lexical analysis model with sample dataset. The effect of large-scale online distillation is shown below: +| model | Precision | Recall | F1-score| +| ------ | ------ | ------ | ------ | +| BiGRU | 89.2 | 89.4 | 89.3 | +| BERT fine-tuned | 90.2 | 90.4 | 90.3 | +| ERNIE fine-tuned | 91.7 | 91.7 | 91.7 | +| DistillBiGRU | 90.20 | 90.52 | 90.36 | + +BiGRU is to train a BiGRU based LAC model from scratch; BERT fine-tuned is to fine-tune LAC task on BERT base model; ERNIE fine-tuned is to fine-tune LAC task on BERT base model; DistillBiGRU is trained through large-scale online distillation with ERNIE fine-tuned as teacher model. + +## Introduction + +Lexical Analysis of Chinese, or LAC for short, is a lexical analysis model that completes the tasks of Chinese word segmentation, part-of-speech tagging, and named entity recognition in a single model. We conduct an overall evaluation of word segmentation, part-of-speech tagging, and named entity recognition on a self-built dataset. We use the finetuned [ERNIE](https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE) model as the Teacher model and GRU as the Student model, which are needed by the Pantheon framework for online distillation. + +#### 1. Download the training data set + +Download the data set file, and after decompression, a `./data/` folder will be created. +```bash +python downloads.py dataset +``` + +#### 2. Download the Teacher model + +```bash +# download ERNIE finetuned model +python downloads.py finetuned +python downloads.py conf +``` + +### 3. Distilling Student model +```bash +# start teacher service +bash run_teacher.sh + +# start student service +bash run_student.sh +``` + +> If you want to learn more about LAC, you can refer to this repo: https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/lexical_analysis \ No newline at end of file diff --git a/demo/pantheon/lexical_anlysis/README_cn.md b/demo/pantheon/lexical_anlysis/README_cn.md new file mode 100644 index 00000000..77e4a944 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/README_cn.md @@ -0,0 +1,41 @@ +# 蒸馏样例:中文词法分析 +我们在样例数据集上,对中文词法分析模型,演示了如何使用Pantheon框架进行在线蒸馏。大规模在线蒸馏的效果如下图所示: + +| 模型 | 精度 | 召回率 | F1值| +| ------ | ------ | ------ | ------ | +| BiGRU | 89.2 | 89.4 | 89.3 | +| BERT fine-tuned | 90.2 | 90.4 | 90.3 | +| ERNIE fine-tuned | 91.7 | 91.7 | 91.7 | +| DistillBiGRU | 90.20 | 90.52 | 90.36 | + +BiGRU 是使用双向GRU网络从头训练LAC任务;BERT fine-tuned 是在BERT base模型上微调LAC任务;ERNIE fine-tuned 是在ERNIE base模型上微调LAC任务;DistillBiGRU 是使用ERNIE fine-tuned模型作为teacher模型,通过大规模蒸馏训练LAC任务。 + +## 简介 + +Lexical Analysis of Chinese,简称 LAC,是一个联合的词法分析模型,在单个模型中完成中文分词、词性标注、专名识别任务。我们在自建的数据集上对分词、词性标注、专名识别进行整体的评估效果。我们使用经过finetune的 [ERNIE](https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE) 模型作为Teacher模型,使用GRU作为Student模型,使用Pantheon框架进行在线蒸馏。 + +#### 1. 下载训练数据集 + +下载数据集文件,解压后会生成 `./data/` 文件夹 +```bash +python downloads.py dataset +``` + +#### 2. 下载Teacher模型 + +```bash +# download ERNIE finetuned model +python downloads.py finetuned +python downloads.py conf +``` + +### 3. 蒸馏Student模型 +```bash +# start teacher service +bash run_teacher.sh + +# start student service +bash run_student.sh +``` + +> 如果你想详细了解LAC的原理可以参照相关repo: https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/lexical_analysis diff --git a/demo/pantheon/lexical_anlysis/__init__.py b/demo/pantheon/lexical_anlysis/__init__.py new file mode 100644 index 00000000..bcc99e78 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/__init__.py @@ -0,0 +1,4 @@ +from .teacher import Teacher +from .student import Student + +__all__ = teacher.__all__ + student.__all__ diff --git a/demo/pantheon/lexical_anlysis/creator.py b/demo/pantheon/lexical_anlysis/creator.py new file mode 100644 index 00000000..48324091 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/creator.py @@ -0,0 +1,260 @@ +# -*- coding: UTF-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Define the function to create lexical analysis model and model's data reader +""" +import sys +import os +import math +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.fluid.initializer import NormalInitializer + +from reader import Dataset +from ernie_reader import SequenceLabelReader + +from models.sequence_labeling import nets +from models.representation.ernie import ernie_encoder, ernie_pyreader + + +def create_model(args, vocab_size, num_labels, mode='train'): + """create lac model""" + + # model's input data + words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1) + targets = fluid.data( + name='targets', shape=[-1, 1], dtype='int64', lod_level=1) + if mode == "train": + print("create model mode: ", mode) + teacher_crf_decode = fluid.data( + name='teacher_crf_decode', shape=[-1, 1], dtype='float32', lod_level=1) + else: + print("create model mode: ", mode) + teacher_crf_decode = None + + feed_list = [words, targets] + if teacher_crf_decode: + feed_list.append(teacher_crf_decode) + + pyreader = fluid.io.DataLoader.from_generator( + feed_list=feed_list, + capacity=200, + use_double_buffer=True, + iterable=False) + # for test or train process + avg_cost, crf_avg_cost, teacher_cost, crf_decode= nets.lex_net( + words, args, vocab_size, num_labels, teacher_crf_decode,for_infer=False, target=targets) + + (precision, recall, f1_score, num_infer_chunks, num_label_chunks, + num_correct_chunks) = fluid.layers.chunk_eval( + input=crf_decode, + label=targets, + chunk_scheme="IOB", + num_chunk_types=int(math.ceil((num_labels - 1) / 2.0))) + chunk_evaluator = fluid.metrics.ChunkEvaluator() + chunk_evaluator.reset() + + ret = { + "pyreader": pyreader, + "words": words, + "targets": targets, + "avg_cost": avg_cost, + "crf_avg_cost": crf_avg_cost, + "teacher_cost": teacher_cost, + "crf_decode": crf_decode, + "precision": precision, + "recall": recall, + "f1_score": f1_score, + "chunk_evaluator": chunk_evaluator, + "num_infer_chunks": num_infer_chunks, + "num_label_chunks": num_label_chunks, + "num_correct_chunks": num_correct_chunks + } + return ret + +def create_lexnet_data_generator(args, + reader, + file_name, + place, + mode='train'): + if mode == 'train': + def wrapper(): + batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, [] + emi_lens = [] + for epoch in range(args.epoch): + print("data epoch: {}".format(epoch)) + for instance in reader.file_reader(file_name, mode="train")(): + words, labels, emission = instance + if len(seq_lens) < args.batch_size: + batch_words.append(words) + batch_labels.append(labels) + if batch_emissions is not None: + batch_emissions = np.concatenate((batch_emissions, emission)) + else: + batch_emissions = emission + seq_lens.append(len(words)) + emi_lens.append(emission.shape[0]) + if len(seq_lens) == args.batch_size: + + #print("batch words len", [len(seq) for seq in batch_words]) + #print("batch labels len", [len(seq) for seq in batch_labels]) + #print("emi lens:", emi_lens) + #print("emission first dim:", batch_emissions.shape[0]) + #print("reduced seq_lens:", sum(seq_lens)) + t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place) + t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place) + t_emissions = fluid.create_lod_tensor(batch_emissions, [seq_lens], place) + yield t_words, t_labels, t_emissions + batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, [] + emi_lens = [] + + if len(seq_lens) > 0: + t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place) + t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place) + t_emissions = fluid.create_lod_tensor(batch_emissions, [seq_lens], place) + yield t_words, t_labels, t_emissions + batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, [] + + else: + def wrapper(): + batch_words, batch_labels, seq_lens = [], [], [] + for instance in reader.file_reader(file_name, mode="test")(): + words, labels = instance + if len(seq_lens) < args.batch_size: + batch_words.append(words) + batch_labels.append(labels) + seq_lens.append(len(words)) + if len(seq_lens) == args.batch_size: + t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place) + t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place) + yield t_words, t_labels + batch_words, batch_labels, seq_lens = [], [], [] + + if len(seq_lens) > 0: + t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place) + t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place) + yield t_words, t_labels + batch_words, batch_labels, seq_lens = [], [], [] + return wrapper + +def create_pyreader(args, + file_name, + feed_list, + place, + model='lac', + reader=None, + return_reader=False, + mode='train'): + reader = SequenceLabelReader( + vocab_path=args.vocab_path, + label_map_config=args.label_map_config, + max_seq_len=args.max_seq_len, + do_lower_case=args.do_lower_case, + random_seed=args.random_seed) + return reader.data_generator(file_name,args.batch_size,args.epoch,shuffle=False,phase="train") + + +def create_ernie_model(args, ernie_config): + """ + Create Model for LAC based on ERNIE encoder + """ + # ERNIE's input data + + src_ids = fluid.data( + name='src_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') + sent_ids = fluid.data( + name='sent_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') + pos_ids = fluid.data( + name='pos_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') + input_mask = fluid.data( + name='input_mask', shape=[-1, args.max_seq_len, 1], dtype='float32') + + padded_labels = fluid.data( + name='padded_labels', shape=[-1, args.max_seq_len, 1], dtype='int64') + + seq_lens = fluid.data( + name='seq_lens', shape=[-1], dtype='int64', lod_level=0) + + squeeze_labels = fluid.layers.squeeze(padded_labels, axes=[-1]) + + # ernie_pyreader + ernie_inputs = { + "src_ids": src_ids, + "sent_ids": sent_ids, + "pos_ids": pos_ids, + "input_mask": input_mask, + "seq_lens": seq_lens + } + embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config) + + padded_token_embeddings = embeddings["padded_token_embeddings"] + + emission = fluid.layers.fc( + size=args.num_labels, + input=padded_token_embeddings, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-args.init_bound, high=args.init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4)), + num_flatten_dims=2) + + crf_cost = fluid.layers.linear_chain_crf( + input=emission, + label=padded_labels, + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=args.crf_learning_rate), + length=seq_lens) + + avg_cost = fluid.layers.mean(x=crf_cost) + crf_decode = fluid.layers.crf_decoding( + input=emission, + param_attr=fluid.ParamAttr(name='crfw'), + length=seq_lens) + + (precision, recall, f1_score, num_infer_chunks, num_label_chunks, + num_correct_chunks) = fluid.layers.chunk_eval( + input=crf_decode, + label=squeeze_labels, + chunk_scheme="IOB", + num_chunk_types=int(math.ceil((args.num_labels - 1) / 2.0)), + seq_length=seq_lens) + chunk_evaluator = fluid.metrics.ChunkEvaluator() + chunk_evaluator.reset() + + ret = { + "feed_list": + [src_ids, sent_ids, pos_ids, input_mask, padded_labels, seq_lens], + "words": src_ids, + "pos_ids":pos_ids, + "sent_ids":sent_ids, + "input_mask":input_mask, + "labels": padded_labels, + "seq_lens": seq_lens, + "avg_cost": avg_cost, + "crf_decode": crf_decode, + "precision": precision, + "recall": recall, + "f1_score": f1_score, + "chunk_evaluator": chunk_evaluator, + "num_infer_chunks": num_infer_chunks, + "num_label_chunks": num_label_chunks, + "num_correct_chunks": num_correct_chunks, + "emission":emission, + "alpha": None + } + + return ret diff --git a/demo/pantheon/lexical_anlysis/downloads.py b/demo/pantheon/lexical_anlysis/downloads.py new file mode 100644 index 00000000..c0aae6ec --- /dev/null +++ b/demo/pantheon/lexical_anlysis/downloads.py @@ -0,0 +1,163 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Download script, download dataset and pretrain models. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import io +import os +import sys +import time +import hashlib +import tarfile +import requests + +FILE_INFO = { + 'BASE_URL': 'https://baidu-nlp.bj.bcebos.com/', + 'DATA': { + 'name': 'lexical_analysis-dataset-2.0.0.tar.gz', + 'md5': '71e4a9a36d0f0177929a1bccedca7dba' + }, + 'FINETURN_MODEL': { + 'name': 'lexical_analysis_finetuned-1.0.0.tar.gz', + 'md5': "ee2c7614b06dcfd89561fbbdaac34342" + }, + 'CONF': { + 'name': 'conf.tar.gz', + 'md5': "7a0fe28db46db496fff4361eebaa6515", + 'url': 'https://paddlemodels.bj.bcebos.com/PaddleSlim/pantheon/lexical_analysis/', + } +} + + +def usage(): + desc = ("\nDownload datasets and pretrained models for LAC.\n" + "Usage:\n" + " 1. python download.py all\n" + " 2. python download.py dataset\n" + " 3. python download.py finetuned\n" + " 4. python download.py conf\n") + print(desc) + + +def md5file(fname): + hash_md5 = hashlib.md5() + with io.open(fname, "rb") as fin: + for chunk in iter(lambda: fin.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + +def extract(fname, dir_path): + """ + Extract tar.gz file + """ + try: + tar = tarfile.open(fname, "r") + file_names = tar.getnames() + for file_name in file_names: + tar.extract(file_name, dir_path) + print(file_name) + tar.close() + except Exception as e: + raise e + + +def _download(url, filename, md5sum): + """ + Download file and check md5 + """ + retry = 0 + retry_limit = 3 + chunk_size = 4096 + while not (os.path.exists(filename) and md5file(filename) == md5sum): + if retry < retry_limit: + retry += 1 + else: + raise RuntimeError( + "Cannot download dataset ({0}) with retry {1} times.".format( + url, retry_limit)) + try: + start = time.time() + size = 0 + res = requests.get(url, stream=True) + filesize = int(res.headers['content-length']) + if res.status_code == 200: + print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024)) + # save by chunk + with io.open(filename, "wb") as fout: + for chunk in res.iter_content(chunk_size=chunk_size): + if chunk: + fout.write(chunk) + size += len(chunk) + pr = '>' * int(size * 50 / filesize) + print( + '\r[Process ]: %s%.2f%%' % + (pr, float(size / filesize * 100)), + end='') + end = time.time() + print("\n[CostTime]: %.2f s" % (end - start)) + except Exception as e: + print(e) + + +def download(name, dir_path): + # import ipdb; ipdb.set_trace() + if name == 'CONF': + url = FILE_INFO[name]['url'] + FILE_INFO[name]['name'] + else: + url = FILE_INFO['BASE_URL'] + FILE_INFO[name]['name'] + file_path = os.path.join(dir_path, FILE_INFO[name]['name']) + + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + # download data + print("Downloading : %s" % name) + _download(url, file_path, FILE_INFO[name]['md5']) + + # extract data + print("Extracting : %s" % file_path) + extract(file_path, dir_path) + os.remove(file_path) + + +if __name__ == '__main__': + if len(sys.argv) != 2: + usage() + sys.exit(1) + pwd = os.path.join(os.path.dirname(__file__), './') + ernie_dir = os.path.join(os.path.dirname(__file__), './pretrained') + + if sys.argv[1] == 'all': + download('DATA', pwd) + download('FINETURN_MODEL', pwd) + download('CONF', pwd) + + if sys.argv[1] == "dataset": + download('DATA', pwd) + + elif sys.argv[1] == "finetuned": + download('FINETURN_MODEL', pwd) + + elif sys.argv[1] == "conf": + download('CONF', pwd) + + else: + usage() + diff --git a/demo/pantheon/lexical_anlysis/ernie_reader.py b/demo/pantheon/lexical_anlysis/ernie_reader.py new file mode 100755 index 00000000..5e8b6e4b --- /dev/null +++ b/demo/pantheon/lexical_anlysis/ernie_reader.py @@ -0,0 +1,160 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module provides reader for ernie model +""" + +import sys + +from collections import namedtuple +import numpy as np + +sys.path.append("..") +from preprocess.ernie.task_reader import BaseReader, tokenization + + +def pad_batch_data(insts, + pad_idx=0, + max_len=128, + return_pos=False, + return_input_mask=False, + return_max_len=False, + return_num_token=False, + return_seq_lens=False): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and input mask. + """ + return_list = [] + # max_len = max(len(inst) for inst in insts) + max_len = max_len + # Any token included in dict can be used to pad, since the paddings' loss + # will be masked out by weights and make no effect on parameter gradients. + + inst_data = np.array( + [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] + + # position data + if return_pos: + inst_pos = np.array([ + list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) + for inst in insts + ]) + + return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] + + if return_input_mask: + # This is used to avoid attention on paddings. + input_mask_data = np.array([[1] * len(inst) + [0] * + (max_len - len(inst)) for inst in insts]) + input_mask_data = np.expand_dims(input_mask_data, axis=-1) + return_list += [input_mask_data.astype("float32")] + + if return_max_len: + return_list += [max_len] + + if return_num_token: + num_token = 0 + for inst in insts: + num_token += len(inst) + return_list += [num_token] + + if return_seq_lens: + seq_lens = np.array([len(inst) for inst in insts]) + return_list += [seq_lens.astype("int64").reshape([-1])] + + return return_list if len(return_list) > 1 else return_list[0] + + +class SequenceLabelReader(BaseReader): + """SequenceLabelReader""" + + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + batch_label_ids = [record.label_ids for record in batch_records] + + # padding + padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( + batch_token_ids, + max_len=self.max_seq_len, + pad_idx=self.pad_id, + return_input_mask=True, + return_seq_lens=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, max_len=self.max_seq_len, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, max_len=self.max_seq_len, pad_idx=self.pad_id) + padded_label_ids = pad_batch_data( + batch_label_ids, + max_len=self.max_seq_len, + pad_idx=len(self.label_map) - 1) + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + input_mask, padded_label_ids, batch_seq_lens + ] + return return_list + + def _reseg_token_label(self, tokens, labels, tokenizer): + assert len(tokens) == len(labels) + ret_tokens = [] + ret_labels = [] + for token, label in zip(tokens, labels): + sub_token = tokenizer.tokenize(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + ret_labels.append(label) + if len(sub_token) < 2: + continue + sub_label = label + if label.startswith("B-"): + sub_label = "I-" + label[2:] + ret_labels.extend([sub_label] * (len(sub_token) - 1)) + + assert len(ret_tokens) == len(ret_labels) + return ret_tokens, ret_labels + + def _convert_example_to_record(self, example, max_seq_length, tokenizer): + tokens = tokenization.convert_to_unicode(example.text_a).split(u"") + labels = tokenization.convert_to_unicode(example.label).split(u"") + tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) + + if len(tokens) > max_seq_length - 2: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + tokens = ["[CLS]"] + tokens + ["[SEP]"] + token_ids = tokenizer.convert_tokens_to_ids(tokens) + position_ids = list(range(len(token_ids))) + text_type_ids = [0] * len(token_ids) + no_entity_id = len(self.label_map) - 1 + labels = [ + label if label in self.label_map else u"O" for label in labels + ] + label_ids = [no_entity_id] + [ + self.label_map[label] for label in labels + ] + [no_entity_id] + + Record = namedtuple( + 'Record', + ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) + record = Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + label_ids=label_ids) + return record diff --git a/demo/pantheon/lexical_anlysis/eval.py b/demo/pantheon/lexical_anlysis/eval.py new file mode 100755 index 00000000..b7a9072b --- /dev/null +++ b/demo/pantheon/lexical_anlysis/eval.py @@ -0,0 +1,131 @@ +# -*- coding: UTF-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import time +import sys + +import paddle.fluid as fluid +import paddle + +import model_utils +import reader +import creator +sys.path.append('models/') +from model_check import check_cuda +from model_check import check_version + +parser = argparse.ArgumentParser(__doc__) +# 1. model parameters +model_g = model_utils.ArgumentGroup(parser, "model", "model configuration") +model_g.add_arg("word_emb_dim", int, 128, + "The dimension in which a word is embedded.") +model_g.add_arg("grnn_hidden_dim", int, 128, + "The number of hidden nodes in the GRNN layer.") +model_g.add_arg("bigru_num", int, 2, + "The number of bi_gru layers in the network.") +model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.") + +# 2. data parameters +data_g = model_utils.ArgumentGroup(parser, "data", "data paths") +data_g.add_arg("word_dict_path", str, "./conf/word.dic", + "The path of the word dictionary.") +data_g.add_arg("label_dict_path", str, "./conf/tag.dic", + "The path of the label dictionary.") +data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", + "The path of the word replacement Dictionary.") +data_g.add_arg("test_data", str, "./data/test.tsv", + "The folder where the training data is located.") +data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model") +data_g.add_arg( + "batch_size", int, 200, + "The number of sequences contained in a mini-batch, " + "or the maximum number of tokens (include paddings) contained in a mini-batch." +) + + +def do_eval(args): + print('do_eval...........') + dataset = reader.Dataset(args) + + test_program = fluid.Program() + with fluid.program_guard(test_program, fluid.default_startup_program()): + with fluid.unique_name.guard(): + test_ret = creator.create_model( + args, dataset.vocab_size, dataset.num_labels, mode='test') + test_program = test_program.clone(for_test=True) + + # init executor + if args.use_cuda: + place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) + else: + place = fluid.CPUPlace() + + pyreader = creator.create_pyreader( + args, + file_name=args.test_data, + feed_list=test_ret['feed_list'], + place=place, + model='lac', + reader=dataset, + mode='test') + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + # load model + model_utils.init_checkpoint(exe, args.init_checkpoint, test_program) + test_process( + exe=exe, program=test_program, reader=pyreader, test_ret=test_ret) + + +def test_process(exe, program, reader, test_ret): + """ + the function to execute the infer process + :param exe: the fluid Executor + :param program: the infer_program + :param reader: data reader + :return: the list of prediction result + """ + print('test_process...........') + test_ret["chunk_evaluator"].reset() + start_time = time.time() + reader.start() + while True: + try: + nums_infer, nums_label, nums_correct = exe.run( + program, + fetch_list=[ + test_ret["num_infer_chunks"], + test_ret["num_label_chunks"], + test_ret["num_correct_chunks"], + ]) + test_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct) + except fluid.core.EOFException: + reader.reset() + break + + precision, recall, f1 = test_ret["chunk_evaluator"].eval() + end_time = time.time() + print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" % + (precision, recall, f1, end_time - start_time)) + + +if __name__ == '__main__': + args = parser.parse_args() + check_cuda(args.use_cuda) + check_version() + do_eval(args) diff --git a/demo/pantheon/lexical_anlysis/model_utils.py b/demo/pantheon/lexical_anlysis/model_utils.py new file mode 100755 index 00000000..d9f10b17 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/model_utils.py @@ -0,0 +1,248 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +util tools +""" +from __future__ import print_function +import os +import sys +import numpy as np +import paddle.fluid as fluid +import yaml +import io + + +def str2bool(v): + """ + argparse does not support True or False in python + """ + return v.lower() in ("true", "t", "1") + + +class ArgumentGroup(object): + """ + Put arguments to one group + """ + + def __init__(self, parser, title, des): + """none""" + self._group = parser.add_argument_group(title=title, description=des) + + def add_arg(self, name, type, default, help, **kwargs): + """ Add argument """ + type = str2bool if type == bool else type + self._group.add_argument( + "--" + name, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def load_yaml(parser, file_name, **kwargs): + with io.open(file_name, 'r', encoding='utf8') as f: + args = yaml.load(f) + for title in args: + group = parser.add_argument_group(title=title, description='') + for name in args[title]: + _type = type(args[title][name]['val']) + _type = str2bool if _type == bool else _type + group.add_argument( + "--" + name, + default=args[title][name]['val'], + type=_type, + help=args[title][name]['meaning'] + + ' Default: %(default)s.', + **kwargs) + + +def print_arguments(args): + """none""" + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).items()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def to_str(string, encoding="utf-8"): + """convert to str for print""" + if sys.version_info.major == 3: + if isinstance(string, bytes): + return string.decode(encoding) + elif sys.version_info.major == 2: + if isinstance(string, unicode): + if os.name == 'nt': + return string + else: + return string.encode(encoding) + return string + + +def to_lodtensor(data, place): + """ + Convert data in list into lodtensor. + """ + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.Tensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def parse_result(words, crf_decode, dataset): + """ parse result """ + offset_list = (crf_decode.lod())[0] + words = np.array(words) + crf_decode = np.array(crf_decode) + batch_size = len(offset_list) - 1 + + batch_out = [] + for sent_index in range(batch_size): + begin, end = offset_list[sent_index], offset_list[sent_index + 1] + sent = [dataset.id2word_dict[str(id[0])] for id in words[begin:end]] + tags = [ + dataset.id2label_dict[str(id[0])] for id in crf_decode[begin:end] + ] + + sent_out = [] + tags_out = [] + parital_word = "" + for ind, tag in enumerate(tags): + # for the first word + if parital_word == "": + parital_word = sent[ind] + tags_out.append(tag.split('-')[0]) + continue + + # for the beginning of word + if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"): + sent_out.append(parital_word) + tags_out.append(tag.split('-')[0]) + parital_word = sent[ind] + continue + + parital_word += sent[ind] + + # append the last word, except for len(tags)=0 + if len(sent_out) < len(tags_out): + sent_out.append(parital_word) + + batch_out.append([sent_out, tags_out]) + return batch_out + + +def parse_padding_result(words, crf_decode, seq_lens, dataset): + """ parse padding result """ + words = np.squeeze(words) + batch_size = len(seq_lens) + + batch_out = [] + for sent_index in range(batch_size): + + sent = [ + dataset.id2word_dict[str(id)] + for id in words[sent_index][1:seq_lens[sent_index] - 1] + ] + tags = [ + dataset.id2label_dict[str(id)] + for id in crf_decode[sent_index][1:seq_lens[sent_index] - 1] + ] + + sent_out = [] + tags_out = [] + parital_word = "" + for ind, tag in enumerate(tags): + # for the first word + if parital_word == "": + parital_word = sent[ind] + tags_out.append(tag.split('-')[0]) + continue + + # for the beginning of word + if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"): + sent_out.append(parital_word) + tags_out.append(tag.split('-')[0]) + parital_word = sent[ind] + continue + + parital_word += sent[ind] + + # append the last word, except for len(tags)=0 + if len(sent_out) < len(tags_out): + sent_out.append(parital_word) + + batch_out.append([sent_out, tags_out]) + return batch_out + + +def init_checkpoint(exe, init_checkpoint_path, main_program): + """ + Init CheckPoint + """ + assert os.path.exists( + init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path + + def existed_persitables(var): + """ + If existed presitabels + """ + if not fluid.io.is_persistable(var): + return False + if os.path.exists(os.path.join(init_checkpoint_path, var.name)): + print("INIT {}".format(var.name)) + return True + else: + print("SKIP {}".format(var.name)) + return False + + fluid.io.load_vars( + exe, + init_checkpoint_path, + main_program=main_program, + predicate=existed_persitables) + print("Load model from {}".format(init_checkpoint_path)) + + +def init_pretraining_params(exe, + pretraining_params_path, + main_program, + use_fp16=False): + """load params of pretrained model, NOT including moment, learning_rate""" + assert os.path.exists(pretraining_params_path + ), "[%s] cann't be found." % pretraining_params_path + + def _existed_params(var): + if not isinstance(var, fluid.framework.Parameter): + return False + if os.path.exists(os.path.join(pretraining_params_path, var.name)): + print("INIT {}".format(var.name)) + return True + else: + print("SKIP {}".format(var.name)) + return False + + fluid.io.load_vars( + exe, + pretraining_params_path, + main_program=main_program, + predicate=_existed_params) + print("Load pretraining parameters from {}.".format( + pretraining_params_path)) diff --git a/demo/pantheon/lexical_anlysis/models/__init__.py b/demo/pantheon/lexical_anlysis/models/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/demo/pantheon/lexical_anlysis/models/model_check.py b/demo/pantheon/lexical_anlysis/models/model_check.py new file mode 100755 index 00000000..51713452 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/models/model_check.py @@ -0,0 +1,73 @@ +#encoding=utf8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle +import paddle.fluid as fluid + + +def check_cuda(use_cuda, err = \ + "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \ + Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n" + ): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + try: + if use_cuda == True and fluid.is_compiled_with_cuda() == False: + print(err) + sys.exit(1) + except Exception as e: + pass + +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + print(err) + sys.exit(1) + + +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + print(err) + sys.exit(1) + + +if __name__ == "__main__": + check_cuda(True) + + check_cuda(False) + + check_cuda(True, "This is only for testing.") diff --git a/demo/pantheon/lexical_anlysis/models/representation/__init__.py b/demo/pantheon/lexical_anlysis/models/representation/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/demo/pantheon/lexical_anlysis/models/representation/ernie.py b/demo/pantheon/lexical_anlysis/models/representation/ernie.py new file mode 100755 index 00000000..ced3196f --- /dev/null +++ b/demo/pantheon/lexical_anlysis/models/representation/ernie.py @@ -0,0 +1,322 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module provides ErnieModel and ErnieConfig +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json + +import six +import paddle.fluid as fluid + +from models.transformer_encoder import encoder, pre_process_layer + + +def ernie_pyreader(args, pyreader_name): + """define standard ernie pyreader""" + src_ids = fluid.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64') + sent_ids = fluid.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64') + pos_ids = fluid.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64') + input_mask = fluid.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='float32') + labels = fluid.data(name='5', shape=[-1, 1], dtype='int64') + seq_lens = fluid.data(name='6', shape=[-1], dtype='int64') + + pyreader = fluid.io.DataLoader.from_generator( + feed_list=[src_ids, sent_ids, pos_ids, input_mask, labels, seq_lens], + capacity=50, + iterable=False, + use_double_buffer=True) + + ernie_inputs = { + "src_ids": src_ids, + "sent_ids": sent_ids, + "pos_ids": pos_ids, + "input_mask": input_mask, + "seq_lens": seq_lens + } + return pyreader, ernie_inputs, labels + + +def ernie_encoder_with_paddle_hub(ernie_inputs, max_seq_len): + ernie = hub.Module(name="ernie") + inputs, outputs, program = ernie.context( + trainable=True, max_seq_len=max_seq_len, learning_rate=1) + + main_program = fluid.default_main_program() + input_dict = { + inputs["input_ids"].name: ernie_inputs["src_ids"], + inputs["segment_ids"].name: ernie_inputs["sent_ids"], + inputs["position_ids"].name: ernie_inputs["pos_ids"], + inputs["input_mask"].name: ernie_inputs["input_mask"] + } + + hub.connect_program( + pre_program=main_program, + next_program=program, + input_dict=input_dict, + inplace=True) + + enc_out = outputs["sequence_output"] + unpad_enc_out = fluid.layers.sequence_unpad( + enc_out, length=ernie_inputs["seq_lens"]) + cls_feats = outputs["pooled_output"] + + embeddings = { + "sentence_embeddings": cls_feats, + "token_embeddings": unpad_enc_out, + "padded_token_embeddings": enc_out + } + + for k, v in embeddings.items(): + v.persistable = True + + return embeddings + + +def ernie_encoder(ernie_inputs, ernie_config): + """return sentence embedding and token embeddings""" + + ernie = ErnieModel( + src_ids=ernie_inputs["src_ids"], + position_ids=ernie_inputs["pos_ids"], + sentence_ids=ernie_inputs["sent_ids"], + input_mask=ernie_inputs["input_mask"], + config=ernie_config) + + enc_out = ernie.get_sequence_output() + unpad_enc_out = fluid.layers.sequence_unpad( + enc_out, length=ernie_inputs["seq_lens"]) + cls_feats = ernie.get_pooled_output() + + embeddings = { + "sentence_embeddings": cls_feats, + "token_embeddings": unpad_enc_out, + "padded_token_embeddings": enc_out + } + + for k, v in embeddings.items(): + v.persistable = True + + return embeddings + + +class ErnieConfig(object): + """ErnieConfig""" + + def __init__(self, config_path): + self._config_dict = self._parse(config_path) + + def _parse(self, config_path): + try: + with open(config_path) as json_file: + config_dict = json.load(json_file) + except Exception: + raise IOError("Error in parsing Ernie model config file '%s'" % + config_path) + else: + return config_dict + + def __getitem__(self, key): + return self._config_dict[key] + + def print_config(self): + """print config""" + for arg, value in sorted(six.iteritems(self._config_dict)): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +class ErnieModel(object): + """ErnieModel""" + + def __init__(self, + src_ids, + position_ids, + sentence_ids, + input_mask, + config, + weight_sharing=True, + use_fp16=False): + + self._emb_size = config['hidden_size'] + self._n_layer = config['num_hidden_layers'] + self._n_head = config['num_attention_heads'] + self._voc_size = config['vocab_size'] + self._max_position_seq_len = config['max_position_embeddings'] + self._sent_types = config['type_vocab_size'] + self._hidden_act = config['hidden_act'] + self._prepostprocess_dropout = config['hidden_dropout_prob'] + self._attention_dropout = config['attention_probs_dropout_prob'] + self._weight_sharing = weight_sharing + + self._word_emb_name = "word_embedding" + self._pos_emb_name = "pos_embedding" + self._sent_emb_name = "sent_embedding" + self._dtype = "float16" if use_fp16 else "float32" + + # Initialize all weigths by truncated normal initializer, and all biases + # will be initialized by constant zero by default. + self._param_initializer = fluid.initializer.TruncatedNormal( + scale=config['initializer_range']) + + self._build_model(src_ids, position_ids, sentence_ids, input_mask) + + def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): + # padding id in vocabulary must be set to 0 + emb_out = fluid.layers.embedding( + input=src_ids, + size=[self._voc_size, self._emb_size], + dtype=self._dtype, + param_attr=fluid.ParamAttr( + name=self._word_emb_name, initializer=self._param_initializer), + is_sparse=False) + position_emb_out = fluid.layers.embedding( + input=position_ids, + size=[self._max_position_seq_len, self._emb_size], + dtype=self._dtype, + param_attr=fluid.ParamAttr( + name=self._pos_emb_name, initializer=self._param_initializer)) + + sent_emb_out = fluid.layers.embedding( + sentence_ids, + size=[self._sent_types, self._emb_size], + dtype=self._dtype, + param_attr=fluid.ParamAttr( + name=self._sent_emb_name, initializer=self._param_initializer)) + + emb_out = emb_out + position_emb_out + emb_out = emb_out + sent_emb_out + + emb_out = pre_process_layer( + emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') + + if self._dtype == "float16": + input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) + self_attn_mask = fluid.layers.matmul( + x=input_mask, y=input_mask, transpose_y=True) + + self_attn_mask = fluid.layers.scale( + x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) + n_head_self_attn_mask = fluid.layers.stack( + x=[self_attn_mask] * self._n_head, axis=1) + n_head_self_attn_mask.stop_gradient = True + + self._enc_out = encoder( + enc_input=emb_out, + attn_bias=n_head_self_attn_mask, + n_layer=self._n_layer, + n_head=self._n_head, + d_key=self._emb_size // self._n_head, + d_value=self._emb_size // self._n_head, + d_model=self._emb_size, + d_inner_hid=self._emb_size * 4, + prepostprocess_dropout=self._prepostprocess_dropout, + attention_dropout=self._attention_dropout, + relu_dropout=0, + hidden_act=self._hidden_act, + preprocess_cmd="", + postprocess_cmd="dan", + param_initializer=self._param_initializer, + name='encoder') + + def get_sequence_output(self): + """Get embedding of each token for squence labeling""" + return self._enc_out + + def get_pooled_output(self): + """Get the first feature of each sequence for classification""" + next_sent_feat = fluid.layers.slice( + input=self._enc_out, axes=[1], starts=[0], ends=[1]) + next_sent_feat = fluid.layers.fc( + input=next_sent_feat, + size=self._emb_size, + act="tanh", + param_attr=fluid.ParamAttr( + name="pooled_fc.w_0", initializer=self._param_initializer), + bias_attr="pooled_fc.b_0") + return next_sent_feat + + def get_pretraining_output(self, mask_label, mask_pos, labels): + """Get the loss & accuracy for pretraining""" + + mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') + + # extract the first token feature in each sentence + next_sent_feat = self.get_pooled_output() + reshaped_emb_out = fluid.layers.reshape( + x=self._enc_out, shape=[-1, self._emb_size]) + # extract masked tokens' feature + mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) + + # transform: fc + mask_trans_feat = fluid.layers.fc( + input=mask_feat, + size=self._emb_size, + act=self._hidden_act, + param_attr=fluid.ParamAttr( + name='mask_lm_trans_fc.w_0', + initializer=self._param_initializer), + bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) + # transform: layer norm + mask_trans_feat = pre_process_layer( + mask_trans_feat, 'n', name='mask_lm_trans') + + mask_lm_out_bias_attr = fluid.ParamAttr( + name="mask_lm_out_fc.b_0", + initializer=fluid.initializer.Constant(value=0.0)) + if self._weight_sharing: + fc_out = fluid.layers.matmul( + x=mask_trans_feat, + y=fluid.default_main_program().global_block().var( + self._word_emb_name), + transpose_y=True) + fc_out += fluid.layers.create_parameter( + shape=[self._voc_size], + dtype=self._dtype, + attr=mask_lm_out_bias_attr, + is_bias=True) + + else: + fc_out = fluid.layers.fc(input=mask_trans_feat, + size=self._voc_size, + param_attr=fluid.ParamAttr( + name="mask_lm_out_fc.w_0", + initializer=self._param_initializer), + bias_attr=mask_lm_out_bias_attr) + + mask_lm_loss = fluid.layers.softmax_with_cross_entropy( + logits=fc_out, label=mask_label) + mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) + + next_sent_fc_out = fluid.layers.fc( + input=next_sent_feat, + size=2, + param_attr=fluid.ParamAttr( + name="next_sent_fc.w_0", initializer=self._param_initializer), + bias_attr="next_sent_fc.b_0") + + next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( + logits=next_sent_fc_out, label=labels, return_softmax=True) + + next_sent_acc = fluid.layers.accuracy( + input=next_sent_softmax, label=labels) + + mean_next_sent_loss = fluid.layers.mean(next_sent_loss) + + loss = mean_next_sent_loss + mean_mask_lm_loss + return next_sent_acc, mean_mask_lm_loss, loss diff --git a/demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py b/demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py b/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py new file mode 100755 index 00000000..414e89b0 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py @@ -0,0 +1,174 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The function lex_net(args) define the lexical analysis network structure +""" +import sys +import os +import math + +import paddle.fluid as fluid +from paddle.fluid.initializer import NormalInitializer + + +def lex_net(word, args, vocab_size, num_labels, teacher_crf_decode=None, for_infer=True,target=None): + """ + define the lexical analysis network structure + word: stores the input of the model + for_infer: a boolean value, indicating if the model to be created is for training or predicting. + + return: + for infer: return the prediction + otherwise: return the prediction + """ + word_emb_dim = args.word_emb_dim + grnn_hidden_dim = args.grnn_hidden_dim + emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(args) else 1.0 + crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(args) else 1.0 + bigru_num = args.bigru_num + init_bound = 0.1 + IS_SPARSE = True + + def _bigru_layer(input_feature): + """ + define the bidirectional gru layer + """ + pre_gru = fluid.layers.fc( + input=input_feature, + size=grnn_hidden_dim * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + gru = fluid.layers.dynamic_gru( + input=pre_gru, + size=grnn_hidden_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + pre_gru_r = fluid.layers.fc( + input=input_feature, + size=grnn_hidden_dim * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + gru_r = fluid.layers.dynamic_gru( + input=pre_gru_r, + size=grnn_hidden_dim, + is_reverse=True, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1) + return bi_merge + + def log_softmax(logits, axis=-1): + logsoftmax = logits-fluid.layers.log(fluid.layers.reduce_sum(fluid.layers.exp(logits),axis)) + return logsoftmax + + def cross_entropy(student, teacher): + ce_loss = -1.0 * fluid.layers.reduce_sum(teacher*fluid.layers.log(student), dim=1) + ce_loss = fluid.layers.sequence_pool(ce_loss, "sum") + return ce_loss + + def kl_div(student, teacher): + ce_loss = fluid.layers.reduce_sum(teacher*(fluid.layers.log(teacher) - fluid.layers.log(student)), dim=1) + ce_loss = fluid.layers.sequence_pool(ce_loss, "sum") + return ce_loss + + def pred(student, teacher,t=1.0): + return fluid.layers.reduce_mean(-1.0*fluid.layers.softmax(teacher)*log_softmax(student/t)) + + def normalize(alpha): + """ alpha shape (-1, 57) + """ + tag_num = alpha.shape[1] + sum_alpha = fluid.layers.reduce_sum(alpha, dim=1) + sum_alpha = fluid.layers.unsqueeze(sum_alpha, axes=[1]) + sum_alpha = fluid.layers.expand(sum_alpha, [1, tag_num]) + norm_alpha = alpha / sum_alpha + return norm_alpha + + def _net_conf(word, target=None): + """ + Configure the network + """ + word_embedding = fluid.embedding( + input=word, + size=[vocab_size, word_emb_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr( + learning_rate=emb_lr, + name="word_emb", + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound))) + + input_feature = word_embedding + for i in range(bigru_num): + bigru_output = _bigru_layer(input_feature) + input_feature = bigru_output + + emission = fluid.layers.fc( + size=num_labels, + input=bigru_output, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + if target is not None: + crf_cost = fluid.layers.linear_chain_crf( + input=emission, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=crf_lr)) + if teacher_crf_decode is not None: + teacher_cost = pred(student=emission, teacher=teacher_crf_decode,t=1.0) + else: + teacher_cost = 0 + print('no teacher emission') + crf_avg_cost = fluid.layers.mean(x=crf_cost) + alpha, beta = 0.5, 0.5 + print("alpha * crf_avg_cost + beta * teacher_cost: ", alpha, beta) + avg_cost = alpha * crf_avg_cost+ beta * teacher_cost + crf_decode = fluid.layers.crf_decoding( + input=emission, param_attr=fluid.ParamAttr(name='crfw')) + return avg_cost, crf_avg_cost, teacher_cost, crf_decode + + else: + size = emission.shape[1] + fluid.layers.create_parameter( + shape=[size + 2, size], dtype=emission.dtype, name='crfw') + crf_decode = fluid.layers.crf_decoding( + input=emission, param_attr=fluid.ParamAttr(name='crfw')) + + return crf_decode + + if for_infer: + return _net_conf(word) + + else: + # assert target != None, "target is necessary for training" + return _net_conf(word, target) diff --git a/demo/pantheon/lexical_anlysis/models/transformer_encoder.py b/demo/pantheon/lexical_anlysis/models/transformer_encoder.py new file mode 100755 index 00000000..77908896 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/models/transformer_encoder.py @@ -0,0 +1,342 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Transformer encoder.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from functools import partial + +import paddle.fluid as fluid +import paddle.fluid.layers as layers + + +def multi_head_attention(queries, + keys, + values, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None, + param_initializer=None, + name='multi_head_att'): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + keys = queries if keys is None else keys + values = keys if values is None else values + + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_query_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_query_fc.b_0') + k = layers.fc(input=keys, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_key_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_key_fc.b_0') + v = layers.fc(input=values, + size=d_value * n_head, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_value_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_value_fc.b_0') + return q, k, v + + def __split_heads(x, n_head): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions and then transpose. Specifically, input a tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + hidden_size = x.shape[-1] + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped = layers.reshape( + x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) + + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) == 3: + return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + return layers.reshape( + x=trans_x, + shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], + inplace=True) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): + """ + Scaled Dot-Product Attention + """ + scaled_q = layers.scale(x=q, scale=d_key**-0.5) + product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + if attn_bias: + product += attn_bias + weights = layers.softmax(product) + if dropout_rate: + weights = layers.dropout( + weights, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.matmul(weights, v) + return out + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + + if cache is not None: # use cache and concat time steps + # Since the inplace reshape in __split_heads changes the shape of k and + # v, which is the cache input for next time step, reshape the cache + # input from the previous time step first. + k = cache["k"] = layers.concat( + [layers.reshape( + cache["k"], shape=[0, 0, d_model]), k], axis=1) + v = cache["v"] = layers.concat( + [layers.reshape( + cache["v"], shape=[0, 0, d_model]), v], axis=1) + + q = __split_heads(q, n_head) + k = __split_heads(k, n_head) + v = __split_heads(v, n_head) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_output_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_output_fc.b_0') + return proj_out + + +def positionwise_feed_forward(x, + d_inner_hid, + d_hid, + dropout_rate, + hidden_act, + param_initializer=None, + name='ffn'): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=2, + act=hidden_act, + param_attr=fluid.ParamAttr( + name=name + '_fc_0.w_0', + initializer=param_initializer), + bias_attr=name + '_fc_0.b_0') + if dropout_rate: + hidden = layers.dropout( + hidden, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.fc(input=hidden, + size=d_hid, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_fc_1.w_0', initializer=param_initializer), + bias_attr=name + '_fc_1.b_0') + return out + + +def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., + name=''): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out_dtype = out.dtype + if out_dtype == fluid.core.VarDesc.VarType.FP16: + out = layers.cast(x=out, dtype="float32") + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_layer_norm_scale', + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + name=name + '_layer_norm_bias', + initializer=fluid.initializer.Constant(0.))) + if out_dtype == fluid.core.VarDesc.VarType.FP16: + out = layers.cast(x=out, dtype="float16") + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +def encoder_layer(enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + name=''): + """The encoder layers that can be stacked to form a deep encoder. + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + attn_output = multi_head_attention( + pre_process_layer( + enc_input, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_att'), + None, + None, + attn_bias, + d_key, + d_value, + d_model, + n_head, + attention_dropout, + param_initializer=param_initializer, + name=name + '_multi_head_att') + attn_output = post_process_layer( + enc_input, + attn_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_att') + ffd_output = positionwise_feed_forward( + pre_process_layer( + attn_output, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_ffn'), + d_inner_hid, + d_model, + relu_dropout, + hidden_act, + param_initializer=param_initializer, + name=name + '_ffn') + return post_process_layer( + attn_output, + ffd_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_ffn') + + +def encoder(enc_input, + attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + name=''): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + for i in range(n_layer): + enc_output = encoder_layer( + enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd, + postprocess_cmd, + param_initializer=param_initializer, + name=name + '_layer_' + str(i)) + enc_input = enc_output + enc_output = pre_process_layer( + enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") + + return enc_output diff --git a/demo/pantheon/lexical_anlysis/preprocess/__init__.py b/demo/pantheon/lexical_anlysis/preprocess/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py b/demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py b/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py new file mode 100644 index 00000000..b3a8a0d7 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py @@ -0,0 +1,392 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module provides reader for classification and sequence labing +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import namedtuple +import csv +import json + +import numpy as np + +from preprocess.ernie import tokenization +from preprocess.padding import pad_batch_data +import io + +def csv_reader(fd, delimiter='\t'): + def gen(): + for i in fd: + slots = i.rstrip('\n').split(delimiter) + if len(slots) == 1: + yield slots, + else: + yield slots + return gen() + +class BaseReader(object): + """BaseReader for classify and sequence labeling task""" + + def __init__(self, + vocab_path, + label_map_config=None, + max_seq_len=512, + do_lower_case=True, + in_tokens=False, + random_seed=None): + self.max_seq_len = max_seq_len + self.tokenizer = tokenization.FullTokenizer( + vocab_file=vocab_path, do_lower_case=do_lower_case) + self.vocab = self.tokenizer.vocab + self.pad_id = self.vocab["[PAD]"] + self.cls_id = self.vocab["[CLS]"] + self.sep_id = self.vocab["[SEP]"] + self.in_tokens = in_tokens + + np.random.seed(random_seed) + + self.current_example = 0 + self.current_epoch = 0 + self.num_examples = 0 + + if label_map_config: + with open(label_map_config) as f: + self.label_map = json.load(f) + else: + self.label_map = None + + def get_train_progress(self): + """Gets progress for training phase.""" + return self.current_example, self.current_epoch + + def _read_tsv(self, input_file, quotechar=None): + """Reads a tab separated value file.""" + with io.open(input_file, "r", encoding="utf8") as f: + reader = csv_reader(f, delimiter="\t") + headers = next(reader) + Example = namedtuple('Example', headers) + + examples = [] + for line in reader: + example = Example(*line) + examples.append(example) + return examples + + def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + def _convert_example_to_record(self, example, max_seq_length, tokenizer): + """Converts a single `Example` into a single `Record`.""" + + text_a = tokenization.convert_to_unicode(example.text_a) + tokens_a = tokenizer.tokenize(text_a) + tokens_b = None + if "text_b" in example._fields: + text_b = tokenization.convert_to_unicode(example.text_b) + tokens_b = tokenizer.tokenize(text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT/ERNIE is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + text_type_ids = [] + tokens.append("[CLS]") + text_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + text_type_ids.append(0) + tokens.append("[SEP]") + text_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + text_type_ids.append(1) + tokens.append("[SEP]") + text_type_ids.append(1) + + token_ids = tokenizer.convert_tokens_to_ids(tokens) + position_ids = list(range(len(token_ids))) + + if self.label_map: + label_id = self.label_map[example.label] + else: + label_id = example.label + + Record = namedtuple( + 'Record', + ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid']) + + qid = None + if "qid" in example._fields: + qid = example.qid + + record = Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + label_id=label_id, + qid=qid) + return record + + def _prepare_batch_data(self, examples, batch_size, phase=None): + """generate batch records""" + batch_records, max_len = [], 0 + for index, example in enumerate(examples): + if phase == "train": + self.current_example = index + record = self._convert_example_to_record(example, self.max_seq_len, + self.tokenizer) + max_len = max(max_len, len(record.token_ids)) + if self.in_tokens: + to_append = (len(batch_records) + 1) * max_len <= batch_size + else: + to_append = len(batch_records) < batch_size + if to_append: + batch_records.append(record) + else: + yield self._pad_batch_records(batch_records) + batch_records, max_len = [record], len(record.token_ids) + + if batch_records: + yield self._pad_batch_records(batch_records) + + def get_num_examples(self, input_file): + """return total number of examples""" + examples = self._read_tsv(input_file) + return len(examples) + + def data_generator(self, + input_file, + batch_size, + epoch, + shuffle=True, + phase=None): + """return generator which yields batch data for pyreader""" + examples = self._read_tsv(input_file) + + def _wrapper(): + for epoch_index in range(epoch): + if phase == "train": + self.current_example = 0 + self.current_epoch = epoch_index + if shuffle: + np.random.shuffle(examples) + + for batch_data in self._prepare_batch_data( + examples, batch_size, phase=phase): + yield batch_data + + return _wrapper + + +class ClassifyReader(BaseReader): + """ClassifyReader""" + + def _read_tsv(self, input_file, quotechar=None): + """Reads a tab separated value file.""" + with io.open(input_file, "r", encoding="utf8") as f: + reader = csv_reader(f, delimiter="\t") + headers = next(reader) + text_indices = [ + index for index, h in enumerate(headers) if h != "label" + ] + Example = namedtuple('Example', headers) + + examples = [] + for line in reader: + for index, text in enumerate(line): + if index in text_indices: + line[index] = text.replace(' ', '') + example = Example(*line) + examples.append(example) + return examples + + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + batch_labels = [record.label_id for record in batch_records] + batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1]) + + # padding + padded_token_ids, input_mask, seq_lens = pad_batch_data( + batch_token_ids, + pad_idx=self.pad_id, + return_input_mask=True, + return_seq_lens=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + input_mask, batch_labels, seq_lens + ] + + return return_list + + +class SequenceLabelReader(BaseReader): + """SequenceLabelReader""" + + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + batch_label_ids = [record.label_ids for record in batch_records] + + # padding + padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( + batch_token_ids, + pad_idx=self.pad_id, + return_input_mask=True, + return_seq_lens=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + padded_label_ids = pad_batch_data( + batch_label_ids, pad_idx=len(self.label_map) - 1) + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + input_mask, padded_label_ids, batch_seq_lens + ] + return return_list + + def _reseg_token_label(self, tokens, labels, tokenizer): + assert len(tokens) == len(labels) + ret_tokens = [] + ret_labels = [] + for token, label in zip(tokens, labels): + sub_token = tokenizer.tokenize(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + ret_labels.append(label) + if len(sub_token) < 2: + continue + sub_label = label + if label.startswith("B-"): + sub_label = "I-" + label[2:] + ret_labels.extend([sub_label] * (len(sub_token) - 1)) + + assert len(ret_tokens) == len(ret_labels) + return ret_tokens, ret_labels + + def _convert_example_to_record(self, example, max_seq_length, tokenizer): + tokens = tokenization.convert_to_unicode(example.text_a).split(u"") + labels = tokenization.convert_to_unicode(example.label).split(u"") + tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) + + if len(tokens) > max_seq_length - 2: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + + tokens = ["[CLS]"] + tokens + ["[SEP]"] + token_ids = tokenizer.convert_tokens_to_ids(tokens) + position_ids = list(range(len(token_ids))) + text_type_ids = [0] * len(token_ids) + no_entity_id = len(self.label_map) - 1 + labels = [ + label if label in self.label_map else u"O" for label in labels + ] + label_ids = [no_entity_id] + [ + self.label_map[label] for label in labels + ] + [no_entity_id] + + Record = namedtuple( + 'Record', + ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) + record = Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + label_ids=label_ids) + return record + + +class ExtractEmbeddingReader(BaseReader): + """ExtractEmbeddingReader""" + + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + + # padding + padded_token_ids, input_mask, seq_lens = pad_batch_data( + batch_token_ids, + pad_idx=self.pad_id, + return_input_mask=True, + return_seq_lens=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + input_mask, seq_lens + ] + + return return_list + + +if __name__ == '__main__': + pass diff --git a/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py b/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py new file mode 100644 index 00000000..2a06a581 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py @@ -0,0 +1,370 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import unicodedata +import six +import io + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + fin = io.open(vocab_file, encoding="utf8") + for num, line in enumerate(fin): + items = convert_to_unicode(line.strip()).split("\t") + if len(items) > 2: + break + token = items[0] + index = items[1] if len(items) == 2 else num + token = token.strip() + vocab[token] = int(index) + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class CharTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in text.lower().split(" "): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/demo/pantheon/lexical_anlysis/preprocess/padding.py b/demo/pantheon/lexical_anlysis/preprocess/padding.py new file mode 100644 index 00000000..82171e68 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/preprocess/padding.py @@ -0,0 +1,78 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Mask, padding and batching. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + + +def pad_batch_data(insts, + pad_idx=0, + return_pos=False, + return_input_mask=False, + return_max_len=False, + return_num_token=False, + return_seq_lens=False): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and input mask. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + # Any token included in dict can be used to pad, since the paddings' loss + # will be masked out by weights and make no effect on parameter gradients. + + inst_data = np.array( + [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] + + # position data + if return_pos: + inst_pos = np.array([ + list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) + for inst in insts + ]) + + return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] + + if return_input_mask: + # This is used to avoid attention on paddings. + input_mask_data = np.array([[1] * len(inst) + [0] * + (max_len - len(inst)) for inst in insts]) + input_mask_data = np.expand_dims(input_mask_data, axis=-1) + return_list += [input_mask_data.astype("float32")] + + if return_max_len: + return_list += [max_len] + + if return_num_token: + num_token = 0 + for inst in insts: + num_token += len(inst) + return_list += [num_token] + + if return_seq_lens: + seq_lens = np.array([len(inst) for inst in insts]) + return_list += [seq_lens.astype("int64").reshape([-1])] + + return return_list if len(return_list) > 1 else return_list[0] + + +if __name__ == "__main__": + pass diff --git a/demo/pantheon/lexical_anlysis/reader.py b/demo/pantheon/lexical_anlysis/reader.py new file mode 100644 index 00000000..11958919 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/reader.py @@ -0,0 +1,208 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The file_reader converts raw corpus to input. +""" + +import os +import argparse +import __future__ +import io +import glob +from paddleslim.pantheon import Student +import random +import numpy as np +import six + +def load_kv_dict(dict_path, + reverse=False, + delimiter="\t", + key_func=None, + value_func=None): + """ + Load key-value dict from file + """ + result_dict = {} + for line in io.open(dict_path, "r", encoding='utf8'): + terms = line.strip("\n").split(delimiter) + if len(terms) != 2: + continue + if reverse: + value, key = terms + else: + key, value = terms + if key in result_dict: + raise KeyError("key duplicated with [%s]" % (key)) + if key_func: + key = key_func(key) + if value_func: + value = value_func(value) + result_dict[key] = value + return result_dict + + +class Dataset(object): + """data reader""" + + def __init__(self, args, mode="train"): + # read dict + self.word2id_dict = load_kv_dict( + args.word_dict_path, reverse=True, value_func=int) + self.id2word_dict = load_kv_dict(args.word_dict_path) + self.label2id_dict = load_kv_dict( + args.label_dict_path, reverse=True, value_func=int) + self.id2label_dict = load_kv_dict(args.label_dict_path) + self.word_replace_dict = load_kv_dict(args.word_rep_dict_path) + self._student = Student() + self._student.register_teacher(in_address=args.in_address) + self._student.start() + self._know_desc = self._student.get_knowledge_desc() + self._know_data_generator = self._student.get_knowledge_generator(batch_size=1, drop_last=False)() + self._train_shuffle_buf_size = args.traindata_shuffle_buffer + + @property + def vocab_size(self): + """vocabuary size""" + return max(self.word2id_dict.values()) + 1 + + @property + def num_labels(self): + """num_labels""" + return max(self.label2id_dict.values()) + 1 + + def get_num_examples(self, filename): + """num of line of file""" + return sum(1 for line in io.open(filename, "r", encoding='utf8')) + + def word_to_ids(self, words): + """convert word to word index""" + word_ids = [] + for word in words: + word = self.word_replace_dict.get(word, word) + if word not in self.word2id_dict: + word = "OOV" + word_id = self.word2id_dict[word] + word_ids.append(word_id) + + return word_ids + + def label_to_ids(self, labels): + """convert label to label index""" + label_ids = [] + for label in labels: + if label not in self.label2id_dict: + label = "O" + label_id = self.label2id_dict[label] + label_ids.append(label_id) + return label_ids + + def file_reader(self, filename, max_seq_len=126, mode="train"): + """ + yield (word_idx, target_idx, teacher_emission) one by one from file, + or yield (word_idx, ) in `infer` mode + """ + + def wrapper(): + invalid_samples = 0 + fread = io.open(filename, "r", encoding="utf-8") + if mode == "infer": + for line in fread: + words = line.strip() + word_ids = self.word_to_ids(words) + yield (word_ids[0:max_seq_len], ) + elif mode == "test": + headline = next(fread) + headline = headline.strip().split('\t') + assert len(headline) == 2 and headline[ + 0] == "text_a" and headline[1] == "label" + for line in fread: + words, labels = line.strip("\n").split("\t") + if len(words) < 1: + continue + word_ids = self.word_to_ids(words.split("\002")) + label_ids = self.label_to_ids(labels.split("\002")) + assert len(word_ids) == len(label_ids) + yield word_ids[0:max_seq_len], label_ids[0:max_seq_len] + else: + headline = next(fread) + headline = headline.strip().split('\t') + assert len(headline) == 2 and headline[ + 0] == "text_a" and headline[1] == "label" + buf = [] + for line in fread: + words, labels = line.strip("\n").split("\t") + if len(words) < 1: + continue + word_ids = self.word_to_ids(words.split("\002")) + label_ids = self.label_to_ids(labels.split("\002")) + if six.PY2: + know_data = self._know_data_generator.next() + else: + know_data = self._know_data_generator.__next__() + teacher_crf_decode = know_data["crf_decode"] + + if len(teacher_crf_decode.shape) == 1: + teacher_crf_decode = np.reshape(teacher_crf_decode, [-1, 1]) + teacher_seq_len = know_data["seq_lens"] + assert len(word_ids) == len(label_ids) + + real_len = len(word_ids) if len(word_ids) < max_seq_len else max_seq_len + if real_len == teacher_seq_len[0] - 2: + teacher_crf_decode_range = teacher_crf_decode[0][1:teacher_seq_len[0]-1] + teacher_crf_decode_range = np.reshape(teacher_crf_decode_range, [-1, 1]) + buf.append([word_ids[0:max_seq_len], label_ids[0:max_seq_len], teacher_crf_decode_range]) + #buf.append([word_ids[0:max_seq_len], label_ids[0:max_seq_len], teacher_crf_decode[0][1:teacher_seq_len[0]-1]]) + if len(buf) > self._train_shuffle_buf_size: + buf_ids = range(len(buf)) + random.shuffle(buf_ids) + for idx in buf_ids: + yield buf[idx] + buf = [] + else: + invalid_samples += 1 + if len(buf) > 0: + buf_ids = list(range(len(buf))) + random.shuffle(buf_ids) + for idx in buf_ids: + yield buf[idx] + + print("invalid samples in one epoch: {}".format(invalid_samples)) + fread.close() + return wrapper + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(__doc__) + parser.add_argument( + "--word_dict_path", + type=str, + default="./conf/word.dic", + help="word dict") + parser.add_argument( + "--label_dict_path", + type=str, + default="./conf/tag.dic", + help="label dict") + parser.add_argument( + "--word_rep_dict_path", + type=str, + default="./conf/q2b.dic", + help="word replace dict") + args = parser.parse_args() + dataset = Dataset(args) + # data_generator = dataset.file_reader("data/train.tsv") + #for word_idx, target_idx in data_generator(): + # print(word_idx, target_idx) + # print(len(word_idx), len(target_idx)) + # break diff --git a/demo/pantheon/lexical_anlysis/run_student.sh b/demo/pantheon/lexical_anlysis/run_student.sh new file mode 100644 index 00000000..a4b0a241 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/run_student.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +export CUDA_VISIBLE_DEVICES=5,6 +python -u train_student.py \ + --train_data ./data/train.tsv \ + --test_data ./data/test.tsv \ + --model_save_dir ./teacher_ernie_init_lac_1gru_emb128 \ + --validation_steps 1000 \ + --save_steps 1000 \ + --print_steps 100 \ + --batch_size 32 \ + --epoch 10 \ + --traindata_shuffle_buffer 20000 \ + --word_emb_dim 128 \ + --grnn_hidden_dim 128 \ + --bigru_num 1 \ + --base_learning_rate 1e-3 \ + --emb_learning_rate 2 \ + --crf_learning_rate 0.2 \ + --word_dict_path ./conf/word.dic \ + --label_dict_path ./conf/tag.dic \ + --word_rep_dict_path ./conf/q2b.dic \ + --enable_ce false \ + --use_cuda true \ + --in_address "127.0.0.1:5002" + diff --git a/demo/pantheon/lexical_anlysis/run_teacher.sh b/demo/pantheon/lexical_anlysis/run_teacher.sh new file mode 100755 index 00000000..d0acc194 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/run_teacher.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +export FLAGS_sync_nccl_allreduce=0 +export FLAGS_eager_delete_tensor_gb=1 +export FLAGS_fraction_of_gpu_memory_to_use=0.99 + +export CUDA_VISIBLE_DEVICES=5,6 # which GPU to use +ERNIE_FINETUNED_MODEL_PATH=./model_finetuned +DATA_PATH=./data/ + +python -u teacher_ernie.py \ + --ernie_config_path "conf/ernie_config.json" \ + --init_checkpoint "${ERNIE_FINETUNED_MODEL_PATH}" \ + --init_bound 0.1 \ + --vocab_path "conf/vocab.txt" \ + --batch_size 32 \ + --random_seed 0 \ + --num_labels 57 \ + --max_seq_len 128 \ + --test_data "${DATA_PATH}/train.tsv" \ + --label_map_config "./conf/label_map.json" \ + --do_lower_case true \ + --use_cuda true \ + --out_port=5002 + diff --git a/demo/pantheon/lexical_anlysis/teacher_ernie.py b/demo/pantheon/lexical_anlysis/teacher_ernie.py new file mode 100644 index 00000000..9235fda1 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/teacher_ernie.py @@ -0,0 +1,111 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Baidu's open-source Lexical Analysis tool for Chinese, including: + 1. Word Segmentation, + 2. Part-of-Speech Tagging + 3. Named Entity Recognition +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import argparse +import numpy as np +import multiprocessing +import sys +from collections import namedtuple +from paddleslim.pantheon import Teacher +import paddle.fluid as fluid + +import creator +import model_utils +print('model representation') +from models.representation.ernie import ErnieConfig +print('model check') +from models.model_check import check_cuda +from models.model_check import check_version + + + +def do_eval(args): + # init executor + if args.use_cuda: + place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) + else: + place = fluid.CPUPlace() + print('ernie config') + ernie_config = ErnieConfig(args.ernie_config_path) + ernie_config.print_config() + test_program = fluid.Program() + print('test program') + with fluid.program_guard(test_program, fluid.default_startup_program()): + with fluid.unique_name.guard(): + test_ret = creator.create_ernie_model(args, ernie_config) + test_program = test_program.clone(for_test=True) + #print('create pyreader') + pyreader = creator.create_pyreader( + args, + file_name=args.test_data, + feed_list=[ret.name for ret in test_ret['feed_list']], + model="ernie", + place=place, + return_reader=True, + mode='test') + + #data_inter = reader.data_generator(args.test_data, args.batch_size, 1, shuffle=False, phase="train") + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + # load model + if not args.init_checkpoint: + raise ValueError( + "args 'init_checkpoint' should be set if only doing test or infer!") + model_utils.init_checkpoint(exe, args.init_checkpoint, test_program) + + teacher = Teacher(out_path=None, out_port=int(args.out_port)) + teacher.start() + print('run teacher......') + + test_ret["chunk_evaluator"].reset() + + reader_config = {"batch_generator": pyreader} + + teacher.start_knowledge_service( + feed_list=[test_ret["words"].name, test_ret["sent_ids"].name, test_ret["pos_ids"].name, test_ret["input_mask"].name, test_ret["labels"].name, test_ret["seq_lens"].name], + schema={"crf_decode":test_ret["crf_decode"],"seq_lens":test_ret["seq_lens"]}, + program=test_program, + reader_config=reader_config, + exe=exe, + times=10) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(__doc__) + model_utils.load_yaml(parser, './conf/ernie_args.yaml') + + # config for pantheon teacher + parser.add_argument('--out_path', type=str, default=None, help="The path to dump knowledge for offline mode.") + parser.add_argument('--out_port', type=str, default=None, help="The IP port number to send out knowledge for \ + online mode, should be unique when launching multiple teachers in \ + the same node.") + + args = parser.parse_args() + check_cuda(args.use_cuda) + check_version() + model_utils.print_arguments(args) + do_eval(args) diff --git a/demo/pantheon/lexical_anlysis/train_student.py b/demo/pantheon/lexical_anlysis/train_student.py new file mode 100644 index 00000000..5a553431 --- /dev/null +++ b/demo/pantheon/lexical_anlysis/train_student.py @@ -0,0 +1,208 @@ +# -*- coding: UTF-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import math +import time +import random +import argparse +import multiprocessing + +import numpy as np +import paddle +import paddle.fluid as fluid + +import reader +import model_utils +import creator +from eval import test_process +from models.model_check import check_cuda +from models.model_check import check_version + +# the function to train model +def do_train(args): + # init executor + if args.use_cuda: + place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) + dev_count = fluid.core.get_cuda_device_count() + else: + dev_count = min(multiprocessing.cpu_count(), args.cpu_num) + if (dev_count < args.cpu_num): + print( + "WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. " + "Change the cpu_num from %d to %d" % + (dev_count, args.cpu_num, dev_count)) + os.environ['CPU_NUM'] = str(dev_count) + place = fluid.CPUPlace() + + train_program = fluid.Program() + test_program = fluid.Program() + startup_program = fluid.Program() + + dataset = reader.Dataset(args) + with fluid.program_guard(train_program, startup_program): + #train_program.random_seed = args.random_seed + startup_program.random_seed = args.random_seed + + with fluid.unique_name.guard(): + train_ret = creator.create_model( + args, dataset.vocab_size, dataset.num_labels, mode='train') + + optimizer = fluid.optimizer.Adam( + learning_rate=args.base_learning_rate) + optimizer.minimize(train_ret["avg_cost"]) + + with fluid.program_guard(test_program, startup_program): + with fluid.unique_name.guard(): + test_ret = creator.create_model( + args, dataset.vocab_size, dataset.num_labels, mode='test') + + test_program = test_program.clone(for_test=True) + + exe = fluid.Executor(place) + exe.run(startup_program) + + if args.init_checkpoint: + model_utils.init_checkpoint(exe, args.init_checkpoint, train_program) + if dev_count > 1: + device = "GPU" if args.use_cuda else "CPU" + print("%d %s are used to train model" % (dev_count, device)) + # multi cpu/gpu config + exec_strategy = fluid.ExecutionStrategy() + + build_strategy = fluid.compiler.BuildStrategy() + + compiled_prog = fluid.compiler.CompiledProgram( + train_program).with_data_parallel( + loss_name=train_ret['avg_cost'].name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + else: + compiled_prog = fluid.compiler.CompiledProgram(train_program) + + # start training + num_train_examples = dataset.get_num_examples(args.train_data) + max_train_steps = args.epoch * num_train_examples // args.batch_size + print("Num train examples: %d" % num_train_examples) + print("Max train steps: %d" % max_train_steps) + + train_generator = creator.create_lexnet_data_generator(args, + reader=dataset, + file_name=args.train_data, + place=place, + mode='train') + test_generator = creator.create_lexnet_data_generator(args, + reader=dataset, + file_name=args.test_data, + place=place, + mode='test') + + train_reader, test_reader = train_ret['pyreader'], test_ret['pyreader'] + train_reader.set_batch_generator(train_generator, places=place) + test_reader.set_batch_generator(test_generator, places=place) + + ce_info = [] + step = 0 + ce_time = 0 + train_reader.start() + while True: + try: + # this is for minimizing the fetching op, saving the training speed. + if step % args.print_steps == 0: + fetch_list = [ + train_ret["avg_cost"], train_ret["precision"], + train_ret["recall"], train_ret["f1_score"], + train_ret["crf_avg_cost"], train_ret["teacher_cost"] + ] + else: + fetch_list = [] + + start_time = time.time() + outputs = exe.run( + program=compiled_prog, + fetch_list=fetch_list) + + end_time = time.time() + if step % args.print_steps == 0: + avg_cost, precision, recall, f1_score, crf_avg_cost, teacher_cost = [ + np.mean(x) for x in outputs + ] + print("Data loader queue size: %d " % train_reader.queue.size()) + print( + "[train] step = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, crf_avg_cost: %.5f, teacher_cost: %.5f, elapsed time %.5f" + % (step, avg_cost, precision, recall, f1_score, crf_avg_cost, teacher_cost, + end_time - start_time)) + + if step % args.validation_steps == 0: + test_process(exe, test_program, test_reader, test_ret) + + ce_time += end_time - start_time + ce_info.append([ce_time, avg_cost, precision, recall, f1_score]) + + # save checkpoints + if step % args.save_steps == 0 and step != 0: + save_path = os.path.join(args.model_save_dir, + "step_" + str(step)) + fluid.io.save_persistables(exe, save_path, train_program) + step += 1 + except fluid.core.EOFException: + train_reader.reset() + break + + if args.enable_ce: + card_num = get_cards() + ce_cost = 0 + ce_f1 = 0 + ce_p = 0 + ce_r = 0 + ce_time = 0 + try: + ce_time = ce_info[-2][0] + ce_cost = ce_info[-2][1] + ce_p = ce_info[-2][2] + ce_r = ce_info[-2][3] + ce_f1 = ce_info[-2][4] + except: + print("ce info error") + print("kpis\teach_step_duration_card%s\t%s" % (card_num, ce_time)) + print("kpis\ttrain_cost_card%s\t%f" % (card_num, ce_cost)) + print("kpis\ttrain_precision_card%s\t%f" % (card_num, ce_p)) + print("kpis\ttrain_recall_card%s\t%f" % (card_num, ce_r)) + print("kpis\ttrain_f1_card%s\t%f" % (card_num, ce_f1)) + + +def get_cards(): + num = 0 + cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') + if cards != '': + num = len(cards.split(",")) + return num + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(__doc__) + model_utils.load_yaml(parser, 'conf/args.yaml') + + # config for pantheon student + parser.add_argument('--in_path', type=str, default=None, help="The path of dumped knowledge from teacher for offline mode.") + parser.add_argument('--in_address', type=str, default=None, help="The IP port number to receive knowledge from teacher for \ + online mode") + + args = parser.parse_args() + check_cuda(args.use_cuda) + check_version() + do_train(args) -- GitLab