diff --git a/examples/sequence_tagging/README.md b/examples/sequence_tagging/README.md index 0bcb9ff859a0ab593abcc6769ec671b15581f6c9..898f3abbcbc6bbee447b258c554ef4cde98143e4 100644 --- a/examples/sequence_tagging/README.md +++ b/examples/sequence_tagging/README.md @@ -6,7 +6,7 @@ Sequence Tagging,是一个序列标注模型,模型可用于实现,分词 |模型|Precision|Recall|F1-score| |:-:|:-:|:-:|:-:| -|Lexical Analysis|88.26%|89.20%|88.73%| +|Lexical Analysis|89.57%|89.96%|89.76%| ## 2. 快速开始 @@ -22,7 +22,7 @@ Sequence Tagging,是一个序列标注模型,模型可用于实现,分词 克隆工具集代码库到本地 ```bash git clone https://github.com/PaddlePaddle/hapi.git - cd hapi/sequence_tagging + cd hapi/examples/sequence_tagging ``` #### 3. 环境依赖 @@ -70,7 +70,7 @@ python -u train.py \ --dynamic False # --device: 使用gpu设备还是cpu设备 -# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False +# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为False, 动态图设置为True ``` GPU上多卡训练 @@ -84,7 +84,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 train.py \ --dynamic False # --device: 使用gpu设备还是cpu设备 -# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False +# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为False, 动态图设置为True ``` CPU上训练 @@ -95,7 +95,7 @@ python -u train.py \ --dynamic False # --device: 使用gpu设备还是cpu设备 -# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False +# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为False, 动态图设置为True ``` ### 模型预测 @@ -105,15 +105,13 @@ python -u train.py \ python predict.py \ --init_from_checkpoint model_baseline/params \ --output_file predict.result \ - --mode predict \ --device cpu \ --dynamic False # --init_from_checkpoint: 初始化模型 # --output_file: 预测结果文件 # --device: 使用gpu还是cpu设备 -# --mode: 开启模式, 设置为train时,进行训练,设置为predict时进行预测 -# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False +# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为False, 动态图设置为True ``` ### 模型评估 @@ -123,14 +121,12 @@ python predict.py \ # baseline model python eval.py \ --init_from_checkpoint ./model_baseline/params \ - --mode predict \ --device cpu \ --dynamic False # --init_from_checkpoint: 初始化模型 # --device: 使用gpu还是cpu设备 -# --mode: 开启模式, 设置为train时,进行训练,设置为predict时进行预测 -# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False +# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为False, 动态图设置为True ``` @@ -168,7 +164,7 @@ Overall Architecture of GRU-CRF-MODEL 训练使用的数据可以由用户根据实际的应用场景,自己组织数据。除了第一行是 `text_a\tlabel` 固定的开头,后面的每行数据都是由两列组成,以制表符分隔,第一列是 utf-8 编码的中文文本,以 `\002` 分割,第二列是对应每个字的标注,以 `\002` 分隔。我们采用 IOB2 标注体系,即以 X-B 作为类型为 X 的词的开始,以 X-I 作为类型为 X 的词的持续,以 O 表示不关注的字(实际上,在词性、专名联合标注中,不存在 O )。示例如下: ```text -除\002了\002他\002续\002任\002十\002二\002届\002政\002协\002委\002员\002,\002马\002化\002腾\002,\002雷\002军\002,\002李\002彦\002宏\002也\002被\002推\002选\002为\002新\002一\002届\002全\002国\002人\002大\002代\002表\002或\002全\002国\002政\002协\002委\002员 p-B\002p-I\002r-B\002v-B\002v-I\002m-B\002m-I\002m-I\002ORG-B\002ORG-I\002n-B\002n-I\002w-B\002PER-B\002PER-I\002PER-I\002w-B\002PER-B\002PER-I\002w-B\002PER-B\002PER-I\002PER-I\002d-B\002p-B\002v-B\002v-I\002v-B\002a-B\002m-B\002m-I\002ORG-B\002ORG-I\002ORG-I\002ORG-I\002n-B\002n-I\002c-B\002n-B\002n-I\002ORG-B\002ORG-I\002n-B\002n-I +除\002了\002他\002续\002任\002十\002二\002届\002政\002协\002委\002员\002,\002马\002化\002腾\002,\002雷\002军\002,\002李\002彦\002宏\002也\002被\002推\002选\002为\002新\002一\002届\002全\002国\002人\002大\002代\002表\002或\002全\002国\002政\002协\002委\002员 p-B\002p-I\002r-B\002v-B\002v-I\002m-B\002m-I\002m-I\002ORG-B\002ORG-I\002n-B\002n-I\002w-B\002PER-B\002PER-I\002PER-I\002w-B\002PER-B\002PER-I\002w-B\002PER-B\002PER-I\002PER-I\002d-B\002p-B\002v-B\002v-I\002v-B\002a-B\002m-B\002m-I\002ORG-B\002ORG-I\002ORG-I\002ORG-I\002n-B\002n-I\002c-B\002n-B\002n-I\002ORG-B\002ORG-I\002n-B\002n-I ``` + 我们随同代码一并发布了完全版的模型和相关的依赖数据。但是,由于模型的训练数据过于庞大,我们没有发布训练数据,仅在`data`目录下放置少数样本用以示例输入数据格式。 @@ -196,6 +192,7 @@ Overall Architecture of GRU-CRF-MODEL ├── eval.py # 词法分析评估的脚本 ├── downloads.py # 用于下载数据和模型的脚本 ├── downloads.sh # 用于下载数据和模型的脚本 +├── sequence_tagging.yaml # 模型训练、预测、评估相关配置参数 └──reader.py # 文件读取相关函数 ``` @@ -207,11 +204,11 @@ Overall Architecture of GRU-CRF-MODEL ```text @article{jiao2018LAC, - title={Chinese Lexical Analysis with Deep Bi-GRU-CRF Network}, - author={Jiao, Zhenyu and Sun, Shuqi and Sun, Ke}, - journal={arXiv preprint arXiv:1807.01882}, - year={2018}, - url={https://arxiv.org/abs/1807.01882} + title={Chinese Lexical Analysis with Deep Bi-GRU-CRF Network}, + author={Jiao, Zhenyu and Sun, Shuqi and Sun, Ke}, + journal={arXiv preprint arXiv:1807.01882}, + year={2018}, + url={https://arxiv.org/abs/1807.01882} } ``` ### 如何贡献代码 diff --git a/examples/sequence_tagging/downloads.py b/examples/sequence_tagging/downloads.py index b61c3e779cca3f3900d8e0bc0eb209f8fa2f9389..e89f2c83af7c70f3d4e961253a3f57cac9fc2f7c 100644 --- a/examples/sequence_tagging/downloads.py +++ b/examples/sequence_tagging/downloads.py @@ -35,7 +35,7 @@ FILE_INFO = { }, 'MODEL': { 'name': 'sequence_tagging_dy.tar.gz', - 'md5': "1125d374c03c8218b6e47325dcf607e3" + 'md5': "6ba37ceea8f1f764ba1fe227295a6a3b" }, } diff --git a/examples/sequence_tagging/eval.py b/examples/sequence_tagging/eval.py index ff3e7b9865064289f73b19756d4c1b5a271e11d2..b1e617bdc45dbad1a5f966a399c2168dbf02bb54 100644 --- a/examples/sequence_tagging/eval.py +++ b/examples/sequence_tagging/eval.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -SequenceTagging network structure +SequenceTagging eval structure """ from __future__ import division @@ -25,18 +25,16 @@ import math import argparse import numpy as np -from train import SeqTagging +from train import SeqTagging, ChunkEval, LacLoss from utils.configure import PDConfig from utils.check import check_gpu, check_version -from utils.metrics import chunk_count -from reader import LacDataset, create_lexnet_data_generator, create_dataloader +from reader import LacDataset, LacDataLoader work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.join(work_dir, "../")) from hapi.model import set_device, Input import paddle.fluid as fluid -from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.layers.utils import flatten @@ -44,51 +42,33 @@ def main(args): place = set_device(args.device) fluid.enable_dygraph(place) if args.dynamic else None - inputs = [Input([None, None], 'int64', name='words'), - Input([None], 'int64', name='length')] + inputs = [ + Input( + [None, None], 'int64', name='words'), Input( + [None], 'int64', name='length'), Input( + [None, None], 'int64', name='target') + ] + labels = [Input([None, None], 'int64', name='labels')] - feed_list = None if args.dynamic else [x.forward() for x in inputs] dataset = LacDataset(args) - eval_path = args.test_file - - chunk_evaluator = fluid.metrics.ChunkEvaluator() - chunk_evaluator.reset() - - eval_generator = create_lexnet_data_generator( - args, reader=dataset, file_name=eval_path, place=place, mode="test") - - eval_dataset = create_dataloader( - eval_generator, place, feed_list=feed_list) + eval_dataset = LacDataLoader(args, place, phase="test") vocab_size = dataset.vocab_size num_labels = dataset.num_labels - model = SeqTagging(args, vocab_size, num_labels) - - optim = AdamOptimizer( - learning_rate=args.base_learning_rate, - parameter_list=model.parameters()) + model = SeqTagging(args, vocab_size, num_labels, mode="test") model.mode = "test" - model.prepare(inputs=inputs) + model.prepare( + metrics=ChunkEval(num_labels), + inputs=inputs, + labels=labels, + device=place) model.load(args.init_from_checkpoint, skip_mismatch=True) - for data in eval_dataset(): - if len(data) == 1: - batch_data = data[0] - targets = np.array(batch_data[2]) - else: - batch_data = data - targets = batch_data[2].numpy() - inputs_data = [batch_data[0], batch_data[1]] - crf_decode, length = model.test(inputs=inputs_data) - num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_count(crf_decode, targets, length, dataset.id2label_dict) - chunk_evaluator.update(num_infer_chunks, num_label_chunks, num_correct_chunks) - - precision, recall, f1 = chunk_evaluator.eval() - print("[test] P: %.5f, R: %.5f, F1: %.5f" % (precision, recall, f1)) + model.evaluate(eval_dataset.dataloader, batch_size=args.batch_size) -if __name__ == '__main__': +if __name__ == '__main__': args = PDConfig(yaml_file="sequence_tagging.yaml") args.build() args.Print() diff --git a/examples/sequence_tagging/predict.py b/examples/sequence_tagging/predict.py index ac4a50ad30c494a4d433682d04fffa23cc4d1c03..bcb39265d7ef8a08ce6700d599b37a4f4ae19054 100644 --- a/examples/sequence_tagging/predict.py +++ b/examples/sequence_tagging/predict.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -SequenceTagging network structure +SequenceTagging predict structure """ from __future__ import division @@ -28,14 +28,13 @@ import numpy as np from train import SeqTagging from utils.check import check_gpu, check_version from utils.configure import PDConfig -from reader import LacDataset, create_lexnet_data_generator, create_dataloader +from reader import LacDataset, LacDataLoader work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.join(work_dir, "../")) from hapi.model import set_device, Input import paddle.fluid as fluid -from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.layers.utils import flatten @@ -43,26 +42,18 @@ def main(args): place = set_device(args.device) fluid.enable_dygraph(place) if args.dynamic else None - inputs = [Input([None, None], 'int64', name='words'), - Input([None], 'int64', name='length')] + inputs = [ + Input( + [None, None], 'int64', name='words'), Input( + [None], 'int64', name='length') + ] - feed_list = None if args.dynamic else [x.forward() for x in inputs] dataset = LacDataset(args) - predict_path = args.predict_file - - predict_generator = create_lexnet_data_generator( - args, reader=dataset, file_name=predict_path, place=place, mode="predict") - - predict_dataset = create_dataloader( - predict_generator, place, feed_list=feed_list) + predict_dataset = LacDataLoader(args, place, phase="predict") vocab_size = dataset.vocab_size num_labels = dataset.num_labels - model = SeqTagging(args, vocab_size, num_labels) - - optim = AdamOptimizer( - learning_rate=args.base_learning_rate, - parameter_list=model.parameters()) + model = SeqTagging(args, vocab_size, num_labels, mode="predict") model.mode = "test" model.prepare(inputs=inputs) @@ -70,20 +61,20 @@ def main(args): model.load(args.init_from_checkpoint, skip_mismatch=True) f = open(args.output_file, "wb") - for data in predict_dataset(): - if len(data) == 1: + for data in predict_dataset.dataloader: + if len(data) == 1: input_data = data[0] - else: + else: input_data = data - results, length = model.test(inputs=flatten(input_data)) - for i in range(len(results)): + results, length = model.test_batch(inputs=flatten(input_data)) + for i in range(len(results)): word_len = length[i] - word_ids = results[i][: word_len] + word_ids = results[i][:word_len] tags = [dataset.id2label_dict[str(id)] for id in word_ids] f.write("\002".join(tags) + "\n") - -if __name__ == '__main__': + +if __name__ == '__main__': args = PDConfig(yaml_file="sequence_tagging.yaml") args.build() args.Print() diff --git a/examples/sequence_tagging/reader.py b/examples/sequence_tagging/reader.py index 02719407e8771d46a17f1c7b5583e69c11a6cd73..7a772b3fbbc80478dfc4e9096273a60ade05c79a 100644 --- a/examples/sequence_tagging/reader.py +++ b/examples/sequence_tagging/reader.py @@ -19,12 +19,19 @@ from __future__ import division from __future__ import print_function import io +import os +import leveldb import numpy as np +import shutil +from functools import partial import paddle +from paddle.io import BatchSampler, DataLoader, Dataset +from paddle.fluid.dygraph.parallel import ParallelEnv +from hapi.distributed import DistributedBatchSampler -class LacDataset(object): +class LacDataset(Dataset): """ Load lexical analysis dataset """ @@ -34,6 +41,7 @@ class LacDataset(object): self.label_dict_path = args.label_dict_path self.word_rep_dict_path = args.word_rep_dict_path self._load_dict() + self.examples = [] def _load_dict(self): self.word2id_dict = self.load_kv_dict( @@ -108,152 +116,135 @@ class LacDataset(object): label_ids.append(label_id) return label_ids - def file_reader(self, - filename, - mode="train", - batch_size=32, - max_seq_len=126): + def file_reader(self, filename, phase="train"): """ yield (word_idx, target_idx) one by one from file, or yield (word_idx, ) in `infer` mode """ - - def wrapper(): - fread = io.open(filename, "r", encoding="utf-8") - if mode == "train": - headline = next(fread) - headline = headline.strip().split('\t') - assert len(headline) == 2 and headline[0] == "text_a" and headline[ - 1] == "label" - buf = [] - for line in fread: - words, labels = line.strip("\n").split("\t") - if len(words) < 1: - continue - word_ids = self.word_to_ids(words.split("\002")) - label_ids = self.label_to_ids(labels.split("\002")) - assert len(word_ids) == len(label_ids) - words_len = np.int64(len(word_ids)) - - word_ids = word_ids[0:max_seq_len] - words_len = np.int64(len(word_ids)) - word_ids += [0 for _ in range(max_seq_len - words_len)] - label_ids = label_ids[0:max_seq_len] - label_ids += [0 for _ in range(max_seq_len - words_len)] - assert len(word_ids) == len(label_ids) - yield word_ids, label_ids, words_len - elif mode == "test": - headline = next(fread) + self.phase = phase + with io.open(filename, "r", encoding="utf8") as fr: + if phase in ["train", "test"]: + headline = next(fr) headline = headline.strip().split('\t') - assert len(headline) == 2 and headline[0] == "text_a" and headline[ - 1] == "label" - buf = [] - for line in fread: - words, labels = line.strip("\n").split("\t") - if len(words) < 1: - continue - word_ids = self.word_to_ids(words.split("\002")) - label_ids = self.label_to_ids(labels.split("\002")) - assert len(word_ids) == len(label_ids) - words_len = np.int64(len(word_ids)) - yield word_ids, label_ids, words_len - else: - for line in fread: - words = line.strip("\n").split('\t')[0] - if words == u"text_a": + assert len(headline) == 2 and headline[ + 0] == "text_a" and headline[1] == "label" + + for line in fr: + line_str = line.strip("\n") + if len(line_str) < 1 and len(line_str.split('\t')) < 2: continue - if "\002" not in words: - word_ids = self.word_to_ids(words) - else: - word_ids = self.word_to_ids(words.split("\002")) - words_len = np.int64(len(word_ids)) - yield word_ids, words_len - fread.close() + self.examples.append(line_str) + else: + for idx, line in enumerate(fr): + words = line.strip("\n").split("\t")[0] + self.examples.append(words) + + def __getitem__(self, idx): + line_str = self.examples[idx] + if self.phase in ["train", "test"]: + words, labels = line_str.split('\t') + word_ids = self.word_to_ids(words.split("\002")) + label_ids = self.label_to_ids(labels.split("\002")) + assert len(word_ids) == len(label_ids) + return word_ids, label_ids + else: + words = [w for w in line_str] + word_ids = self.word_to_ids(words) + return word_ids + + def __len__(self): - return wrapper + return len(self.examples) -def create_lexnet_data_generator(args, reader, file_name, place, mode="train"): - def padding_data(max_len, batch_data): +def create_lexnet_data_generator(args, insts, phase="train"): + def padding_data(max_len, batch_data, if_len=False): padding_batch_data = [] - for data in batch_data: + padding_lens = [] + for data in batch_data: + data = data[:max_len] + if if_len: + seq_len = np.int64(len(data)) + padding_lens.append(seq_len) data += [0 for _ in range(max_len - len(data))] padding_batch_data.append(data) - return padding_batch_data - - def wrapper(): - if mode == "train": - batch_words, batch_labels, seq_lens = [], [], [] - for epoch in xrange(args.epoch): - for instance in reader.file_reader( - file_name, mode, max_seq_len=args.max_seq_len)(): - words, labels, words_len = instance - if len(seq_lens) < args.batch_size: - batch_words.append(words) - batch_labels.append(labels) - seq_lens.append(words_len) - if len(seq_lens) == args.batch_size: - yield batch_words, seq_lens, batch_labels, batch_labels - batch_words, batch_labels, seq_lens = [], [], [] - - if len(seq_lens) > 0: - yield batch_words, seq_lens, batch_labels, batch_labels - elif mode == "test": - batch_words, batch_labels, seq_lens, max_len = [], [], [], 0 - for instance in reader.file_reader( - file_name, mode, max_seq_len=args.max_seq_len)(): - words, labels, words_len = instance - max_len = words_len if words_len > max_len else max_len - if len(seq_lens) < args.batch_size: - batch_words.append(words) - seq_lens.append(words_len) - batch_labels.append(labels) - if len(seq_lens) == args.batch_size: - padding_batch_words = padding_data(max_len, batch_words) - padding_batch_labels = padding_data(max_len, batch_labels) - yield padding_batch_words, seq_lens, padding_batch_labels, padding_batch_labels - batch_words, batch_labels, seq_lens, max_len = [], [], [], 0 - if len(seq_lens) > 0: - padding_batch_words = padding_data(max_len, batch_words) - padding_batch_labels = padding_data(max_len, batch_labels) - yield padding_batch_words, seq_lens, padding_batch_labels, padding_batch_labels - - else: - batch_words, seq_lens, max_len = [], [], 0 - for instance in reader.file_reader( - file_name, mode, max_seq_len=args.max_seq_len)(): - words, words_len = instance - if len(seq_lens) < args.batch_size: - batch_words.append(words) - seq_lens.append(words_len) - max_len = words_len if words_len > max_len else max_len - if len(seq_lens) == args.batch_size: - padding_batch_words = padding_data(max_len, batch_words) - yield padding_batch_words, seq_lens - batch_words, seq_lens, max_len = [], [], 0 - if len(seq_lens) > 0: - padding_batch_words = padding_data(max_len, batch_words) - yield padding_batch_words, seq_lens - - return wrapper - - -def create_dataloader(generator, place, feed_list=None): - if not feed_list: - data_loader = paddle.io.DataLoader.from_generator( - capacity=50, - use_double_buffer=True, - iterable=True, - return_list=True) + if if_len: + return np.array(padding_batch_data), np.array(padding_lens) + else: + return np.array(padding_batch_data) + + if phase == "train": + batch_words = [inst[0] for inst in insts] + batch_labels = [inst[1] for inst in insts] + padding_batch_words, padding_lens = padding_data( + args.max_seq_len, batch_words, if_len=True) + padding_batch_labels = padding_data(args.max_seq_len, batch_labels) + return [ + padding_batch_words, padding_lens, padding_batch_labels, + padding_batch_labels + ] + elif phase == "test": + batch_words = [inst[0] for inst in insts] + seq_len = [len(inst[0]) for inst in insts] + max_seq_len = max(seq_len) + batch_labels = [inst[1] for inst in insts] + padding_batch_words, padding_lens = padding_data( + max_seq_len, batch_words, if_len=True) + padding_batch_labels = padding_data(max_seq_len, batch_labels) + return [ + padding_batch_words, padding_lens, padding_batch_labels, + padding_batch_labels + ] else: - data_loader = paddle.io.DataLoader.from_generator( - feed_list=feed_list, - capacity=50, - use_double_buffer=True, - iterable=True, + batch_words = insts + seq_len = [len(inst) for inst in insts] + max_seq_len = max(seq_len) + padding_batch_words, padding_lens = padding_data( + max_seq_len, batch_words, if_len=True) + return [padding_batch_words, padding_lens] + + +class LacDataLoader(object): + def __init__(self, + args, + place, + phase="train", + shuffle=False, + num_workers=0, + drop_last=False): + assert phase in [ + "train", "test", "predict" + ], "phase should be in [train, test, predict], but get %s" % phase + + if phase == "train": + file_name = args.train_file + elif phase == "test": + file_name = args.test_file + elif phase == "predict": + file_name = args.predict_file + + self.dataset = LacDataset(args) + self.dataset.file_reader(file_name, phase=phase) + + if phase == "train": + self.sampler = DistributedBatchSampler( + dataset=self.dataset, + batch_size=args.batch_size, + shuffle=shuffle, + drop_last=drop_last) + else: + self.sampler = BatchSampler( + dataset=self.dataset, + batch_size=args.batch_size, + shuffle=shuffle, + drop_last=drop_last) + + self.dataloader = DataLoader( + dataset=self.dataset, + batch_sampler=self.sampler, + places=place, + collate_fn=partial( + create_lexnet_data_generator, args, phase=phase), + num_workers=num_workers, return_list=True) - data_loader.set_batch_generator(generator, places=place) - return data_loader - - diff --git a/examples/sequence_tagging/sequence_tagging.yaml b/examples/sequence_tagging/sequence_tagging.yaml index feb0ce13c20aac64af5ddd85980de32c86b7a1d0..9ab4942397bf060e6b48016938f7797ee4bc5711 100644 --- a/examples/sequence_tagging/sequence_tagging.yaml +++ b/examples/sequence_tagging/sequence_tagging.yaml @@ -1,7 +1,7 @@ word_dict_path: "./conf/word.dic" label_dict_path: "./conf/tag.dic" word_rep_dict_path: "./conf/q2b.dic" -device: "cpu" +device: "gpu" dynamic: True epoch: 10 base_learning_rate: 0.001 @@ -14,7 +14,7 @@ batch_size: 300 max_seq_len: 126 num_devices: 1 save_dir: "model" -init_from_checkpoint: "model_baseline/params" +init_from_checkpoint: "" init_from_pretrain_model: "" save_freq: 1 eval_freq: 1 @@ -22,4 +22,3 @@ output_file: "predict.result" test_file: "./data/test.tsv" train_file: "./data/train.tsv" predict_file: "./data/infer.tsv" -mode: "train" diff --git a/examples/sequence_tagging/train.py b/examples/sequence_tagging/train.py index 947bf370e9de22ddde4127c22431baf1a8b0248d..7d5a9337d3b0da6f116262f1b30def68b828e00b 100644 --- a/examples/sequence_tagging/train.py +++ b/examples/sequence_tagging/train.py @@ -28,21 +28,23 @@ import numpy as np work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.join(work_dir, "../")) - from hapi.metrics import Metric from hapi.model import Model, Input, Loss, set_device from hapi.text.text import SequenceTagging from utils.check import check_gpu, check_version from utils.configure import PDConfig -from reader import LacDataset, create_lexnet_data_generator, create_dataloader +from reader import LacDataset, LacDataLoader import paddle.fluid as fluid from paddle.fluid.optimizer import AdamOptimizer +__all__ = ["SeqTagging", "LacLoss", "ChunkEval"] + class SeqTagging(Model): - def __init__(self, args, vocab_size, num_labels, length=None): + def __init__(self, args, vocab_size, num_labels, length=None, + mode="train"): super(SeqTagging, self).__init__() """ define the lexical analysis network structure @@ -53,7 +55,7 @@ class SeqTagging(Model): for infer: return the prediction otherwise: return the prediction """ - self.mode_type = args.mode + self.mode_type = mode self.word_emb_dim = args.word_emb_dim self.vocab_size = vocab_size self.num_labels = num_labels @@ -65,19 +67,19 @@ class SeqTagging(Model): self.bigru_num = args.bigru_num self.batch_size = args.batch_size self.init_bound = 0.1 - self.length=length + self.length = length self.sequence_tagging = SequenceTagging( - vocab_size=self.vocab_size, - num_labels=self.num_labels, - batch_size=self.batch_size, - word_emb_dim=self.word_emb_dim, - grnn_hidden_dim=self.grnn_hidden_dim, - emb_learning_rate=self.emb_lr, - crf_learning_rate=self.crf_lr, - bigru_num=self.bigru_num, - init_bound=self.init_bound, - length=self.length) + vocab_size=self.vocab_size, + num_labels=self.num_labels, + batch_size=self.batch_size, + word_emb_dim=self.word_emb_dim, + grnn_hidden_dim=self.grnn_hidden_dim, + emb_learning_rate=self.emb_lr, + crf_learning_rate=self.crf_lr, + bigru_num=self.bigru_num, + init_bound=self.init_bound, + length=self.length) def forward(self, *inputs): """ @@ -85,10 +87,10 @@ class SeqTagging(Model): """ word = inputs[0] lengths = inputs[1] - if self.mode_type == "train" or self.mode_type == "test": + if self.mode_type == "train" or self.mode_type == "test": target = inputs[2] outputs = self.sequence_tagging(word, lengths, target) - else: + else: outputs = self.sequence_tagging(word, lengths) return outputs @@ -156,7 +158,7 @@ class ChunkEval(Metric): int(math.ceil((num_labels - 1) / 2.0)), "IOB") self.reset() - def add_metric_op(self, *args): + def add_metric_op(self, *args): crf_decode = args[0] lengths = args[2] label = args[3] @@ -207,30 +209,25 @@ def main(args): place = set_device(args.device) fluid.enable_dygraph(place) if args.dynamic else None - inputs = [Input([None, None], 'int64', name='words'), - Input([None], 'int64', name='length'), - Input([None, None], 'int64', name='target')] + inputs = [ + Input( + [None, None], 'int64', name='words'), Input( + [None], 'int64', name='length'), Input( + [None, None], 'int64', name='target') + ] labels = [Input([None, None], 'int64', name='labels')] - feed_list = None if args.dynamic else [x.forward() for x in inputs + labels] - dataset = LacDataset(args) - train_path = args.train_file - test_path = args.test_file - - train_generator = create_lexnet_data_generator( - args, reader=dataset, file_name=train_path, place=place, mode="train") - test_generator = create_lexnet_data_generator( - args, reader=dataset, file_name=test_path, place=place, mode="test") + feed_list = None if args.dynamic else [ + x.forward() for x in inputs + labels + ] - train_dataset = create_dataloader( - train_generator, place, feed_list=feed_list) - test_dataset = create_dataloader( - test_generator, place, feed_list=feed_list) + dataset = LacDataset(args) + train_dataset = LacDataLoader(args, place, phase="train") vocab_size = dataset.vocab_size num_labels = dataset.num_labels - model = SeqTagging(args, vocab_size, num_labels) + model = SeqTagging(args, vocab_size, num_labels, mode="train") optim = AdamOptimizer( learning_rate=args.base_learning_rate, @@ -250,8 +247,7 @@ def main(args): if args.init_from_pretrain_model: model.load(args.init_from_pretrain_model, reset_optimizer=True) - model.fit(train_dataset, - test_dataset, + model.fit(train_dataset.dataloader, epochs=args.epoch, batch_size=args.batch_size, eval_freq=args.eval_freq, @@ -263,7 +259,7 @@ if __name__ == '__main__': args = PDConfig(yaml_file="sequence_tagging.yaml") args.build() args.Print() - + use_gpu = True if args.device == "gpu" else False check_gpu(use_gpu) check_version() diff --git a/examples/sequence_tagging/utils/configure.py b/examples/sequence_tagging/utils/configure.py index 67e601282fee572518435eaed38a4ed8e26fc5f9..17dfaa53d8b44a68a2847c4bc1a1934384bb5f82 100644 --- a/examples/sequence_tagging/utils/configure.py +++ b/examples/sequence_tagging/utils/configure.py @@ -195,13 +195,19 @@ class PDConfig(object): "Whether to perform predicting.") self.default_g.add_arg("do_eval", bool, False, "Whether to perform evaluating.") - self.default_g.add_arg("do_save_inference_model", bool, False, - "Whether to perform model saving for inference.") + self.default_g.add_arg( + "do_save_inference_model", bool, False, + "Whether to perform model saving for inference.") # NOTE: args for profiler - self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)") - self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)") - self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)") + self.default_g.add_arg( + "is_profiler", int, 0, + "the switch of profiler tools. (used for benchmark)") + self.default_g.add_arg( + "profiler_path", str, './', + "the profiler output file path. (used for benchmark)") + self.default_g.add_arg("max_iter", int, 0, + "the max train batch num.(used for benchmark)") self.parser = parser diff --git a/examples/sequence_tagging/utils/metrics.py b/examples/sequence_tagging/utils/metrics.py index a7e01b91e1cd7e7cee71f570aef1d722b0c4770b..2b6422388b7729bc2b820bfb55d15f1dee56c006 100644 --- a/examples/sequence_tagging/utils/metrics.py +++ b/examples/sequence_tagging/utils/metrics.py @@ -23,7 +23,7 @@ import paddle.fluid as fluid __all__ = ['chunk_count', "build_chunk"] -def build_chunk(data_list, id2label_dict): +def build_chunk(data_list, id2label_dict): """ Assembly entity """ @@ -31,29 +31,29 @@ def build_chunk(data_list, id2label_dict): ner_dict = {} ner_str = "" ner_start = 0 - for i in range(len(tag_list)): + for i in range(len(tag_list)): tag = tag_list[i] - if tag == u"O": - if i != 0: + if tag == u"O": + if i != 0: key = "%d_%d" % (ner_start, i - 1) ner_dict[key] = ner_str ner_start = i - ner_str = tag - elif tag.endswith(u"B"): - if i != 0: + ner_str = tag + elif tag.endswith(u"B"): + if i != 0: key = "%d_%d" % (ner_start, i - 1) ner_dict[key] = ner_str ner_start = i ner_str = tag.split('-')[0] - elif tag.endswith(u"I"): - if tag.split('-')[0] != ner_str: - if i != 0: + elif tag.endswith(u"I"): + if tag.split('-')[0] != ner_str: + if i != 0: key = "%d_%d" % (ner_start, i - 1) ner_dict[key] = ner_str ner_start = i ner_str = tag.split('-')[0] return ner_dict - + def chunk_count(infer_numpy, label_numpy, seq_len, id2label_dict): """ @@ -62,15 +62,14 @@ def chunk_count(infer_numpy, label_numpy, seq_len, id2label_dict): num_infer_chunks, num_label_chunks, num_correct_chunks = 0, 0, 0 assert infer_numpy.shape[0] == label_numpy.shape[0] - for i in range(infer_numpy.shape[0]): - infer_list = infer_numpy[i][: seq_len[i]] - label_list = label_numpy[i][: seq_len[i]] + for i in range(infer_numpy.shape[0]): + infer_list = infer_numpy[i][:seq_len[i]] + label_list = label_numpy[i][:seq_len[i]] infer_dict = build_chunk(infer_list, id2label_dict) num_infer_chunks += len(infer_dict) label_dict = build_chunk(label_list, id2label_dict) num_label_chunks += len(label_dict) - for key in infer_dict: - if key in label_dict and label_dict[key] == infer_dict[key]: + for key in infer_dict: + if key in label_dict and label_dict[key] == infer_dict[key]: num_correct_chunks += 1 return num_infer_chunks, num_label_chunks, num_correct_chunks - diff --git a/hapi/text/text.py b/hapi/text/text.py index ee74c516437a366e1dd91cde236346ecf2e1b787..2b99f81f58dda36d59275790eb9acf767552de6b 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -49,7 +49,7 @@ __all__ = [ 'BeamSearchDecoder', 'MultiHeadAttention', 'FFN', 'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer', 'TransformerDecoder', 'TransformerBeamSearchDecoder', 'Linear_chain_crf', - 'Crf_decoding', 'SequenceTagging' + 'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer' ] @@ -763,7 +763,7 @@ class BasicGRUCell(RNNCell): c = self._activation(candidate) new_hidden = u * pre_hidden + (1 - u) * c - return new_hidden + return new_hidden, new_hidden @property def state_shape(self): @@ -1741,6 +1741,64 @@ class Crf_decoding(fluid.dygraph.Layer): return viterbi_path +class GRUEncoderLayer(Layer): + def __init__(self, + input_dim, + grnn_hidden_dim, + init_bound, + num_layers=1, + h_0=None, + is_bidirection=False): + super(GRUEncoderLayer, self).__init__() + self.h_0 = h_0 + self.num_layers = num_layers + self.is_bidirection = is_bidirection + self.gru_list = [] + self.gru_r_list = [] + for i in range(num_layers): + self.basic_gru_cell = BasicGRUCell( + input_size=input_dim if i == 0 else input_dim * 2, + hidden_size=grnn_hidden_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.UniformInitializer( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + self.gru_list.append( + self.add_sublayer( + "gru_%d" % i, + RNN(self.basic_gru_cell, + is_reverse=False, + time_major=False))) + if self.is_bidirection: + for i in range(num_layers): + self.basic_gru_cell_r = BasicGRUCell( + input_size=input_dim if i == 0 else input_dim * 2, + hidden_size=grnn_hidden_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.UniformInitializer( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + self.gru_r_list.append( + self.add_sublayer( + "gru_r_%d" % i, + RNN(self.basic_gru_cell_r, + is_reverse=True, + time_major=False))) + + def forward(self, input_feature): + for i in range(self.num_layers): + pre_gru, pre_state = self.gru_list[i](input_feature) + if self.is_bidirection: + gru_r, r_state = self.gru_r_list[i](input_feature) + out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1) + else: + out = pre_gru + input_feature = out + return out + + class SequenceTagging(fluid.dygraph.Layer): def __init__(self, vocab_size, @@ -1790,26 +1848,13 @@ class SequenceTagging(fluid.dygraph.Layer): force_cpu=True, name='h_0') - self.bigru_units = [] - for i in range(self.bigru_num): - if i == 0: - self.bigru_units.append( - self.add_sublayer( - "bigru_units%d" % i, - BiGRU( - self.grnn_hidden_dim, - self.grnn_hidden_dim, - self.init_bound, - h_0=h_0))) - else: - self.bigru_units.append( - self.add_sublayer( - "bigru_units%d" % i, - BiGRU( - self.grnn_hidden_dim * 2, - self.grnn_hidden_dim, - self.init_bound, - h_0=h_0))) + self.gru_encoder = GRUEncoderLayer( + input_dim=self.grnn_hidden_dim, + grnn_hidden_dim=self.grnn_hidden_dim, + init_bound=self.init_bound, + num_layers=self.bigru_num, + h_0=h_0, + is_bidirection=True) self.fc = Linear( input_dim=self.grnn_hidden_dim * 2, @@ -1837,10 +1882,7 @@ class SequenceTagging(fluid.dygraph.Layer): word_embed = self.word_embedding(word) input_feature = word_embed - for i in range(self.bigru_num): - bigru_output = self.bigru_units[i](input_feature) - input_feature = bigru_output - + bigru_output = self.gru_encoder(input_feature) emission = self.fc(bigru_output) if target is not None: