diff --git a/PaddleNLP/lexical_analysis/README.md b/PaddleNLP/lexical_analysis/README.md index e35c9d58c68922a329c6f6675f0d39e47259ffea..3a5034a50568a4ddc91d5fb39cb3a9c21d582c1a 100644 --- a/PaddleNLP/lexical_analysis/README.md +++ b/PaddleNLP/lexical_analysis/README.md @@ -18,6 +18,8 @@ Lexical Analysis of Chinese,简称 LAC,是一个联合的词法分析模型 本项目依赖 PaddlePaddle 1.3.2 及以上版本,安装请参考官网 [快速安装](http://www.paddlepaddle.org/paddle#quick-start)。 +> Warning: GPU 和 CPU 版本的 PaddlePaddle 分别是 paddlepaddle-gpu 和 paddlepaddle,请安装时注意区别。 + #### 2. 克隆代码 克隆工具集代码库到本地 ```bash diff --git a/PaddleNLP/lexical_analysis/reader.py b/PaddleNLP/lexical_analysis/reader.py index e7099eed440c054811d92a6a8dd97cbe9a906412..4655c5ebd282e3c7be43f702d7917cb23be1b931 100644 --- a/PaddleNLP/lexical_analysis/reader.py +++ b/PaddleNLP/lexical_analysis/reader.py @@ -93,7 +93,7 @@ class Dataset(object): for line in fread: words = line.strip("\n").split("\002") word_ids = self.word_to_ids(words) - yield word_ids[0:max_seq_len] + yield word_ids[0:max_seq_len], [0 for _ in word_ids][0: max_seq_len] else: assert len(headline) == 2 and headline[0] == "text_a" and headline[1] == "label" for line in fread: diff --git a/PaddleNLP/lexical_analysis/run_ernie_sequence_labeling.py b/PaddleNLP/lexical_analysis/run_ernie_sequence_labeling.py index 135c6b27fc058f20584572b3ae1c2276dc31e176..96092dc29e9dc81f1b853d735ca18fe80b52b2fb 100644 --- a/PaddleNLP/lexical_analysis/run_ernie_sequence_labeling.py +++ b/PaddleNLP/lexical_analysis/run_ernie_sequence_labeling.py @@ -75,6 +75,10 @@ run_type_g.add_arg("do_infer", bool, True, "Whether to perform inference.") args = parser.parse_args() # yapf: enable. +sys.path.append('../models/') +from model_check import check_cuda +check_cuda(args.use_cuda) + def ernie_pyreader(args, pyreader_name): """define standard ernie pyreader""" pyreader = fluid.layers.py_reader( diff --git a/PaddleNLP/lexical_analysis/run_sequence_labeling.py b/PaddleNLP/lexical_analysis/run_sequence_labeling.py index 55265b99259a4bb387979a890f4c80e9d4b273d0..6f20cfe34a4044ffa376f07d63efcebdc5a6ffb7 100644 --- a/PaddleNLP/lexical_analysis/run_sequence_labeling.py +++ b/PaddleNLP/lexical_analysis/run_sequence_labeling.py @@ -25,7 +25,6 @@ import utils sys.path.append("../") from models.sequence_labeling import nets - # yapf: disable parser = argparse.ArgumentParser(__doc__) @@ -71,6 +70,10 @@ parser.add_argument('--enable_ce', action='store_true', help='If set, run the ta args = parser.parse_args() # yapf: enable. +sys.path.append('../models/') +from model_check import check_cuda +check_cuda(args.use_cuda) + print(args) diff --git a/PaddleNLP/lexical_analysis/utils.py b/PaddleNLP/lexical_analysis/utils.py index 1ef20b22eb17e3081f9d39f7b30ee751d3c504a0..e3085fe0e788f8979d6e0dddc8d56c85446b8c3f 100644 --- a/PaddleNLP/lexical_analysis/utils.py +++ b/PaddleNLP/lexical_analysis/utils.py @@ -81,13 +81,27 @@ def parse_result(words, crf_decode, dataset): for sent_index in range(batch_size): sent_out_str = "" sent_len = offset_list[sent_index + 1] - offset_list[sent_index] + last_word = "" + last_tag = "" for tag_index in range(sent_len): # iterate every word in sent index = tag_index + offset_list[sent_index] cur_word_id = str(words[index][0]) cur_tag_id = str(crf_decode[index][0]) cur_word = dataset.id2word_dict[cur_word_id] cur_tag = dataset.id2label_dict[cur_tag_id] - sent_out_str += cur_word + u"/" + cur_tag + u" " + if last_word == "": + last_word = cur_word + last_tag = cur_tag[:-2] + elif cur_tag.endswith("-B") or cur_tag == "O": + sent_out_str += last_word + u"/" + last_tag + u" " + last_word = cur_word + last_tag = cur_tag[:-2] + elif cur_tag.endswith("-I"): + last_word += cur_word + else: + raise ValueError("invalid tag: %s" % (cur_tag)) + if cur_word != "": + sent_out_str += last_word + u"/" + last_tag + u" " sent_out_str = to_str(sent_out_str.strip()) batch_out_str.append(sent_out_str) return batch_out_str