From 21127772b9a7055edebfb3511c6702eb82718f30 Mon Sep 17 00:00:00 2001 From: zhanghan17 Date: Mon, 19 Aug 2019 19:03:38 +0800 Subject: [PATCH] support IO,IOB,IOE,IOBES for sequence labeling & fix en mrc --- finetune/sequence_label.py | 38 ++++++++++------- finetune_args.py | 1 + reader/task_reader.py | 48 +++++++++++++--------- run_sequence_labeling.py | 2 + script/zh_task/ernie_base/run_msra_ner.sh | 1 + script/zh_task/ernie_large/run_msra_ner.sh | 1 + tokenization.py | 7 +++- 7 files changed, 63 insertions(+), 35 deletions(-) diff --git a/finetune/sequence_label.py b/finetune/sequence_label.py index 9d39790..0b3f9d4 100644 --- a/finetune/sequence_label.py +++ b/finetune/sequence_label.py @@ -68,11 +68,19 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False): bias_attr=fluid.ParamAttr( name="cls_seq_label_out_b", initializer=fluid.initializer.Constant(0.))) + infers = fluid.layers.argmax(logits, axis=2) ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1]) - ret_infers = fluid.layers.reshape( - x=fluid.layers.argmax( - logits, axis=2), shape=[-1, 1]) + ret_infers = fluid.layers.reshape(x=infers, shape=[-1, 1]) + + lod_labels = fluid.layers.sequence_unpad(labels, seq_lens) + lod_infers = fluid.layers.sequence_unpad(infers, seq_lens) + + (_, _, _, num_infer, num_label, num_correct) = fluid.layers.chunk_eval( + input=lod_infers, + label=lod_labels, + chunk_scheme=args.chunk_scheme, + num_chunk_types=((args.num_labels-1)//(len(args.chunk_scheme)-1))) labels = fluid.layers.flatten(labels, axis=2) ce_loss, probs = fluid.layers.softmax_with_cross_entropy( @@ -92,6 +100,9 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False): "probs": probs, "labels": ret_labels, "infers": ret_infers, + "num_infer": num_infer, + "num_label": num_label, + "num_correct": num_correct, "seq_lens": seq_lens } @@ -212,8 +223,8 @@ def evaluate(exe, eval_phase, dev_count=1): fetch_list = [ - graph_vars["labels"].name, graph_vars["infers"].name, - graph_vars["seq_lens"].name + graph_vars["num_infer"].name, graph_vars["num_label"].name, + graph_vars["num_correct"].name ] if eval_phase == "train": @@ -221,9 +232,10 @@ def evaluate(exe, if "learning_rate" in graph_vars: fetch_list.append(graph_vars["learning_rate"].name) outputs = exe.run(fetch_list=fetch_list) - np_labels, np_infers, np_lens, np_loss = outputs[:4] - num_label, num_infer, num_correct = chunk_eval( - np_labels, np_infers, np_lens, tag_num, dev_count) + np_num_infer, np_num_label, np_num_correct, np_loss = outputs[:4] + num_label = np.sum(np_num_label) + num_infer = np.sum(np_num_infer) + num_correct = np.sum(np_num_correct) precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct) rets = { "precision": precision, @@ -241,13 +253,11 @@ def evaluate(exe, pyreader.start() while True: try: - np_labels, np_infers, np_lens = exe.run(program=program, + np_num_infer, np_num_label, np_num_correct = exe.run(program=program, fetch_list=fetch_list) - label_num, infer_num, correct_num = chunk_eval( - np_labels, np_infers, np_lens, tag_num, dev_count) - total_infer += infer_num - total_label += label_num - total_correct += correct_num + total_infer += np.sum(np_num_infer) + total_label += np.sum(np_num_label) + total_correct += np.sum(np_num_correct) except fluid.core.EOFException: pyreader.reset() diff --git a/finetune_args.py b/finetune_args.py index 1b96c17..98b5968 100644 --- a/finetune_args.py +++ b/finetune_args.py @@ -82,6 +82,7 @@ data_g.add_arg("doc_stride", int, 128, "When splitting up a long document into chunks, how much stride to take between chunks.") data_g.add_arg("n_best_size", int, 20, "The total number of n-best predictions to generate in the nbest_predictions.json output file.") +data_g.add_arg("chunk_scheme", type=str, default="IOB", choices=["IO", "IOB", "IOE", "IOBES"], help="chunk scheme") run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") diff --git a/reader/task_reader.py b/reader/task_reader.py index 2a6507e..a68522c 100644 --- a/reader/task_reader.py +++ b/reader/task_reader.py @@ -347,13 +347,23 @@ class SequenceLabelReader(BaseReader): if len(sub_token) == 0: continue ret_tokens.extend(sub_token) - ret_labels.append(label) - if len(sub_token) < 2: + if len(sub_token) == 1: + ret_labels.append(label) continue - sub_label = label - if label.startswith("B-"): - sub_label = "I-" + label[2:] - ret_labels.extend([sub_label] * (len(sub_token) - 1)) + + if label == "O" or label.startswith("I-"): + ret_labels.extend([label] * len(sub_token)) + elif label.startswith("B-"): + i_label = "I-" + label[2:] + ret_labels.extend([label] + [i_label] * (len(sub_token) - 1)) + elif label.startswith("S-"): + b_laebl = "B-" + label[2:] + e_label = "E-" + label[2:] + i_label = "I-" + label[2:] + ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label]) + elif label.startswith("E-"): + i_label = "I-" + label[2:] + ret_labels.extend([i_label] * (len(sub_token) - 1) + [label]) assert len(ret_tokens) == len(ret_labels) return ret_tokens, ret_labels @@ -451,6 +461,15 @@ class MRCReader(BaseReader): self.current_epoch = 0 self.num_examples = 0 + self.Example = namedtuple('Example', + ['qas_id', 'question_text', 'doc_tokens', 'orig_answer_text', + 'start_position', 'end_position']) + self.Feature = namedtuple("Feature", ["unique_id", "example_index", "doc_span_index", + "tokens", "token_to_orig_map", "token_is_max_context", + "token_ids", "position_ids", "text_type_ids", + "start_position", "end_position"]) + self.DocSpan = namedtuple("DocSpan", ["start", "length"]) + def _read_json(self, input_file, is_training): examples = [] with open(input_file, "r") as f: @@ -495,12 +514,7 @@ class MRCReader(BaseReader): doc_tokens = tokenization.tokenize_chinese_chars( paragraph_text) - Example = namedtuple('Example', [ - 'qas_id', 'question_text', 'doc_tokens', - 'orig_answer_text', 'start_position', 'end_position' - ]) - - example = Example( + example = self.Example( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, @@ -544,11 +558,6 @@ class MRCReader(BaseReader): def _convert_example_to_feature(self, examples, max_seq_length, tokenizer, is_training): - Feature = namedtuple("Feature", [ - "unique_id", "example_index", "doc_span_index", "tokens", - "token_to_orig_map", "token_is_max_context", "token_ids", - "position_ids", "text_type_ids", "start_position", "end_position" - ]) features = [] unique_id = 1000000000 @@ -581,14 +590,13 @@ class MRCReader(BaseReader): tokenizer, example.orig_answer_text) max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - _DocSpan = namedtuple("DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc - doc_spans.append(_DocSpan(start=start_offset, length=length)) + doc_spans.append(self.DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, self.doc_stride) @@ -638,7 +646,7 @@ class MRCReader(BaseReader): start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset - feature = Feature( + feature = self.Feature( unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, diff --git a/run_sequence_labeling.py b/run_sequence_labeling.py index f026fb3..499af1c 100644 --- a/run_sequence_labeling.py +++ b/run_sequence_labeling.py @@ -109,12 +109,14 @@ def main(args): scheduler=args.lr_scheduler, use_fp16=args.use_fp16) + """ fluid.memory_optimize( input_program=train_program, skip_opt_set=[ graph_vars["loss"].name, graph_vars["labels"].name, graph_vars["infers"].name, graph_vars["seq_lens"].name ]) + """ if args.verbose: if args.in_tokens: diff --git a/script/zh_task/ernie_base/run_msra_ner.sh b/script/zh_task/ernie_base/run_msra_ner.sh index 4232b6c..9a25a5a 100644 --- a/script/zh_task/ernie_base/run_msra_ner.sh +++ b/script/zh_task/ernie_base/run_msra_ner.sh @@ -12,6 +12,7 @@ python -u run_sequence_labeling.py \ --batch_size 16 \ --init_pretraining_params ${MODEL_PATH}/params \ --num_labels 7 \ + --chunk_scheme "IOB" \ --label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \ --train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \ --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \ diff --git a/script/zh_task/ernie_large/run_msra_ner.sh b/script/zh_task/ernie_large/run_msra_ner.sh index 58bb7aa..2a2ff24 100644 --- a/script/zh_task/ernie_large/run_msra_ner.sh +++ b/script/zh_task/ernie_large/run_msra_ner.sh @@ -11,6 +11,7 @@ python -u run_sequence_labeling.py \ --batch_size 16 \ --init_pretraining_params ${MODEL_PATH}/params \ --num_labels 7 \ + --chunk_scheme "IOB" \ --label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \ --train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \ --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \ diff --git a/tokenization.py b/tokenization.py index b2b4dc0..132fcde 100644 --- a/tokenization.py +++ b/tokenization.py @@ -395,11 +395,16 @@ def tokenize_chinese_chars(text): return False + def _is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + output = [] buff = "" for char in text: cp = ord(char) - if _is_chinese_char(cp): + if _is_chinese_char(cp) or _is_whitespace(char): if buff != "": output.append(buff) buff = "" -- GitLab