support IO,IOB,IOE,IOBES for sequence labeling & fix en mrc

21127772 · zhanghan17 · 5c8c3e3e · 21127772 · 21127772 · 21127772
7 changed file
--- a/finetune/sequence_label.py
+++ b/finetune/sequence_label.py
@@ -68,11 +68,19 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
        bias_attr=fluid.ParamAttr(
            name="cls_seq_label_out_b",
            initializer=fluid.initializer.Constant(0.)))
+    infers = fluid.layers.argmax(logits, axis=2)

    ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1])
-    ret_infers = fluid.layers.reshape(
-        x=fluid.layers.argmax(
-            logits, axis=2), shape=[-1, 1])
+    ret_infers = fluid.layers.reshape(x=infers, shape=[-1, 1])
+
+    lod_labels = fluid.layers.sequence_unpad(labels, seq_lens)
+    lod_infers = fluid.layers.sequence_unpad(infers, seq_lens)
+
+    (_, _, _, num_infer, num_label, num_correct) = fluid.layers.chunk_eval(
+         input=lod_infers,
+         label=lod_labels,
+         chunk_scheme=args.chunk_scheme,
+         num_chunk_types=((args.num_labels-1)//(len(args.chunk_scheme)-1)))

    labels = fluid.layers.flatten(labels, axis=2)
    ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
@@ -92,6 +100,9 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
        "probs": probs,
        "labels": ret_labels,
        "infers": ret_infers,
+        "num_infer": num_infer,
+        "num_label": num_label,
+        "num_correct": num_correct,
        "seq_lens": seq_lens
    }

@@ -212,8 +223,8 @@ def evaluate(exe,
             eval_phase,
             dev_count=1):
    fetch_list = [
-        graph_vars["labels"].name, graph_vars["infers"].name,
-        graph_vars["seq_lens"].name
+        graph_vars["num_infer"].name, graph_vars["num_label"].name,
+        graph_vars["num_correct"].name
    ]

    if eval_phase == "train":
@@ -221,9 +232,10 @@ def evaluate(exe,
        if "learning_rate" in graph_vars:
            fetch_list.append(graph_vars["learning_rate"].name)
        outputs = exe.run(fetch_list=fetch_list)
-        np_labels, np_infers, np_lens, np_loss = outputs[:4]
-        num_label, num_infer, num_correct = chunk_eval(
-            np_labels, np_infers, np_lens, tag_num, dev_count)
+        np_num_infer, np_num_label, np_num_correct, np_loss = outputs[:4]
+        num_label = np.sum(np_num_label)
+        num_infer = np.sum(np_num_infer)
+        num_correct = np.sum(np_num_correct)
        precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct)
        rets = {
            "precision": precision,
@@ -241,13 +253,11 @@ def evaluate(exe,
        pyreader.start()
        while True:
            try:
-                np_labels, np_infers, np_lens = exe.run(program=program,
+                np_num_infer, np_num_label, np_num_correct = exe.run(program=program,
                                                        fetch_list=fetch_list)
-                label_num, infer_num, correct_num = chunk_eval(
-                    np_labels, np_infers, np_lens, tag_num, dev_count)
-                total_infer += infer_num
-                total_label += label_num
-                total_correct += correct_num
+                total_infer += np.sum(np_num_infer)
+                total_label += np.sum(np_num_label)
+                total_correct += np.sum(np_num_correct)

            except fluid.core.EOFException:
                pyreader.reset()

--- a/finetune_args.py
+++ b/finetune_args.py
@@ -82,6 +82,7 @@ data_g.add_arg("doc_stride",                int,   128,
               "When splitting up a long document into chunks, how much stride to take between chunks.")
 data_g.add_arg("n_best_size",               int,   20,
               "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+data_g.add_arg("chunk_scheme", type=str,  default="IOB", choices=["IO", "IOB", "IOE", "IOBES"], help="chunk scheme")

 run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
 run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")

--- a/reader/task_reader.py
+++ b/reader/task_reader.py
@@ -347,13 +347,23 @@ class SequenceLabelReader(BaseReader):
            if len(sub_token) == 0:
                continue
            ret_tokens.extend(sub_token)
-            ret_labels.append(label)
-            if len(sub_token) < 2:
+            if len(sub_token) == 1:
+                ret_labels.append(label)
                continue
-            sub_label = label
-            if label.startswith("B-"):
-                sub_label = "I-" + label[2:]
-            ret_labels.extend([sub_label] * (len(sub_token) - 1))
+
+            if label == "O" or label.startswith("I-"):
+                ret_labels.extend([label] * len(sub_token))
+            elif label.startswith("B-"):
+                i_label = "I-" + label[2:]
+                ret_labels.extend([label] + [i_label] * (len(sub_token) - 1))
+            elif label.startswith("S-"):
+                b_laebl = "B-" + label[2:]
+                e_label = "E-" + label[2:]
+                i_label = "I-" + label[2:]
+                ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label])
+            elif label.startswith("E-"):
+                i_label = "I-" + label[2:]
+                ret_labels.extend([i_label] * (len(sub_token) - 1) + [label])

        assert len(ret_tokens) == len(ret_labels)
        return ret_tokens, ret_labels
@@ -451,6 +461,15 @@ class MRCReader(BaseReader):
        self.current_epoch = 0
        self.num_examples = 0

+        self.Example = namedtuple('Example',
+                ['qas_id', 'question_text', 'doc_tokens', 'orig_answer_text',
+                'start_position', 'end_position'])
+        self.Feature = namedtuple("Feature", ["unique_id", "example_index", "doc_span_index",
+                "tokens", "token_to_orig_map", "token_is_max_context",
+                "token_ids", "position_ids", "text_type_ids",
+                "start_position", "end_position"])
+        self.DocSpan = namedtuple("DocSpan", ["start", "length"])
+
    def _read_json(self, input_file, is_training):
        examples = []
        with open(input_file, "r") as f:
@@ -495,12 +514,7 @@ class MRCReader(BaseReader):
                            doc_tokens = tokenization.tokenize_chinese_chars(
                                paragraph_text)

-                        Example = namedtuple('Example', [
-                            'qas_id', 'question_text', 'doc_tokens',
-                            'orig_answer_text', 'start_position', 'end_position'
-                        ])
-
-                        example = Example(
+                        example = self.Example(
                            qas_id=qas_id,
                            question_text=question_text,
                            doc_tokens=doc_tokens,
@@ -544,11 +558,6 @@ class MRCReader(BaseReader):

    def _convert_example_to_feature(self, examples, max_seq_length, tokenizer,
                                    is_training):
-        Feature = namedtuple("Feature", [
-            "unique_id", "example_index", "doc_span_index", "tokens",
-            "token_to_orig_map", "token_is_max_context", "token_ids",
-            "position_ids", "text_type_ids", "start_position", "end_position"
-        ])
        features = []
        unique_id = 1000000000

@@ -581,14 +590,13 @@ class MRCReader(BaseReader):
                     tokenizer, example.orig_answer_text)

            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-            _DocSpan = namedtuple("DocSpan", ["start", "length"])
            doc_spans = []
            start_offset = 0
            while start_offset < len(all_doc_tokens):
                length = len(all_doc_tokens) - start_offset
                if length > max_tokens_for_doc:
                    length = max_tokens_for_doc
-                doc_spans.append(_DocSpan(start=start_offset, length=length))
+                doc_spans.append(self.DocSpan(start=start_offset, length=length))
                if start_offset + length == len(all_doc_tokens):
                    break
                start_offset += min(length, self.doc_stride)
@@ -638,7 +646,7 @@ class MRCReader(BaseReader):
                        start_position = tok_start_position - doc_start + doc_offset
                        end_position = tok_end_position - doc_start + doc_offset

-                feature = Feature(
+                feature = self.Feature(
                    unique_id=unique_id,
                    example_index=example_index,
                    doc_span_index=doc_span_index,

--- a/run_sequence_labeling.py
+++ b/run_sequence_labeling.py
@@ -109,12 +109,14 @@ def main(args):
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16)

+                """
                fluid.memory_optimize(
                    input_program=train_program,
                    skip_opt_set=[
                        graph_vars["loss"].name, graph_vars["labels"].name,
                        graph_vars["infers"].name, graph_vars["seq_lens"].name
                    ])
+                """

        if args.verbose:
            if args.in_tokens:

--- a/script/zh_task/ernie_base/run_msra_ner.sh
+++ b/script/zh_task/ernie_base/run_msra_ner.sh
@@ -12,6 +12,7 @@ python -u run_sequence_labeling.py \
                   --batch_size 16 \
                   --init_pretraining_params ${MODEL_PATH}/params \
                   --num_labels 7 \
+                   --chunk_scheme "IOB" \
                   --label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \
                   --train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \
                   --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \

--- a/script/zh_task/ernie_large/run_msra_ner.sh
+++ b/script/zh_task/ernie_large/run_msra_ner.sh
@@ -11,6 +11,7 @@ python -u run_sequence_labeling.py \
                   --batch_size 16 \
                   --init_pretraining_params ${MODEL_PATH}/params \
                   --num_labels 7 \
+                   --chunk_scheme "IOB" \
                   --label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \
                   --train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \
                   --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \

--- a/tokenization.py
+++ b/tokenization.py
@@ -395,11 +395,16 @@ def tokenize_chinese_chars(text):

        return False

+    def _is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
    output = []
    buff = ""
    for char in text:
        cp = ord(char)
-        if _is_chinese_char(cp):
+        if _is_chinese_char(cp) or _is_whitespace(char):
            if buff != "":
                output.append(buff)
                buff = ""