提交 21127772 编写于 作者: Z zhanghan17

support IO,IOB,IOE,IOBES for sequence labeling & fix en mrc

上级 5c8c3e3e
...@@ -68,11 +68,19 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False): ...@@ -68,11 +68,19 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
bias_attr=fluid.ParamAttr( bias_attr=fluid.ParamAttr(
name="cls_seq_label_out_b", name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.))) initializer=fluid.initializer.Constant(0.)))
infers = fluid.layers.argmax(logits, axis=2)
ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1]) ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1])
ret_infers = fluid.layers.reshape( ret_infers = fluid.layers.reshape(x=infers, shape=[-1, 1])
x=fluid.layers.argmax(
logits, axis=2), shape=[-1, 1]) lod_labels = fluid.layers.sequence_unpad(labels, seq_lens)
lod_infers = fluid.layers.sequence_unpad(infers, seq_lens)
(_, _, _, num_infer, num_label, num_correct) = fluid.layers.chunk_eval(
input=lod_infers,
label=lod_labels,
chunk_scheme=args.chunk_scheme,
num_chunk_types=((args.num_labels-1)//(len(args.chunk_scheme)-1)))
labels = fluid.layers.flatten(labels, axis=2) labels = fluid.layers.flatten(labels, axis=2)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy( ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
...@@ -92,6 +100,9 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False): ...@@ -92,6 +100,9 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
"probs": probs, "probs": probs,
"labels": ret_labels, "labels": ret_labels,
"infers": ret_infers, "infers": ret_infers,
"num_infer": num_infer,
"num_label": num_label,
"num_correct": num_correct,
"seq_lens": seq_lens "seq_lens": seq_lens
} }
...@@ -212,8 +223,8 @@ def evaluate(exe, ...@@ -212,8 +223,8 @@ def evaluate(exe,
eval_phase, eval_phase,
dev_count=1): dev_count=1):
fetch_list = [ fetch_list = [
graph_vars["labels"].name, graph_vars["infers"].name, graph_vars["num_infer"].name, graph_vars["num_label"].name,
graph_vars["seq_lens"].name graph_vars["num_correct"].name
] ]
if eval_phase == "train": if eval_phase == "train":
...@@ -221,9 +232,10 @@ def evaluate(exe, ...@@ -221,9 +232,10 @@ def evaluate(exe,
if "learning_rate" in graph_vars: if "learning_rate" in graph_vars:
fetch_list.append(graph_vars["learning_rate"].name) fetch_list.append(graph_vars["learning_rate"].name)
outputs = exe.run(fetch_list=fetch_list) outputs = exe.run(fetch_list=fetch_list)
np_labels, np_infers, np_lens, np_loss = outputs[:4] np_num_infer, np_num_label, np_num_correct, np_loss = outputs[:4]
num_label, num_infer, num_correct = chunk_eval( num_label = np.sum(np_num_label)
np_labels, np_infers, np_lens, tag_num, dev_count) num_infer = np.sum(np_num_infer)
num_correct = np.sum(np_num_correct)
precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct) precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct)
rets = { rets = {
"precision": precision, "precision": precision,
...@@ -241,13 +253,11 @@ def evaluate(exe, ...@@ -241,13 +253,11 @@ def evaluate(exe,
pyreader.start() pyreader.start()
while True: while True:
try: try:
np_labels, np_infers, np_lens = exe.run(program=program, np_num_infer, np_num_label, np_num_correct = exe.run(program=program,
fetch_list=fetch_list) fetch_list=fetch_list)
label_num, infer_num, correct_num = chunk_eval( total_infer += np.sum(np_num_infer)
np_labels, np_infers, np_lens, tag_num, dev_count) total_label += np.sum(np_num_label)
total_infer += infer_num total_correct += np.sum(np_num_correct)
total_label += label_num
total_correct += correct_num
except fluid.core.EOFException: except fluid.core.EOFException:
pyreader.reset() pyreader.reset()
......
...@@ -82,6 +82,7 @@ data_g.add_arg("doc_stride", int, 128, ...@@ -82,6 +82,7 @@ data_g.add_arg("doc_stride", int, 128,
"When splitting up a long document into chunks, how much stride to take between chunks.") "When splitting up a long document into chunks, how much stride to take between chunks.")
data_g.add_arg("n_best_size", int, 20, data_g.add_arg("n_best_size", int, 20,
"The total number of n-best predictions to generate in the nbest_predictions.json output file.") "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
data_g.add_arg("chunk_scheme", type=str, default="IOB", choices=["IO", "IOB", "IOE", "IOBES"], help="chunk scheme")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
......
...@@ -347,13 +347,23 @@ class SequenceLabelReader(BaseReader): ...@@ -347,13 +347,23 @@ class SequenceLabelReader(BaseReader):
if len(sub_token) == 0: if len(sub_token) == 0:
continue continue
ret_tokens.extend(sub_token) ret_tokens.extend(sub_token)
if len(sub_token) == 1:
ret_labels.append(label) ret_labels.append(label)
if len(sub_token) < 2:
continue continue
sub_label = label
if label.startswith("B-"): if label == "O" or label.startswith("I-"):
sub_label = "I-" + label[2:] ret_labels.extend([label] * len(sub_token))
ret_labels.extend([sub_label] * (len(sub_token) - 1)) elif label.startswith("B-"):
i_label = "I-" + label[2:]
ret_labels.extend([label] + [i_label] * (len(sub_token) - 1))
elif label.startswith("S-"):
b_laebl = "B-" + label[2:]
e_label = "E-" + label[2:]
i_label = "I-" + label[2:]
ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label])
elif label.startswith("E-"):
i_label = "I-" + label[2:]
ret_labels.extend([i_label] * (len(sub_token) - 1) + [label])
assert len(ret_tokens) == len(ret_labels) assert len(ret_tokens) == len(ret_labels)
return ret_tokens, ret_labels return ret_tokens, ret_labels
...@@ -451,6 +461,15 @@ class MRCReader(BaseReader): ...@@ -451,6 +461,15 @@ class MRCReader(BaseReader):
self.current_epoch = 0 self.current_epoch = 0
self.num_examples = 0 self.num_examples = 0
self.Example = namedtuple('Example',
['qas_id', 'question_text', 'doc_tokens', 'orig_answer_text',
'start_position', 'end_position'])
self.Feature = namedtuple("Feature", ["unique_id", "example_index", "doc_span_index",
"tokens", "token_to_orig_map", "token_is_max_context",
"token_ids", "position_ids", "text_type_ids",
"start_position", "end_position"])
self.DocSpan = namedtuple("DocSpan", ["start", "length"])
def _read_json(self, input_file, is_training): def _read_json(self, input_file, is_training):
examples = [] examples = []
with open(input_file, "r") as f: with open(input_file, "r") as f:
...@@ -495,12 +514,7 @@ class MRCReader(BaseReader): ...@@ -495,12 +514,7 @@ class MRCReader(BaseReader):
doc_tokens = tokenization.tokenize_chinese_chars( doc_tokens = tokenization.tokenize_chinese_chars(
paragraph_text) paragraph_text)
Example = namedtuple('Example', [ example = self.Example(
'qas_id', 'question_text', 'doc_tokens',
'orig_answer_text', 'start_position', 'end_position'
])
example = Example(
qas_id=qas_id, qas_id=qas_id,
question_text=question_text, question_text=question_text,
doc_tokens=doc_tokens, doc_tokens=doc_tokens,
...@@ -544,11 +558,6 @@ class MRCReader(BaseReader): ...@@ -544,11 +558,6 @@ class MRCReader(BaseReader):
def _convert_example_to_feature(self, examples, max_seq_length, tokenizer, def _convert_example_to_feature(self, examples, max_seq_length, tokenizer,
is_training): is_training):
Feature = namedtuple("Feature", [
"unique_id", "example_index", "doc_span_index", "tokens",
"token_to_orig_map", "token_is_max_context", "token_ids",
"position_ids", "text_type_ids", "start_position", "end_position"
])
features = [] features = []
unique_id = 1000000000 unique_id = 1000000000
...@@ -581,14 +590,13 @@ class MRCReader(BaseReader): ...@@ -581,14 +590,13 @@ class MRCReader(BaseReader):
tokenizer, example.orig_answer_text) tokenizer, example.orig_answer_text)
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
_DocSpan = namedtuple("DocSpan", ["start", "length"])
doc_spans = [] doc_spans = []
start_offset = 0 start_offset = 0
while start_offset < len(all_doc_tokens): while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc: if length > max_tokens_for_doc:
length = max_tokens_for_doc length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length)) doc_spans.append(self.DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens): if start_offset + length == len(all_doc_tokens):
break break
start_offset += min(length, self.doc_stride) start_offset += min(length, self.doc_stride)
...@@ -638,7 +646,7 @@ class MRCReader(BaseReader): ...@@ -638,7 +646,7 @@ class MRCReader(BaseReader):
start_position = tok_start_position - doc_start + doc_offset start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset
feature = Feature( feature = self.Feature(
unique_id=unique_id, unique_id=unique_id,
example_index=example_index, example_index=example_index,
doc_span_index=doc_span_index, doc_span_index=doc_span_index,
......
...@@ -109,12 +109,14 @@ def main(args): ...@@ -109,12 +109,14 @@ def main(args):
scheduler=args.lr_scheduler, scheduler=args.lr_scheduler,
use_fp16=args.use_fp16) use_fp16=args.use_fp16)
"""
fluid.memory_optimize( fluid.memory_optimize(
input_program=train_program, input_program=train_program,
skip_opt_set=[ skip_opt_set=[
graph_vars["loss"].name, graph_vars["labels"].name, graph_vars["loss"].name, graph_vars["labels"].name,
graph_vars["infers"].name, graph_vars["seq_lens"].name graph_vars["infers"].name, graph_vars["seq_lens"].name
]) ])
"""
if args.verbose: if args.verbose:
if args.in_tokens: if args.in_tokens:
......
...@@ -12,6 +12,7 @@ python -u run_sequence_labeling.py \ ...@@ -12,6 +12,7 @@ python -u run_sequence_labeling.py \
--batch_size 16 \ --batch_size 16 \
--init_pretraining_params ${MODEL_PATH}/params \ --init_pretraining_params ${MODEL_PATH}/params \
--num_labels 7 \ --num_labels 7 \
--chunk_scheme "IOB" \
--label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \ --label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \
--train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \ --train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \
--dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \ --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \
......
...@@ -11,6 +11,7 @@ python -u run_sequence_labeling.py \ ...@@ -11,6 +11,7 @@ python -u run_sequence_labeling.py \
--batch_size 16 \ --batch_size 16 \
--init_pretraining_params ${MODEL_PATH}/params \ --init_pretraining_params ${MODEL_PATH}/params \
--num_labels 7 \ --num_labels 7 \
--chunk_scheme "IOB" \
--label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \ --label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \
--train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \ --train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \
--dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \ --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \
......
...@@ -395,11 +395,16 @@ def tokenize_chinese_chars(text): ...@@ -395,11 +395,16 @@ def tokenize_chinese_chars(text):
return False return False
def _is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
output = [] output = []
buff = "" buff = ""
for char in text: for char in text:
cp = ord(char) cp = ord(char)
if _is_chinese_char(cp): if _is_chinese_char(cp) or _is_whitespace(char):
if buff != "": if buff != "":
output.append(buff) output.append(buff)
buff = "" buff = ""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册