提交 21127772 编写于 作者: Z zhanghan17

support IO,IOB,IOE,IOBES for sequence labeling & fix en mrc

上级 5c8c3e3e
......@@ -68,11 +68,19 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
bias_attr=fluid.ParamAttr(
name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.)))
infers = fluid.layers.argmax(logits, axis=2)
ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1])
ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(
logits, axis=2), shape=[-1, 1])
ret_infers = fluid.layers.reshape(x=infers, shape=[-1, 1])
lod_labels = fluid.layers.sequence_unpad(labels, seq_lens)
lod_infers = fluid.layers.sequence_unpad(infers, seq_lens)
(_, _, _, num_infer, num_label, num_correct) = fluid.layers.chunk_eval(
input=lod_infers,
label=lod_labels,
chunk_scheme=args.chunk_scheme,
num_chunk_types=((args.num_labels-1)//(len(args.chunk_scheme)-1)))
labels = fluid.layers.flatten(labels, axis=2)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
......@@ -92,6 +100,9 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
"probs": probs,
"labels": ret_labels,
"infers": ret_infers,
"num_infer": num_infer,
"num_label": num_label,
"num_correct": num_correct,
"seq_lens": seq_lens
}
......@@ -212,8 +223,8 @@ def evaluate(exe,
eval_phase,
dev_count=1):
fetch_list = [
graph_vars["labels"].name, graph_vars["infers"].name,
graph_vars["seq_lens"].name
graph_vars["num_infer"].name, graph_vars["num_label"].name,
graph_vars["num_correct"].name
]
if eval_phase == "train":
......@@ -221,9 +232,10 @@ def evaluate(exe,
if "learning_rate" in graph_vars:
fetch_list.append(graph_vars["learning_rate"].name)
outputs = exe.run(fetch_list=fetch_list)
np_labels, np_infers, np_lens, np_loss = outputs[:4]
num_label, num_infer, num_correct = chunk_eval(
np_labels, np_infers, np_lens, tag_num, dev_count)
np_num_infer, np_num_label, np_num_correct, np_loss = outputs[:4]
num_label = np.sum(np_num_label)
num_infer = np.sum(np_num_infer)
num_correct = np.sum(np_num_correct)
precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct)
rets = {
"precision": precision,
......@@ -241,13 +253,11 @@ def evaluate(exe,
pyreader.start()
while True:
try:
np_labels, np_infers, np_lens = exe.run(program=program,
np_num_infer, np_num_label, np_num_correct = exe.run(program=program,
fetch_list=fetch_list)
label_num, infer_num, correct_num = chunk_eval(
np_labels, np_infers, np_lens, tag_num, dev_count)
total_infer += infer_num
total_label += label_num
total_correct += correct_num
total_infer += np.sum(np_num_infer)
total_label += np.sum(np_num_label)
total_correct += np.sum(np_num_correct)
except fluid.core.EOFException:
pyreader.reset()
......
......@@ -82,6 +82,7 @@ data_g.add_arg("doc_stride", int, 128,
"When splitting up a long document into chunks, how much stride to take between chunks.")
data_g.add_arg("n_best_size", int, 20,
"The total number of n-best predictions to generate in the nbest_predictions.json output file.")
data_g.add_arg("chunk_scheme", type=str, default="IOB", choices=["IO", "IOB", "IOE", "IOBES"], help="chunk scheme")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
......
......@@ -347,13 +347,23 @@ class SequenceLabelReader(BaseReader):
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
ret_labels.append(label)
if len(sub_token) < 2:
if len(sub_token) == 1:
ret_labels.append(label)
continue
sub_label = label
if label.startswith("B-"):
sub_label = "I-" + label[2:]
ret_labels.extend([sub_label] * (len(sub_token) - 1))
if label == "O" or label.startswith("I-"):
ret_labels.extend([label] * len(sub_token))
elif label.startswith("B-"):
i_label = "I-" + label[2:]
ret_labels.extend([label] + [i_label] * (len(sub_token) - 1))
elif label.startswith("S-"):
b_laebl = "B-" + label[2:]
e_label = "E-" + label[2:]
i_label = "I-" + label[2:]
ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label])
elif label.startswith("E-"):
i_label = "I-" + label[2:]
ret_labels.extend([i_label] * (len(sub_token) - 1) + [label])
assert len(ret_tokens) == len(ret_labels)
return ret_tokens, ret_labels
......@@ -451,6 +461,15 @@ class MRCReader(BaseReader):
self.current_epoch = 0
self.num_examples = 0
self.Example = namedtuple('Example',
['qas_id', 'question_text', 'doc_tokens', 'orig_answer_text',
'start_position', 'end_position'])
self.Feature = namedtuple("Feature", ["unique_id", "example_index", "doc_span_index",
"tokens", "token_to_orig_map", "token_is_max_context",
"token_ids", "position_ids", "text_type_ids",
"start_position", "end_position"])
self.DocSpan = namedtuple("DocSpan", ["start", "length"])
def _read_json(self, input_file, is_training):
examples = []
with open(input_file, "r") as f:
......@@ -495,12 +514,7 @@ class MRCReader(BaseReader):
doc_tokens = tokenization.tokenize_chinese_chars(
paragraph_text)
Example = namedtuple('Example', [
'qas_id', 'question_text', 'doc_tokens',
'orig_answer_text', 'start_position', 'end_position'
])
example = Example(
example = self.Example(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
......@@ -544,11 +558,6 @@ class MRCReader(BaseReader):
def _convert_example_to_feature(self, examples, max_seq_length, tokenizer,
is_training):
Feature = namedtuple("Feature", [
"unique_id", "example_index", "doc_span_index", "tokens",
"token_to_orig_map", "token_is_max_context", "token_ids",
"position_ids", "text_type_ids", "start_position", "end_position"
])
features = []
unique_id = 1000000000
......@@ -581,14 +590,13 @@ class MRCReader(BaseReader):
tokenizer, example.orig_answer_text)
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
_DocSpan = namedtuple("DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
doc_spans.append(self.DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, self.doc_stride)
......@@ -638,7 +646,7 @@ class MRCReader(BaseReader):
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
feature = Feature(
feature = self.Feature(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
......
......@@ -109,12 +109,14 @@ def main(args):
scheduler=args.lr_scheduler,
use_fp16=args.use_fp16)
"""
fluid.memory_optimize(
input_program=train_program,
skip_opt_set=[
graph_vars["loss"].name, graph_vars["labels"].name,
graph_vars["infers"].name, graph_vars["seq_lens"].name
])
"""
if args.verbose:
if args.in_tokens:
......
......@@ -12,6 +12,7 @@ python -u run_sequence_labeling.py \
--batch_size 16 \
--init_pretraining_params ${MODEL_PATH}/params \
--num_labels 7 \
--chunk_scheme "IOB" \
--label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \
--train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \
--dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \
......
......@@ -11,6 +11,7 @@ python -u run_sequence_labeling.py \
--batch_size 16 \
--init_pretraining_params ${MODEL_PATH}/params \
--num_labels 7 \
--chunk_scheme "IOB" \
--label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \
--train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \
--dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \
......
......@@ -395,11 +395,16 @@ def tokenize_chinese_chars(text):
return False
def _is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
output = []
buff = ""
for char in text:
cp = ord(char)
if _is_chinese_char(cp):
if _is_chinese_char(cp) or _is_whitespace(char):
if buff != "":
output.append(buff)
buff = ""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册