sequence_label.py 8.5 KB
Newer Older
T
tianxin04 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import argparse
import numpy as np
import multiprocessing

import paddle
import paddle.fluid as fluid

C
cclauss 已提交
28 29
from six.moves import xrange

T
tianxin04 已提交
30 31
from model.ernie import ErnieModel

Y
Yibing Liu 已提交
32 33

def create_model(args, pyreader_name, ernie_config, is_prediction=False):
T
tianxin04 已提交
34 35 36
    pyreader = fluid.layers.py_reader(
        capacity=50,
        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
Y
Yibing Liu 已提交
37
                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
T
tianxin 已提交
38 39 40 41 42
                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]],
        dtypes=[
            'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'
        ],
        lod_levels=[0, 0, 0, 0, 0, 0, 0],
T
tianxin04 已提交
43 44 45
        name=pyreader_name,
        use_double_buffer=True)

T
tianxin 已提交
46
    (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels,
T
tianxin04 已提交
47 48 49 50 51 52
     seq_lens) = fluid.layers.read_file(pyreader)

    ernie = ErnieModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        sentence_ids=sent_ids,
T
tianxin 已提交
53
        task_ids=task_ids,
Y
Yibing Liu 已提交
54
        input_mask=input_mask,
T
tianxin04 已提交
55 56 57 58
        config=ernie_config,
        use_fp16=args.use_fp16)

    enc_out = ernie.get_sequence_output()
T
tianxin 已提交
59 60
    enc_out = fluid.layers.dropout(
        x=enc_out, dropout_prob=0.1, dropout_implementation="upscale_in_train")
T
tianxin04 已提交
61 62 63 64 65 66 67 68
    logits = fluid.layers.fc(
        input=enc_out,
        size=args.num_labels,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(
            name="cls_seq_label_out_w",
            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
        bias_attr=fluid.ParamAttr(
Y
Yibing Liu 已提交
69 70
            name="cls_seq_label_out_b",
            initializer=fluid.initializer.Constant(0.)))
T
tianxin04 已提交
71

Y
Yibing Liu 已提交
72 73 74 75
    ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1])
    ret_infers = fluid.layers.reshape(
        x=fluid.layers.argmax(
            logits, axis=2), shape=[-1, 1])
T
tianxin04 已提交
76 77 78

    labels = fluid.layers.flatten(labels, axis=2)
    ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
Y
Yibing Liu 已提交
79 80 81 82
        logits=fluid.layers.flatten(
            logits, axis=2),
        label=labels,
        return_softmax=True)
T
tianxin 已提交
83 84
    input_mask = fluid.layers.flatten(input_mask, axis=2)
    ce_loss = ce_loss * input_mask
T
tianxin04 已提交
85 86 87 88 89
    loss = fluid.layers.mean(x=ce_loss)

    if args.use_fp16 and args.loss_scaling > 1.0:
        loss *= args.loss_scaling

Y
Yibing Liu 已提交
90 91 92 93 94 95 96
    graph_vars = {
        "loss": loss,
        "probs": probs,
        "labels": ret_labels,
        "infers": ret_infers,
        "seq_lens": seq_lens
    }
T
tianxin04 已提交
97 98

    for k, v in graph_vars.items():
Y
Yibing Liu 已提交
99
        v.persistable = True
T
tianxin04 已提交
100 101 102 103

    return pyreader, graph_vars


Y
Yibing Liu 已提交
104
def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
T
tianxin04 已提交
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
    def extract_bio_chunk(seq):
        chunks = []
        cur_chunk = None
        null_index = tag_num - 1
        for index in xrange(len(seq)):
            tag = seq[index]
            tag_type = tag // 2
            tag_pos = tag % 2

            if tag == null_index:
                if cur_chunk is not None:
                    chunks.append(cur_chunk)
                    cur_chunk = None
                continue

            if tag_pos == 0:
                if cur_chunk is not None:
                    chunks.append(cur_chunk)
                    cur_chunk = {}
Y
Yibing Liu 已提交
124
                cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
T
tianxin04 已提交
125 126 127

            else:
                if cur_chunk is None:
Y
Yibing Liu 已提交
128
                    cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
T
tianxin04 已提交
129 130 131
                    continue

                if cur_chunk["type"] == tag_type:
Y
Yibing Liu 已提交
132
                    cur_chunk["en"] = index + 1
T
tianxin04 已提交
133 134
                else:
                    chunks.append(cur_chunk)
Y
Yibing Liu 已提交
135
                    cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
T
tianxin04 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165

        if cur_chunk is not None:
            chunks.append(cur_chunk)
        return chunks

    null_index = tag_num - 1
    num_label = 0
    num_infer = 0
    num_correct = 0
    labels = np_labels.reshape([-1]).astype(np.int32).tolist()
    infers = np_infers.reshape([-1]).astype(np.int32).tolist()
    all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist()

    base_index = 0
    for dev_index in xrange(dev_count):
        lens = all_lens[dev_index]
        max_len = 0
        for l in lens:
            max_len = max(max_len, l)

        for i in xrange(len(lens)):
            seq_st = base_index + i * max_len + 1
            seq_en = seq_st + (lens[i] - 2)
            infer_chunks = extract_bio_chunk(infers[seq_st:seq_en])
            label_chunks = extract_bio_chunk(labels[seq_st:seq_en])
            num_infer += len(infer_chunks)
            num_label += len(label_chunks)

            infer_index = 0
            label_index = 0
Y
Yibing Liu 已提交
166 167 168 169
            while label_index < len(label_chunks) \
                   and infer_index < len(infer_chunks):
                if infer_chunks[infer_index]["st"] \
                    < label_chunks[label_index]["st"]:
T
tianxin04 已提交
170
                    infer_index += 1
Y
Yibing Liu 已提交
171 172
                elif infer_chunks[infer_index]["st"] \
                    > label_chunks[label_index]["st"]:
T
tianxin04 已提交
173 174
                    label_index += 1
                else:
Y
Yibing Liu 已提交
175 176 177 178
                    if infer_chunks[infer_index]["en"] \
                        == label_chunks[label_index]["en"] \
                        and infer_chunks[infer_index]["type"] \
                        == label_chunks[label_index]["type"]:
T
tianxin04 已提交
179 180 181 182 183 184 185 186 187
                        num_correct += 1

                    infer_index += 1
                    label_index += 1

        base_index += max_len * len(lens)

    return num_label, num_infer, num_correct

Y
Yibing Liu 已提交
188

T
tianxin04 已提交
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
def calculate_f1(num_label, num_infer, num_correct):
    if num_infer == 0:
        precision = 0.0
    else:
        precision = num_correct * 1.0 / num_infer

    if num_label == 0:
        recall = 0.0
    else:
        recall = num_correct * 1.0 / num_label

    if num_correct == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1

Y
Yibing Liu 已提交
206 207 208 209 210 211 212 213 214 215 216 217

def evaluate(exe,
             program,
             pyreader,
             graph_vars,
             tag_num,
             eval_phase,
             dev_count=1):
    fetch_list = [
        graph_vars["labels"].name, graph_vars["infers"].name,
        graph_vars["seq_lens"].name
    ]
T
tianxin04 已提交
218 219 220 221 222 223 224

    if eval_phase == "train":
        fetch_list.append(graph_vars["loss"].name)
        if "learning_rate" in graph_vars:
            fetch_list.append(graph_vars["learning_rate"].name)
        outputs = exe.run(fetch_list=fetch_list)
        np_labels, np_infers, np_lens, np_loss = outputs[:4]
Y
Yibing Liu 已提交
225 226
        num_label, num_infer, num_correct = chunk_eval(
            np_labels, np_infers, np_lens, tag_num, dev_count)
T
tianxin04 已提交
227
        precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct)
T
tianxin 已提交
228
        rets = {
Y
Yibing Liu 已提交
229 230 231 232 233
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "loss": np.mean(np_loss)
        }
T
tianxin04 已提交
234
        if "learning_rate" in graph_vars:
T
tianxin 已提交
235 236
            rets["lr"] = float(outputs[4][0])
        return rets
T
tianxin04 已提交
237 238 239 240 241 242 243

    else:
        total_label, total_infer, total_correct = 0.0, 0.0, 0.0
        time_begin = time.time()
        pyreader.start()
        while True:
            try:
Y
Yibing Liu 已提交
244 245 246 247
                np_labels, np_infers, np_lens = exe.run(program=program,
                                                        fetch_list=fetch_list)
                label_num, infer_num, correct_num = chunk_eval(
                    np_labels, np_infers, np_lens, tag_num, dev_count)
T
tianxin04 已提交
248 249 250 251 252 253 254 255
                total_infer += infer_num
                total_label += label_num
                total_correct += correct_num

            except fluid.core.EOFException:
                pyreader.reset()
                break

Y
Yibing Liu 已提交
256 257
        precision, recall, f1 = calculate_f1(total_label, total_infer,
                                             total_correct)
T
tianxin04 已提交
258 259
        time_end = time.time()

Y
Yibing Liu 已提交
260 261 262
        print(
            "[%s evaluation] f1: %f, precision: %f, recall: %f, elapsed time: %f s"
            % (eval_phase, f1, precision, recall, time_end - time_begin))