From 445310c03df3f27f7fae2384cd81dc3ce80d5881 Mon Sep 17 00:00:00 2001 From: Zeyu Chen Date: Thu, 4 Apr 2019 14:43:40 +0800 Subject: [PATCH] add evaluation code --- paddlehub/finetune/evaluate.py | 200 +++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 paddlehub/finetune/evaluate.py diff --git a/paddlehub/finetune/evaluate.py b/paddlehub/finetune/evaluate.py new file mode 100644 index 00000000..31f16f30 --- /dev/null +++ b/paddlehub/finetune/evaluate.py @@ -0,0 +1,200 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def evaluate_cls_task(task, data_reader, feed_list, phase="test", config=None): + logger.info("Evaluation on {} dataset start".format(phase)) + inference_program = task.inference_program() + main_program = task.main_program() + loss = task.variable("loss") + accuracy = task.variable("accuracy") + batch_size = config.batch_size + place, dev_count = _get_running_device_info(config) + exe = fluid.Executor(place=place) + with fluid.program_guard(inference_program): + data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + num_eval_examples = acc_sum = loss_sum = 0 + test_reader = data_reader.data_generator( + batch_size=batch_size, phase=phase) + eval_time_begin = time.time() + eval_step = 0 + for batch in test_reader(): + num_batch_examples = len(batch) + eval_step += 1 + loss_v, accuracy_v = exe.run( + feed=data_feeder.feed(batch), + fetch_list=[loss.name, accuracy.name]) + num_eval_examples += num_batch_examples + acc_sum += accuracy_v * num_batch_examples + loss_sum += loss_v * num_batch_examples + eval_time_used = time.time() - eval_time_begin + + avg_loss = loss_sum / num_eval_examples + avg_acc = acc_sum / num_eval_examples + eval_speed = eval_step / eval_time_used + logger.info( + "[%s dataset evaluation result] loss=%.5f acc=%.5f [step/sec: %.2f]" % + (phase, avg_loss, avg_acc, eval_speed)) + + return avg_loss, avg_acc, eval_speed + + +def evaluate_seq_labeling_task(task, + data_reader, + feed_list, + phase="test", + config=None): + fetch_list = [ + task.variable("labels").name, + task.variable("infers").name, + task.variable("seq_len").name, + task.variable("loss").name + ] + logger.info("Evaluation on {} dataset start".format(phase)) + inference_program = task.inference_program() + batch_size = config.batch_size + place, dev_count = _get_running_device_info(config) + exe = fluid.Executor(place=place) + num_labels = len(data_reader.get_labels()) + with fluid.program_guard(inference_program): + data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + num_eval_examples = acc_sum = loss_sum = 0 + test_reader = data_reader.data_generator( + batch_size=batch_size, phase=phase) + eval_time_begin = time.time() + eval_step = 0 + total_label, total_infer, total_correct = 0.0, 0.0, 0.0 + for batch in test_reader(): + num_batch_examples = len(batch) + eval_step += 1 + np_labels, np_infers, np_lens, _ = exe.run( + feed=data_feeder.feed(batch), fetch_list=fetch_list) + label_num, infer_num, correct_num = chunk_eval( + np_labels, np_infers, np_lens, num_labels, dev_count) + + total_infer += infer_num + total_label += label_num + total_correct += correct_num + + precision, recall, f1 = calculate_f1(total_label, total_infer, + total_correct) + eval_time_used = time.time() - eval_time_begin + eval_speed = eval_step / eval_time_used + logger.info( + "[%s evaluation] F1-Score=%f, precision=%f, recall=%f [step/sec: %.2f]" + % (phase, f1, precision, recall, eval_speed)) + + +# Sequence label evaluation functions +def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1): + def extract_bio_chunk(seq): + chunks = [] + cur_chunk = None + null_index = tag_num - 1 + for index in range(len(seq)): + tag = seq[index] + tag_type = tag // 2 + tag_pos = tag % 2 + + if tag == null_index: + if cur_chunk is not None: + chunks.append(cur_chunk) + cur_chunk = None + continue + + if tag_pos == 0: + if cur_chunk is not None: + chunks.append(cur_chunk) + cur_chunk = {} + cur_chunk = {"st": index, "en": index + 1, "type": tag_type} + + else: + if cur_chunk is None: + cur_chunk = {"st": index, "en": index + 1, "type": tag_type} + continue + + if cur_chunk["type"] == tag_type: + cur_chunk["en"] = index + 1 + else: + chunks.append(cur_chunk) + cur_chunk = {"st": index, "en": index + 1, "type": tag_type} + + if cur_chunk is not None: + chunks.append(cur_chunk) + return chunks + + null_index = tag_num - 1 + num_label = 0 + num_infer = 0 + num_correct = 0 + labels = np_labels.reshape([-1]).astype(np.int32).tolist() + infers = np_infers.reshape([-1]).astype(np.int32).tolist() + all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist() + + base_index = 0 + for dev_index in range(dev_count): + lens = all_lens[dev_index] + max_len = 0 + for l in lens: + max_len = max(max_len, l) + + for i in range(len(lens)): + seq_st = base_index + i * max_len + 1 + seq_en = seq_st + (lens[i] - 2) + infer_chunks = extract_bio_chunk(infers[seq_st:seq_en]) + label_chunks = extract_bio_chunk(labels[seq_st:seq_en]) + num_infer += len(infer_chunks) + num_label += len(label_chunks) + + infer_index = 0 + label_index = 0 + while label_index < len(label_chunks) \ + and infer_index < len(infer_chunks): + if infer_chunks[infer_index]["st"] \ + < label_chunks[label_index]["st"]: + infer_index += 1 + elif infer_chunks[infer_index]["st"] \ + > label_chunks[label_index]["st"]: + label_index += 1 + else: + if infer_chunks[infer_index]["en"] \ + == label_chunks[label_index]["en"] \ + and infer_chunks[infer_index]["type"] \ + == label_chunks[label_index]["type"]: + num_correct += 1 + + infer_index += 1 + label_index += 1 + + base_index += max_len * len(lens) + + return num_label, num_infer, num_correct + + +def calculate_f1(num_label, num_infer, num_correct): + if num_infer == 0: + precision = 0.0 + else: + precision = num_correct * 1.0 / num_infer + + if num_label == 0: + recall = 0.0 + else: + recall = num_correct * 1.0 / num_label + + if num_correct == 0: + f1 = 0.0 + else: + f1 = 2 * precision * recall / (precision + recall) + return precision, recall, f1 -- GitLab