diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3d0d996464910cf51dfe6e206e93f26fc63cd2b --- /dev/null +++ b/paddle/operators/chunk_eval_op.cc @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/chunk_eval_op.h" + +namespace paddle { +namespace operators { + +class ChunkEvalOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Inference"), + "Input(Inference) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input(Label) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Precision"), + "Output(Precision) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Recall"), + "Output(Recall) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("F1-Score"), + "Output(F1-Score) of ChunkEvalOp should not be null."); + + auto inference_dim = ctx->GetInputDim("Inference"); + auto label_dim = ctx->GetInputDim("Label"); + + PADDLE_ENFORCE(inference_dim == label_dim, + "Inference's shape must be the same as Label's shape."); + + ctx->SetOutputDim("Precision", {1}); + ctx->SetOutputDim("Recall", {1}); + ctx->SetOutputDim("F1-Score", {1}); + } + + protected: + framework::DataType IndicateDataType( + const framework::ExecutionContext &ctx) const override { + return framework::DataType::FP32; + } +}; + +class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ChunkEvalOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Inference", + "(Tensor, default: Tensor). Predictions from the network."); + AddInput("Label", + "(Tensor, default: Tensor). The true tag sequences."); + AddOutput("Precision", + "(float). The evaluated precision (called positive predictive " + "value) of chunks on the given mini-batch."); + AddOutput("Recall", + "(float). The evaluated recall (true positive rate or " + "sensitivity) of chunks on the given mini-batch."); + AddOutput("F1-Score", + "(float). The evaluated F1-Score on the given mini-batch."); + AddAttr("num_chunk_types", + "(int). The number of chunk type. See below for details."); + AddAttr( + "chunk_scheme", + "(string, default IOB). The labeling scheme indicating " + "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below " + "for details.") + .SetDefault("IOB"); + AddAttr>("excluded_chunk_types", + "(list) A list including chunk type ids " + "indicating chunk types that are not counted. " + "See below for details.") + .SetDefault(std::vector{}); + AddComment(R"DOC( +For some basics of chunking, please refer to +‘Chunking with Support Vector Mechines ’. + + +CheckEvalOp computes the precision, recall, and F1-score of chunk detection, +and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. +Here is a NER example of labeling for these tagging schemes: + + Li Ming works at Agricultural Bank of China in Beijing. + IO: I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC + IOB: B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC + IOE: I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC + IOBES: B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC + +There are three chunk types(named entity types) including PER(person), ORG(orgnazation) +and LOC(LOCATION), and we can see that the labels have the form -. + +Since the calculations actually use label ids rather than labels, extra attention +should be paid when mapping labels to ids to make CheckEvalOp work. The key point +is that the listed equations are satisfied by ids. + + tag_type = label % num_tag_type + chunk_type = label / num_tag_type + +where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type` +is the num of chunk types, and `tag_type` get its value from the following table. + + Scheme Begin Inside End Single + plain 0 - - - + IOB 0 1 - - + IOE - 0 1 - + IOBES 0 1 2 3 + +Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG, +PER and LOC. To satisfy the above equations, the label map can be like this: + + B-ORG 0 + I-ORG 1 + B-PER 2 + I-PER 3 + B-LOC 4 + I-LOC 5 + O 6 + +It’s not hard to verify the equations noting that the num of chunk types +is 3 and the num of tag types in IOB scheme is 2. For example, the label +id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of +I-LOC is 2, which consistent with the results from the equations. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp, + ops::ChunkEvalOpMaker); +REGISTER_OP_CPU_KERNEL(chunk_eval, + ops::ChunkEvalKernel); diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h new file mode 100644 index 0000000000000000000000000000000000000000..81aa07817b673b2ff85a35a51cc43742b7ad7fed --- /dev/null +++ b/paddle/operators/chunk_eval_op.h @@ -0,0 +1,219 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class ChunkEvalKernel : public framework::OpKernel { + public: + struct Segment { + int begin; + int end; + int type; + bool operator==(const Segment& y) const { + return begin == y.begin && end == y.end && type == y.type; + } + }; + + void GetSegments(const int* label, int length, std::vector& segments, + int num_chunk_types, int num_tag_types, int other_chunk_type, + int tag_begin, int tag_inside, int tag_end, + int tag_single) const { + segments.clear(); + segments.reserve(length); + int chunk_start = 0; + bool in_chunk = false; + int tag = -1; + int type = other_chunk_type; + for (int i = 0; i < length; ++i) { + int prev_tag = tag; + int prev_type = type; + PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types); + tag = label[i] % num_tag_types; + type = label[i] / num_tag_types; + if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type, + tag_begin, tag_inside, tag_end, tag_single)) { + Segment segment{ + chunk_start, // begin + i - 1, // end + prev_type, + }; + segments.push_back(segment); + in_chunk = false; + } + if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type, + tag_begin, tag_inside, tag_end, tag_single)) { + chunk_start = i; + in_chunk = true; + } + } + if (in_chunk) { + Segment segment{ + chunk_start, // begin + length - 1, // end + type, + }; + segments.push_back(segment); + } + } + + bool ChunkEnd(int prev_tag, int prev_type, int tag, int type, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single) const { + if (prev_type == other_chunk_type) return false; + if (type == other_chunk_type) return true; + if (type != prev_type) return true; + if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single; + if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single; + if (prev_tag == tag_end) return true; + if (prev_tag == tag_single) return true; + return false; + } + + bool ChunkBegin(int prev_tag, int prev_type, int tag, int type, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single) const { + if (prev_type == other_chunk_type) return type != other_chunk_type; + if (type == other_chunk_type) return false; + if (type != prev_type) return true; + if (tag == tag_begin) return true; + if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single; + if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single; + if (tag == tag_single) return true; + return false; + } + + void Compute(const framework::ExecutionContext& context) const override { + // initialize to parse configurations + int num_chunk_types, num_tag_types; + int other_chunk_type; + int tag_begin, tag_inside, tag_end, tag_single; + std::vector label_segments; + std::vector output_segments; + std::set excluded_chunk_types; + int64_t num_output_segments = 0; + int64_t num_label_segments = 0; + int64_t num_correct = 0; + if (context.Attr("chunk_scheme") == "IOB") { + num_tag_types = 2; + tag_begin = 0; + tag_inside = 1; + tag_end = -1; + tag_single = -1; + } else if (context.Attr("chunk_scheme") == "IOE") { + num_tag_types = 2; + tag_begin = -1; + tag_inside = 0; + tag_end = 1; + tag_single = -1; + } else if (context.Attr("chunk_scheme") == "IOBES") { + num_tag_types = 4; + tag_begin = 0; + tag_inside = 1; + tag_end = 2; + tag_single = 3; + } else if (context.Attr("chunk_scheme") == "plain") { + num_tag_types = 1; + tag_begin = -1; + tag_inside = -1; + tag_end = -1; + tag_single = -1; + } else { + PADDLE_THROW("Unknown chunk scheme."); + } + other_chunk_type = num_chunk_types = context.Attr("num_chunk_types"); + excluded_chunk_types.insert( + context.Attr>("excluded_chunk_types").begin(), + context.Attr>("excluded_chunk_types").end()); + + auto* inference = context.Input("Inference"); + auto* label = context.Input("Label"); + auto* precision = context.Output("Precision"); + auto* recall = context.Output("Recall"); + auto* f1 = context.Output("F1-Score"); + + const int* inference_data = inference->data(); + const int* label_data = label->data(); + T* precision_data = precision->mutable_data(context.GetPlace()); + T* racall_data = recall->mutable_data(context.GetPlace()); + T* f1_data = f1->mutable_data(context.GetPlace()); + + auto lod = label->lod(); + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE(lod == inference->lod(), + "LoD must be same between Inference and Label."); + int num_sequences = lod[0].size() - 1; + for (int i = 0; i < num_sequences; ++i) { + int seq_length = lod[0][i + 1] - lod[0][i]; + EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length, + output_segments, label_segments, num_output_segments, + num_label_segments, num_correct, num_chunk_types, + num_tag_types, other_chunk_type, tag_begin, tag_inside, + tag_end, tag_single, excluded_chunk_types); + } + *precision_data = !num_output_segments ? 0 : static_cast(num_correct) / + num_output_segments; + *racall_data = !num_label_segments ? 0 : static_cast(num_correct) / + num_label_segments; + *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) / + ((*precision_data) + (*racall_data)); + } + + void EvalOneSeq(const int* output, const int* label, int length, + std::vector& output_segments, + std::vector& label_segments, + int64_t& num_output_segments, int64_t& num_label_segments, + int64_t& num_correct, int num_chunk_types, int num_tag_types, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single, + const std::set& excluded_chunk_types) const { + GetSegments(output, length, output_segments, num_chunk_types, num_tag_types, + other_chunk_type, tag_begin, tag_inside, tag_end, tag_single); + GetSegments(label, length, label_segments, num_chunk_types, num_tag_types, + other_chunk_type, tag_begin, tag_inside, tag_end, tag_single); + size_t i = 0, j = 0; + while (i < output_segments.size() && j < label_segments.size()) { + if (output_segments[i] == label_segments[j] && + excluded_chunk_types.count(output_segments[i].type) != 1) { + ++num_correct; + } + if (output_segments[i].end < label_segments[j].end) { + ++i; + } else if (output_segments[i].end > label_segments[j].end) { + ++j; + } else { + ++i; + ++j; + } + } + for (auto& segment : label_segments) { + if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments; + } + for (auto& segment : output_segments) { + if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/v2/framework/tests/test_chunk_eval_op.py b/python/paddle/v2/framework/tests/test_chunk_eval_op.py new file mode 100644 index 0000000000000000000000000000000000000000..48673296a67716c4de804da533f0fd2567f10e2e --- /dev/null +++ b/python/paddle/v2/framework/tests/test_chunk_eval_op.py @@ -0,0 +1,179 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class Segment(object): + def __init__(self, chunk_type, start_idx, end_idx): + self.chunk_type = chunk_type + self.start_idx = start_idx + self.end_idx = end_idx + + def __str__(self): + return '(Segment: %s, %s, %s)' % (self.chunk_type, self.start_idx, + self.end_idx) + + __repr__ = __str__ + + +class TestChunkEvalOp(OpTest): + num_sequences = 5 + batch_size = 50 + + def parse_scheme(self): + if self.scheme == 'IOB': + self.num_tag_types = 2 + elif self.scheme == 'IOE': + self.num_tag_types = 2 + + def fill_with_chunks(self, data, chunks): + for chunk in chunks: + if self.scheme == 'IOB': + data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types + data[chunk.start_idx + 1: + chunk.end_idx] = chunk.chunk_type * self.num_tag_types + ( + self.num_tag_types - 1) + data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + ( + self.num_tag_types - 1 + ) if chunk.start_idx < chunk.end_idx else data[chunk.start_idx] + elif self.scheme == 'IOE': + data[chunk.start_idx: + chunk.end_idx] = chunk.chunk_type * self.num_tag_types + data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + ( + self.num_tag_types - 1) + + def rand_chunks(self, starts, num_chunks): + if num_chunks < 0: + num_chunks = np.random.randint(starts[-1]) + chunks = [] + # generate chunk beginnings + chunk_begins = sorted( + np.random.choice( + range(starts[-1]), num_chunks, replace=False)) + seq_chunk_begins = [] + begin_idx = 0 + # divide chunks into sequences + for i in range(len(starts) - 1): + tmp_chunk_begins = [] + while begin_idx < len(chunk_begins) and chunk_begins[ + begin_idx] < starts[i + 1]: + tmp_chunk_begins.append(chunk_begins[begin_idx]) + begin_idx += 1 + seq_chunk_begins.append(tmp_chunk_begins) + # generate chunk ends + chunk_ends = [] + for i in range(len(seq_chunk_begins)): + for j in range(len(seq_chunk_begins[i])): + low = seq_chunk_begins[i][j] + high = seq_chunk_begins[i][j + 1] if j < len(seq_chunk_begins[ + i]) - 1 else starts[i + 1] + chunk_ends.append(np.random.randint(low, high)) + # generate chunks + for chunk_pos in zip(chunk_begins, chunk_ends): + chunk_type = np.random.randint(self.num_chunk_types) + chunks.append(Segment(chunk_type, *chunk_pos)) + return chunks + + def gen_chunks(self, infer, label, starts): + chunks = self.rand_chunks(starts, + self.num_infer_chunks + self.num_label_chunks + - self.num_correct_chunks) + correct_chunks = np.random.choice( + range(len(chunks)), self.num_correct_chunks, replace=False) + infer_chunks = np.random.choice( + [x for x in range(len(chunks)) if x not in correct_chunks], + self.num_infer_chunks - self.num_correct_chunks, + replace=False) + infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist()) + label_chunks = np.random.choice( + [x for x in range(len(chunks)) if x not in infer_chunks], + self.num_label_chunks - self.num_correct_chunks, + replace=False) + label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist()) + self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks]) + self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks]) + # exclude types in excluded_chunk_types + if len(self.excluded_chunk_types) > 0: + for idx in correct_chunks: + if chunks[idx].chunk_type in self.excluded_chunk_types: + self.num_correct_chunks -= 1 + for idx in infer_chunks: + if chunks[idx].chunk_type in self.excluded_chunk_types: + self.num_infer_chunks -= 1 + for idx in label_chunks: + if chunks[idx].chunk_type in self.excluded_chunk_types: + self.num_label_chunks -= 1 + return self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks + + def set_confs(self): + # Use the IOB scheme and labels with 2 chunk types + self.scheme = 'IOB' + self.num_chunk_types = 2 + self.excluded_chunk_types = [] + self.other_chunk_type = self.num_chunk_types + self.attrs = { + 'num_chunk_types': self.num_chunk_types, + 'chunk_scheme': self.scheme, + 'excluded_chunk_types': self.excluded_chunk_types + } + self.parse_scheme() + self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9 + + def set_data(self): + infer = np.zeros((self.batch_size, )).astype('int32') + infer.fill(self.num_chunk_types * self.num_tag_types) + label = np.copy(infer) + starts = np.random.choice( + range(1, self.batch_size), self.num_sequences - 1, + replace=False).tolist() + starts.extend([0, self.batch_size]) + starts = sorted(starts) + self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks( + infer, label, starts) + self.inputs = { + 'Inference': (infer, [starts]), + 'Label': (label, [starts]) + } + precision = float( + self.num_correct_chunks + ) / self.num_infer_chunks if self.num_infer_chunks else 0 + recall = float(self.num_correct_chunks + ) / self.num_label_chunks if self.num_label_chunks else 0 + f1 = float(2 * precision * recall) / ( + precision + recall) if self.num_correct_chunks else 0 + self.outputs = { + 'Precision': np.asarray( + [precision], dtype='float32'), + 'Recall': np.asarray( + [recall], dtype='float32'), + 'F1-Score': np.asarray( + [f1], dtype='float32') + } + + def setUp(self): + self.op_type = 'chunk_eval' + self.set_confs() + self.set_data() + + def test_check_output(self): + self.check_output() + + +class TestChunkEvalOpWithExclude(TestChunkEvalOp): + def set_confs(self): + # Use the IOE scheme and labels with 3 chunk types + self.scheme = 'IOE' + self.num_chunk_types = 3 + self.excluded_chunk_types = [1] + self.other_chunk_type = self.num_chunk_types + self.attrs = { + 'num_chunk_types': self.num_chunk_types, + 'chunk_scheme': self.scheme, + 'excluded_chunk_types': self.excluded_chunk_types + } + self.parse_scheme() + self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 15, 18, 20 + + +if __name__ == '__main__': + unittest.main()