未验证 提交 aa340671 编写于 作者: Q qingqing01 提交者: GitHub

Merge pull request #5016 from guoshengCS/add-ChunkEvalOp

Add chunk eval op
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/chunk_eval_op.h"
namespace paddle {
namespace operators {
class ChunkEvalOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Inference"),
"Input(Inference) of ChunkEvalOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"),
"Input(Label) of ChunkEvalOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Precision"),
"Output(Precision) of ChunkEvalOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Recall"),
"Output(Recall) of ChunkEvalOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
"Output(F1-Score) of ChunkEvalOp should not be null.");
auto inference_dim = ctx->GetInputDim("Inference");
auto label_dim = ctx->GetInputDim("Label");
PADDLE_ENFORCE(inference_dim == label_dim,
"Inference's shape must be the same as Label's shape.");
ctx->SetOutputDim("Precision", {1});
ctx->SetOutputDim("Recall", {1});
ctx->SetOutputDim("F1-Score", {1});
}
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext &ctx) const override {
return framework::DataType::FP32;
}
};
class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ChunkEvalOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Inference",
"(Tensor, default: Tensor<int>). Predictions from the network.");
AddInput("Label",
"(Tensor, default: Tensor<int>). The true tag sequences.");
AddOutput("Precision",
"(float). The evaluated precision (called positive predictive "
"value) of chunks on the given mini-batch.");
AddOutput("Recall",
"(float). The evaluated recall (true positive rate or "
"sensitivity) of chunks on the given mini-batch.");
AddOutput("F1-Score",
"(float). The evaluated F1-Score on the given mini-batch.");
AddAttr<int>("num_chunk_types",
"(int). The number of chunk type. See below for details.");
AddAttr<std::string>(
"chunk_scheme",
"(string, default IOB). The labeling scheme indicating "
"how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
"for details.")
.SetDefault("IOB");
AddAttr<std::vector<int>>("excluded_chunk_types",
"(list<int>) A list including chunk type ids "
"indicating chunk types that are not counted. "
"See below for details.")
.SetDefault(std::vector<int>{});
AddComment(R"DOC(
For some basics of chunking, please refer to
‘Chunking with Support Vector Mechines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
Here is a NER example of labeling for these tagging schemes:
Li Ming works at Agricultural Bank of China in Beijing.
IO: I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC
IOB: B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC
IOE: I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC
IOBES: B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC
There are three chunk types(named entity types) including PER(person), ORG(orgnazation)
and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
Since the calculations actually use label ids rather than labels, extra attention
should be paid when mapping labels to ids to make CheckEvalOp work. The key point
is that the listed equations are satisfied by ids.
tag_type = label % num_tag_type
chunk_type = label / num_tag_type
where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
is the num of chunk types, and `tag_type` get its value from the following table.
Scheme Begin Inside End Single
plain 0 - - -
IOB 0 1 - -
IOE - 0 1 -
IOBES 0 1 2 3
Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
PER and LOC. To satisfy the above equations, the label map can be like this:
B-ORG 0
I-ORG 1
B-PER 2
I-PER 3
B-LOC 4
I-LOC 5
O 6
It’s not hard to verify the equations noting that the num of chunk types
is 3 and the num of tag types in IOB scheme is 2. For example, the label
id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
I-LOC is 2, which consistent with the results from the equations.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp,
ops::ChunkEvalOpMaker);
REGISTER_OP_CPU_KERNEL(chunk_eval,
ops::ChunkEvalKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <set>
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename Place, typename T>
class ChunkEvalKernel : public framework::OpKernel<T> {
public:
struct Segment {
int begin;
int end;
int type;
bool operator==(const Segment& y) const {
return begin == y.begin && end == y.end && type == y.type;
}
};
void GetSegments(const int* label, int length, std::vector<Segment>& segments,
int num_chunk_types, int num_tag_types, int other_chunk_type,
int tag_begin, int tag_inside, int tag_end,
int tag_single) const {
segments.clear();
segments.reserve(length);
int chunk_start = 0;
bool in_chunk = false;
int tag = -1;
int type = other_chunk_type;
for (int i = 0; i < length; ++i) {
int prev_tag = tag;
int prev_type = type;
PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types);
tag = label[i] % num_tag_types;
type = label[i] / num_tag_types;
if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type,
tag_begin, tag_inside, tag_end, tag_single)) {
Segment segment{
chunk_start, // begin
i - 1, // end
prev_type,
};
segments.push_back(segment);
in_chunk = false;
}
if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
tag_begin, tag_inside, tag_end, tag_single)) {
chunk_start = i;
in_chunk = true;
}
}
if (in_chunk) {
Segment segment{
chunk_start, // begin
length - 1, // end
type,
};
segments.push_back(segment);
}
}
bool ChunkEnd(int prev_tag, int prev_type, int tag, int type,
int other_chunk_type, int tag_begin, int tag_inside,
int tag_end, int tag_single) const {
if (prev_type == other_chunk_type) return false;
if (type == other_chunk_type) return true;
if (type != prev_type) return true;
if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
if (prev_tag == tag_end) return true;
if (prev_tag == tag_single) return true;
return false;
}
bool ChunkBegin(int prev_tag, int prev_type, int tag, int type,
int other_chunk_type, int tag_begin, int tag_inside,
int tag_end, int tag_single) const {
if (prev_type == other_chunk_type) return type != other_chunk_type;
if (type == other_chunk_type) return false;
if (type != prev_type) return true;
if (tag == tag_begin) return true;
if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
if (tag == tag_single) return true;
return false;
}
void Compute(const framework::ExecutionContext& context) const override {
// initialize to parse configurations
int num_chunk_types, num_tag_types;
int other_chunk_type;
int tag_begin, tag_inside, tag_end, tag_single;
std::vector<Segment> label_segments;
std::vector<Segment> output_segments;
std::set<int> excluded_chunk_types;
int64_t num_output_segments = 0;
int64_t num_label_segments = 0;
int64_t num_correct = 0;
if (context.Attr<std::string>("chunk_scheme") == "IOB") {
num_tag_types = 2;
tag_begin = 0;
tag_inside = 1;
tag_end = -1;
tag_single = -1;
} else if (context.Attr<std::string>("chunk_scheme") == "IOE") {
num_tag_types = 2;
tag_begin = -1;
tag_inside = 0;
tag_end = 1;
tag_single = -1;
} else if (context.Attr<std::string>("chunk_scheme") == "IOBES") {
num_tag_types = 4;
tag_begin = 0;
tag_inside = 1;
tag_end = 2;
tag_single = 3;
} else if (context.Attr<std::string>("chunk_scheme") == "plain") {
num_tag_types = 1;
tag_begin = -1;
tag_inside = -1;
tag_end = -1;
tag_single = -1;
} else {
PADDLE_THROW("Unknown chunk scheme.");
}
other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
excluded_chunk_types.insert(
context.Attr<std::vector<int>>("excluded_chunk_types").begin(),
context.Attr<std::vector<int>>("excluded_chunk_types").end());
auto* inference = context.Input<LoDTensor>("Inference");
auto* label = context.Input<LoDTensor>("Label");
auto* precision = context.Output<Tensor>("Precision");
auto* recall = context.Output<Tensor>("Recall");
auto* f1 = context.Output<Tensor>("F1-Score");
const int* inference_data = inference->data<int>();
const int* label_data = label->data<int>();
T* precision_data = precision->mutable_data<T>(context.GetPlace());
T* racall_data = recall->mutable_data<T>(context.GetPlace());
T* f1_data = f1->mutable_data<T>(context.GetPlace());
auto lod = label->lod();
PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
PADDLE_ENFORCE(lod == inference->lod(),
"LoD must be same between Inference and Label.");
int num_sequences = lod[0].size() - 1;
for (int i = 0; i < num_sequences; ++i) {
int seq_length = lod[0][i + 1] - lod[0][i];
EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
output_segments, label_segments, num_output_segments,
num_label_segments, num_correct, num_chunk_types,
num_tag_types, other_chunk_type, tag_begin, tag_inside,
tag_end, tag_single, excluded_chunk_types);
}
*precision_data = !num_output_segments ? 0 : static_cast<T>(num_correct) /
num_output_segments;
*racall_data = !num_label_segments ? 0 : static_cast<T>(num_correct) /
num_label_segments;
*f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
((*precision_data) + (*racall_data));
}
void EvalOneSeq(const int* output, const int* label, int length,
std::vector<Segment>& output_segments,
std::vector<Segment>& label_segments,
int64_t& num_output_segments, int64_t& num_label_segments,
int64_t& num_correct, int num_chunk_types, int num_tag_types,
int other_chunk_type, int tag_begin, int tag_inside,
int tag_end, int tag_single,
const std::set<int>& excluded_chunk_types) const {
GetSegments(output, length, output_segments, num_chunk_types, num_tag_types,
other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
size_t i = 0, j = 0;
while (i < output_segments.size() && j < label_segments.size()) {
if (output_segments[i] == label_segments[j] &&
excluded_chunk_types.count(output_segments[i].type) != 1) {
++num_correct;
}
if (output_segments[i].end < label_segments[j].end) {
++i;
} else if (output_segments[i].end > label_segments[j].end) {
++j;
} else {
++i;
++j;
}
}
for (auto& segment : label_segments) {
if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
}
for (auto& segment : output_segments) {
if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
}
}
};
} // namespace operators
} // namespace paddle
import unittest
import numpy as np
from op_test import OpTest
class Segment(object):
def __init__(self, chunk_type, start_idx, end_idx):
self.chunk_type = chunk_type
self.start_idx = start_idx
self.end_idx = end_idx
def __str__(self):
return '(Segment: %s, %s, %s)' % (self.chunk_type, self.start_idx,
self.end_idx)
__repr__ = __str__
class TestChunkEvalOp(OpTest):
num_sequences = 5
batch_size = 50
def parse_scheme(self):
if self.scheme == 'IOB':
self.num_tag_types = 2
elif self.scheme == 'IOE':
self.num_tag_types = 2
def fill_with_chunks(self, data, chunks):
for chunk in chunks:
if self.scheme == 'IOB':
data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
data[chunk.start_idx + 1:
chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
self.num_tag_types - 1)
data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
self.num_tag_types - 1
) if chunk.start_idx < chunk.end_idx else data[chunk.start_idx]
elif self.scheme == 'IOE':
data[chunk.start_idx:
chunk.end_idx] = chunk.chunk_type * self.num_tag_types
data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
self.num_tag_types - 1)
def rand_chunks(self, starts, num_chunks):
if num_chunks < 0:
num_chunks = np.random.randint(starts[-1])
chunks = []
# generate chunk beginnings
chunk_begins = sorted(
np.random.choice(
range(starts[-1]), num_chunks, replace=False))
seq_chunk_begins = []
begin_idx = 0
# divide chunks into sequences
for i in range(len(starts) - 1):
tmp_chunk_begins = []
while begin_idx < len(chunk_begins) and chunk_begins[
begin_idx] < starts[i + 1]:
tmp_chunk_begins.append(chunk_begins[begin_idx])
begin_idx += 1
seq_chunk_begins.append(tmp_chunk_begins)
# generate chunk ends
chunk_ends = []
for i in range(len(seq_chunk_begins)):
for j in range(len(seq_chunk_begins[i])):
low = seq_chunk_begins[i][j]
high = seq_chunk_begins[i][j + 1] if j < len(seq_chunk_begins[
i]) - 1 else starts[i + 1]
chunk_ends.append(np.random.randint(low, high))
# generate chunks
for chunk_pos in zip(chunk_begins, chunk_ends):
chunk_type = np.random.randint(self.num_chunk_types)
chunks.append(Segment(chunk_type, *chunk_pos))
return chunks
def gen_chunks(self, infer, label, starts):
chunks = self.rand_chunks(starts,
self.num_infer_chunks + self.num_label_chunks
- self.num_correct_chunks)
correct_chunks = np.random.choice(
range(len(chunks)), self.num_correct_chunks, replace=False)
infer_chunks = np.random.choice(
[x for x in range(len(chunks)) if x not in correct_chunks],
self.num_infer_chunks - self.num_correct_chunks,
replace=False)
infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
label_chunks = np.random.choice(
[x for x in range(len(chunks)) if x not in infer_chunks],
self.num_label_chunks - self.num_correct_chunks,
replace=False)
label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
# exclude types in excluded_chunk_types
if len(self.excluded_chunk_types) > 0:
for idx in correct_chunks:
if chunks[idx].chunk_type in self.excluded_chunk_types:
self.num_correct_chunks -= 1
for idx in infer_chunks:
if chunks[idx].chunk_type in self.excluded_chunk_types:
self.num_infer_chunks -= 1
for idx in label_chunks:
if chunks[idx].chunk_type in self.excluded_chunk_types:
self.num_label_chunks -= 1
return self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks
def set_confs(self):
# Use the IOB scheme and labels with 2 chunk types
self.scheme = 'IOB'
self.num_chunk_types = 2
self.excluded_chunk_types = []
self.other_chunk_type = self.num_chunk_types
self.attrs = {
'num_chunk_types': self.num_chunk_types,
'chunk_scheme': self.scheme,
'excluded_chunk_types': self.excluded_chunk_types
}
self.parse_scheme()
self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9
def set_data(self):
infer = np.zeros((self.batch_size, )).astype('int32')
infer.fill(self.num_chunk_types * self.num_tag_types)
label = np.copy(infer)
starts = np.random.choice(
range(1, self.batch_size), self.num_sequences - 1,
replace=False).tolist()
starts.extend([0, self.batch_size])
starts = sorted(starts)
self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
infer, label, starts)
self.inputs = {
'Inference': (infer, [starts]),
'Label': (label, [starts])
}
precision = float(
self.num_correct_chunks
) / self.num_infer_chunks if self.num_infer_chunks else 0
recall = float(self.num_correct_chunks
) / self.num_label_chunks if self.num_label_chunks else 0
f1 = float(2 * precision * recall) / (
precision + recall) if self.num_correct_chunks else 0
self.outputs = {
'Precision': np.asarray(
[precision], dtype='float32'),
'Recall': np.asarray(
[recall], dtype='float32'),
'F1-Score': np.asarray(
[f1], dtype='float32')
}
def setUp(self):
self.op_type = 'chunk_eval'
self.set_confs()
self.set_data()
def test_check_output(self):
self.check_output()
class TestChunkEvalOpWithExclude(TestChunkEvalOp):
def set_confs(self):
# Use the IOE scheme and labels with 3 chunk types
self.scheme = 'IOE'
self.num_chunk_types = 3
self.excluded_chunk_types = [1]
self.other_chunk_type = self.num_chunk_types
self.attrs = {
'num_chunk_types': self.num_chunk_types,
'chunk_scheme': self.scheme,
'excluded_chunk_types': self.excluded_chunk_types
}
self.parse_scheme()
self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 15, 18, 20
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册