test_chunk_eval_op.py 9.7 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13 14
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

G
guosheng 已提交
15 16
import unittest
import numpy as np
17
from op_test import OpTest
18
import numpy as np
G
guosheng 已提交
19 20


21
class Segment:
G
guosheng 已提交
22 23 24 25 26 27
    def __init__(self, chunk_type, start_idx, end_idx):
        self.chunk_type = chunk_type
        self.start_idx = start_idx
        self.end_idx = end_idx

    def __str__(self):
28 29 30 31 32
        return '(Segment: %s, %s, %s)' % (
            self.chunk_type,
            self.start_idx,
            self.end_idx,
        )
G
guosheng 已提交
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

    __repr__ = __str__


class TestChunkEvalOp(OpTest):
    num_sequences = 5
    batch_size = 50

    def parse_scheme(self):
        if self.scheme == 'IOB':
            self.num_tag_types = 2
        elif self.scheme == 'IOE':
            self.num_tag_types = 2

    def fill_with_chunks(self, data, chunks):
        for chunk in chunks:
            if self.scheme == 'IOB':
                data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
51 52 53
                data[
                    chunk.start_idx + 1 : chunk.end_idx
                ] = chunk.chunk_type * self.num_tag_types + (
G
guosheng 已提交
54
                    self.num_tag_types - 1
55 56 57 58 59 60 61
                )
                data[chunk.end_idx] = (
                    chunk.chunk_type * self.num_tag_types
                    + (self.num_tag_types - 1)
                    if chunk.start_idx < chunk.end_idx
                    else data[chunk.start_idx]
                )
G
guosheng 已提交
62
            elif self.scheme == 'IOE':
63 64 65
                data[chunk.start_idx : chunk.end_idx] = (
                    chunk.chunk_type * self.num_tag_types
                )
G
guosheng 已提交
66
                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
67 68
                    self.num_tag_types - 1
                )
G
guosheng 已提交
69 70 71 72 73 74 75

    def rand_chunks(self, starts, num_chunks):
        if num_chunks < 0:
            num_chunks = np.random.randint(starts[-1])
        chunks = []
        # generate chunk beginnings
        chunk_begins = sorted(
76 77
            np.random.choice(list(range(starts[-1])), num_chunks, replace=False)
        )
G
guosheng 已提交
78 79 80 81 82
        seq_chunk_begins = []
        begin_idx = 0
        # divide chunks into sequences
        for i in range(len(starts) - 1):
            tmp_chunk_begins = []
83 84 85 86
            while (
                begin_idx < len(chunk_begins)
                and chunk_begins[begin_idx] < starts[i + 1]
            ):
G
guosheng 已提交
87 88 89 90 91 92 93 94
                tmp_chunk_begins.append(chunk_begins[begin_idx])
                begin_idx += 1
            seq_chunk_begins.append(tmp_chunk_begins)
        # generate chunk ends
        chunk_ends = []
        for i in range(len(seq_chunk_begins)):
            for j in range(len(seq_chunk_begins[i])):
                low = seq_chunk_begins[i][j]
95 96 97 98 99
                high = (
                    seq_chunk_begins[i][j + 1]
                    if j < len(seq_chunk_begins[i]) - 1
                    else starts[i + 1]
                )
G
guosheng 已提交
100 101 102 103
                chunk_ends.append(np.random.randint(low, high))
        # generate chunks
        for chunk_pos in zip(chunk_begins, chunk_ends):
            chunk_type = np.random.randint(self.num_chunk_types)
104
            chunks.append(Segment(chunk_type, *chunk_pos))
G
guosheng 已提交
105 106 107
        return chunks

    def gen_chunks(self, infer, label, starts):
108
        chunks = self.rand_chunks(
109 110 111 112 113 114 115 116
            starts,
            self.num_infer_chunks
            + self.num_label_chunks
            - self.num_correct_chunks,
        )
        correct_chunks = np.random.choice(
            list(range(len(chunks))), self.num_correct_chunks, replace=False
        )
G
guosheng 已提交
117 118 119
        infer_chunks = np.random.choice(
            [x for x in range(len(chunks)) if x not in correct_chunks],
            self.num_infer_chunks - self.num_correct_chunks,
120 121
            replace=False,
        )
G
guosheng 已提交
122 123 124 125
        infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
        label_chunks = np.random.choice(
            [x for x in range(len(chunks)) if x not in infer_chunks],
            self.num_label_chunks - self.num_correct_chunks,
126 127
            replace=False,
        )
G
guosheng 已提交
128 129 130 131 132 133 134 135 136 137 138 139 140 141
        label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
        self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
        self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
        # exclude types in excluded_chunk_types
        if len(self.excluded_chunk_types) > 0:
            for idx in correct_chunks:
                if chunks[idx].chunk_type in self.excluded_chunk_types:
                    self.num_correct_chunks -= 1
            for idx in infer_chunks:
                if chunks[idx].chunk_type in self.excluded_chunk_types:
                    self.num_infer_chunks -= 1
            for idx in label_chunks:
                if chunks[idx].chunk_type in self.excluded_chunk_types:
                    self.num_label_chunks -= 1
142 143 144 145 146
        return (
            self.num_correct_chunks,
            self.num_infer_chunks,
            self.num_label_chunks,
        )
G
guosheng 已提交
147 148 149 150 151 152 153 154 155 156

    def set_confs(self):
        # Use the IOB scheme and labels with 2 chunk types
        self.scheme = 'IOB'
        self.num_chunk_types = 2
        self.excluded_chunk_types = []
        self.other_chunk_type = self.num_chunk_types
        self.attrs = {
            'num_chunk_types': self.num_chunk_types,
            'chunk_scheme': self.scheme,
157
            'excluded_chunk_types': self.excluded_chunk_types,
G
guosheng 已提交
158 159
        }
        self.parse_scheme()
160 161 162 163 164
        (
            self.num_correct_chunks,
            self.num_infer_chunks,
            self.num_label_chunks,
        ) = (4, 5, 9)
G
guosheng 已提交
165 166

    def set_data(self):
167
        infer = np.zeros((self.batch_size,)).astype('int64')
G
guosheng 已提交
168 169
        infer.fill(self.num_chunk_types * self.num_tag_types)
        label = np.copy(infer)
170 171 172 173 174
        starts = np.random.choice(
            list(range(1, self.batch_size)),
            self.num_sequences - 1,
            replace=False,
        ).tolist()
G
guosheng 已提交
175 176
        starts.extend([0, self.batch_size])
        starts = sorted(starts)
177 178 179 180 181
        (
            self.num_correct_chunks,
            self.num_infer_chunks,
            self.num_label_chunks,
        ) = self.gen_chunks(infer, label, starts)
182 183 184
        lod = []
        for i in range(len(starts) - 1):
            lod.append(starts[i + 1] - starts[i])
185
        self.set_input(infer, label, lod)
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
        precision = (
            float(self.num_correct_chunks) / self.num_infer_chunks
            if self.num_infer_chunks
            else 0
        )
        recall = (
            float(self.num_correct_chunks) / self.num_label_chunks
            if self.num_label_chunks
            else 0
        )
        f1 = (
            float(2 * precision * recall) / (precision + recall)
            if self.num_correct_chunks
            else 0
        )
G
guosheng 已提交
201
        self.outputs = {
202 203 204
            'Precision': np.asarray([precision], dtype='float32'),
            'Recall': np.asarray([recall], dtype='float32'),
            'F1-Score': np.asarray([f1], dtype='float32'),
205 206 207 208 209 210 211 212 213
            'NumInferChunks': np.asarray(
                [self.num_infer_chunks], dtype='int64'
            ),
            'NumLabelChunks': np.asarray(
                [self.num_label_chunks], dtype='int64'
            ),
            'NumCorrectChunks': np.asarray(
                [self.num_correct_chunks], dtype='int64'
            ),
G
guosheng 已提交
214 215
        }

216 217 218
    def set_input(self, infer, label, lod):
        self.inputs = {'Inference': (infer, [lod]), 'Label': (label, [lod])}

G
guosheng 已提交
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
    def setUp(self):
        self.op_type = 'chunk_eval'
        self.set_confs()
        self.set_data()

    def test_check_output(self):
        self.check_output()


class TestChunkEvalOpWithExclude(TestChunkEvalOp):
    def set_confs(self):
        # Use the IOE scheme and labels with 3 chunk types
        self.scheme = 'IOE'
        self.num_chunk_types = 3
        self.excluded_chunk_types = [1]
        self.other_chunk_type = self.num_chunk_types
        self.attrs = {
            'num_chunk_types': self.num_chunk_types,
            'chunk_scheme': self.scheme,
238
            'excluded_chunk_types': self.excluded_chunk_types,
G
guosheng 已提交
239 240
        }
        self.parse_scheme()
241 242 243 244 245
        (
            self.num_correct_chunks,
            self.num_infer_chunks,
            self.num_label_chunks,
        ) = (15, 18, 20)
G
guosheng 已提交
246 247


248 249 250 251 252 253 254 255 256
class TestChunkEvalOpWithTensorInput(TestChunkEvalOp):
    def set_input(self, infer, label, lod):
        max_len = np.max(lod)
        pad_infer = []
        pad_label = []
        start = 0
        for i in range(len(lod)):
            end = lod[i] + start
            pad_infer.append(
257 258 259 260 261 262 263
                np.pad(
                    infer[start:end],
                    (0, max_len - lod[i]),
                    'constant',
                    constant_values=(-1,),
                )
            )
264
            pad_label.append(
265 266 267 268 269 270 271
                np.pad(
                    label[start:end],
                    (0, max_len - lod[i]),
                    'constant',
                    constant_values=(-1,),
                )
            )
272 273 274 275 276 277 278 279
            start = end

        pad_infer = np.expand_dims(np.array(pad_infer, dtype='int64'), 2)
        pad_label = np.expand_dims(np.array(pad_label, dtype='int64'), 2)
        lod = np.array(lod, dtype='int64')
        self.inputs = {
            'Inference': pad_infer,
            'Label': pad_label,
280
            'SeqLength': lod,
281 282 283
        }


G
guosheng 已提交
284 285
if __name__ == '__main__':
    unittest.main()