test_chunk_eval_op.py 11.3 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13 14
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

G
guosheng 已提交
15 16
import unittest
import numpy as np
17
from op_test import OpTest
18 19 20
import numpy as np
from paddle.fluid import Program, program_guard
from paddle import fluid
G
guosheng 已提交
21 22


23
class Segment:
G
guosheng 已提交
24 25 26 27 28 29
    def __init__(self, chunk_type, start_idx, end_idx):
        self.chunk_type = chunk_type
        self.start_idx = start_idx
        self.end_idx = end_idx

    def __str__(self):
30 31 32 33 34
        return '(Segment: %s, %s, %s)' % (
            self.chunk_type,
            self.start_idx,
            self.end_idx,
        )
G
guosheng 已提交
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52

    __repr__ = __str__


class TestChunkEvalOp(OpTest):
    num_sequences = 5
    batch_size = 50

    def parse_scheme(self):
        if self.scheme == 'IOB':
            self.num_tag_types = 2
        elif self.scheme == 'IOE':
            self.num_tag_types = 2

    def fill_with_chunks(self, data, chunks):
        for chunk in chunks:
            if self.scheme == 'IOB':
                data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
53 54 55
                data[
                    chunk.start_idx + 1 : chunk.end_idx
                ] = chunk.chunk_type * self.num_tag_types + (
G
guosheng 已提交
56
                    self.num_tag_types - 1
57 58 59 60 61 62 63
                )
                data[chunk.end_idx] = (
                    chunk.chunk_type * self.num_tag_types
                    + (self.num_tag_types - 1)
                    if chunk.start_idx < chunk.end_idx
                    else data[chunk.start_idx]
                )
G
guosheng 已提交
64
            elif self.scheme == 'IOE':
65 66 67
                data[chunk.start_idx : chunk.end_idx] = (
                    chunk.chunk_type * self.num_tag_types
                )
G
guosheng 已提交
68
                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
69 70
                    self.num_tag_types - 1
                )
G
guosheng 已提交
71 72 73 74 75 76 77

    def rand_chunks(self, starts, num_chunks):
        if num_chunks < 0:
            num_chunks = np.random.randint(starts[-1])
        chunks = []
        # generate chunk beginnings
        chunk_begins = sorted(
78 79
            np.random.choice(list(range(starts[-1])), num_chunks, replace=False)
        )
G
guosheng 已提交
80 81 82 83 84
        seq_chunk_begins = []
        begin_idx = 0
        # divide chunks into sequences
        for i in range(len(starts) - 1):
            tmp_chunk_begins = []
85 86 87 88
            while (
                begin_idx < len(chunk_begins)
                and chunk_begins[begin_idx] < starts[i + 1]
            ):
G
guosheng 已提交
89 90 91 92 93 94 95 96
                tmp_chunk_begins.append(chunk_begins[begin_idx])
                begin_idx += 1
            seq_chunk_begins.append(tmp_chunk_begins)
        # generate chunk ends
        chunk_ends = []
        for i in range(len(seq_chunk_begins)):
            for j in range(len(seq_chunk_begins[i])):
                low = seq_chunk_begins[i][j]
97 98 99 100 101
                high = (
                    seq_chunk_begins[i][j + 1]
                    if j < len(seq_chunk_begins[i]) - 1
                    else starts[i + 1]
                )
G
guosheng 已提交
102 103 104 105
                chunk_ends.append(np.random.randint(low, high))
        # generate chunks
        for chunk_pos in zip(chunk_begins, chunk_ends):
            chunk_type = np.random.randint(self.num_chunk_types)
106
            chunks.append(Segment(chunk_type, *chunk_pos))
G
guosheng 已提交
107 108 109
        return chunks

    def gen_chunks(self, infer, label, starts):
110
        chunks = self.rand_chunks(
111 112 113 114 115 116 117 118
            starts,
            self.num_infer_chunks
            + self.num_label_chunks
            - self.num_correct_chunks,
        )
        correct_chunks = np.random.choice(
            list(range(len(chunks))), self.num_correct_chunks, replace=False
        )
G
guosheng 已提交
119 120 121
        infer_chunks = np.random.choice(
            [x for x in range(len(chunks)) if x not in correct_chunks],
            self.num_infer_chunks - self.num_correct_chunks,
122 123
            replace=False,
        )
G
guosheng 已提交
124 125 126 127
        infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
        label_chunks = np.random.choice(
            [x for x in range(len(chunks)) if x not in infer_chunks],
            self.num_label_chunks - self.num_correct_chunks,
128 129
            replace=False,
        )
G
guosheng 已提交
130 131 132 133 134 135 136 137 138 139 140 141 142 143
        label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
        self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
        self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
        # exclude types in excluded_chunk_types
        if len(self.excluded_chunk_types) > 0:
            for idx in correct_chunks:
                if chunks[idx].chunk_type in self.excluded_chunk_types:
                    self.num_correct_chunks -= 1
            for idx in infer_chunks:
                if chunks[idx].chunk_type in self.excluded_chunk_types:
                    self.num_infer_chunks -= 1
            for idx in label_chunks:
                if chunks[idx].chunk_type in self.excluded_chunk_types:
                    self.num_label_chunks -= 1
144 145 146 147 148
        return (
            self.num_correct_chunks,
            self.num_infer_chunks,
            self.num_label_chunks,
        )
G
guosheng 已提交
149 150 151 152 153 154 155 156 157 158

    def set_confs(self):
        # Use the IOB scheme and labels with 2 chunk types
        self.scheme = 'IOB'
        self.num_chunk_types = 2
        self.excluded_chunk_types = []
        self.other_chunk_type = self.num_chunk_types
        self.attrs = {
            'num_chunk_types': self.num_chunk_types,
            'chunk_scheme': self.scheme,
159
            'excluded_chunk_types': self.excluded_chunk_types,
G
guosheng 已提交
160 161
        }
        self.parse_scheme()
162 163 164 165 166
        (
            self.num_correct_chunks,
            self.num_infer_chunks,
            self.num_label_chunks,
        ) = (4, 5, 9)
G
guosheng 已提交
167 168

    def set_data(self):
169
        infer = np.zeros((self.batch_size,)).astype('int64')
G
guosheng 已提交
170 171
        infer.fill(self.num_chunk_types * self.num_tag_types)
        label = np.copy(infer)
172 173 174 175 176
        starts = np.random.choice(
            list(range(1, self.batch_size)),
            self.num_sequences - 1,
            replace=False,
        ).tolist()
G
guosheng 已提交
177 178
        starts.extend([0, self.batch_size])
        starts = sorted(starts)
179 180 181 182 183
        (
            self.num_correct_chunks,
            self.num_infer_chunks,
            self.num_label_chunks,
        ) = self.gen_chunks(infer, label, starts)
184 185 186
        lod = []
        for i in range(len(starts) - 1):
            lod.append(starts[i + 1] - starts[i])
187
        self.set_input(infer, label, lod)
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
        precision = (
            float(self.num_correct_chunks) / self.num_infer_chunks
            if self.num_infer_chunks
            else 0
        )
        recall = (
            float(self.num_correct_chunks) / self.num_label_chunks
            if self.num_label_chunks
            else 0
        )
        f1 = (
            float(2 * precision * recall) / (precision + recall)
            if self.num_correct_chunks
            else 0
        )
G
guosheng 已提交
203
        self.outputs = {
204 205 206
            'Precision': np.asarray([precision], dtype='float32'),
            'Recall': np.asarray([recall], dtype='float32'),
            'F1-Score': np.asarray([f1], dtype='float32'),
207 208 209 210 211 212 213 214 215
            'NumInferChunks': np.asarray(
                [self.num_infer_chunks], dtype='int64'
            ),
            'NumLabelChunks': np.asarray(
                [self.num_label_chunks], dtype='int64'
            ),
            'NumCorrectChunks': np.asarray(
                [self.num_correct_chunks], dtype='int64'
            ),
G
guosheng 已提交
216 217
        }

218 219 220
    def set_input(self, infer, label, lod):
        self.inputs = {'Inference': (infer, [lod]), 'Label': (label, [lod])}

G
guosheng 已提交
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
    def setUp(self):
        self.op_type = 'chunk_eval'
        self.set_confs()
        self.set_data()

    def test_check_output(self):
        self.check_output()


class TestChunkEvalOpWithExclude(TestChunkEvalOp):
    def set_confs(self):
        # Use the IOE scheme and labels with 3 chunk types
        self.scheme = 'IOE'
        self.num_chunk_types = 3
        self.excluded_chunk_types = [1]
        self.other_chunk_type = self.num_chunk_types
        self.attrs = {
            'num_chunk_types': self.num_chunk_types,
            'chunk_scheme': self.scheme,
240
            'excluded_chunk_types': self.excluded_chunk_types,
G
guosheng 已提交
241 242
        }
        self.parse_scheme()
243 244 245 246 247
        (
            self.num_correct_chunks,
            self.num_infer_chunks,
            self.num_label_chunks,
        ) = (15, 18, 20)
G
guosheng 已提交
248 249


250 251 252 253 254 255 256 257 258
class TestChunkEvalOpWithTensorInput(TestChunkEvalOp):
    def set_input(self, infer, label, lod):
        max_len = np.max(lod)
        pad_infer = []
        pad_label = []
        start = 0
        for i in range(len(lod)):
            end = lod[i] + start
            pad_infer.append(
259 260 261 262 263 264 265
                np.pad(
                    infer[start:end],
                    (0, max_len - lod[i]),
                    'constant',
                    constant_values=(-1,),
                )
            )
266
            pad_label.append(
267 268 269 270 271 272 273
                np.pad(
                    label[start:end],
                    (0, max_len - lod[i]),
                    'constant',
                    constant_values=(-1,),
                )
            )
274 275 276 277 278 279 280 281
            start = end

        pad_infer = np.expand_dims(np.array(pad_infer, dtype='int64'), 2)
        pad_label = np.expand_dims(np.array(pad_label, dtype='int64'), 2)
        lod = np.array(lod, dtype='int64')
        self.inputs = {
            'Inference': pad_infer,
            'Label': pad_label,
282
            'SeqLength': lod,
283 284 285
        }


286 287 288 289 290 291 292
class TestChunkEvalOpError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program(), Program()):

            def test_input():
                input_data = np.random.random(1, 1).astype("int64")
                label_data = np.random.random(1).astype("int64")
293 294 295 296 297 298
                fluid.layers.chunk_eval(
                    input=input_data,
                    label=label_data,
                    chunk_scheme="IOB",
                    num_chunk_types=3,
                )
299 300 301 302

            self.assertRaises(TypeError, test_input)

            def test_label():
303 304 305
                input_ = fluid.data(
                    name="input", shape=[None, 1], dtype="int64"
                )
306
                label_data = np.random.random(1).astype("int64")
307 308 309 310 311 312
                fluid.layers.chunk_eval(
                    input=input_,
                    label=label_data,
                    chunk_scheme="IOB",
                    num_chunk_types=3,
                )
313 314 315 316

            self.assertRaises(TypeError, test_label)

            def test_type():
317 318 319
                in_data = fluid.data(
                    name="input_", shape=[None, 1], dtype="int32"
                )
320
                label = fluid.data(name="label_", shape=[1], dtype="int64")
321 322 323 324 325 326
                fluid.layers.chunk_eval(
                    input=in_data,
                    label=label,
                    chunk_scheme="IOB",
                    num_chunk_types=3,
                )
327 328 329 330

            self.assertRaises(TypeError, test_type)


G
guosheng 已提交
331 332
if __name__ == '__main__':
    unittest.main()