test_lac.py 22.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
16 17
import os
import tempfile
18 19 20
import time
import unittest

21
import numpy as np
22

23 24
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

L
Leo Chen 已提交
25
import paddle
26
from paddle import _legacy_C_ops, fluid
27
from paddle.fluid.dygraph import to_variable
28
from paddle.fluid.framework import _non_static_mode
H
hjyp 已提交
29
from paddle.jit.api import to_static
30
from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
31 32 33

SEED = 2020

34 35 36 37
# Add InputSpec to make unittest run faster.
input_specs = [
    paddle.static.InputSpec([None, None], 'int64'),
    paddle.static.InputSpec([None, None], 'int64'),
38
    paddle.static.InputSpec([None], 'int64'),
39
]
40 41


42
class DynamicGRU(paddle.nn.Layer):
43 44 45 46 47 48 49 50 51 52 53 54
    def __init__(
        self,
        size,
        h_0=None,
        param_attr=None,
        bias_attr=None,
        is_reverse=False,
        gate_activation='sigmoid',
        candidate_activation='tanh',
        origin_mode=False,
        init_size=None,
    ):
55
        super().__init__()
56

W
wangzhen38 已提交
57
        self.gru_unit = paddle.nn.GRUCell(
58
            size * 3,
W
wangzhen38 已提交
59
            size,
60
        )
61 62 63 64 65 66 67 68 69 70 71 72 73 74

        self.size = size
        self.h_0 = h_0
        self.is_reverse = is_reverse

    def forward(self, inputs):
        # Use `to_variable` to create a copy of global h_0 created not in `DynamicGRU`,
        # to avoid modify it because `h_0` is both used in other `DynamicGRU`.
        hidden = to_variable(self.h_0)
        hidden.stop_gradient = True

        res = []
        for i in range(inputs.shape[1]):
            if self.is_reverse:
2
201716010711 已提交
75
                j = paddle.shape(inputs)[1] - 1 - i
76
            else:
77 78 79
                j = i

            # input_ = inputs[:, j:j+1, :]  # original code
2
201716010711 已提交
80
            input_ = paddle.slice(inputs, axes=[1], starts=[j], ends=[j + 1])
81
            input_ = paddle.reshape(input_, [-1, input_.shape[2]])
82
            hidden, reset, gate = self.gru_unit(input_, hidden)
83
            hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]])
84 85 86 87
            res.append(hidden_)

        if self.is_reverse:
            res = res[::-1]
88
        res = paddle.concat(res, axis=1)
89 90 91
        return res


92
class BiGRU(paddle.nn.Layer):
93
    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
94
        super().__init__()
95

96 97 98 99
        self.pre_gru = paddle.nn.Linear(
            in_features=input_dim,
            out_features=grnn_hidden_dim * 3,
            weight_attr=fluid.ParamAttr(
100
                initializer=paddle.nn.initializer.Uniform(
101 102
                    low=-init_bound, high=init_bound
                ),
103
                regularizer=fluid.regularizer.L2DecayRegularizer(
104 105 106 107
                    regularization_coeff=1e-4
                ),
            ),
        )
108 109 110 111 112

        self.gru = DynamicGRU(
            size=grnn_hidden_dim,
            h_0=h_0,
            param_attr=fluid.ParamAttr(
113
                initializer=paddle.nn.initializer.Uniform(
114 115
                    low=-init_bound, high=init_bound
                ),
116
                regularizer=fluid.regularizer.L2DecayRegularizer(
117 118 119 120
                    regularization_coeff=1e-4
                ),
            ),
        )
121

122 123 124 125
        self.pre_gru_r = paddle.nn.Linear(
            in_features=input_dim,
            out_features=grnn_hidden_dim * 3,
            weight_attr=fluid.ParamAttr(
126
                initializer=paddle.nn.initializer.Uniform(
127 128
                    low=-init_bound, high=init_bound
                ),
129
                regularizer=fluid.regularizer.L2DecayRegularizer(
130 131 132 133
                    regularization_coeff=1e-4
                ),
            ),
        )
134 135 136 137 138 139

        self.gru_r = DynamicGRU(
            size=grnn_hidden_dim,
            is_reverse=True,
            h_0=h_0,
            param_attr=fluid.ParamAttr(
140
                initializer=paddle.nn.initializer.Uniform(
141 142
                    low=-init_bound, high=init_bound
                ),
143
                regularizer=fluid.regularizer.L2DecayRegularizer(
144 145 146 147
                    regularization_coeff=1e-4
                ),
            ),
        )
148 149 150 151 152 153 154 155

    def forward(self, input_feature):
        res_pre_gru = self.pre_gru(input_feature)
        res_gru = self.gru(res_pre_gru)

        res_pre_gru_r = self.pre_gru_r(input_feature)
        res_gru_r = self.gru_r(res_pre_gru_r)

156
        bi_merge = paddle.concat([res_gru, res_gru_r], axis=-1)
157 158 159
        return bi_merge


160
class LinearChainCRF(paddle.nn.Layer):
161
    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
162
        super().__init__()
163 164 165 166 167 168 169 170

        self._param_attr = param_attr
        self._dtype = dtype
        self._size = size
        self._is_test = is_test
        self._transition = self.create_parameter(
            attr=self._param_attr,
            shape=[self._size + 2, self._size],
171 172
            dtype=self._dtype,
        )
173 174 175 176 177 178 179 180 181 182

    @property
    def weight(self):
        return self._transition

    @weight.setter
    def weight(self, value):
        self._transition = value

    def forward(self, input, label, length=None):
183
        if _non_static_mode():
184
            _, _, _, log_likelihood = _legacy_C_ops.linear_chain_crf(
185 186
                input, self._transition, label, length, "is_test", self._is_test
            )
187
            return log_likelihood
188 189

        alpha = self._helper.create_variable_for_type_inference(
190 191
            dtype=self._dtype
        )
192
        emission_exps = self._helper.create_variable_for_type_inference(
193 194
            dtype=self._dtype
        )
195
        transition_exps = self._helper.create_variable_for_type_inference(
196 197
            dtype=self._dtype
        )
198
        log_likelihood = self._helper.create_variable_for_type_inference(
199 200
            dtype=self._dtype
        )
201 202 203
        this_inputs = {
            "Emission": [input],
            "Transition": self._transition,
204
            "Label": [label],
205 206 207
        }
        if length is not None:
            this_inputs['Length'] = [length]
208 209 210 211 212 213 214 215 216 217 218 219 220
        self._helper.append_op(
            type='linear_chain_crf',
            inputs=this_inputs,
            outputs={
                "Alpha": [alpha],
                "EmissionExps": [emission_exps],
                "TransitionExps": transition_exps,
                "LogLikelihood": log_likelihood,
            },
            attrs={
                "is_test": self._is_test,
            },
        )
221 222 223
        return log_likelihood


224
class CRFDecoding(paddle.nn.Layer):
225
    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
226
        super().__init__()
227 228 229 230 231 232 233 234

        self._dtype = dtype
        self._size = size
        self._is_test = is_test
        self._param_attr = param_attr
        self._transition = self.create_parameter(
            attr=self._param_attr,
            shape=[self._size + 2, self._size],
235 236
            dtype=self._dtype,
        )
237 238 239 240 241 242 243 244 245 246

    @property
    def weight(self):
        return self._transition

    @weight.setter
    def weight(self, value):
        self._transition = value

    def forward(self, input, label=None, length=None):
247
        if _non_static_mode():
248 249 250
            return _legacy_C_ops.crf_decoding(
                input, self._transition, label, length, "is_test", self._is_test
            )
251 252

        viterbi_path = self._helper.create_variable_for_type_inference(
253 254
            dtype=self._dtype
        )
255 256 257
        this_inputs = {
            "Emission": [input],
            "Transition": self._transition,
258
            "Label": label,
259 260 261
        }
        if length is not None:
            this_inputs['Length'] = [length]
262 263 264 265 266 267 268 269
        self._helper.append_op(
            type='crf_decoding',
            inputs=this_inputs,
            outputs={"ViterbiPath": [viterbi_path]},
            attrs={
                "is_test": self._is_test,
            },
        )
270 271 272
        return viterbi_path


273
class ChunkEval(paddle.nn.Layer):
274 275 276
    def __init__(
        self, num_chunk_types, chunk_scheme, excluded_chunk_types=None
    ):
277
        super().__init__()
278 279 280 281 282
        self.num_chunk_types = num_chunk_types
        self.chunk_scheme = chunk_scheme
        self.excluded_chunk_types = excluded_chunk_types

    def forward(self, input, label, seq_length=None):
283
        if _non_static_mode():
284 285 286 287 288 289 290 291 292 293 294
            return _legacy_C_ops.chunk_eval(
                input,
                label,
                seq_length,
                "num_chunk_types",
                self.num_chunk_types,
                "chunk_scheme",
                self.chunk_scheme,
                "excluded_chunk_types",
                self.excluded_chunk_types or [],
            )
295 296

        precision = self._helper.create_variable_for_type_inference(
297 298
            dtype="float32"
        )
299
        recall = self._helper.create_variable_for_type_inference(
300 301
            dtype="float32"
        )
302
        f1_score = self._helper.create_variable_for_type_inference(
303 304
            dtype="float32"
        )
305
        num_infer_chunks = self._helper.create_variable_for_type_inference(
306 307
            dtype="int64"
        )
308
        num_label_chunks = self._helper.create_variable_for_type_inference(
309 310
            dtype="int64"
        )
311
        num_correct_chunks = self._helper.create_variable_for_type_inference(
312 313
            dtype="int64"
        )
314 315 316 317 318

        this_input = {"Inference": [input], "Label": [label]}
        if seq_length is not None:
            this_input["SeqLength"] = [seq_length]

319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
        self._helper.append_op(
            type='chunk_eval',
            inputs=this_input,
            outputs={
                "Precision": [precision],
                "Recall": [recall],
                "F1-Score": [f1_score],
                "NumInferChunks": [num_infer_chunks],
                "NumLabelChunks": [num_label_chunks],
                "NumCorrectChunks": [num_correct_chunks],
            },
            attrs={
                "num_chunk_types": self.num_chunk_types,
                "chunk_scheme": self.chunk_scheme,
                "excluded_chunk_types": self.excluded_chunk_types or [],
            },
        )
        return (
            precision,
            recall,
            f1_score,
            num_infer_chunks,
            num_label_chunks,
            num_correct_chunks,
        )
344 345


346
class LexNet(paddle.nn.Layer):
347
    def __init__(self, args, length=None):
348
        super().__init__()
349 350 351 352 353 354 355 356 357 358 359 360 361
        """
        define the lexical analysis network structure
        word: stores the input of the model
        for_infer: a boolean value, indicating if the model to be created is for training or predicting.

        return:
            for infer: return the prediction
            otherwise: return the prediction
        """
        self.word_emb_dim = args.word_emb_dim
        self.vocab_size = args.vocab_size
        self.num_labels = args.num_labels
        self.grnn_hidden_dim = args.grnn_hidden_dim
362 363 364 365 366 367
        self.emb_lr = (
            args.emb_learning_rate if 'emb_learning_rate' in dir(args) else 1.0
        )
        self.crf_lr = (
            args.emb_learning_rate if 'crf_learning_rate' in dir(args) else 1.0
        )
368 369 370
        self.bigru_num = args.bigru_num
        self.init_bound = 0.1

371 372 373 374
        self.word_embedding = paddle.nn.Embedding(
            self.vocab_size,
            self.word_emb_dim,
            weight_attr=fluid.ParamAttr(
375 376
                learning_rate=self.emb_lr,
                name="word_emb",
377
                initializer=paddle.nn.initializer.Uniform(
378 379 380 381
                    low=-self.init_bound, high=self.init_bound
                ),
            ),
        )
382 383 384 385 386 387 388 389 390 391

        h_0 = np.zeros((args.batch_size, self.grnn_hidden_dim), dtype="float32")
        h_0 = to_variable(h_0)

        self.bigru_units = []
        for i in range(self.bigru_num):
            if i == 0:
                self.bigru_units.append(
                    self.add_sublayer(
                        "bigru_units%d" % i,
392 393 394 395 396 397 398 399
                        BiGRU(
                            self.grnn_hidden_dim,
                            self.grnn_hidden_dim,
                            self.init_bound,
                            h_0=h_0,
                        ),
                    )
                )
400 401 402 403
            else:
                self.bigru_units.append(
                    self.add_sublayer(
                        "bigru_units%d" % i,
404 405 406 407 408 409 410 411 412
                        BiGRU(
                            self.grnn_hidden_dim * 2,
                            self.grnn_hidden_dim,
                            self.init_bound,
                            h_0=h_0,
                        ),
                    )
                )

413 414 415 416
        self.fc = paddle.nn.Linear(
            in_features=self.grnn_hidden_dim * 2,
            out_features=self.num_labels,
            weight_attr=fluid.ParamAttr(
417
                initializer=paddle.nn.initializer.Uniform(
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
                    low=-self.init_bound, high=self.init_bound
                ),
                regularizer=fluid.regularizer.L2DecayRegularizer(
                    regularization_coeff=1e-4
                ),
            ),
        )

        self.linear_chain_crf = LinearChainCRF(
            param_attr=fluid.ParamAttr(
                name='linear_chain_crfw', learning_rate=self.crf_lr
            ),
            size=self.num_labels,
        )

        self.crf_decoding = CRFDecoding(
            param_attr=fluid.ParamAttr(name='crfw', learning_rate=self.crf_lr),
            size=self.num_labels,
        )
437 438 439
        # share weight
        self.crf_decoding.weight = self.linear_chain_crf.weight

H
hjyp 已提交
440
    @to_static(input_spec=input_specs)
441 442 443 444 445 446 447 448 449 450 451 452 453
    def forward(self, word, target, length=None):
        """
        Configure the network
        """
        word_embed = self.word_embedding(word)
        input_feature = word_embed

        for i in range(self.bigru_num):
            bigru_output = self.bigru_units[i](input_feature)
            input_feature = bigru_output

        emission = self.fc(bigru_output)

454 455 456
        crf_cost = self.linear_chain_crf(
            input=emission, label=target, length=length
        )
457
        avg_cost = paddle.mean(x=crf_cost)
458 459 460 461
        crf_decode = self.crf_decoding(input=emission, length=length)
        return avg_cost, crf_decode


462
class Args:
463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
    epoch = 1
    batch_size = 4
    vocab_size = 100
    num_labels = 10
    word_emb_dim = 128
    grnn_hidden_dim = 128
    base_learning_rate = 0.01
    bigru_num = 2
    print_steps = 1


def get_random_input_data(batch_size, vocab_size, num_labels, max_seq_len=64):
    local_random = np.random.RandomState(SEED)
    padding_id = np.int64(0)
    iter_num = 5

    def __reader__():
        batch, init_lens = [], []
        for i in range(iter_num * batch_size):
            cur_len = local_random.randint(3, max_seq_len)
483 484 485 486 487 488 489 490 491 492
            word_ids = (
                local_random.randint(0, vocab_size, [cur_len])
                .astype('int64')
                .tolist()
            )
            label_ids = (
                local_random.randint(0, num_labels, [cur_len])
                .astype('int64')
                .tolist()
            )
493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
            batch.append((word_ids, label_ids))
            init_lens.append(cur_len)
            if len(batch) == batch_size:
                batch_max_len = min(max(init_lens), max_seq_len)
                new_batch = []
                for words_len, (word_ids, label_ids) in zip(init_lens, batch):
                    word_ids = word_ids[0:batch_max_len]
                    words_len = np.int64(len(word_ids))
                    word_ids += [
                        padding_id for _ in range(batch_max_len - words_len)
                    ]
                    label_ids = label_ids[0:batch_max_len]
                    label_ids += [
                        padding_id for _ in range(batch_max_len - words_len)
                    ]
                    assert len(word_ids) == len(label_ids)
                    new_batch.append((word_ids, label_ids, words_len))
                yield new_batch
                batch, init_lens = [], []

    return __reader__


def create_dataloader(reader, place):
517 518 519
    data_loader = fluid.io.DataLoader.from_generator(
        capacity=16, use_double_buffer=True, iterable=True
    )
520 521 522 523 524 525 526 527 528

    data_loader.set_sample_list_generator(reader, places=place)

    return data_loader


class TestLACModel(unittest.TestCase):
    def setUp(self):
        self.args = Args()
529 530 531 532 533
        self.place = (
            fluid.CUDAPlace(0)
            if fluid.is_compiled_with_cuda()
            else fluid.CPUPlace()
        )
534 535 536 537 538 539 540 541
        self.temp_dir = tempfile.TemporaryDirectory()
        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
        self.model_save_prefix = os.path.join(self.model_save_dir, 'lac')
        self.model_filename = "lac" + INFER_MODEL_SUFFIX
        self.params_filename = "lac" + INFER_PARAMS_SUFFIX
        self.dy_param_path = os.path.join(self.temp_dir.name, 'lac_dy_param')

    def train(self, args, to_static):
R
Ryan 已提交
542
        paddle.jit.enable_to_static(to_static)
543 544 545 546 547
        place = (
            fluid.CUDAPlace(0)
            if fluid.is_compiled_with_cuda()
            else fluid.CPUPlace()
        )
548 549 550 551
        with fluid.dygraph.guard(place):
            paddle.seed(SEED)
            paddle.framework.random._manual_program_seed(SEED)

552 553 554
            reader = get_random_input_data(
                args.batch_size, args.vocab_size, args.num_labels
            )
555 556 557 558 559
            train_loader = create_dataloader(reader, place)

            model = LexNet(args)
            optimizer = fluid.optimizer.AdamOptimizer(
                learning_rate=args.base_learning_rate,
560 561 562 563 564
                parameter_list=model.parameters(),
            )
            chunk_eval = ChunkEval(
                int(math.ceil((args.num_labels - 1) / 2.0)), "IOB"
            )
565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584

            step = 0
            chunk_evaluator = fluid.metrics.ChunkEvaluator()
            chunk_evaluator.reset()

            loss_data = []
            for epoch_id in range(args.epoch):
                for batch in train_loader():
                    words, targets, length = batch
                    start_time = time.time()
                    avg_cost, crf_decode = model(words, targets, length)
                    loss_data.append(avg_cost.numpy()[0])

                    # backward and optimization
                    avg_cost.backward()
                    optimizer.minimize(avg_cost)
                    model.clear_gradients()
                    end_time = time.time()

                    if step % args.print_steps == 0:
585 586 587 588 589 590 591 592 593 594
                        (
                            precision,
                            recall,
                            f1_score,
                            num_infer_chunks,
                            num_label_chunks,
                            num_correct_chunks,
                        ) = chunk_eval(
                            input=crf_decode, label=targets, seq_length=length
                        )
595
                        outputs = [avg_cost, precision, recall, f1_score]
596
                        avg_cost, precision, recall, f1_score = (
597
                            np.mean(x.numpy()) for x in outputs
598
                        )
599 600 601

                        print(
                            "[train] step = %d, loss = %f, P: %f, R: %f, F1: %f, elapsed time %f"
602 603 604 605 606 607 608 609 610
                            % (
                                step,
                                avg_cost,
                                precision,
                                recall,
                                f1_score,
                                end_time - start_time,
                            )
                        )
611 612 613 614

                    step += 1
            # save inference model
            if to_static:
615
                paddle.jit.save(
616 617 618
                    layer=model,
                    path=self.model_save_prefix,
                    input_spec=[input_specs[0], input_specs[-1]],
619 620
                    output_spec=[crf_decode],
                )
621
            else:
622 623
                paddle.save(
                    model.state_dict(), self.dy_param_path + '.pdparams'
624
                )
625

626
            return np.array(loss_data)
627 628

    def test_train(self):
629 630
        st_out = self.train(self.args, to_static=True)
        dy_out = self.train(self.args, to_static=False)
631 632 633 634 635
        np.testing.assert_allclose(
            dy_out,
            st_out,
            rtol=1e-05,
            err_msg='dygraph output:\n{},\nstatic output:\n {}.'.format(
636 637 638
                dy_out, st_out
            ),
        )
639
        # Prediction needs trained models, so put `test_predict` at last of `test_train`
640
        # self.verify_predict()
641 642

    def verify_predict(self):
643 644 645
        reader = get_random_input_data(
            self.args.batch_size, self.args.vocab_size, self.args.num_labels
        )
646 647 648 649
        for batch in reader():
            batch = [np.vstack(var) for var in zip(*batch)]
            dy_pre = self.predict_dygraph(batch)
            st_pre = self.predict_static(batch)
650
            dy_jit_pre = self.predict_dygraph_jit(batch)
651 652
            np.testing.assert_allclose(dy_pre, st_pre, rtol=1e-05)
            np.testing.assert_allclose(dy_jit_pre, st_pre, rtol=1e-05)
653 654 655

    def predict_dygraph(self, batch):
        words, targets, length = batch
R
Ryan 已提交
656
        paddle.jit.enable_to_static(False)
657 658 659
        with fluid.dygraph.guard(self.place):
            model = LexNet(self.args)
            # load dygraph trained parameters
660
            model_dict = paddle.load(self.dy_param_path + ".pdparams")
661 662 663
            model.set_dict(model_dict)
            model.eval()

664 665 666
            _, pred_res = model(
                to_variable(words), to_variable(targets), to_variable(length)
            )
667 668 669 670 671 672 673 674

            return pred_res.numpy()

    def predict_static(self, batch):
        """
        LAC model contains h_0 created in `__init__` that is necessary for inferring.
        Load inference model to test it's ok for prediction.
        """
675
        paddle.enable_static()
676 677
        exe = fluid.Executor(self.place)
        # load inference model
678 679 680 681 682 683 684 685 686 687
        [
            inference_program,
            feed_target_names,
            fetch_targets,
        ] = fluid.io.load_inference_model(
            self.model_save_dir,
            executor=exe,
            model_filename=self.model_filename,
            params_filename=self.params_filename,
        )
688 689

        words, targets, length = batch
690 691 692 693 694
        pred_res = exe.run(
            inference_program,
            feed={feed_target_names[0]: words, feed_target_names[1]: length},
            fetch_list=fetch_targets,
        )
695 696
        return pred_res[0]

697 698 699
    def predict_dygraph_jit(self, batch):
        words, targets, length = batch
        with fluid.dygraph.guard(self.place):
700
            model = paddle.jit.load(self.model_save_prefix)
701 702 703 704 705 706
            model.eval()

            pred_res = model(to_variable(words), to_variable(length))

            return pred_res.numpy()

707 708

if __name__ == "__main__":
709
    unittest.main()