test_lac.py 22.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
16 17
import os
import tempfile
18 19 20
import time
import unittest

21
import numpy as np
22

23 24
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

L
Leo Chen 已提交
25
import paddle
26
from paddle import _legacy_C_ops, fluid
27
from paddle.fluid.dygraph import to_variable
28
from paddle.fluid.framework import _non_static_mode
H
hjyp 已提交
29
from paddle.jit.api import to_static
30
from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
31 32 33

SEED = 2020

34 35 36 37
# Add InputSpec to make unittest run faster.
input_specs = [
    paddle.static.InputSpec([None, None], 'int64'),
    paddle.static.InputSpec([None, None], 'int64'),
38
    paddle.static.InputSpec([None], 'int64'),
39
]
40 41


42
class DynamicGRU(paddle.nn.Layer):
43 44 45 46 47 48 49 50 51 52 53 54
    def __init__(
        self,
        size,
        h_0=None,
        param_attr=None,
        bias_attr=None,
        is_reverse=False,
        gate_activation='sigmoid',
        candidate_activation='tanh',
        origin_mode=False,
        init_size=None,
    ):
55
        super().__init__()
56

W
wangzhen38 已提交
57
        self.gru_unit = paddle.nn.GRUCell(
58
            size * 3,
W
wangzhen38 已提交
59
            size,
60
        )
61 62 63 64 65 66 67 68 69 70 71 72 73 74

        self.size = size
        self.h_0 = h_0
        self.is_reverse = is_reverse

    def forward(self, inputs):
        # Use `to_variable` to create a copy of global h_0 created not in `DynamicGRU`,
        # to avoid modify it because `h_0` is both used in other `DynamicGRU`.
        hidden = to_variable(self.h_0)
        hidden.stop_gradient = True

        res = []
        for i in range(inputs.shape[1]):
            if self.is_reverse:
2
201716010711 已提交
75
                j = paddle.shape(inputs)[1] - 1 - i
76
            else:
77 78 79
                j = i

            # input_ = inputs[:, j:j+1, :]  # original code
2
201716010711 已提交
80
            input_ = paddle.slice(inputs, axes=[1], starts=[j], ends=[j + 1])
81
            input_ = paddle.reshape(input_, [-1, input_.shape[2]])
82
            hidden, reset, gate = self.gru_unit(input_, hidden)
83
            hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]])
84 85 86 87
            res.append(hidden_)

        if self.is_reverse:
            res = res[::-1]
88
        res = paddle.concat(res, axis=1)
89 90 91
        return res


92
class BiGRU(paddle.nn.Layer):
93
    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
94
        super().__init__()
95

96 97 98 99
        self.pre_gru = paddle.nn.Linear(
            in_features=input_dim,
            out_features=grnn_hidden_dim * 3,
            weight_attr=fluid.ParamAttr(
100
                initializer=paddle.nn.initializer.Uniform(
101 102
                    low=-init_bound, high=init_bound
                ),
103
                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
104 105
            ),
        )
106 107 108 109 110

        self.gru = DynamicGRU(
            size=grnn_hidden_dim,
            h_0=h_0,
            param_attr=fluid.ParamAttr(
111
                initializer=paddle.nn.initializer.Uniform(
112 113
                    low=-init_bound, high=init_bound
                ),
114
                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
115 116
            ),
        )
117

118 119 120 121
        self.pre_gru_r = paddle.nn.Linear(
            in_features=input_dim,
            out_features=grnn_hidden_dim * 3,
            weight_attr=fluid.ParamAttr(
122
                initializer=paddle.nn.initializer.Uniform(
123 124
                    low=-init_bound, high=init_bound
                ),
125
                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
126 127
            ),
        )
128 129 130 131 132 133

        self.gru_r = DynamicGRU(
            size=grnn_hidden_dim,
            is_reverse=True,
            h_0=h_0,
            param_attr=fluid.ParamAttr(
134
                initializer=paddle.nn.initializer.Uniform(
135 136
                    low=-init_bound, high=init_bound
                ),
137
                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
138 139
            ),
        )
140 141 142 143 144 145 146 147

    def forward(self, input_feature):
        res_pre_gru = self.pre_gru(input_feature)
        res_gru = self.gru(res_pre_gru)

        res_pre_gru_r = self.pre_gru_r(input_feature)
        res_gru_r = self.gru_r(res_pre_gru_r)

148
        bi_merge = paddle.concat([res_gru, res_gru_r], axis=-1)
149 150 151
        return bi_merge


152
class LinearChainCRF(paddle.nn.Layer):
153
    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
154
        super().__init__()
155 156 157 158 159 160 161 162

        self._param_attr = param_attr
        self._dtype = dtype
        self._size = size
        self._is_test = is_test
        self._transition = self.create_parameter(
            attr=self._param_attr,
            shape=[self._size + 2, self._size],
163 164
            dtype=self._dtype,
        )
165 166 167 168 169 170 171 172 173 174

    @property
    def weight(self):
        return self._transition

    @weight.setter
    def weight(self, value):
        self._transition = value

    def forward(self, input, label, length=None):
175
        if _non_static_mode():
176
            _, _, _, log_likelihood = _legacy_C_ops.linear_chain_crf(
177 178
                input, self._transition, label, length, "is_test", self._is_test
            )
179
            return log_likelihood
180 181

        alpha = self._helper.create_variable_for_type_inference(
182 183
            dtype=self._dtype
        )
184
        emission_exps = self._helper.create_variable_for_type_inference(
185 186
            dtype=self._dtype
        )
187
        transition_exps = self._helper.create_variable_for_type_inference(
188 189
            dtype=self._dtype
        )
190
        log_likelihood = self._helper.create_variable_for_type_inference(
191 192
            dtype=self._dtype
        )
193 194 195
        this_inputs = {
            "Emission": [input],
            "Transition": self._transition,
196
            "Label": [label],
197 198 199
        }
        if length is not None:
            this_inputs['Length'] = [length]
200 201 202 203 204 205 206 207 208 209 210 211 212
        self._helper.append_op(
            type='linear_chain_crf',
            inputs=this_inputs,
            outputs={
                "Alpha": [alpha],
                "EmissionExps": [emission_exps],
                "TransitionExps": transition_exps,
                "LogLikelihood": log_likelihood,
            },
            attrs={
                "is_test": self._is_test,
            },
        )
213 214 215
        return log_likelihood


216
class CRFDecoding(paddle.nn.Layer):
217
    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
218
        super().__init__()
219 220 221 222 223 224 225 226

        self._dtype = dtype
        self._size = size
        self._is_test = is_test
        self._param_attr = param_attr
        self._transition = self.create_parameter(
            attr=self._param_attr,
            shape=[self._size + 2, self._size],
227 228
            dtype=self._dtype,
        )
229 230 231 232 233 234 235 236 237 238

    @property
    def weight(self):
        return self._transition

    @weight.setter
    def weight(self, value):
        self._transition = value

    def forward(self, input, label=None, length=None):
239
        if _non_static_mode():
240 241 242
            return _legacy_C_ops.crf_decoding(
                input, self._transition, label, length, "is_test", self._is_test
            )
243 244

        viterbi_path = self._helper.create_variable_for_type_inference(
245 246
            dtype=self._dtype
        )
247 248 249
        this_inputs = {
            "Emission": [input],
            "Transition": self._transition,
250
            "Label": label,
251 252 253
        }
        if length is not None:
            this_inputs['Length'] = [length]
254 255 256 257 258 259 260 261
        self._helper.append_op(
            type='crf_decoding',
            inputs=this_inputs,
            outputs={"ViterbiPath": [viterbi_path]},
            attrs={
                "is_test": self._is_test,
            },
        )
262 263 264
        return viterbi_path


265
class ChunkEval(paddle.nn.Layer):
266 267 268
    def __init__(
        self, num_chunk_types, chunk_scheme, excluded_chunk_types=None
    ):
269
        super().__init__()
270 271 272 273 274
        self.num_chunk_types = num_chunk_types
        self.chunk_scheme = chunk_scheme
        self.excluded_chunk_types = excluded_chunk_types

    def forward(self, input, label, seq_length=None):
275
        if _non_static_mode():
276 277 278 279 280 281 282 283 284 285 286
            return _legacy_C_ops.chunk_eval(
                input,
                label,
                seq_length,
                "num_chunk_types",
                self.num_chunk_types,
                "chunk_scheme",
                self.chunk_scheme,
                "excluded_chunk_types",
                self.excluded_chunk_types or [],
            )
287 288

        precision = self._helper.create_variable_for_type_inference(
289 290
            dtype="float32"
        )
291
        recall = self._helper.create_variable_for_type_inference(
292 293
            dtype="float32"
        )
294
        f1_score = self._helper.create_variable_for_type_inference(
295 296
            dtype="float32"
        )
297
        num_infer_chunks = self._helper.create_variable_for_type_inference(
298 299
            dtype="int64"
        )
300
        num_label_chunks = self._helper.create_variable_for_type_inference(
301 302
            dtype="int64"
        )
303
        num_correct_chunks = self._helper.create_variable_for_type_inference(
304 305
            dtype="int64"
        )
306 307 308 309 310

        this_input = {"Inference": [input], "Label": [label]}
        if seq_length is not None:
            this_input["SeqLength"] = [seq_length]

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
        self._helper.append_op(
            type='chunk_eval',
            inputs=this_input,
            outputs={
                "Precision": [precision],
                "Recall": [recall],
                "F1-Score": [f1_score],
                "NumInferChunks": [num_infer_chunks],
                "NumLabelChunks": [num_label_chunks],
                "NumCorrectChunks": [num_correct_chunks],
            },
            attrs={
                "num_chunk_types": self.num_chunk_types,
                "chunk_scheme": self.chunk_scheme,
                "excluded_chunk_types": self.excluded_chunk_types or [],
            },
        )
        return (
            precision,
            recall,
            f1_score,
            num_infer_chunks,
            num_label_chunks,
            num_correct_chunks,
        )
336 337


338
class LexNet(paddle.nn.Layer):
339
    def __init__(self, args, length=None):
340
        super().__init__()
341 342 343 344 345 346 347 348 349 350 351 352 353
        """
        define the lexical analysis network structure
        word: stores the input of the model
        for_infer: a boolean value, indicating if the model to be created is for training or predicting.

        return:
            for infer: return the prediction
            otherwise: return the prediction
        """
        self.word_emb_dim = args.word_emb_dim
        self.vocab_size = args.vocab_size
        self.num_labels = args.num_labels
        self.grnn_hidden_dim = args.grnn_hidden_dim
354 355 356 357 358 359
        self.emb_lr = (
            args.emb_learning_rate if 'emb_learning_rate' in dir(args) else 1.0
        )
        self.crf_lr = (
            args.emb_learning_rate if 'crf_learning_rate' in dir(args) else 1.0
        )
360 361 362
        self.bigru_num = args.bigru_num
        self.init_bound = 0.1

363 364 365 366
        self.word_embedding = paddle.nn.Embedding(
            self.vocab_size,
            self.word_emb_dim,
            weight_attr=fluid.ParamAttr(
367 368
                learning_rate=self.emb_lr,
                name="word_emb",
369
                initializer=paddle.nn.initializer.Uniform(
370 371 372 373
                    low=-self.init_bound, high=self.init_bound
                ),
            ),
        )
374 375 376 377 378 379 380 381 382 383

        h_0 = np.zeros((args.batch_size, self.grnn_hidden_dim), dtype="float32")
        h_0 = to_variable(h_0)

        self.bigru_units = []
        for i in range(self.bigru_num):
            if i == 0:
                self.bigru_units.append(
                    self.add_sublayer(
                        "bigru_units%d" % i,
384 385 386 387 388 389 390 391
                        BiGRU(
                            self.grnn_hidden_dim,
                            self.grnn_hidden_dim,
                            self.init_bound,
                            h_0=h_0,
                        ),
                    )
                )
392 393 394 395
            else:
                self.bigru_units.append(
                    self.add_sublayer(
                        "bigru_units%d" % i,
396 397 398 399 400 401 402 403 404
                        BiGRU(
                            self.grnn_hidden_dim * 2,
                            self.grnn_hidden_dim,
                            self.init_bound,
                            h_0=h_0,
                        ),
                    )
                )

405 406 407 408
        self.fc = paddle.nn.Linear(
            in_features=self.grnn_hidden_dim * 2,
            out_features=self.num_labels,
            weight_attr=fluid.ParamAttr(
409
                initializer=paddle.nn.initializer.Uniform(
410 411
                    low=-self.init_bound, high=self.init_bound
                ),
412
                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
413 414 415 416 417 418 419 420 421 422 423 424 425 426
            ),
        )

        self.linear_chain_crf = LinearChainCRF(
            param_attr=fluid.ParamAttr(
                name='linear_chain_crfw', learning_rate=self.crf_lr
            ),
            size=self.num_labels,
        )

        self.crf_decoding = CRFDecoding(
            param_attr=fluid.ParamAttr(name='crfw', learning_rate=self.crf_lr),
            size=self.num_labels,
        )
427 428 429
        # share weight
        self.crf_decoding.weight = self.linear_chain_crf.weight

H
hjyp 已提交
430
    @to_static(input_spec=input_specs)
431 432 433 434 435 436 437 438 439 440 441 442 443
    def forward(self, word, target, length=None):
        """
        Configure the network
        """
        word_embed = self.word_embedding(word)
        input_feature = word_embed

        for i in range(self.bigru_num):
            bigru_output = self.bigru_units[i](input_feature)
            input_feature = bigru_output

        emission = self.fc(bigru_output)

444 445 446
        crf_cost = self.linear_chain_crf(
            input=emission, label=target, length=length
        )
447
        avg_cost = paddle.mean(x=crf_cost)
448 449 450 451
        crf_decode = self.crf_decoding(input=emission, length=length)
        return avg_cost, crf_decode


452
class Args:
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
    epoch = 1
    batch_size = 4
    vocab_size = 100
    num_labels = 10
    word_emb_dim = 128
    grnn_hidden_dim = 128
    base_learning_rate = 0.01
    bigru_num = 2
    print_steps = 1


def get_random_input_data(batch_size, vocab_size, num_labels, max_seq_len=64):
    local_random = np.random.RandomState(SEED)
    padding_id = np.int64(0)
    iter_num = 5

    def __reader__():
        batch, init_lens = [], []
        for i in range(iter_num * batch_size):
            cur_len = local_random.randint(3, max_seq_len)
473 474 475 476 477 478 479 480 481 482
            word_ids = (
                local_random.randint(0, vocab_size, [cur_len])
                .astype('int64')
                .tolist()
            )
            label_ids = (
                local_random.randint(0, num_labels, [cur_len])
                .astype('int64')
                .tolist()
            )
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
            batch.append((word_ids, label_ids))
            init_lens.append(cur_len)
            if len(batch) == batch_size:
                batch_max_len = min(max(init_lens), max_seq_len)
                new_batch = []
                for words_len, (word_ids, label_ids) in zip(init_lens, batch):
                    word_ids = word_ids[0:batch_max_len]
                    words_len = np.int64(len(word_ids))
                    word_ids += [
                        padding_id for _ in range(batch_max_len - words_len)
                    ]
                    label_ids = label_ids[0:batch_max_len]
                    label_ids += [
                        padding_id for _ in range(batch_max_len - words_len)
                    ]
                    assert len(word_ids) == len(label_ids)
                    new_batch.append((word_ids, label_ids, words_len))
                yield new_batch
                batch, init_lens = [], []

    return __reader__


def create_dataloader(reader, place):
507 508 509
    data_loader = fluid.io.DataLoader.from_generator(
        capacity=16, use_double_buffer=True, iterable=True
    )
510 511 512 513 514 515 516 517 518

    data_loader.set_sample_list_generator(reader, places=place)

    return data_loader


class TestLACModel(unittest.TestCase):
    def setUp(self):
        self.args = Args()
519 520 521 522 523
        self.place = (
            fluid.CUDAPlace(0)
            if fluid.is_compiled_with_cuda()
            else fluid.CPUPlace()
        )
524 525 526 527 528 529 530 531
        self.temp_dir = tempfile.TemporaryDirectory()
        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
        self.model_save_prefix = os.path.join(self.model_save_dir, 'lac')
        self.model_filename = "lac" + INFER_MODEL_SUFFIX
        self.params_filename = "lac" + INFER_PARAMS_SUFFIX
        self.dy_param_path = os.path.join(self.temp_dir.name, 'lac_dy_param')

    def train(self, args, to_static):
R
Ryan 已提交
532
        paddle.jit.enable_to_static(to_static)
533 534 535 536 537
        place = (
            fluid.CUDAPlace(0)
            if fluid.is_compiled_with_cuda()
            else fluid.CPUPlace()
        )
538 539 540 541
        with fluid.dygraph.guard(place):
            paddle.seed(SEED)
            paddle.framework.random._manual_program_seed(SEED)

542 543 544
            reader = get_random_input_data(
                args.batch_size, args.vocab_size, args.num_labels
            )
545 546 547 548 549
            train_loader = create_dataloader(reader, place)

            model = LexNet(args)
            optimizer = fluid.optimizer.AdamOptimizer(
                learning_rate=args.base_learning_rate,
550 551 552 553 554
                parameter_list=model.parameters(),
            )
            chunk_eval = ChunkEval(
                int(math.ceil((args.num_labels - 1) / 2.0)), "IOB"
            )
555 556 557 558 559 560 561 562 563

            step = 0

            loss_data = []
            for epoch_id in range(args.epoch):
                for batch in train_loader():
                    words, targets, length = batch
                    start_time = time.time()
                    avg_cost, crf_decode = model(words, targets, length)
564
                    loss_data.append(float(avg_cost))
565 566 567 568 569 570 571 572

                    # backward and optimization
                    avg_cost.backward()
                    optimizer.minimize(avg_cost)
                    model.clear_gradients()
                    end_time = time.time()

                    if step % args.print_steps == 0:
573 574 575 576 577 578 579 580 581 582
                        (
                            precision,
                            recall,
                            f1_score,
                            num_infer_chunks,
                            num_label_chunks,
                            num_correct_chunks,
                        ) = chunk_eval(
                            input=crf_decode, label=targets, seq_length=length
                        )
583
                        outputs = [avg_cost, precision, recall, f1_score]
584
                        avg_cost, precision, recall, f1_score = (
585
                            np.mean(x.numpy()) for x in outputs
586
                        )
587 588 589

                        print(
                            "[train] step = %d, loss = %f, P: %f, R: %f, F1: %f, elapsed time %f"
590 591 592 593 594 595 596 597 598
                            % (
                                step,
                                avg_cost,
                                precision,
                                recall,
                                f1_score,
                                end_time - start_time,
                            )
                        )
599 600 601 602

                    step += 1
            # save inference model
            if to_static:
603
                paddle.jit.save(
604 605 606
                    layer=model,
                    path=self.model_save_prefix,
                    input_spec=[input_specs[0], input_specs[-1]],
607 608
                    output_spec=[crf_decode],
                )
609
            else:
610 611
                paddle.save(
                    model.state_dict(), self.dy_param_path + '.pdparams'
612
                )
613

614
            return np.array(loss_data)
615 616

    def test_train(self):
617 618
        st_out = self.train(self.args, to_static=True)
        dy_out = self.train(self.args, to_static=False)
619 620 621 622 623
        np.testing.assert_allclose(
            dy_out,
            st_out,
            rtol=1e-05,
            err_msg='dygraph output:\n{},\nstatic output:\n {}.'.format(
624 625 626
                dy_out, st_out
            ),
        )
627
        # Prediction needs trained models, so put `test_predict` at last of `test_train`
628
        # self.verify_predict()
629 630

    def verify_predict(self):
631 632 633
        reader = get_random_input_data(
            self.args.batch_size, self.args.vocab_size, self.args.num_labels
        )
634 635 636 637
        for batch in reader():
            batch = [np.vstack(var) for var in zip(*batch)]
            dy_pre = self.predict_dygraph(batch)
            st_pre = self.predict_static(batch)
638
            dy_jit_pre = self.predict_dygraph_jit(batch)
639 640
            np.testing.assert_allclose(dy_pre, st_pre, rtol=1e-05)
            np.testing.assert_allclose(dy_jit_pre, st_pre, rtol=1e-05)
641 642 643

    def predict_dygraph(self, batch):
        words, targets, length = batch
R
Ryan 已提交
644
        paddle.jit.enable_to_static(False)
645 646 647
        with fluid.dygraph.guard(self.place):
            model = LexNet(self.args)
            # load dygraph trained parameters
648
            model_dict = paddle.load(self.dy_param_path + ".pdparams")
649 650 651
            model.set_dict(model_dict)
            model.eval()

652 653 654
            _, pred_res = model(
                to_variable(words), to_variable(targets), to_variable(length)
            )
655 656 657 658 659 660 661 662

            return pred_res.numpy()

    def predict_static(self, batch):
        """
        LAC model contains h_0 created in `__init__` that is necessary for inferring.
        Load inference model to test it's ok for prediction.
        """
663
        paddle.enable_static()
664 665
        exe = fluid.Executor(self.place)
        # load inference model
666 667 668 669 670 671 672 673 674 675
        [
            inference_program,
            feed_target_names,
            fetch_targets,
        ] = fluid.io.load_inference_model(
            self.model_save_dir,
            executor=exe,
            model_filename=self.model_filename,
            params_filename=self.params_filename,
        )
676 677

        words, targets, length = batch
678 679 680 681 682
        pred_res = exe.run(
            inference_program,
            feed={feed_target_names[0]: words, feed_target_names[1]: length},
            fetch_list=fetch_targets,
        )
683 684
        return pred_res[0]

685 686 687
    def predict_dygraph_jit(self, batch):
        words, targets, length = batch
        with fluid.dygraph.guard(self.place):
688
            model = paddle.jit.load(self.model_save_prefix)
689 690 691 692 693 694
            model.eval()

            pred_res = model(to_variable(words), to_variable(length))

            return pred_res.numpy()

695 696

if __name__ == "__main__":
697
    unittest.main()