test_roberta_large.py 3.1 KB
Newer Older
W
wuzewu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from unittest import TestCase, main
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import numpy as np
import paddlehub as hub


class RoBERTaChnLargeTestCase(TestCase):
    def setUp(self):
        self.module = hub.Module(name='chinese-roberta-wwm-ext-large')
        self.test_text = [[
            '飞桨(PaddlePaddle)是国内开源产业级深度学习平台', 'PaddleHub是飞桨生态的预训练模型应用工具'
        ], ["飞浆PaddleHub"]]

    def test_get_embedding(self):
        # test batch_size
        results = self.module.get_embedding(
            texts=self.test_text, use_gpu=False, batch_size=1)
        results_2 = self.module.get_embedding(
            texts=self.test_text, use_gpu=False, batch_size=10)
        # 2 sample results
        self.assertEqual(len(results), 2)
        self.assertEqual(len(results_2), 2)
        # sequence embedding and token embedding results per sample
        self.assertEqual(len(results[0]), 2)
        self.assertEqual(len(results_2[0]), 2)
        # sequence embedding shape
        self.assertEqual(results[0][0].shape, (1024, ))
        self.assertEqual(results_2[0][0].shape, (1024, ))
        # token embedding shape, max_seq_len is 512
        self.assertEqual(results[0][1].shape, (512, 1024))
        self.assertEqual(results_2[0][1].shape, (512, 1024))

        # test gpu
        results_3 = self.module.get_embedding(
            texts=self.test_text, use_gpu=True, batch_size=1)
        diff = np.abs(results[0][0] - results_3[0][0])
        self.assertTrue((diff < 1e-6).all)
        diff = np.abs(results[0][1] - results_3[0][1])
        self.assertTrue((diff < 1e-6).all)
        diff = np.abs(results[1][0] - results_3[1][0])
        self.assertTrue((diff < 1e-6).all)
        diff = np.abs(results[1][1] - results_3[1][1])
        self.assertTrue((diff < 1e-6).all)

    def test_get_params_layer(self):
        self.module.context()
        layers = self.module.get_params_layer()
        layers = list(set(layers.values()))
        true_layers = [i for i in range(24)]
        self.assertEqual(layers, true_layers)

    def test_get_spm_path(self):
        self.assertEqual(self.module.get_spm_path(), None)

    def test_get_word_dict_path(self):
        self.assertEqual(self.module.get_word_dict_path(), None)

    def test_get_vocab_path(self):
        vocab_path = self.module.get_vocab_path()
        true_vocab_path = os.path.join(self.module.directory, "assets",
                                       "vocab.txt")
        self.assertEqual(vocab_path, true_vocab_path)


if __name__ == '__main__':
    main()