test_faster_tokenizer_op.py

#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import io
import os
import unittest

import numpy as np
import paddle
import paddle.nn as nn
from paddle.dataset.common import DATA_HOME
from paddle.fluid.framework import core, _non_static_mode, _test_eager_guard
from paddle.fluid.layer_helper import LayerHelper
from paddle import _C_ops

import sys
import tempfile

sys.path.append("./tokenizer")
from tokenizer.bert_tokenizer import BertTokenizer


def to_string_tensor(string_values, name):
    """
    Create the tensor that the value holds the list of string.
    NOTICE: The value will be holded in the cpu place. 
 
    Args:
        string_values(list[string]): The value will be setted to the tensor.
        name(string): The name of the tensor.
    """
    tensor = paddle.Tensor(core.VarDesc.VarType.STRING, [], name,
                           core.VarDesc.VarType.STRINGS, False)
    tensor.value().set_string_list(string_values)
    return tensor


def to_map_tensor(string_dict, name):
    """
    Create the tensor that the value holds the map, the type of key is the string
    and the value is the int. 
    NOTICE: The value will be holded in the cpu place. 
 
    Args:
        string_dict(dict): The value will be setted to the tensor.
        name(string): The name of the tensor.
    """
    tensor = paddle.Tensor(core.VarDesc.VarType.RAW, [], name,
                           core.VarDesc.VarType.VOCAB, True)
    tensor.value().set_vocab(string_dict)
    return tensor


class FasterTokenizer(nn.Layer):

    def __init__(self, vocab_dict):
        super(FasterTokenizer, self).__init__()
        vocab_tensor = to_map_tensor(vocab_dict, "vocab")
        self.register_buffer("vocab", vocab_tensor, persistable=True)

    def forward(self,
                text,
                text_pair=None,
                do_lower_case=True,
                max_seq_len=-1,
                is_split_into_words=False,
                pad_to_max_seq_len=False):
        if _non_static_mode():
            input_ids, seg_ids = _C_ops.faster_tokenizer(
                self.vocab, text, text_pair, "do_lower_case", do_lower_case,
                "max_seq_len", max_seq_len, "pad_to_max_seq_len",
                pad_to_max_seq_len, "is_split_into_words", is_split_into_words)
            return input_ids, seg_ids

        attrs = {
            "do_lower_case": do_lower_case,
            "max_seq_len": max_seq_len,
            "pad_to_max_seq_len": pad_to_max_seq_len,
            "is_split_into_words": is_split_into_words,
        }
        helper = LayerHelper("faster_tokenizer")
        input_ids = helper.create_variable_for_type_inference(dtype="int64")
        seg_ids = helper.create_variable_for_type_inference(dtype="int64")
        if text_pair is None:
            helper.append_op(type='faster_tokenizer',
                             inputs={
                                 'Vocab': self.vocab,
                                 'Text': text
                             },
                             outputs={
                                 'InputIds': input_ids,
                                 'SegmentIds': seg_ids
                             },
                             attrs=attrs)
        else:
            helper.append_op(type='faster_tokenizer',
                             inputs={
                                 'Vocab': self.vocab,
                                 'Text': text,
                                 'TextPair': text_pair
                             },
                             outputs={
                                 'InputIds': input_ids,
                                 'SegmentIds': seg_ids
                             },
                             attrs=attrs)
        return input_ids, seg_ids


class Predictor(object):

    def __init__(self, model_dir):
        model_file = os.path.join(model_dir, "inference.pdmodel")
        params_file = os.path.join(model_dir, "inference.pdiparams")
        if not os.path.exists(model_file):
            raise ValueError("not find model file path {}".format(model_file))
        if not os.path.exists(params_file):
            raise ValueError("not find params file path {}".format(params_file))
        config = paddle.inference.Config(model_file, params_file)

        # fast_tokenizer op only support cpu.
        config.disable_gpu()
        config.set_cpu_math_library_num_threads(10)

        config.switch_use_feed_fetch_ops(False)
        self.predictor = paddle.inference.create_predictor(config)
        self.input_handles = [
            self.predictor.get_input_handle(name)
            for name in self.predictor.get_input_names()
        ]
        self.output_handles = [
            self.predictor.get_output_handle(name)
            for name in self.predictor.get_output_names()
        ]

    def predict(self, data):

        self.input_handles[0].copy_from_cpu(data)
        self.predictor.run()
        input_ids = self.output_handles[0].copy_to_cpu()
        token_type_ids = self.output_handles[1].copy_to_cpu()
        return input_ids, token_type_ids


class TestBertTokenizerOp(unittest.TestCase):

    def setUp(self):
        self.temp_dir = tempfile.TemporaryDirectory()
        self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
        self.save_path = os.path.join(self.temp_dir.name, "fast_tokenizer")
        self.param_path = os.path.join(self.save_path, "model.pdparams")
        self.inference_path = os.path.join(self.save_path, "inference")

    def tearDown(self):
        self.temp_dir.cleanup()

    def init_data(self):
        self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab)
        self.text = [
            '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。'
            '酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，'
            '还算丰富。 服务吗，一般'
        ]
        self.text_pair = ['非常不错，服务很好，位于市中心区，交通方便，不过价格也高！']
        self.text_tensor = to_string_tensor(self.text, "text")
        self.text_pair_tensor = to_string_tensor(self.text_pair, "text_pair")
        self.texts = [
            '很好的地理位置，一蹋糊涂的服务，萧条的酒店。',
            ' 选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，'
            '但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
            'Test bert tokenizer. The first text.'
        ]
        self.text_pairs = [
            '非常不错，服务很好，位于市中心区，交通方便，不过价格也高！', '房间太小。其他的都一般。。。。。。。。。',
            'Test bert tokenizer. The second text.'
        ]
        self.texts_tensor = to_string_tensor(self.texts, "texts")
        self.text_pairs_tensor = to_string_tensor(self.text_pairs, "text_pairs")

    def run_padding(self):
        self.init_data()
        self.max_seq_len = 128
        self.pad_to_max_seq_len = True
        self.is_split_into_words = False

        # case 1: only one text (batch_size = 1)
        input_ids, token_type_ids = self.faster_tokenizer(
            text=self.text_tensor,
            do_lower_case=self.bert_tokenizer.do_lower_case,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        input_ids = input_ids.numpy()
        token_type_ids = token_type_ids.numpy()

        encoded_inputs = self.bert_tokenizer(
            text=self.text,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
        py_token_type_ids = np.array(
            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
        np.testing.assert_allclose(input_ids, py_input_ids, rtol=0, atol=0.01)
        np.testing.assert_allclose(token_type_ids,
                                   py_token_type_ids,
                                   rtol=0,
                                   atol=0.01)

        # case 2: only one text and one text_pair (batch_size = 1)
        input_ids, token_type_ids = self.faster_tokenizer(
            text=self.text_tensor,
            text_pair=self.text_pair_tensor,
            do_lower_case=self.bert_tokenizer.do_lower_case,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        input_ids = input_ids.numpy()
        token_type_ids = token_type_ids.numpy()

        encoded_inputs = self.bert_tokenizer(
            text=self.text,
            text_pair=self.text_pair,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
        py_token_type_ids = np.array(
            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
        np.testing.assert_allclose(input_ids, py_input_ids, rtol=0, atol=0.01)
        np.testing.assert_allclose(token_type_ids,
                                   py_token_type_ids,
                                   rtol=0,
                                   atol=0.01)

        # case 3: only texts (batch_size = 3)
        input_ids, token_type_ids = self.faster_tokenizer(
            text=self.texts_tensor,
            do_lower_case=self.bert_tokenizer.do_lower_case,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        input_ids = input_ids.numpy()
        token_type_ids = token_type_ids.numpy()

        encoded_inputs = self.bert_tokenizer(
            self.texts,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        py_input_ids = [i["input_ids"] for i in encoded_inputs]
        py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs]
        py_input_ids = np.array(py_input_ids).reshape([3, -1])
        py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1])
        np.testing.assert_allclose(input_ids, py_input_ids, rtol=0, atol=0.01)
        np.testing.assert_allclose(token_type_ids,
                                   py_token_type_ids,
                                   rtol=0,
                                   atol=0.01)

        # case 4: texts and text pairs (batch_size = 3)
        input_ids, token_type_ids = self.faster_tokenizer(
            text=self.texts_tensor,
            text_pair=self.text_pairs_tensor,
            do_lower_case=self.bert_tokenizer.do_lower_case,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        input_ids = input_ids.numpy()
        token_type_ids = token_type_ids.numpy()

        encoded_inputs = self.bert_tokenizer(
            self.texts,
            self.text_pairs,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        py_input_ids = [i["input_ids"] for i in encoded_inputs]
        py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs]
        py_input_ids = np.array(py_input_ids).reshape([3, -1])
        py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1])
        np.testing.assert_allclose(input_ids, py_input_ids, rtol=0, atol=0.01)
        np.testing.assert_allclose(token_type_ids,
                                   py_token_type_ids,
                                   rtol=0,
                                   atol=0.01)

    def test_padding(self):
        with _test_eager_guard():
            self.run_padding()
        self.run_padding()

    def run_no_padding(self):
        self.init_data()
        self.max_seq_len = 128
        self.pad_to_max_seq_len = False
        self.is_split_into_words = False

        # case 1: only one text (batch_size = 1)
        input_ids, token_type_ids = self.faster_tokenizer(
            text=self.text_tensor,
            do_lower_case=self.bert_tokenizer.do_lower_case,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        input_ids = input_ids.numpy()
        token_type_ids = token_type_ids.numpy()

        encoded_inputs = self.bert_tokenizer(
            self.text,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
        py_token_type_ids = np.array(
            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
        np.testing.assert_allclose(input_ids, py_input_ids, rtol=0, atol=0.01)
        np.testing.assert_allclose(token_type_ids,
                                   py_token_type_ids,
                                   rtol=0,
                                   atol=0.01)

        # case 2: only one text and one text_pair (batch_size = 1)
        input_ids, token_type_ids = self.faster_tokenizer(
            self.text_tensor,
            self.text_pair_tensor,
            do_lower_case=self.bert_tokenizer.do_lower_case,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        input_ids = input_ids.numpy()
        token_type_ids = token_type_ids.numpy()

        encoded_inputs = self.bert_tokenizer(
            self.text,
            self.text_pair,
            max_seq_len=self.max_seq_len,
            pad_to_max_seq_len=self.pad_to_max_seq_len,
            is_split_into_words=self.is_split_into_words)
        py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
        py_token_type_ids = np.array(
            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
        np.testing.assert_allclose(input_ids, py_input_ids, rtol=0, atol=0.01)
        np.testing.assert_allclose(token_type_ids,
                                   py_token_type_ids,
                                   rtol=0,
                                   atol=0.01)

    def test_no_padding(self):
        with _test_eager_guard():
            self.run_no_padding()
        self.run_no_padding()

    def run_is_split_into_words(self):
        self.init_data()
        self.is_split_into_words = True

        input_ids, token_type_ids = self.faster_tokenizer(
            self.text_tensor,
            do_lower_case=self.bert_tokenizer.do_lower_case,
            is_split_into_words=self.is_split_into_words)
        input_ids = input_ids.numpy()
        token_type_ids = token_type_ids.numpy()
        encoded_inputs = self.bert_tokenizer(
            list(self.text[0]), is_split_into_words=self.is_split_into_words)
        py_input_ids = np.array(encoded_inputs["input_ids"]).reshape([1, -1])
        py_token_type_ids = np.array(encoded_inputs["token_type_ids"]).reshape(
            [1, -1])
        np.testing.assert_allclose(input_ids, py_input_ids, rtol=0, atol=0.01)
        np.testing.assert_allclose(token_type_ids,
                                   py_token_type_ids,
                                   rtol=0,
                                   atol=0.01)

    def test_is_split_into_words(self):
        with _test_eager_guard():
            self.run_is_split_into_words()
        self.run_is_split_into_words()

    def test_inference(self):
        self.init_data()
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path, exist_ok=True)
        paddle.save(self.faster_tokenizer.state_dict(), self.param_path)
        state_dict = paddle.load(self.param_path)
        self.faster_tokenizer.set_dict(state_dict)

        static_model = paddle.jit.to_static(
            self.faster_tokenizer,
            input_spec=[
                paddle.static.InputSpec(
                    shape=[None], dtype=core.VarDesc.VarType.STRINGS),  # texts
            ])
        # Save in static graph model.
        paddle.jit.save(static_model, self.inference_path)
        predictor = Predictor(self.save_path)
        input_ids, token_type_ids = predictor.predict(self.text)

        encoded_inputs = self.bert_tokenizer(self.text)
        py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
        py_token_type_ids = np.array(
            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
        np.testing.assert_allclose(input_ids, py_input_ids, rtol=0, atol=0.01)
        np.testing.assert_allclose(token_type_ids,
                                   py_token_type_ids,
                                   rtol=0,
                                   atol=0.01)

    def test_feed_string_var(self):
        self.init_data()
        paddle.enable_static()
        x = paddle.static.data(name="x",
                               shape=[-1],
                               dtype=core.VarDesc.VarType.STRINGS)
        exe = paddle.static.Executor(paddle.framework.CPUPlace())
        exe.run(paddle.static.default_main_program(), feed={'x': self.text})
        paddle.disable_static()


if __name__ == '__main__':
    unittest.main()