提交 558bab5d 编写于 作者: C Chen Chen 提交者: A. Unique TensorFlower

Add sentence piece tokenizer in tokenization.py

PiperOrigin-RevId: 284624714
上级 9cae3c4f
# coding=utf-8
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -29,6 +30,10 @@ import unicodedata
import six
import tensorflow as tf
import sentencepiece as spm
SPIECE_UNDERLINE = u"▁".encode("utf-8")
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""
......@@ -366,7 +371,7 @@ class WordpieceTokenizer(object):
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# \t, \n, and \r are technically control characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
......@@ -402,3 +407,128 @@ def _is_punctuation(char):
if cat.startswith("P"):
return True
return False
def preprocess_text(inputs, remove_space=True, lower=False):
"""Preprocesses data by removing extra space and normalize data.
This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/master/albert/tokenization.py
Args:
inputs: The input text.
remove_space: Whether to remove the extra space.
lower: Whether to lowercase the text.
Returns:
The preprocessed text.
"""
outputs = inputs
if remove_space:
outputs = " ".join(inputs.strip().split())
if six.PY2 and isinstance(outputs, str):
try:
outputs = six.ensure_text(outputs, "utf-8")
except UnicodeDecodeError:
outputs = six.ensure_text(outputs, "latin-1")
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
if lower:
outputs = outputs.lower()
return outputs
def encode_pieces(sp_model, text, sample=False):
"""Segements text into pieces.
This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/master/albert/tokenization.py
Args:
sp_model: A spm.SentencePieceProcessor object.
text: The input text to be segemented.
sample: Whether to randomly sample a segmentation output or return a
deterministic one.
Returns:
A list of token pieces.
"""
if not sample:
pieces = sp_model.EncodeAsPieces(text)
else:
pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = []
for piece in pieces:
piece = printable_text(piece)
if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
cur_pieces = sp_model.EncodeAsPieces(
six.ensure_binary(piece[:-1]).replace(SPIECE_UNDERLINE, b""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
return new_pieces
def encode_ids(sp_model, text, sample=False):
"""Segments text and return token ids.
This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/master/albert/tokenization.py
Args:
sp_model: A spm.SentencePieceProcessor object.
text: The input text to be segemented.
sample: Whether to randomly sample a segmentation output or return a
deterministic one.
Returns:
A list of token ids.
"""
pieces = encode_pieces(sp_model, text, sample=sample)
ids = [sp_model.PieceToId(piece) for piece in pieces]
return ids
class FullSentencePieceTokenizer(object):
"""Runs end-to-end sentence piece tokenization.
The interface of this class is intended to keep the same as above
`FullTokenizer` class for easier usage.
"""
def __init__(self, sp_model_file):
"""Inits FullSentencePieceTokenizer.
Args:
sp_model_file: The path to the sentence piece model file.
"""
self._sp_model = spm.SentencePieceProcessor()
self._sp_model.Load(sp_model_file)
self.vocab = {
self._sp_model.IdToPiece(i): i
for i in six.moves.range(self._sp_model.GetPieceSize())
}
def tokenize(self, text):
"""Tokenizes text into pieces."""
return encode_pieces(self._sp_model, text)
def convert_tokens_to_ids(self, tokens):
"""Converts a list of tokens to a list of ids."""
return [self._sp_model.PieceToId(printable_text(token)) for token in tokens]
def convert_ids_to_tokens(self, ids):
"""Converts a list of ids ot a list of tokens."""
return [self._sp_model.IdToPiece(id_) for id_ in ids]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册