"""
NLP data processing; tokenizes text and creates vocab indexes
I have directly copied and paste part of OF THE TRANSFORMS.PY FASTAI LIBRARY.
I only need the Tokenizer and the Vocab classes which are both in this module.
This way I avoid the numerous fastai dependencies.
Credit for the code here to Jeremy Howard and the fastai team
"""
import os
import re
import html
from collections import Counter, defaultdict
from concurrent.futures.process import ProcessPoolExecutor
import spacy
from spacy.symbols import ORTH
from ..wdtypes import *
def partition(a: Collection, sz: int) -> List[Collection]:
"Split iterables `a` in equal parts of size `sz`"
return [a[i : i + sz] for i in range(0, len(a), sz)] # type: ignore
def partition_by_cores(a: Collection, n_cpus: int) -> List[Collection]:
"Split data in `a` equally among `n_cpus` cores"
return partition(a, len(a) // n_cpus + 1)
def ifnone(a: Any, b: Any) -> Any:
"`a` if `a` is not None, otherwise `b`."
return b if a is None else a
def num_cpus() -> Optional[int]:
"Get number of cpus"
try:
return len(os.sched_getaffinity(0))
except AttributeError:
return os.cpu_count()
_default_cpus = min(16, num_cpus())
defaults = SimpleNamespace(
cpus=_default_cpus, cmap="viridis", return_fig=False, silent=False
)
__all__ = [
"BaseTokenizer",
"SpacyTokenizer",
"Tokenizer",
"Vocab",
"fix_html",
"replace_all_caps",
"replace_rep",
"replace_wrep",
"rm_useless_spaces",
"spec_add_spaces",
"BOS",
"EOS",
"FLD",
"UNK",
"PAD",
"TK_MAJ",
"TK_UP",
"TK_REP",
"TK_REP",
"TK_WREP",
"deal_caps",
]
BOS, EOS, FLD, UNK, PAD = "xxbos", "xxeos", "xxfld", "xxunk", "xxpad"
TK_MAJ, TK_UP, TK_REP, TK_WREP = "xxmaj", "xxup", "xxrep", "xxwrep"
defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_MAJ, TK_UP, TK_REP, TK_WREP]
class BaseTokenizer:
"""Basic class for a tokenizer function."""
def __init__(self, lang: str):
self.lang = lang
def tokenizer(self, t: str) -> List[str]:
return t.split(" ")
def add_special_cases(self, toks: Collection[str]):
pass
class SpacyTokenizer(BaseTokenizer):
"""Wrapper around a spacy tokenizer to make it a :obj:`BaseTokenizer`.
Parameters
----------
lang: str
Language of the text to be tokenized
"""
def __init__(self, lang: str):
self.tok = spacy.blank(lang, disable=["parser", "tagger", "ner"])
def tokenizer(self, t: str):
"""Runs ``Spacy``'s ``tokenizer``
Parameters
----------
t: str
text to be tokenized
"""
return [t.text for t in self.tok.tokenizer(t)]
def add_special_cases(self, toks: Collection[str]):
"""Runs ``Spacy``'s ``add_special_case`` method
Parameters
----------
toks: Collection[str]
`List`, `Tuple`, `Set` or `Dictionary` with special cases
to add to the tokenizer
"""
for w in toks:
self.tok.tokenizer.add_special_case(w, [{ORTH: w}])
def spec_add_spaces(t: str) -> str:
"Add spaces around / and # in `t`. \n"
return re.sub(r"([/#\n])", r" \1 ", t)
def rm_useless_spaces(t: str) -> str:
"Remove multiple spaces in `t`."
return re.sub(" {2,}", " ", t)
def replace_rep(t: str) -> str:
"Replace repetitions at the character level in `t`."
def _replace_rep(m: Match[str]) -> str:
c, cc = m.groups()
return f" {TK_REP} {len(cc)+1} {c} "
re_rep = re.compile(r"(\S)(\1{3,})")
return re_rep.sub(_replace_rep, t)
def replace_wrep(t: str) -> str:
"Replace word repetitions in `t`."
def _replace_wrep(m: Match[str]) -> str:
c, cc = m.groups()
return f" {TK_WREP} {len(cc.split())+1} {c} "
re_wrep = re.compile(r"(\b\w+\W+)(\1{3,})")
return re_wrep.sub(_replace_wrep, t)
def fix_html(x: str) -> str:
"List of replacements from html strings in `x`."
re1 = re.compile(r" +")
x = (
x.replace("#39;", "'")
.replace("amp;", "&")
.replace("#146;", "'")
.replace("nbsp;", " ")
.replace("#36;", "$")
.replace("\\n", "\n")
.replace("quot;", "'")
.replace("
", "\n")
.replace('\\"', '"')
.replace("", UNK)
.replace(" @.@ ", ".")
.replace(" @-@ ", "-")
.replace(" @,@ ", ",")
.replace("\\", " \\ ")
)
return re1.sub(" ", html.unescape(x))
def replace_all_caps(x: Collection[str]) -> Collection[str]:
"Replace tokens in ALL CAPS in `x` by their lower version and add `TK_UP` before."
res = []
for t in x:
if t.isupper() and len(t) > 1:
res.append(TK_UP)
res.append(t.lower())
else:
res.append(t)
return res
def deal_caps(x: Collection[str]) -> Collection[str]:
"Replace all Capitalized tokens in `x` by their lower version and add `TK_MAJ` before."
res = []
for t in x:
if t == "":
continue
if t[0].isupper() and len(t) > 1 and t[1:].islower():
res.append(TK_MAJ)
res.append(t.lower())
return res
defaults.text_pre_rules = [
fix_html,
replace_rep,
replace_wrep,
spec_add_spaces,
rm_useless_spaces,
]
defaults.text_post_rules = [replace_all_caps, deal_caps]
class Tokenizer:
"""Class to combine a series of rules and a tokenizer function to tokenize
text with multiprocessing.
Parameters
----------
tok_func: Callable, Default = SpacyTokenizer
Tokenizer Object. See :class:`pytorch_widedeep.utils.fastai_transforms.SpacyTokenizer`
lang: str, Default = "en",
Text's Language
pre_rules: ListRules, Default = None,
Custom type, see :obj:`pytorch_widedeep.wdtypes`. Preprocessing Rules
post_rules: ListRules, Default = None,
Custom type, see :obj:`pytorch_widedeep.wdtypes`. Postprocessing Rules
special_cases: Collection[str], Default= None,
special cases to be added to the tokenizer via ``Spacy``'s
``add_special_case`` method
n_cpus: int, Default = None
number of CPUs to used during the tokenization process
"""
def __init__(
self,
tok_func: Callable = SpacyTokenizer,
lang: str = "en",
pre_rules: ListRules = None,
post_rules: ListRules = None,
special_cases: Collection[str] = None,
n_cpus: int = None,
):
self.tok_func, self.lang, self.special_cases = tok_func, lang, special_cases
self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules)
self.post_rules = ifnone(post_rules, defaults.text_post_rules)
self.special_cases = (
special_cases if special_cases is not None else defaults.text_spec_tok
)
self.n_cpus = ifnone(n_cpus, defaults.cpus)
def __repr__(self) -> str:
res = f"Tokenizer {self.tok_func.__name__} in {self.lang} with the following rules:\n"
for rule in self.pre_rules:
res += f" - {rule.__name__}\n"
for rule in self.post_rules:
res += f" - {rule.__name__}\n"
return res
def process_text(self, t: str, tok: BaseTokenizer) -> List[str]:
"""Process and tokenize one text ``t`` with tokenizer ``tok``.
Parameters
----------
t: str
text to be processed and tokenized
tok: BaseTokenizer
Instance of :obj:`BaseTokenizer`
"""
for rule in self.pre_rules:
t = rule(t)
toks = tok.tokenizer(t)
for rule in self.post_rules:
toks = rule(toks)
return toks
def _process_all_1(self, texts: Collection[str]) -> List[List[str]]:
"""Process a list of ``texts`` in one process."""
tok = self.tok_func(self.lang)
if self.special_cases:
tok.add_special_cases(self.special_cases)
return [self.process_text(str(t), tok) for t in texts]
def process_all(self, texts: Collection[str]) -> List[List[str]]:
r"""Process a list of texts. Parallel execution of ``process_text``.
Examples
--------
>>> from pytorch_widedeep.utils import Tokenizer
>>> texts = ['Machine learning is great', 'but building stuff is even better']
>>> tok = Tokenizer()
>>> tok.process_all(texts)
[['xxmaj', 'machine', 'learning', 'is', 'great'],
['but', 'building', 'stuff', 'is', 'even', 'better']]
.. note:: Note the token ``TK_MAJ`` (`xxmaj`), used to indicate the
next word begins with a capital in the original text. For more
details of special tokens please see the ``fastai`` `docs
`_.
"""
if self.n_cpus <= 1:
return self._process_all_1(texts)
with ProcessPoolExecutor(self.n_cpus) as e:
return sum(
e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), []
)
class Vocab:
"""Contains the correspondence between numbers and tokens.
Parameters
----------
itos: Collection[str]
`index to str`. Collection of srt that are the tokens of the vocabulary
Attributes
----------
stoi: defaultdict
`str to index`. Dictionary containing the tokens of the vocabulary and
their corresponding index
"""
def __init__(self, itos: Collection[str]):
self.itos = itos
self.stoi = defaultdict(int, {v: k for k, v in enumerate(self.itos)})
def numericalize(self, t: Collection[str]) -> List[int]:
"""Convert a list of str (or tokens) ``t`` to their ids."""
return [self.stoi[w] for w in t]
def textify(self, nums: Collection[int], sep=" ") -> List[str]:
"""Convert a list of ``nums`` (or indexes) to their tokens."""
return sep.join([self.itos[i] for i in nums]) if sep is not None else [self.itos[i] for i in nums] # type: ignore
def __getstate__(self):
return {"itos": self.itos}
def __setstate__(self, state: dict):
self.itos = state["itos"]
self.stoi = defaultdict(int, {v: k for k, v in enumerate(self.itos)})
def save(self, path):
"""Save the attribute ``self.itos`` in ``path``"""
pickle.dump(self.itos, open(path, "wb"))
@classmethod
def create(cls, tokens: Tokens, max_vocab: int, min_freq: int) -> "Vocab":
r"""Create a vocabulary object from a set of tokens.
Parameters
----------
tokens: Tokens
Custom type, see :obj:`pytorch_widedeep.wdtypes`. Collection of
collection of str (e.g. list of tokenized sentences)
max_vocab: int
maximum vocabulary size
min_freq: int
minimum frequency that a token has to appear to be part of the
vocabulary
Examples
--------
>>> from pytorch_widedeep.utils import Tokenizer, Vocab
>>> texts = ['Machine learning is great', 'but building stuff is even better']
>>> tokens = Tokenizer().process_all(texts)
>>> vocab = Vocab.create(tokens, max_vocab=18, min_freq=1)
>>> print(vocab.itos)
['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxmaj', 'xxup', 'xxrep', 'xxwrep',
'is', 'machine', 'learning', 'great', 'but', 'building', 'stuff', 'even', 'better']
>>> vocab.numericalize(['machine', 'learning', 'is', 'great'])
[10, 11, 9, 12]
>>> vocab.textify([10, 11, 9, 12])
'machine learning is great'
.. note:: Note the many special tokens that ``fastai``'s' tokenizer
adds. These are particularly useful when building Language models and/or in
classification/Regression tasks. Please see the ``fastai``
`docs `_.
"""
freq = Counter(p for o in tokens for p in o)
itos = [o for o, c in freq.most_common(max_vocab) if c >= min_freq]
for o in reversed(defaults.text_spec_tok):
if o in itos:
itos.remove(o)
itos.insert(0, o)
itos = itos[:max_vocab]
if (
len(itos) < max_vocab
): # Make sure vocab size is a multiple of 8 for fast mixed precision training
while len(itos) % 8 != 0:
itos.append("xxfake")
return cls(itos)
@classmethod
def load(cls, path):
"""Load an intance of :obj:`Vocab` contained in ``path``"""
itos = pickle.load(open(path, "rb"))
return cls(itos)