fastai_transforms.py 12.4 KB
Newer Older
J
jrzaurin 已提交
1
"""
2 3
NLP data processing; tokenizes text and creates vocab indexes

4 5 6
I have directly copied and paste part of OF THE TRANSFORMS.PY FASTAI LIBRARY.
I only need the Tokenizer and the Vocab classes which are both in this module.
This way I avoid the numerous fastai dependencies.
7 8

Credit for the code here to Jeremy Howard and the fastai team
J
jrzaurin 已提交
9
"""
10 11 12 13 14

import os
import re
import html
from collections import Counter, defaultdict
15 16 17
from concurrent.futures.process import ProcessPoolExecutor

import spacy
18 19
from spacy.symbols import ORTH

20 21
from ..wdtypes import *

22

J
jrzaurin 已提交
23
def partition(a: Collection, sz: int) -> List[Collection]:
24
    "Split iterables `a` in equal parts of size `sz`"
J
jrzaurin 已提交
25
    return [a[i : i + sz] for i in range(0, len(a), sz)]  # type: ignore
26 27


J
jrzaurin 已提交
28
def partition_by_cores(a: Collection, n_cpus: int) -> List[Collection]:
29
    "Split data in `a` equally among `n_cpus` cores"
J
jrzaurin 已提交
30
    return partition(a, len(a) // n_cpus + 1)
31 32


J
jrzaurin 已提交
33
def ifnone(a: Any, b: Any) -> Any:
34 35 36 37
    "`a` if `a` is not None, otherwise `b`."
    return b if a is None else a


J
jrzaurin 已提交
38
def num_cpus() -> Optional[int]:
39
    "Get number of cpus"
J
jrzaurin 已提交
40 41 42 43
    try:
        return len(os.sched_getaffinity(0))
    except AttributeError:
        return os.cpu_count()
44 45


J
jrzaurin 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
_default_cpus = min(16, num_cpus())
defaults = SimpleNamespace(
    cpus=_default_cpus, cmap="viridis", return_fig=False, silent=False
)

__all__ = [
    "BaseTokenizer",
    "SpacyTokenizer",
    "Tokenizer",
    "Vocab",
    "fix_html",
    "replace_all_caps",
    "replace_rep",
    "replace_wrep",
    "rm_useless_spaces",
    "spec_add_spaces",
    "BOS",
    "EOS",
    "FLD",
    "UNK",
    "PAD",
    "TK_MAJ",
    "TK_UP",
    "TK_REP",
    "TK_REP",
    "TK_WREP",
    "deal_caps",
]

BOS, EOS, FLD, UNK, PAD = "xxbos", "xxeos", "xxfld", "xxunk", "xxpad"
TK_MAJ, TK_UP, TK_REP, TK_WREP = "xxmaj", "xxup", "xxrep", "xxwrep"
defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_MAJ, TK_UP, TK_REP, TK_WREP]


class BaseTokenizer:
81
    """Basic class for a tokenizer function."""
82

J
jrzaurin 已提交
83 84
    def __init__(self, lang: str):
        self.lang = lang
85

J
jrzaurin 已提交
86 87
    def tokenizer(self, t: str) -> List[str]:
        return t.split(" ")
88

J
jrzaurin 已提交
89 90
    def add_special_cases(self, toks: Collection[str]):
        pass
91 92 93


class SpacyTokenizer(BaseTokenizer):
94
    """Wrapper around a spacy tokenizer to make it a :obj:`BaseTokenizer`.
95 96 97 98 99 100

    Parameters
    ----------
    lang: str
        Language of the text to be tokenized
    """
101

J
jrzaurin 已提交
102 103 104
    def __init__(self, lang: str):
        self.tok = spacy.blank(lang, disable=["parser", "tagger", "ner"])

105 106 107 108 109 110 111 112
    def tokenizer(self, t: str):
        """Runs ``Spacy``'s ``tokenizer``

        Parameters
        ----------
        t: str
            text to be tokenized
        """
113 114
        return [t.text for t in self.tok.tokenizer(t)]

J
jrzaurin 已提交
115
    def add_special_cases(self, toks: Collection[str]):
116 117 118 119
        """Runs ``Spacy``'s ``add_special_case`` method

        Parameters
        ----------
120
        toks: Collection[str]
121
            `List`, `Tuple`, `Set` or `Dictionary` with special cases
122
            to add to the tokenizer
123
        """
124 125 126 127
        for w in toks:
            self.tok.tokenizer.add_special_case(w, [{ORTH: w}])


J
jrzaurin 已提交
128
def spec_add_spaces(t: str) -> str:
129
    "Add spaces around / and # in `t`. \n"
J
jrzaurin 已提交
130
    return re.sub(r"([/#\n])", r" \1 ", t)
131 132


J
jrzaurin 已提交
133
def rm_useless_spaces(t: str) -> str:
134
    "Remove multiple spaces in `t`."
J
jrzaurin 已提交
135
    return re.sub(" {2,}", " ", t)
136 137


J
jrzaurin 已提交
138
def replace_rep(t: str) -> str:
139
    "Replace repetitions at the character level in `t`."
J
jrzaurin 已提交
140 141 142 143 144 145

    def _replace_rep(m: Match[str]) -> str:
        c, cc = m.groups()
        return f" {TK_REP} {len(cc)+1} {c} "

    re_rep = re.compile(r"(\S)(\1{3,})")
146 147 148
    return re_rep.sub(_replace_rep, t)


J
jrzaurin 已提交
149
def replace_wrep(t: str) -> str:
150 151
    "Replace word repetitions in `t`."

J
jrzaurin 已提交
152 153 154
    def _replace_wrep(m: Match[str]) -> str:
        c, cc = m.groups()
        return f" {TK_WREP} {len(cc.split())+1} {c} "
155

J
jrzaurin 已提交
156 157
    re_wrep = re.compile(r"(\b\w+\W+)(\1{3,})")
    return re_wrep.sub(_replace_wrep, t)
158 159


J
jrzaurin 已提交
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
def fix_html(x: str) -> str:
    "List of replacements from html strings in `x`."
    re1 = re.compile(r"  +")
    x = (
        x.replace("#39;", "'")
        .replace("amp;", "&")
        .replace("#146;", "'")
        .replace("nbsp;", " ")
        .replace("#36;", "$")
        .replace("\\n", "\n")
        .replace("quot;", "'")
        .replace("<br />", "\n")
        .replace('\\"', '"')
        .replace("<unk>", UNK)
        .replace(" @.@ ", ".")
        .replace(" @-@ ", "-")
        .replace(" @,@ ", ",")
        .replace("\\", " \\ ")
    )
    return re1.sub(" ", html.unescape(x))


def replace_all_caps(x: Collection[str]) -> Collection[str]:
183 184 185
    "Replace tokens in ALL CAPS in `x` by their lower version and add `TK_UP` before."
    res = []
    for t in x:
J
jrzaurin 已提交
186 187 188 189 190
        if t.isupper() and len(t) > 1:
            res.append(TK_UP)
            res.append(t.lower())
        else:
            res.append(t)
191 192 193
    return res


J
jrzaurin 已提交
194
def deal_caps(x: Collection[str]) -> Collection[str]:
195 196 197
    "Replace all Capitalized tokens in `x` by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
J
jrzaurin 已提交
198 199 200 201
        if t == "":
            continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower():
            res.append(TK_MAJ)
202 203 204
        res.append(t.lower())
    return res

J
jrzaurin 已提交
205 206 207 208 209 210 211 212

defaults.text_pre_rules = [
    fix_html,
    replace_rep,
    replace_wrep,
    spec_add_spaces,
    rm_useless_spaces,
]
213 214 215
defaults.text_post_rules = [replace_all_caps, deal_caps]


J
jrzaurin 已提交
216
class Tokenizer:
217 218 219 220 221
    """Class to combine a series of rules and a tokenizer function to tokenize
    text with multiprocessing.

    Parameters
    ----------
222
    tok_func: Callable, Default = SpacyTokenizer
223 224 225 226
        Tokenizer Object. See :class:`pytorch_widedeep.utils.fastai_transforms.SpacyTokenizer`
    lang: str, Default = "en",
        Text's Language
    pre_rules: ListRules, Default = None,
227
        Custom type, see :obj:`pytorch_widedeep.wdtypes`. Preprocessing Rules
228
    post_rules: ListRules, Default = None,
229 230 231
        Custom type, see :obj:`pytorch_widedeep.wdtypes`. Postprocessing Rules
    special_cases: Collection[str], Default= None,
        special cases to be added to the tokenizer via ``Spacy``'s
232 233 234 235
        ``add_special_case`` method
    n_cpus: int, Default = None
        number of CPUs to used during the tokenization process
    """
J
jrzaurin 已提交
236

J
jrzaurin 已提交
237 238 239 240 241 242 243 244 245 246 247
    def __init__(
        self,
        tok_func: Callable = SpacyTokenizer,
        lang: str = "en",
        pre_rules: ListRules = None,
        post_rules: ListRules = None,
        special_cases: Collection[str] = None,
        n_cpus: int = None,
    ):
        self.tok_func, self.lang, self.special_cases = tok_func, lang, special_cases
        self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules)
248
        self.post_rules = ifnone(post_rules, defaults.text_post_rules)
J
jrzaurin 已提交
249 250 251
        self.special_cases = (
            special_cases if special_cases is not None else defaults.text_spec_tok
        )
252 253 254
        self.n_cpus = ifnone(n_cpus, defaults.cpus)

    def __repr__(self) -> str:
J
jrzaurin 已提交
255 256 257 258 259
        res = f"Tokenizer {self.tok_func.__name__} in {self.lang} with the following rules:\n"
        for rule in self.pre_rules:
            res += f" - {rule.__name__}\n"
        for rule in self.post_rules:
            res += f" - {rule.__name__}\n"
260 261
        return res

J
jrzaurin 已提交
262
    def process_text(self, t: str, tok: BaseTokenizer) -> List[str]:
263 264 265 266 267 268
        """Process and tokenize one text ``t`` with tokenizer ``tok``.

        Parameters
        ----------
        t: str
            text to be processed and tokenized
269 270
        tok: BaseTokenizer
            Instance of :obj:`BaseTokenizer`
271
        """
J
jrzaurin 已提交
272 273
        for rule in self.pre_rules:
            t = rule(t)
274
        toks = tok.tokenizer(t)
J
jrzaurin 已提交
275 276
        for rule in self.post_rules:
            toks = rule(toks)
277 278
        return toks

J
jrzaurin 已提交
279
    def _process_all_1(self, texts: Collection[str]) -> List[List[str]]:
280
        """Process a list of ``texts`` in one process."""
281

282
        tok = self.tok_func(self.lang)
J
jrzaurin 已提交
283 284
        if self.special_cases:
            tok.add_special_cases(self.special_cases)
285 286
        return [self.process_text(str(t), tok) for t in texts]

J
jrzaurin 已提交
287
    def process_all(self, texts: Collection[str]) -> List[List[str]]:
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
        r"""Process a list of texts. Parallel execution of ``process_text``.

        Examples
        --------
        >>> from pytorch_widedeep.utils import Tokenizer
        >>> texts = ['Machine learning is great', 'but building stuff is even better']
        >>> tok = Tokenizer()
        >>> tok.process_all(texts)
            [['xxmaj', 'machine', 'learning', 'is', 'great'],
            ['but', 'building', 'stuff', 'is', 'even', 'better']]

        .. note:: Note the token ``TK_MAJ`` (`xxmaj`), used to indicate the
            next word begins with a capital in the original text. For more
            details of special tokens please see the ``fastai`` `docs
            <https://docs.fast.ai/text.transform.html#Tokenizer>`_.
303 304
        """

J
jrzaurin 已提交
305 306
        if self.n_cpus <= 1:
            return self._process_all_1(texts)
307
        with ProcessPoolExecutor(self.n_cpus) as e:
J
jrzaurin 已提交
308 309 310
            return sum(
                e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), []
            )
311 312


J
jrzaurin 已提交
313
class Vocab:
314 315 316 317
    """Contains the correspondence between numbers and tokens.

    Parameters
    ----------
318
    itos: Collection[str]
319 320 321 322 323
        `index to str`. Collection of srt that are the tokens of the vocabulary

    Attributes
    ----------
    stoi: defaultdict
324 325
        `str to index`. Dictionary containing the tokens of the vocabulary and
        their corresponding index
326
    """
J
jrzaurin 已提交
327 328

    def __init__(self, itos: Collection[str]):
329
        self.itos = itos
J
jrzaurin 已提交
330
        self.stoi = defaultdict(int, {v: k for k, v in enumerate(self.itos)})
331

J
jrzaurin 已提交
332
    def numericalize(self, t: Collection[str]) -> List[int]:
333
        """Convert a list of str (or tokens) ``t`` to their ids."""
334 335
        return [self.stoi[w] for w in t]

J
jrzaurin 已提交
336
    def textify(self, nums: Collection[int], sep=" ") -> List[str]:
337
        """Convert a list of ``nums`` (or indexes) to their tokens."""
J
jrzaurin 已提交
338
        return sep.join([self.itos[i] for i in nums]) if sep is not None else [self.itos[i] for i in nums]  # type: ignore
339 340

    def __getstate__(self):
J
jrzaurin 已提交
341
        return {"itos": self.itos}
342

J
jrzaurin 已提交
343 344 345
    def __setstate__(self, state: dict):
        self.itos = state["itos"]
        self.stoi = defaultdict(int, {v: k for k, v in enumerate(self.itos)})
346 347

    def save(self, path):
348
        """Save the  attribute ``self.itos`` in ``path``"""
J
jrzaurin 已提交
349
        pickle.dump(self.itos, open(path, "wb"))
350 351

    @classmethod
J
jrzaurin 已提交
352
    def create(cls, tokens: Tokens, max_vocab: int, min_freq: int) -> "Vocab":
353
        r"""Create a vocabulary object from a set of tokens.
354 355 356 357

        Parameters
        ----------
        tokens: Tokens
358 359
            Custom type, see :obj:`pytorch_widedeep.wdtypes`. Collection of
            collection of str (e.g. list of tokenized sentences)
360 361 362 363 364 365
        max_vocab: int
            maximum vocabulary size
        min_freq: int
            minimum frequency that a token has to appear to be part of the
            vocabulary

366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
        Examples
        --------
        >>> from pytorch_widedeep.utils import Tokenizer, Vocab
        >>> texts = ['Machine learning is great', 'but building stuff is even better']
        >>> tokens = Tokenizer().process_all(texts)
        >>> vocab = Vocab.create(tokens, max_vocab=18, min_freq=1)
        >>> print(vocab.itos)
        ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxmaj', 'xxup', 'xxrep', 'xxwrep',
        'is', 'machine', 'learning', 'great', 'but', 'building', 'stuff', 'even', 'better']
        >>> vocab.numericalize(['machine', 'learning', 'is', 'great'])
        [10, 11, 9, 12]
        >>> vocab.textify([10, 11, 9, 12])
        'machine learning is great'

        .. note:: Note the many special tokens that ``fastai``'s' tokenizer
            adds. These are particularly useful when building Language models and/or in
            classification/Regression tasks. Please see the ``fastai``
            `docs <https://docs.fast.ai/text.transform.html#Tokenizer>`_.
384
        """
385
        freq = Counter(p for o in tokens for p in o)
J
jrzaurin 已提交
386
        itos = [o for o, c in freq.most_common(max_vocab) if c >= min_freq]
387
        for o in reversed(defaults.text_spec_tok):
J
jrzaurin 已提交
388 389
            if o in itos:
                itos.remove(o)
390 391
            itos.insert(0, o)
        itos = itos[:max_vocab]
J
jrzaurin 已提交
392 393 394 395 396
        if (
            len(itos) < max_vocab
        ):  # Make sure vocab size is a multiple of 8 for fast mixed precision training
            while len(itos) % 8 != 0:
                itos.append("xxfake")
397 398 399 400
        return cls(itos)

    @classmethod
    def load(cls, path):
401
        """Load an intance of :obj:`Vocab` contained in ``path``"""
J
jrzaurin 已提交
402
        itos = pickle.load(open(path, "rb"))
403
        return cls(itos)