diff --git a/PaddleNLP/examples/language_model/gpt2/data.py b/PaddleNLP/examples/language_model/gpt2/data.py index cab6164475cd59499f42a69e66e49911080c1a70..ea40371565a81ac49c3ed6f013307fb0cba7d7d9 100644 --- a/PaddleNLP/examples/language_model/gpt2/data.py +++ b/PaddleNLP/examples/language_model/gpt2/data.py @@ -208,7 +208,7 @@ class GPT2Dataset(paddle.io.Dataset): # -INF mask value as default attention_mask = (attention_mask - 1.0) * 1e9 # Bool mask of attention - # attention_mask = attention_mask.astype("float32") + attention_mask = attention_mask.astype("float32") return [tokens, loss_mask, attention_mask, position_ids, labels] def _get_single_sample_from_idx(self, doc_index_f, doc_index_l, offset_f, diff --git a/PaddleNLP/paddlenlp/transformers/gpt2/modeling.py b/PaddleNLP/paddlenlp/transformers/gpt2/modeling.py index f9a29866f949ee5ebd5750d0e4ee15ea86c444e2..396cc1991dc6c2ce10f33073f10a42b69143a2a9 100644 --- a/PaddleNLP/paddlenlp/transformers/gpt2/modeling.py +++ b/PaddleNLP/paddlenlp/transformers/gpt2/modeling.py @@ -13,16 +13,17 @@ # limitations under the License. import collections +import math + import numpy as np import paddle import paddle.nn as nn -import paddle.tensor as tensor import paddle.nn.functional as F -import math +import paddle.tensor as tensor from paddle.fluid import layers +from paddle.nn.layer.transformer import _convert_param_attr_to_list from .. import PretrainedModel, register_base_model -from paddle.nn.layer.transformer import _convert_param_attr_to_list __all__ = [ 'GPT2Model', diff --git a/PaddleNLP/paddlenlp/transformers/gpt2/tokenizer.py b/PaddleNLP/paddlenlp/transformers/gpt2/tokenizer.py index fec6a3d6d6437352af95258283beb924098fd673..aacc4b50150e59f701716532a8324694b8eb617a 100644 --- a/PaddleNLP/paddlenlp/transformers/gpt2/tokenizer.py +++ b/PaddleNLP/paddlenlp/transformers/gpt2/tokenizer.py @@ -13,14 +13,13 @@ # limitations under the License. import os -import regex as re -import unicodedata +from functools import lru_cache +from collections import namedtuple + import json -import sentencepiece import jieba +from paddle.utils import try_import -from functools import lru_cache -from collections import namedtuple from .. import PretrainedTokenizer from ..tokenizer_utils import convert_to_unicode, whitespace_tokenize,\ _is_whitespace, _is_control, _is_punctuation @@ -122,7 +121,8 @@ class GPT2ChineseTokenizer(PretrainedTokenizer): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v: k for k, v in self.encoder.items()} - self.sp = sentencepiece.SentencePieceProcessor(model_file=model_file) + mod = try_import("sentencepiece") + self.sp = mod.SentencePieceProcessor(model_file=model_file) self.translator = str.maketrans(" \n", "\u2582\u2583") def tokenize(self, text): @@ -220,7 +220,7 @@ class GPT2Tokenizer(PretrainedTokenizer): bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} - + re = try_import("regex") self.pat = re.compile( r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ) @@ -295,6 +295,7 @@ class GPT2Tokenizer(PretrainedTokenizer): def tokenize(self, text): """ Tokenize a string. """ bpe_tokens = [] + re = try_import("regex") for token in re.findall(self.pat, text): token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend(