diff --git a/PaddleNLP/paddlenlp/data/vocab.py b/PaddleNLP/paddlenlp/data/vocab.py index 8d94712909ea5e8b14c7643e0a7fb3776dc748de..d83083b331b02e30ac6571f6e769968edddc55a3 100644 --- a/PaddleNLP/paddlenlp/data/vocab.py +++ b/PaddleNLP/paddlenlp/data/vocab.py @@ -36,14 +36,14 @@ class Vocab(object): between tokens and indices to be used. If provided, adjust the tokens and indices mapping according to it. If None, counter must be provided. Default: None. - unk_token (str): special token for unknow token. If no need, it also - could be None. Default: ''. - pad_token (str): special token for padding token. If no need, it also - could be None. Default: ''. - bos_token (str): special token for bos token. If no need, it also - could be None. Default: '. - eos_token (str): special token for eos token. If no need, it also - could be None. Default: ''. + unk_token (str): special token for unknow token ''. If no need, it also + could be None. Default: None. + pad_token (str): special token for padding token ''. If no need, it also + could be None. Default: None. + bos_token (str): special token for bos token ''. If no need, it also + could be None. Default: None. + eos_token (str): special token for eos token ''. If no need, it also + could be None. Default: None. **kwargs (dict): Keyword arguments ending with `_token`. It can be used to specify further special tokens that will be exposed as attribute of the vocabulary and associated with an index. @@ -54,10 +54,10 @@ class Vocab(object): max_size=None, min_freq=1, token_to_idx=None, - unk_token='', - pad_token='', - bos_token='', - eos_token='', + unk_token=None, + pad_token=None, + bos_token=None, + eos_token=None, **kwargs): # Handle special tokens combs = (('unk_token', unk_token), ('pad_token', pad_token), @@ -317,10 +317,10 @@ class Vocab(object): max_size=None, min_freq=1, token_to_idx=None, - unk_token='', - pad_token='', - bos_token='', - eos_token='', + unk_token=None, + pad_token=None, + bos_token=None, + eos_token=None, **kwargs): """ Building vocab accoring to given iterator and other information. Iterate @@ -333,14 +333,14 @@ class Vocab(object): between tokens and indices to be used. If provided, adjust the tokens and indices mapping according to it. If None, counter must be provided. Default: None. - unk_token (str): special token for unknow token. If no need, it also - could be None. Default: ''. - pad_token (str): special token for padding token. If no need, it also - could be None. Default: ''. - bos_token (str): special token for bos token. If no need, it also - could be None. Default: '. - eos_token (str): special token for eos token. If no need, it also - could be None. Default: ''. + unk_token (str): special token for unknow token ''. If no need, it also + could be None. Default: None. + pad_token (str): special token for padding token ''. If no need, it also + could be None. Default: None. + bos_token (str): special token for bos token ''. If no need, it also + could be None. Default: None. + eos_token (str): special token for eos token ''. If no need, it also + could be None. Default: None. **kwargs (dict): Keyword arguments ending with `_token`. It can be used to specify further special tokens that will be exposed as attribute of the vocabulary and associated with an index.