提交 cb01a99b 编写于 作者: Q qianlong

fix dataset.text api doc

上级 e8639ad9
......@@ -30,7 +30,8 @@ from ..core.datatypes import mstype_to_detype
class Lookup(cde.LookupOp):
"""
Lookup operator that looks up a word to an id.
Lookup operator that looks up a word to an id.
Args:
vocab(Vocab): a Vocab object.
unknown(int, optional): default id to lookup a word that is out of vocab. If no argument is passed, 1 will be
......@@ -48,21 +49,22 @@ class Lookup(cde.LookupOp):
class Ngram(cde.NgramOp):
"""
TensorOp to generate n-gram from a 1-D string Tensor
TensorOp to generate n-gram from a 1-D string Tensor.
Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.
Args:
n([int, list]): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
n (list of int): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for
a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an
empty string be produced.
left_pad(tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (Default is None).
right_pad(tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
left_pad (tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default=None).
right_pad (tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--"
(Default is None).
separator(str,optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
with separator="-" the result would be ["mindspore-amazing"] (Default is None which means whitespace is
(default=None).
separator (str, optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
with separator="-" the result would be ["mindspore-amazing"] (default=None, which means whitespace is
used).
"""
......@@ -86,11 +88,12 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
Args:
hmm_path (str): the dictionary file is used by HMMSegment algorithm,
the dictionary can be obtained on the official website of cppjieba.
mp_path(str): the dictionary file is used by MPSegment algorithm,
mp_path (str): the dictionary file is used by MPSegment algorithm,
the dictionary can be obtained on the official website of cppjieba.
mode (Enum): [Default "MIX"], "MP" model will tokenize with MPSegment algorithm,
mode (JiebaMode, optional): "MP" model will tokenize with MPSegment algorithm,
"HMM" mode will tokenize with Hiddel Markov Model Segment algorithm,
"MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm.
"MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm
(default="MIX").
"""
@check_jieba_init
......@@ -104,13 +107,15 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
@check_jieba_add_word
def add_word(self, word, freq=None):
"""
Add user defined word to JiebaTokenizer's dictionary
Add user defined word to JiebaTokenizer's dictionary.
Args:
word(required, string): The word to be added to the JiebaTokenizer instance.
word (str): The word to be added to the JiebaTokenizer instance.
The added word will not be written into the built-in dictionary on disk.
freq(optional, int): The frequency of the word to be added, The higher the frequency,
the better change the word will be tokenized(default None, use default frequency).
freq (int, optional): The frequency of the word to be added, The higher the frequency,
the better change the word will be tokenized(default=None, use default frequency).
"""
if freq is None:
super().add_word(word, 0)
else:
......@@ -119,15 +124,20 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
@check_jieba_add_dict
def add_dict(self, user_dict):
"""
Add user defined word to JiebaTokenizer's dictionary
Add user defined word to JiebaTokenizer's dictionary.
Args:
user_dict(path/dict):Dictionary to be added, file path or Python dictionary,
Python Dict format: {word1:freq1, word2:freq2,...}
Jieba dictionary format : word(required), freq(optional), such as:
word1 freq1
word2
word3 freq3
user_dict (str or dict): Dictionary to be added, file path or Python dictionary,
Python Dict format: {word1:freq1, word2:freq2,...}.
Jieba dictionary format : word(required), freq(optional), such as:
.. code-block::
word1 freq1
word2
word3 freq3
"""
if isinstance(user_dict, str):
self.__add_dict_py_file(user_dict)
elif isinstance(user_dict, dict):
......@@ -190,12 +200,12 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
"""
Tokenize scalar token or 1-D tokens to subword tokens.
Args
vocab(Vocab): a Vocab object.
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default '[UNK]').
Args:
vocab (Vocab): a Vocab object.
suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default '##').
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default 100).
unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default='[UNK]').
"""
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'):
......@@ -209,7 +219,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
if platform.system().lower() != 'windows':
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
"""
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\t', '\r', '\n').
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n').
"""
......@@ -218,7 +228,7 @@ if platform.system().lower() != 'windows':
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
Args:
keep_whitespace(bool, optional): If or not emit whitespace tokens (default False)
keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
"""
def __init__(self, keep_whitespace=False):
......@@ -246,9 +256,9 @@ if platform.system().lower() != 'windows':
Apply normalize operation on utf-8 string tensor.
Args:
normalize_form(Enum, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
normalize_form (NormalizeForm, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
If set "NONE", will do nothing for input string tensor.
If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default "NFKC").
If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default="NFKC").
See http://unicode.org/reports/tr15/ for details.
"""
......@@ -260,13 +270,14 @@ if platform.system().lower() != 'windows':
class RegexReplace(cde.RegexReplaceOp):
"""
Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
Args:
pattern(string): the regex expression patterns.
replace(string): the string to replace matched element.
pattern(str): the regex expression patterns.
replace(str): the string to replace matched element.
replace_all(bool, optional): If False, only replace first matched element;
if True, replace all matched elements(default True).
if True, replace all matched elements(default=True).
"""
def __init__(self, pattern, replace, replace_all=True):
......@@ -279,13 +290,14 @@ if platform.system().lower() != 'windows':
class RegexTokenizer(cde.RegexTokenizerOp):
"""
Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
Args:
delim_pattern(string): The pattern of regex delimiters.
delim_pattern(str): The pattern of regex delimiters.
The original string will be split by matched elements.
keep_delim_pattern(string, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. And the default value is empty string(''),
keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
in this situation, delimiters will not kept as a output token.
"""
......@@ -302,12 +314,12 @@ if platform.system().lower() != 'windows':
Args:
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to make the text to lower case and strip accents characters; If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text(default False).
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
normalization_form(Enum, optional), Used to specify a specific normlaize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
preserve_unused_token(bool, optional), If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
"""
def __init__(self, lower_case=False, keep_whitespace=False,
......@@ -326,18 +338,18 @@ if platform.system().lower() != 'windows':
Args:
vocab(Vocab): a Vocab object.
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default '[UNK]').
suffix_indicator(str, optional): Used to show that the subword is the last part of a word(default='##').
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default=100).
unknown_token(str, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default='[UNK]').
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to make the text to lower case and strip accents characters; If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text(default False).
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
normalization_form(Enum, optional), Used to specify a specific normlaize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
preserve_unused_token(bool, optional), If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
"""
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
......
......@@ -25,7 +25,9 @@ from .validators import check_from_file, check_from_list, check_from_dict, check
class Vocab(cde.Vocab):
"""
Vocab object that is used to lookup a word. It contains a map that maps each word(str) to an id (int)
Vocab object that is used to lookup a word.
It contains a map that maps each word(str) to an id (int).
"""
@classmethod
......@@ -33,29 +35,32 @@ class Vocab(cde.Vocab):
def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None, special_tokens=None,
special_first=None):
"""
Build a vocab from a dataset. This would collect all unique words in a dataset and return a vocab within
Build a vocab from a dataset.
This would collect all unique words in a dataset and return a vocab within
the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency.
Words in vocab are ordered from highest frequency to lowest frequency. Words with the same frequency would be
ordered lexicographically.
Args:
dataset(Dataset): dataset to build vocab from.
columns([str, list], optional): column names to get words from. It can be a list of column names.
(Default=None where all columns will be used. If any column isn't string type, will return error)
columns(list of str, optional): column names to get words from. It can be a list of column names.
(default=None, where all columns will be used. If any column isn't string type, will return error).
freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as
min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
(default=None, all words are included).
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None,
all words are included).
special_tokens(list, optional): a list of strings, each one is a special token. for example
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
is specified and special_first is set to None, special_tokens will be prepended. (default=None).
return:
text.Vocab: Vocab object built from dataset.
Returns:
Vocab, Vocab object built from dataset.
"""
vocab = Vocab()
......@@ -69,7 +74,8 @@ class Vocab(cde.Vocab):
@check_from_list
def from_list(cls, word_list, special_tokens=None, special_first=None):
"""
build a vocab object from a list of word.
Build a vocab object from a list of word.
Args:
word_list(list): a list of string where each element is a word of type string.
special_tokens(list, optional): a list of strings, each one is a special token. for example
......@@ -77,34 +83,40 @@ class Vocab(cde.Vocab):
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
is specified and special_first is set to None, special_tokens will be prepended. (default=None).
"""
return super().from_list(word_list, special_tokens, special_first)
@classmethod
@check_from_file
def from_file(cls, file_path, delimiter=None, vocab_size=None, special_tokens=None, special_first=None):
"""
build a vocab object from a list of word.
Build a vocab object from a list of word.
Args:
file_path(str): path to the file which contains the vocab list.
delimiter(str, optional): a delimiter to break up each line in file, the first element is taken to be
file_path (str): path to the file which contains the vocab list.
delimiter (str, optional): a delimiter to break up each line in file, the first element is taken to be
the word (default=None).
vocab_size(int, optional): number of words to read from file_path (default=None, all words are taken).
special_tokens(list, optional): a list of strings, each one is a special token. for example
vocab_size (int, optional): number of words to read from file_path (default=None, all words are taken).
special_tokens (list, optional): a list of strings, each one is a special token. for example
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
is specified and special_first is set to None, special_tokens will be prepended. (default=None).
special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
If special_tokens is specified and special_first is set to None,
special_tokens will be prepended. (default=None).
"""
return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
@classmethod
@check_from_dict
def from_dict(cls, word_dict):
"""
build a vocab object from a dict.
Build a vocab object from a dict.
Args:
word_dict(dict): dict contains word, id pairs where word should be str and id int. id is recommended to
start from 0 and be continuous. ValueError will be raised if id is negative.
word_dict (dict): dict contains word, id pairs where word should be str and id int. id is recommended to
start from 0 and be continuous. ValueError will be raised if id is negative.
"""
return super().from_dict(word_dict)
......@@ -113,11 +125,11 @@ def to_str(array, encoding='utf8'):
Convert numpy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.
Args:
array (numpy array): Array of type `bytes` representing strings.
array (numpy.ndarray): Array of type `bytes` representing strings.
encoding (string): Indicating the charset for decoding.
Returns:
Numpy array of `str`.
Returns:
numpy.ndarray, numpy array of `str`.
"""
if not isinstance(array, np.ndarray):
......@@ -131,11 +143,11 @@ def to_bytes(array, encoding='utf8'):
Convert numpy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.
Args:
array (numpy array): Array of type `str` representing strings.
encoding (string): Indicating the charset for encoding.
Returns:
Numpy array of `bytes`.
array (numpy.ndarray): Array of type `str` representing strings.
encoding (str): Indicating the charset for encoding.
Returns:
numpy.ndarray, numpy array of `bytes`.
"""
if not isinstance(array, np.ndarray):
......
......@@ -75,7 +75,6 @@ class Slice(cde.SliceOp):
Slice operation to extract a tensor out using the given n slices.
The functionality of Slice is similar to NumPy indexing feature.
(Currently only rank 1 Tensors are supported)
Args:
......@@ -87,17 +86,17 @@ class Slice(cde.SliceOp):
4. Ellipses ...: slice all dimensions between the two slices.
Examples:
>>> # Data before
>>> # | col |
>>> # +---------+
>>> # | [1,2,3] |
>>> # +---------|
>>> data = data.map(operations=Slice(slice(1,3))) # slice indices 1 and 2 only
>>> # Data after
>>> # | col |
>>> # +------------+
>>> # | [1,2] |
>>> # +------------|
>>> # Data before
>>> # | col |
>>> # +---------+
>>> # | [1,2,3] |
>>> # +---------|
>>> data = data.map(operations=Slice(slice(1,3))) # slice indices 1 and 2 only
>>> # Data after
>>> # | col |
>>> # +------------+
>>> # | [1,2] |
>>> # +------------|
"""
@check_slice_op
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册