fix dataset.text api doc

cb01a99b · qianlong · e8639ad9 · cb01a99b · cb01a99b · cb01a99b
3 changed file
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@@ -30,7 +30,8 @@ from ..core.datatypes import mstype_to_detype

 class Lookup(cde.LookupOp):
    """
-        Lookup operator that looks up a word to an id.
+    Lookup operator that looks up a word to an id.
+
    Args:
        vocab(Vocab): a Vocab object.
        unknown(int, optional): default id to lookup a word that is out of vocab. If no argument is passed, 1 will be
@@ -48,21 +49,22 @@ class Lookup(cde.LookupOp):

 class Ngram(cde.NgramOp):
    """
-    TensorOp to generate n-gram from a 1-D string Tensor
+    TensorOp to generate n-gram from a 1-D string Tensor.
+
    Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.

    Args:
-        n([int, list]):  n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
+        n (list of int):  n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
            would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for
            a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an
            empty string be produced.
-        left_pad(tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
-            will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (Default is None).
-        right_pad(tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
+        left_pad (tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
+            will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default=None).
+        right_pad (tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
            pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--"
-            (Default is None).
-        separator(str,optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
-            with separator="-" the result would be ["mindspore-amazing"] (Default is None which means whitespace is
+            (default=None).
+        separator (str, optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
+            with separator="-" the result would be ["mindspore-amazing"] (default=None, which means whitespace is
            used).
    """

@@ -86,11 +88,12 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
    Args:
        hmm_path (str): the dictionary file is used by  HMMSegment algorithm,
            the dictionary can be obtained on the official website of cppjieba.
-        mp_path(str): the dictionary file is used by MPSegment algorithm,
+        mp_path (str): the dictionary file is used by MPSegment algorithm,
            the dictionary can be obtained on the official website of cppjieba.
-        mode (Enum):  [Default "MIX"], "MP" model will tokenize with MPSegment algorithm,
+        mode (JiebaMode, optional): "MP" model will tokenize with MPSegment algorithm,
            "HMM" mode will tokenize with Hiddel Markov Model Segment algorithm,
-            "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm.
+            "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm
+            (default="MIX").
    """

    @check_jieba_init
@@ -104,13 +107,15 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
    @check_jieba_add_word
    def add_word(self, word, freq=None):
        """
-        Add user defined word to JiebaTokenizer's dictionary
+        Add user defined word to JiebaTokenizer's dictionary.
+
        Args:
-            word(required, string): The word to be added to the JiebaTokenizer instance.
+            word (str): The word to be added to the JiebaTokenizer instance.
                The added word will not be written into the built-in dictionary on disk.
-            freq(optional, int): The frequency of the word to be added, The higher the frequency,
-                the better change the word will be tokenized(default None, use default frequency).
+            freq (int, optional): The frequency of the word to be added, The higher the frequency,
+                the better change the word will be tokenized(default=None, use default frequency).
        """
+
        if freq is None:
            super().add_word(word, 0)
        else:
@@ -119,15 +124,20 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
    @check_jieba_add_dict
    def add_dict(self, user_dict):
        """
-        Add user defined word to JiebaTokenizer's dictionary
+        Add user defined word to JiebaTokenizer's dictionary.
+
        Args:
-            user_dict(path/dict):Dictionary to be added, file path or Python dictionary,
-            Python Dict format: {word1:freq1, word2:freq2,...}
-            Jieba dictionary format : word(required), freq(optional), such as:
-                word1 freq1
-                word2
-                word3 freq3
+            user_dict (str or dict): Dictionary to be added, file path or Python dictionary,
+                Python Dict format: {word1:freq1, word2:freq2,...}.
+                Jieba dictionary format : word(required), freq(optional), such as:
+
+                .. code-block::
+
+                    word1 freq1
+                    word2
+                    word3 freq3
        """
+
        if isinstance(user_dict, str):
            self.__add_dict_py_file(user_dict)
        elif isinstance(user_dict, dict):
@@ -190,12 +200,12 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
    """
    Tokenize scalar token or 1-D tokens to subword tokens.

-    Args
-        vocab(Vocab): a Vocab object.
-        suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
-        max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
-        unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
-            return the token directly, else return 'unknown_token'(default '[UNK]').
+    Args:
+        vocab (Vocab): a Vocab object.
+        suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default '##').
+        max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default 100).
+        unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
+            return the token directly, else return 'unknown_token'(default='[UNK]').
    """

    def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'):
@@ -209,7 +219,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
 if platform.system().lower() != 'windows':
    class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
        """
-        Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\t', '\r', '\n').
+        Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n').
        """


@@ -218,7 +228,7 @@ if platform.system().lower() != 'windows':
        Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.

        Args:
-            keep_whitespace(bool, optional): If or not emit whitespace tokens (default False)
+            keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
        """

        def __init__(self, keep_whitespace=False):
@@ -246,9 +256,9 @@ if platform.system().lower() != 'windows':
        Apply normalize operation on utf-8 string tensor.

        Args:
-            normalize_form(Enum, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
+            normalize_form (NormalizeForm, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
                If set "NONE", will do nothing for input string tensor.
-                If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default "NFKC").
+                If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default="NFKC").
                See http://unicode.org/reports/tr15/ for details.
        """

@@ -260,13 +270,14 @@ if platform.system().lower() != 'windows':
    class RegexReplace(cde.RegexReplaceOp):
        """
        Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.
+
        See http://userguide.icu-project.org/strings/regexp for support regex pattern.

        Args:
-            pattern(string): the regex expression patterns.
-            replace(string): the string to replace matched element.
+            pattern(str): the regex expression patterns.
+            replace(str): the string to replace matched element.
            replace_all(bool, optional): If False, only replace first matched element;
-                if True, replace all matched elements(default True).
+                if True, replace all matched elements(default=True).
        """

        def __init__(self, pattern, replace, replace_all=True):
@@ -279,13 +290,14 @@ if platform.system().lower() != 'windows':
    class RegexTokenizer(cde.RegexTokenizerOp):
        """
        Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
+
        See http://userguide.icu-project.org/strings/regexp for support regex pattern.

        Args:
-            delim_pattern(string): The pattern of regex delimiters.
+            delim_pattern(str): The pattern of regex delimiters.
                The original string will be split by matched elements.
-            keep_delim_pattern(string, optional): The string matched by 'delim_pattern' can be kept as a token
-                if it can be matched by 'keep_delim_pattern'. And the default value is empty string(''),
+            keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
+                if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
                in this situation, delimiters will not kept as a output token.
        """

@@ -302,12 +314,12 @@ if platform.system().lower() != 'windows':
        Args:
            lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
                on input text to make the text to lower case and strip accents characters; If False, only apply
-                NormalizeUTF8('normalization_form' mode) operation on input text(default False).
-            keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
-            normalization_form(Enum, optional), Used to specify a specific normlaize mode,
-                only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
-            preserve_unused_token(bool, optional), If True, do not split special tokens like
-                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
+                NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
+            keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
+            normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
+                only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
+            preserve_unused_token(bool, optional): If True, do not split special tokens like
+                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
        """

        def __init__(self, lower_case=False, keep_whitespace=False,
@@ -326,18 +338,18 @@ if platform.system().lower() != 'windows':

        Args:
            vocab(Vocab): a Vocab object.
-            suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
-            max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
-            unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
-                return the token directly, else return 'unknown_token'(default '[UNK]').
+            suffix_indicator(str, optional): Used to show that the subword is the last part of a word(default='##').
+            max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default=100).
+            unknown_token(str, optional): When we can not found the token: if 'unknown_token' is empty string,
+                return the token directly, else return 'unknown_token'(default='[UNK]').
            lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
                on input text to make the text to lower case and strip accents characters; If False, only apply
-                NormalizeUTF8('normalization_form' mode) operation on input text(default False).
-            keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
-            normalization_form(Enum, optional), Used to specify a specific normlaize mode,
-                only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
-            preserve_unused_token(bool, optional), If True, do not split special tokens like
-                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
+                NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
+            keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
+            normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
+                only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
+            preserve_unused_token(bool, optional): If True, do not split special tokens like
+                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
        """

        def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,

--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@@ -25,7 +25,9 @@ from .validators import check_from_file, check_from_list, check_from_dict, check

 class Vocab(cde.Vocab):
    """
-        Vocab object that is used to lookup a word. It contains a map that maps each word(str) to an id (int)
+    Vocab object that is used to lookup a word.
+
+    It contains a map that maps each word(str) to an id (int).
    """

    @classmethod
@@ -33,29 +35,32 @@ class Vocab(cde.Vocab):
    def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None, special_tokens=None,
                     special_first=None):
        """
-        Build a vocab from a dataset. This would collect all unique words in a dataset and return a vocab within
+        Build a vocab from a dataset.
+
+        This would collect all unique words in a dataset and return a vocab within
        the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency.
        Words in vocab are ordered from highest frequency to lowest frequency. Words with the same frequency would be
        ordered lexicographically.

        Args:
            dataset(Dataset): dataset to build vocab from.
-            columns([str, list], optional): column names to get words from. It can be a list of column names.
-                (Default=None where all columns will be used. If any column isn't string type, will return error)
+            columns(list of str, optional): column names to get words from. It can be a list of column names.
+                (default=None, where all columns will be used. If any column isn't string type, will return error).
            freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
                range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as
                min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
                min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
                (default=None, all words are included).
            top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
-                taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None
+                taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None,
                all words are included).
            special_tokens(list, optional):  a list of strings, each one is a special token. for example
                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
                is specified and special_first is set to None, special_tokens will be prepended. (default=None).
-        return:
-            text.Vocab: Vocab object built from dataset.
+
+        Returns:
+            Vocab, Vocab object built from dataset.
        """

        vocab = Vocab()
@@ -69,7 +74,8 @@ class Vocab(cde.Vocab):
    @check_from_list
    def from_list(cls, word_list, special_tokens=None, special_first=None):
        """
-            build a vocab object from a list of word.
+        Build a vocab object from a list of word.
+
        Args:
            word_list(list): a list of string where each element is a word of type string.
            special_tokens(list, optional):  a list of strings, each one is a special token. for example
@@ -77,34 +83,40 @@ class Vocab(cde.Vocab):
            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
                is specified and special_first is set to None, special_tokens will be prepended. (default=None).
        """
+
        return super().from_list(word_list, special_tokens, special_first)

    @classmethod
    @check_from_file
    def from_file(cls, file_path, delimiter=None, vocab_size=None, special_tokens=None, special_first=None):
        """
-            build a vocab object from a list of word.
+        Build a vocab object from a list of word.
+
        Args:
-            file_path(str): path to the file which contains the vocab list.
-            delimiter(str, optional): a delimiter to break up each line in file, the first element is taken to be
+            file_path (str): path to the file which contains the vocab list.
+            delimiter (str, optional): a delimiter to break up each line in file, the first element is taken to be
                the word (default=None).
-            vocab_size(int, optional): number of words to read from file_path (default=None, all words are taken).
-            special_tokens(list, optional):  a list of strings, each one is a special token. for example
+            vocab_size (int, optional): number of words to read from file_path (default=None, all words are taken).
+            special_tokens (list, optional):  a list of strings, each one is a special token. for example
                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
-            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
-                is specified and special_first is set to None, special_tokens will be prepended. (default=None).
+            special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
+                If special_tokens is specified and special_first is set to None,
+                special_tokens will be prepended. (default=None).
        """
+
        return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first)

    @classmethod
    @check_from_dict
    def from_dict(cls, word_dict):
        """
-            build a vocab object from a dict.
+        Build a vocab object from a dict.
+
        Args:
-            word_dict(dict): dict contains word, id pairs where word should be str and id int. id is recommended to
-            start from 0 and be continuous. ValueError will be raised if id is negative.
+            word_dict (dict): dict contains word, id pairs where word should be str and id int. id is recommended to
+                start from 0 and be continuous. ValueError will be raised if id is negative.
        """
+
        return super().from_dict(word_dict)


@@ -113,11 +125,11 @@ def to_str(array, encoding='utf8'):
    Convert numpy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.

    Args:
-        array (numpy array): Array of type `bytes` representing strings.
+        array (numpy.ndarray): Array of type `bytes` representing strings.
        encoding (string): Indicating the charset for decoding.
-    Returns:
-        Numpy array of `str`.

+    Returns:
+        numpy.ndarray, numpy array of `str`.
    """

    if not isinstance(array, np.ndarray):
@@ -131,11 +143,11 @@ def to_bytes(array, encoding='utf8'):
    Convert numpy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.

    Args:
-        array (numpy array): Array of type `str` representing strings.
-        encoding (string): Indicating the charset for encoding.
-    Returns:
-        Numpy array of `bytes`.
+        array (numpy.ndarray): Array of type `str` representing strings.
+        encoding (str): Indicating the charset for encoding.

+    Returns:
+        numpy.ndarray, numpy array of `bytes`.
    """

    if not isinstance(array, np.ndarray):

--- a/mindspore/dataset/transforms/c_transforms.py
+++ b/mindspore/dataset/transforms/c_transforms.py
@@ -75,7 +75,6 @@ class Slice(cde.SliceOp):
    Slice operation to extract a tensor out using the given n slices.

    The functionality of Slice is similar to NumPy indexing feature.
-
    (Currently only rank 1 Tensors are supported)

    Args:
@@ -87,17 +86,17 @@ class Slice(cde.SliceOp):
             4.  Ellipses ...: slice all dimensions between the two slices.

    Examples:
-     >>> # Data before
-     >>> # |   col   |
-     >>> # +---------+
-     >>> # | [1,2,3] |
-     >>> # +---------|
-     >>> data = data.map(operations=Slice(slice(1,3))) # slice indices 1 and 2 only
-     >>> # Data after
-     >>> # |    col     |
-     >>> # +------------+
-     >>> # |    [1,2]   |
-     >>> # +------------|
+        >>> # Data before
+        >>> # |   col   |
+        >>> # +---------+
+        >>> # | [1,2,3] |
+        >>> # +---------|
+        >>> data = data.map(operations=Slice(slice(1,3))) # slice indices 1 and 2 only
+        >>> # Data after
+        >>> # |    col     |
+        >>> # +------------+
+        >>> # |    [1,2]   |
+        >>> # +------------|
    """

    @check_slice_op