diff --git a/mindspore/dataset/text/__init__.py b/mindspore/dataset/text/__init__.py index 7856b55b7a8be04b44debd331cbd72386f831f3d..7c43a2888c2e0278906bbecba1414639d09c98da 100644 --- a/mindspore/dataset/text/__init__.py +++ b/mindspore/dataset/text/__init__.py @@ -11,9 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ -mindspore.dataset.text +This module is to support text processing for nlp. It includes two parts: +transforms and utils. transforms is a high performance +nlp text processing module which is developed with icu4c and cppjieba. +utils provides some general methods for nlp text processing. """ import platform from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \ diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index 9995ae14d024f2af9407bda977cf607fb5f86d8d..fe970e06cc8bf453a74ee987a6699cc623c9a9e6 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -12,9 +12,37 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -c transforms for all text related operators +The module text.transforms is inheritted from _c_dataengine +which is implemented basing on icu4c and cppjieba in C++. +It's a high performance module to process nlp text. +Users can use Vocab to build their own dictionary, +use appropriate tokenizers to split sentences into different tokens, +and use Lookup to find the index of tokens in Vocab. + +.. Note:: + Constructor's arguments for every class in this module must be saved into the + class attributes (self.xxx) to support save() and load(). + +Examples: + >>> import mindspore.dataset as ds + >>> import mindspore.dataset.text as text + >>> dataset_file = "path/to/text_file_path" + >>> # sentences as line data saved in a file + >>> dataset = ds.TextFileDataset(dataset_file, shuffle=False) + >>> # tokenize sentence to unicode characters + >>> tokenizer = text.UnicodeCharTokenizer() + >>> # load vocabulary form list + >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) + >>> # lookup is an operation for mapping tokens to ids + >>> lookup = text.Lookup(vocab) + >>> dataset = dataset.map(operations=[tokenizer, lookup]) + >>> for i in dataset.create_dict_iterator(): + >>> print(i) + >>> # if text line in dataset_file is: + >>> # 深圳欢迎您 + >>> # then the output will be: + >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)} """ - import os import re import platform @@ -203,8 +231,8 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): Args: vocab (Vocab): a Vocab object. - suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default '##'). - max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default 100). + suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default='##'). + max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100). unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string, return the token directly, else return 'unknown_token'(default='[UNK]'). """ @@ -299,7 +327,7 @@ if platform.system().lower() != 'windows': The original string will be split by matched elements. keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''), - in this situation, delimiters will not kept as a output token. + in this situation, delimiters will not kept as a output token(default=''). """ def __init__(self, delim_pattern, keep_delim_pattern=''): diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py index dc2427aabdaf20b182db61fb088e2333e18f967c..766de76e0141282971a4195a37cc372ff8e7ed1b 100644 --- a/mindspore/dataset/text/utils.py +++ b/mindspore/dataset/text/utils.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Some basic function for text +The module text.utils provides some general methods for nlp text processing. +For example, you can use Vocab to build a dictionary, +use to_bytes and to_str to encode and decode strings into a specified format. """ from enum import IntEnum @@ -52,12 +54,12 @@ class Vocab(cde.Vocab): min_frequency/max_frequency can be None, which corresponds to 0/total_words separately (default=None, all words are included). top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are - taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None, + taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None, all words are included). special_tokens(list, optional): a list of strings, each one is a special token. for example special_tokens=["",""] (default=None, no special tokens will be added). special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens - is specified and special_first is set to None, special_tokens will be prepended. (default=None). + is specified and special_first is set to None, special_tokens will be prepended (default=None). Returns: Vocab, Vocab object built from dataset. @@ -81,7 +83,7 @@ class Vocab(cde.Vocab): special_tokens(list, optional): a list of strings, each one is a special token. for example special_tokens=["",""] (default=None, no special tokens will be added). special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens - is specified and special_first is set to None, special_tokens will be prepended. (default=None). + is specified and special_first is set to None, special_tokens will be prepended (default=None). """ return super().from_list(word_list, special_tokens, special_first) @@ -101,7 +103,7 @@ class Vocab(cde.Vocab): special_tokens=["",""] (default=None, no special tokens will be added). special_first (bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens is specified and special_first is set to None, - special_tokens will be prepended. (default=None). + special_tokens will be prepended (default=None). """ return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first) @@ -157,12 +159,14 @@ def to_bytes(array, encoding='utf8'): class JiebaMode(IntEnum): + """An enumeration for JiebaTokenizer, effective enumeration types are MIX, MP, HMM.""" MIX = 0 MP = 1 HMM = 2 class NormalizeForm(IntEnum): + """An enumeration for NormalizeUTF8, effective enumeration types are NONE, NFC, NFKC, NFD, NFKD.""" NONE = 0 NFC = 1 NFKC = 2