From fb4e0c3b42c87a271e85c702aceacdaf0d8ff347 Mon Sep 17 00:00:00 2001 From: LiuChiachi <709153940@qq.com> Date: Mon, 14 Sep 2020 20:00:41 +0800 Subject: [PATCH] add text/datasets Chinese doc (#2628) --- .../text/datasets/conll05/Conll05st_cn.rst | 84 ++++++++--------- .../api/paddle/text/datasets/imdb/Imdb_cn.rst | 57 ++++++------ .../text/datasets/imikolov/Imikolov_cn.rst | 65 +++++++------ .../movie_reviews/MovieReviews_cn.rst | 55 ++++++----- .../text/datasets/movielens/Movielens_cn.rst | 65 +++++++------ .../datasets/uci_housing/UCIHousing_cn.rst | 57 ++++++------ .../paddle/text/datasets/wmt14/WMT14_cn.rst | 66 +++++++------ .../paddle/text/datasets/wmt16/WMT16_cn.rst | 93 +++++++++---------- 8 files changed, 266 insertions(+), 276 deletions(-) diff --git a/doc/paddle/api/paddle/text/datasets/conll05/Conll05st_cn.rst b/doc/paddle/api/paddle/text/datasets/conll05/Conll05st_cn.rst index 2bf48e8ee..a459259d2 100644 --- a/doc/paddle/api/paddle/text/datasets/conll05/Conll05st_cn.rst +++ b/doc/paddle/api/paddle/text/datasets/conll05/Conll05st_cn.rst @@ -6,59 +6,57 @@ Conll05st .. py:class:: paddle.text.datasets.Conll05st() - Implementation of `Conll05st `_ - test dataset. +该类是对`Conll05st `_ +测试数据集的实现. - Note: only support download test dataset automatically for that - only test dataset of Conll05st is public. +.. note:: + 只支持自动下载公共的 Conll05st测试数据集。 - 参数 +参数 ::::::::: - data_file(str): path to data tar file, can be set None if - :attr:`download` is True. Default None - word_dict_file(str): path to word dictionary file, can be set None if - :attr:`download` is True. Default None - verb_dict_file(str): path to verb dictionary file, can be set None if - :attr:`download` is True. Default None - target_dict_file(str): path to target dictionary file, can be set None if - :attr:`download` is True. Default None - emb_file(str): path to embedding dictionary file, only used for - :code:`get_embedding` can be set None if :attr:`download` is - True. Default None - download(bool): whether to download dataset automatically if - :attr:`data_file` :attr:`word_dict_file` :attr:`verb_dict_file` - :attr:`target_dict_file` is not set. Default True - - Returns: - Dataset: instance of conll05st dataset - - 代码示例 + - data_file(str)- 保存数据的路径,如果参数:attr:`download`设置为True, + 可设置为None。默认为None。 + - word_dict_file(str)- 保存词典的路径。如果参数:attr:`download`设置为True, + 可设置为None。默认为None。 + - verb_dict_file(str)- 保存动词词典的路径。如果参数:attr:`download`设置为True, + 可设置为None。默认为None。 + - target_dict_file(str)- 保存目标词典的路径如果参数:attr:`download`设置为True, + 可设置为None。默认为None。 + - emb_file(str)- 保存词嵌入词典的文件。只有在:code:`get_embedding`能被设置为None + 且:attr:`download` 为True时使用。 + - download(bool)- 如果:attr:`data_file` :attr:`word_dict_file` + :attr:`verb_dict_file` 和:attr:`target_dict_file` 未设置,是否下载数据集。默认为True。 + +返回值 ::::::::: +``Dataset``,conll05st数据集实例。 - .. code-block:: python +代码示例 +::::::::: + +.. code-block:: python - import paddle - from paddle.text.datasets import Conll05st + import paddle + from paddle.text.datasets import Conll05st - class SimpleNet(paddle.nn.Layer): - def __init__(self): - super(SimpleNet, self).__init__() + class SimpleNet(paddle.nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() - def forward(self, pred_idx, mark, label): - return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label) + def forward(self, pred_idx, mark, label): + return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label) - paddle.disable_static() + paddle.disable_static() - conll05st = Conll05st() + conll05st = Conll05st() - for i in range(10): - pred_idx, mark, label= conll05st[i][-3:] - pred_idx = paddle.to_tensor(pred_idx) - mark = paddle.to_tensor(mark) - label = paddle.to_tensor(label) + for i in range(10): + pred_idx, mark, label= conll05st[i][-3:] + pred_idx = paddle.to_tensor(pred_idx) + mark = paddle.to_tensor(mark) + label = paddle.to_tensor(label) - model = SimpleNet() - pred_idx, mark, label= model(pred_idx, mark, label) - print(pred_idx.numpy(), mark.numpy(), label.numpy()) + model = SimpleNet() + pred_idx, mark, label= model(pred_idx, mark, label) + print(pred_idx.numpy(), mark.numpy(), label.numpy()) - \ No newline at end of file diff --git a/doc/paddle/api/paddle/text/datasets/imdb/Imdb_cn.rst b/doc/paddle/api/paddle/text/datasets/imdb/Imdb_cn.rst index 27ede2bef..455393089 100644 --- a/doc/paddle/api/paddle/text/datasets/imdb/Imdb_cn.rst +++ b/doc/paddle/api/paddle/text/datasets/imdb/Imdb_cn.rst @@ -6,46 +6,45 @@ Imdb .. py:class:: paddle.text.datasets.Imdb() - Implementation of `IMDB `_ dataset. +该类是对`IMDB `_ 测试数据集的实现。 - 参数 +参数 ::::::::: - data_file(str): path to data tar file, can be set None if - :attr:`download` is True. Default None - mode(str): 'train' 'test' mode. Default 'train'. - cutoff(int): cutoff number for building word dictionary. Default 150. - download(bool): whether to download dataset automatically if - :attr:`data_file` is not set. Default True + - data_file(str) - 保存压缩数据的路径,如果参数:attr:`download`设置为True, + 可设置为None。默认为None。 + - mode(str) - 'train' 或'test' 模式。默认为'train'。 + - cutoff(int) - 构建词典的截止大小。默认为Default 150。 + - download(bool) - 如果:attr:`data_file`未设置,是否自动下载数据集。默认为True。 - Returns: - Dataset: instance of IMDB dataset +返回值 +::::::::: +``Dataset``, IMDB数据集实例。 - 代码示例 +代码示例 ::::::::: - .. code-block:: python +.. code-block:: python - import paddle - from paddle.text.datasets import Imdb + import paddle + from paddle.text.datasets import Imdb - class SimpleNet(paddle.nn.Layer): - def __init__(self): - super(SimpleNet, self).__init__() + class SimpleNet(paddle.nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() - def forward(self, doc, label): - return paddle.sum(doc), label + def forward(self, doc, label): + return paddle.sum(doc), label - paddle.disable_static() + paddle.disable_static() - imdb = Imdb(mode='train') + imdb = Imdb(mode='train') - for i in range(10): - doc, label = imdb[i] - doc = paddle.to_tensor(doc) - label = paddle.to_tensor(label) + for i in range(10): + doc, label = imdb[i] + doc = paddle.to_tensor(doc) + label = paddle.to_tensor(label) - model = SimpleNet() - image, label = model(doc, label) - print(doc.numpy().shape, label.numpy().shape) + model = SimpleNet() + image, label = model(doc, label) + print(doc.numpy().shape, label.numpy().shape) - \ No newline at end of file diff --git a/doc/paddle/api/paddle/text/datasets/imikolov/Imikolov_cn.rst b/doc/paddle/api/paddle/text/datasets/imikolov/Imikolov_cn.rst index b1c4043c6..714ca6668 100644 --- a/doc/paddle/api/paddle/text/datasets/imikolov/Imikolov_cn.rst +++ b/doc/paddle/api/paddle/text/datasets/imikolov/Imikolov_cn.rst @@ -6,48 +6,47 @@ Imikolov .. py:class:: paddle.text.datasets.Imikolov() - Implementation of imikolov dataset. +该类是对imikolov测试数据集的实现。 - 参数 +参数 ::::::::: - data_file(str): path to data tar file, can be set None if - :attr:`download` is True. Default None - data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'. - window_size(int): sliding window size for 'NGRAM' data. Default -1. - mode(str): 'train' 'test' mode. Default 'train'. - min_word_freq(int): minimal word frequence for building word dictionary. Default 50. - download(bool): whether to download dataset automatically if - :attr:`data_file` is not set. Default True - - Returns: - Dataset: instance of imikolov dataset - - 代码示例 + - data_file(str)- 保存数据的路径,如果参数:attr:`download`设置为True, + 可设置为None。默认为None。 + - data_type(str)- 'NGRAM'或'SEQ'。默认为'NGRAM'。 + - window_size(int) - 'NGRAM'数据滑动窗口的大小。默认为-1。 + - mode(str)- 'train' 'test' mode. Default 'train'. + - min_word_freq(int)- 构建词典的最小词频。默认为50。 + - download(bool)- 如果:attr:`data_file`未设置,是否自动下载数据集。默认为True。 + +返回值 ::::::::: +``Dataset``,imikolov数据集实例。 - .. code-block:: python +代码示例 +::::::::: + +.. code-block:: python - import paddle - from paddle.text.datasets import Imikolov + import paddle + from paddle.text.datasets import Imikolov - class SimpleNet(paddle.nn.Layer): - def __init__(self): - super(SimpleNet, self).__init__() + class SimpleNet(paddle.nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() - def forward(self, src, trg): - return paddle.sum(src), paddle.sum(trg) + def forward(self, src, trg): + return paddle.sum(src), paddle.sum(trg) - paddle.disable_static() + paddle.disable_static() - imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2) + imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2) - for i in range(10): - src, trg = imikolov[i] - src = paddle.to_tensor(src) - trg = paddle.to_tensor(trg) + for i in range(10): + src, trg = imikolov[i] + src = paddle.to_tensor(src) + trg = paddle.to_tensor(trg) - model = SimpleNet() - src, trg = model(src, trg) - print(src.numpy().shape, trg.numpy().shape) + model = SimpleNet() + src, trg = model(src, trg) + print(src.numpy().shape, trg.numpy().shape) - \ No newline at end of file diff --git a/doc/paddle/api/paddle/text/datasets/movie_reviews/MovieReviews_cn.rst b/doc/paddle/api/paddle/text/datasets/movie_reviews/MovieReviews_cn.rst index 6f7458567..b4c99bccf 100644 --- a/doc/paddle/api/paddle/text/datasets/movie_reviews/MovieReviews_cn.rst +++ b/doc/paddle/api/paddle/text/datasets/movie_reviews/MovieReviews_cn.rst @@ -6,45 +6,44 @@ MovieReviews .. py:class:: paddle.text.datasets.MovieReviews() - Implementation of `NLTK movie reviews `_ dataset. +该类是对`NLTK movie reviews `_ 测试数据集的实现。 - 参数 +参数 ::::::::: - data_file(str): path to data tar file, can be set None if - :attr:`download` is True. Default None - mode(str): 'train' 'test' mode. Default 'train'. - download(bool): whether auto download cifar dataset if - :attr:`data_file` unset. Default True. + - data_file(str)- 保存压缩数据的路径,如果参数:attr:`download`设置为True, + 可设置为None。默认为None。 + - mode(str)- 'train'或 'test' 模式。默认为'train'。 + - download(bool)- 如果:attr:`data_file`未设置,是否自动下载数据集。默认为True。 - Returns: - Dataset: instance of movie reviews dataset +返回值 +::::::::: +``Dataset``,NLTK movie reviews数据集实例。 - 代码示例 +代码示例 ::::::::: - .. code-block:: python +.. code-block:: python - import paddle - from paddle.text.datasets import MovieReviews + import paddle + from paddle.text.datasets import MovieReviews - class SimpleNet(paddle.nn.Layer): - def __init__(self): - super(SimpleNet, self).__init__() + class SimpleNet(paddle.nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() - def forward(self, word, category): - return paddle.sum(word), category + def forward(self, word, category): + return paddle.sum(word), category - paddle.disable_static() + paddle.disable_static() - movie_reviews = MovieReviews(mode='train') + movie_reviews = MovieReviews(mode='train') - for i in range(10): - word_list, category = movie_reviews[i] - word_list = paddle.to_tensor(word_list) - category = paddle.to_tensor(category) + for i in range(10): + word_list, category = movie_reviews[i] + word_list = paddle.to_tensor(word_list) + category = paddle.to_tensor(category) - model = SimpleNet() - word_list, category = model(word_list, category) - print(word_list.numpy().shape, category.numpy()) + model = SimpleNet() + word_list, category = model(word_list, category) + print(word_list.numpy().shape, category.numpy()) - \ No newline at end of file diff --git a/doc/paddle/api/paddle/text/datasets/movielens/Movielens_cn.rst b/doc/paddle/api/paddle/text/datasets/movielens/Movielens_cn.rst index d6cd792bb..7921ec9e0 100644 --- a/doc/paddle/api/paddle/text/datasets/movielens/Movielens_cn.rst +++ b/doc/paddle/api/paddle/text/datasets/movielens/Movielens_cn.rst @@ -6,48 +6,47 @@ Movielens .. py:class:: paddle.text.datasets.Movielens() - Implementation of `Movielens 1-M `_ dataset. +该类是对`Movielens 1-M `_ +测试数据集的实现。 - 参数 +参数 ::::::::: - data_file(str): path to data tar file, can be set None if - :attr:`download` is True. Default None - mode(str): 'train' or 'test' mode. Default 'train'. - test_ratio(float): split ratio for test sample. Default 0.1. - rand_seed(int): random seed. Default 0. - download(bool): whether to download dataset automatically if - :attr:`data_file` is not set. Default True - - Returns: - Dataset: instance of Movielens 1-M dataset - - 代码示例 + - data_file(str)- 保存压缩数据的路径,如果参数:attr:`download`设置为True, + 可设置为None。默认为None。 + - mode(str)- 'train' 或 'test' 模式。默认为'train'。 + - test_ratio(float) - 为测试集划分的比例。默认为0.1。 + - rand_seed(int)- 随机数种子。默认为0。 + - download(bool)- 如果:attr:`data_file`未设置,是否自动下载数据集。默认为True。 + +返回值 ::::::::: + ``Dataset``,Movielens 1-M数据集实例。 - .. code-block:: python +代码示例 +::::::::: - import paddle - from paddle.text.datasets import Movielens +.. code-block:: python - class SimpleNet(paddle.nn.Layer): - def __init__(self): - super(SimpleNet, self).__init__() + import paddle + from paddle.text.datasets import Movielens - def forward(self, category, title, rating): - return paddle.sum(category), paddle.sum(title), paddle.sum(rating) + class SimpleNet(paddle.nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() - paddle.disable_static() + def forward(self, category, title, rating): + return paddle.sum(category), paddle.sum(title), paddle.sum(rating) - movielens = Movielens(mode='train') + paddle.disable_static() - for i in range(10): - category, title, rating = movielens[i][-3:] - category = paddle.to_tensor(category) - title = paddle.to_tensor(title) - rating = paddle.to_tensor(rating) + movielens = Movielens(mode='train') - model = SimpleNet() - category, title, rating = model(category, title, rating) - print(category.numpy().shape, title.numpy().shape, rating.numpy().shape) + for i in range(10): + category, title, rating = movielens[i][-3:] + category = paddle.to_tensor(category) + title = paddle.to_tensor(title) + rating = paddle.to_tensor(rating) - \ No newline at end of file + model = SimpleNet() + category, title, rating = model(category, title, rating) + print(category.numpy().shape, title.numpy().shape, rating.numpy().shape) diff --git a/doc/paddle/api/paddle/text/datasets/uci_housing/UCIHousing_cn.rst b/doc/paddle/api/paddle/text/datasets/uci_housing/UCIHousing_cn.rst index bebbce26b..8f9ea0974 100644 --- a/doc/paddle/api/paddle/text/datasets/uci_housing/UCIHousing_cn.rst +++ b/doc/paddle/api/paddle/text/datasets/uci_housing/UCIHousing_cn.rst @@ -6,46 +6,45 @@ UCIHousing .. py:class:: paddle.text.datasets.UCIHousing() - Implementation of `UCI housing `_ - dataset +该类是对`UCI housing `_ +测试数据集的实现。 - 参数 +参数 ::::::::: - data_file(str): path to data file, can be set None if - :attr:`download` is True. Default None - mode(str): 'train' or 'test' mode. Default 'train'. - download(bool): whether to download dataset automatically if - :attr:`data_file` is not set. Default True + - data_file(str)- 保存数据的路径,如果参数:attr:`download`设置为True, + 可设置为None。默认为None。 + - mode(str)- 'train'或'test'模式。默认为'train'。 + - download(bool)- 如果:attr:`data_file`未设置,是否自动下载数据集。默认为True。 - Returns: - Dataset: instance of UCI housing dataset. +返回值 +::::::::: +``Dataset``,UCI housing数据集实例。 - 代码示例 +代码示例 ::::::::: - .. code-block:: python +.. code-block:: python - import paddle - from paddle.text.datasets import UCIHousing + import paddle + from paddle.text.datasets import UCIHousing - class SimpleNet(paddle.nn.Layer): - def __init__(self): - super(SimpleNet, self).__init__() + class SimpleNet(paddle.nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() - def forward(self, feature, target): - return paddle.sum(feature), target + def forward(self, feature, target): + return paddle.sum(feature), target - paddle.disable_static() + paddle.disable_static() - uci_housing = UCIHousing(mode='train') + uci_housing = UCIHousing(mode='train') - for i in range(10): - feature, target = uci_housing[i] - feature = paddle.to_tensor(feature) - target = paddle.to_tensor(target) + for i in range(10): + feature, target = uci_housing[i] + feature = paddle.to_tensor(feature) + target = paddle.to_tensor(target) - model = SimpleNet() - feature, target = model(feature, target) - print(feature.numpy().shape, target.numpy()) + model = SimpleNet() + feature, target = model(feature, target) + print(feature.numpy().shape, target.numpy()) - \ No newline at end of file diff --git a/doc/paddle/api/paddle/text/datasets/wmt14/WMT14_cn.rst b/doc/paddle/api/paddle/text/datasets/wmt14/WMT14_cn.rst index bf98e04ed..eb25b94c6 100644 --- a/doc/paddle/api/paddle/text/datasets/wmt14/WMT14_cn.rst +++ b/doc/paddle/api/paddle/text/datasets/wmt14/WMT14_cn.rst @@ -6,50 +6,48 @@ WMT14 .. py:class:: paddle.text.datasets.WMT14() - Implementation of `WMT14 `_ test dataset. - The original WMT14 dataset is too large and a small set of data for set is - provided. This module will download dataset from - http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz +该类是对`WMT14 `_ 测试数据集实现。 +由于原始WMT14数据集太大,我们在这里提供了一组小数据集。该类将从 +http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz +下载数据集。 - 参数 +参数 ::::::::: - data_file(str): path to data tar file, can be set None if - :attr:`download` is True. Default None - mode(str): 'train', 'test' or 'gen'. Default 'train' - dict_size(int): word dictionary size. Default -1. - download(bool): whether to download dataset automatically if - :attr:`data_file` is not set. Default True + - data_file(str)- 保存数据集压缩文件的路径, 如果参数:attr:`download`设置为True,可设置为None。 + 默认为None。 + - mode(str)- 'train', 'test' 或'gen'。默认为'train'。 + - dict_size(int)- 词典大小。默认为-1。 + - download(bool)- 如果:attr:`data_file`未设置,是否自动下载数据集。默认为True。 - Returns: - Dataset: instance of WMT14 dataset - - 代码示例 +返回值 ::::::::: +``Dataset``,WMT14数据集实例。 - .. code-block:: python +代码示例 +::::::::: - import paddle - from paddle.text.datasets import WMT14 +.. code-block:: python - class SimpleNet(paddle.nn.Layer): - def __init__(self): - super(SimpleNet, self).__init__() + import paddle + from paddle.text.datasets import WMT14 - def forward(self, src_ids, trg_ids, trg_ids_next): - return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next) + class SimpleNet(paddle.nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() - paddle.disable_static() + def forward(self, src_ids, trg_ids, trg_ids_next): + return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next) - wmt14 = WMT14(mode='train', dict_size=50) + paddle.disable_static() - for i in range(10): - src_ids, trg_ids, trg_ids_next = wmt14[i] - src_ids = paddle.to_tensor(src_ids) - trg_ids = paddle.to_tensor(trg_ids) - trg_ids_next = paddle.to_tensor(trg_ids_next) + wmt14 = WMT14(mode='train', dict_size=50) - model = SimpleNet() - src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next) - print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy()) + for i in range(10): + src_ids, trg_ids, trg_ids_next = wmt14[i] + src_ids = paddle.to_tensor(src_ids) + trg_ids = paddle.to_tensor(trg_ids) + trg_ids_next = paddle.to_tensor(trg_ids_next) - \ No newline at end of file + model = SimpleNet() + src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next) + print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy()) diff --git a/doc/paddle/api/paddle/text/datasets/wmt16/WMT16_cn.rst b/doc/paddle/api/paddle/text/datasets/wmt16/WMT16_cn.rst index 1e150e4ab..8166a1d90 100644 --- a/doc/paddle/api/paddle/text/datasets/wmt16/WMT16_cn.rst +++ b/doc/paddle/api/paddle/text/datasets/wmt16/WMT16_cn.rst @@ -6,65 +6,64 @@ WMT16 .. py:class:: paddle.text.datasets.WMT16() - Implementation of `WMT16 `_ test dataset. - ACL2016 Multimodal Machine Translation. Please see this website for more - details: http://www.statmt.org/wmt16/multimodal-task.html#task1 +该类是对`WMT16 `_ 测试数据集实现。 +ACL2016多模态机器翻译。有关更多详细信息,请访问此网站: +http://www.statmt.org/wmt16/multimodal-task.html#task1 - If you use the dataset created for your task, please cite the following paper: - Multi30K: Multilingual English-German Image Descriptions. +如果您任务中使用了该数据集,请引用如下论文: +Multi30K: Multilingual English-German Image Descriptions. - .. code-block:: text +.. code-block:: text - @article{elliott-EtAl:2016:VL16, - author = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.}, - title = {Multi30K: Multilingual English-German Image Descriptions}, - booktitle = {Proceedings of the 6th Workshop on Vision and Language}, - year = {2016}, - pages = {70--74}, - year = 2016 - } + @article{elliott-EtAl:2016:VL16, + author = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.}, + title = {Multi30K: Multilingual English-German Image Descriptions}, + booktitle = {Proceedings of the 6th Workshop on Vision and Language}, + year = {2016}, + pages = {70--74}, + year = 2016 + } - 参数 +参数 ::::::::: - data_file(str): path to data tar file, can be set None if - :attr:`download` is True. Default None - mode(str): 'train', 'test' or 'val'. Default 'train' - src_dict_size(int): word dictionary size for source language word. Default -1. - trg_dict_size(int): word dictionary size for target language word. Default -1. - lang(str): source language, 'en' or 'de'. Default 'en'. - download(bool): whether to download dataset automatically if - :attr:`data_file` is not set. Default True - - Returns: - Dataset: instance of WMT16 dataset - - 代码示例 + - data_file(str)- 保存数据集压缩文件的路径,如果参数:attr:`download`设置为True,可设置为None。 + 默认值为None。 + - mode(str)- 'train', 'test' 或 'val'。默认为'train'。 + - src_dict_size(int)- 源语言词典大小。默认为-1。 + - trg_dict_size(int) - 目标语言测点大小。默认为-1。 + - lang(str)- 源语言,'en' 或 'de'。默认为 'en'。 + - download(bool)- 如果:attr:`data_file`未设置,是否自动下载数据集。默认为True。 + +返回值 ::::::::: +``Dataset``,WMT16数据集实例。 - .. code-block:: python +代码示例 +::::::::: + +.. code-block:: python - import paddle - from paddle.text.datasets import WMT16 + import paddle + from paddle.text.datasets import WMT16 - class SimpleNet(paddle.nn.Layer): - def __init__(self): - super(SimpleNet, self).__init__() + class SimpleNet(paddle.nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() - def forward(self, src_ids, trg_ids, trg_ids_next): - return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next) + def forward(self, src_ids, trg_ids, trg_ids_next): + return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next) - paddle.disable_static() + paddle.disable_static() - wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50) + wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50) - for i in range(10): - src_ids, trg_ids, trg_ids_next = wmt16[i] - src_ids = paddle.to_tensor(src_ids) - trg_ids = paddle.to_tensor(trg_ids) - trg_ids_next = paddle.to_tensor(trg_ids_next) + for i in range(10): + src_ids, trg_ids, trg_ids_next = wmt16[i] + src_ids = paddle.to_tensor(src_ids) + trg_ids = paddle.to_tensor(trg_ids) + trg_ids_next = paddle.to_tensor(trg_ids_next) - model = SimpleNet() - src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next) - print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy()) + model = SimpleNet() + src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next) + print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy()) - \ No newline at end of file -- GitLab