提交 e4451a1a 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!2464 [Dataset] code review & add citation

Merge pull request !2464 from luoyang/pylint
......@@ -104,7 +104,7 @@ Status DatasetOp::InsertAsParent(std::shared_ptr<DatasetOp> to_add) {
void DatasetOp::AddParent(DatasetOp *parent) { parent_.push_back(parent); }
// Removes a parent operator from this operator
void DatasetOp::RemoveParent(DatasetOp *parent) {
void DatasetOp::RemoveParent(const DatasetOp *parent) {
parent_.erase(std::remove(parent_.begin(), parent_.end(), parent), parent_.end());
}
......
......@@ -275,7 +275,7 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
// Removes a parent operator from this operator
// @notes External callers do not have access to this function.
// @param parent - The parent node to remove
void RemoveParent(DatasetOp *parent);
void RemoveParent(const DatasetOp *parent);
// Compute the current op's column map using its child's column map.
// Get called during the tree post-prepare phase in PrepareNodePostAction.
......
......@@ -782,7 +782,7 @@ Status UpdateBBoxesForCrop(std::shared_ptr<Tensor> *bboxList, size_t *bboxCount,
return Status::OK();
}
Status PadBBoxes(std::shared_ptr<Tensor> *bboxList, const size_t &bboxCount, int32_t pad_top, int32_t pad_left) {
Status PadBBoxes(const std::shared_ptr<Tensor> *bboxList, const size_t &bboxCount, int32_t pad_top, int32_t pad_left) {
for (int i = 0; i < bboxCount; i++) {
uint32_t xMin, yMin;
RETURN_IF_NOT_OK((*bboxList)->GetUnsignedIntAt(&xMin, {i, 0}));
......
......@@ -244,7 +244,7 @@ Status UpdateBBoxesForCrop(std::shared_ptr<Tensor> *bboxList, size_t *bboxCount,
// @param bboxCount: total Number of bounding boxes - required within caller function to run update loop
// @param pad_top: Total amount of padding applied to image top
// @param pad_left: Total amount of padding applied to image left side
Status PadBBoxes(std::shared_ptr<Tensor> *bboxList, const size_t &bboxCount, int32_t pad_top, int32_t pad_left);
Status PadBBoxes(const std::shared_ptr<Tensor> *bboxList, const size_t &bboxCount, int32_t pad_top, int32_t pad_left);
// Updates bounding boxes for an Image Resize Operation - Takes in set of valid BBoxes
// For e.g those that remain after a crop
......
......@@ -81,9 +81,9 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect
if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
}
int end;
int end = 0;
for (int start = 0; start < input_token.size();) {
bool found;
bool found = false;
RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end));
if (found) {
RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens));
......
......@@ -2575,6 +2575,22 @@ class MnistDataset(MappableDataset):
- False
- not allowed
Citation of Mnist dataset.
.. code-block::
@article{lecun2010mnist,
title = {MNIST handwritten digit database},
author = {LeCun, Yann and Cortes, Corinna and Burges, CJ},
journal = {ATT Labs [Online]},
volume = {2},
year = {2010},
howpublished = {http://yann.lecun.com/exdb/mnist},
description = {The MNIST database of handwritten digits has a training set of 60,000 examples,
and a test set of 10,000 examples. It is a subset of a larger set available from
NIST. The digits have been size-normalized and centered in a fixed-size image.}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
num_samples (int, optional): The number of images to be included in the dataset
......@@ -3536,6 +3552,20 @@ class Cifar10Dataset(MappableDataset):
- False
- not allowed
Citation of Cifar10 dataset.
.. code-block::
@techreport{Krizhevsky09,
author = {Alex Krizhevsky},
title = {Learning multiple layers of features from tiny images},
institution = {},
year = {2009},
howpublished = {http://www.cs.toronto.edu/~kriz/cifar.html},
description = {The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
with 6000 images per class. There are 50000 training images and 10000 test images.}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
num_samples (int, optional): The number of images to be included in the dataset.
......@@ -3658,6 +3688,22 @@ class Cifar100Dataset(MappableDataset):
- False
- not allowed
Citation of Cifar100 dataset.
.. code-block::
@techreport{Krizhevsky09,
author = {Alex Krizhevsky},
title = {Learning multiple layers of features from tiny images},
institution = {},
year = {2009},
howpublished = {http://www.cs.toronto.edu/~kriz/cifar.html},
description = {This dataset is just like the CIFAR-10, except it has 100 classes containing 600 images
each. There are 500 training images and 100 testing images per class. The 100 classes in
the CIFAR-100 are grouped into 20 superclasses. Each image comes with a "fine" label (the
class to which it belongs) and a "coarse" label (the superclass to which it belongs).}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
num_samples (int, optional): The number of images to be included in the dataset.
......@@ -4026,6 +4072,27 @@ class VOCDataset(MappableDataset):
- False
- not allowed
Citation of VOC dataset.
.. code-block::
@article{Everingham10,
author = {Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.},
title = {The Pascal Visual Object Classes (VOC) Challenge},
journal = {International Journal of Computer Vision},
volume = {88},
year = {2010},
number = {2},
month = {jun},
pages = {303--338},
biburl = {http://host.robots.ox.ac.uk/pascal/VOC/pubs/everingham10.html#bibtex},
howpublished = {http://host.robots.ox.ac.uk/pascal/VOC/voc{year}/index.html},
description = {The PASCAL Visual Object Classes (VOC) challenge is a benchmark in visual
object category recognition and detection, providing the vision and machine
learning communities with a standard dataset of images and annotation, and
standard evaluation procedures.}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
task (str): Set the task type of reading voc data, now only support "Segmentation" or "Detection"
......@@ -4206,6 +4273,30 @@ class CocoDataset(MappableDataset):
- False
- not allowed
Citation of Coco dataset.
.. code-block::
@article{DBLP:journals/corr/LinMBHPRDZ14,
author = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and
Lubomir D. Bourdev and Ross B. Girshick and James Hays and
Pietro Perona and Deva Ramanan and Piotr Doll{\'{a}}r and C. Lawrence Zitnick},
title = {Microsoft {COCO:} Common Objects in Context},
journal = {CoRR},
volume = {abs/1405.0312},
year = {2014},
url = {http://arxiv.org/abs/1405.0312},
archivePrefix = {arXiv},
eprint = {1405.0312},
timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
biburl = {https://dblp.org/rec/journals/corr/LinMBHPRDZ14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org},
description = {COCO is a large-scale object detection, segmentation, and captioning dataset.
It contains 91 common object categories with 82 of them having more than 5,000
labeled instances. In contrast to the popular ImageNet dataset, COCO has fewer
categories but more instances per category.}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
annotation_file (str): Path to the annotation json.
......@@ -4343,6 +4434,35 @@ class CelebADataset(MappableDataset):
into (default=None).
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument should be specified only when num_shards is also specified.
Citation of CelebA dataset.
.. code-block::
@article{DBLP:journals/corr/LiuLWT14,
author = {Ziwei Liu and Ping Luo and Xiaogang Wang and Xiaoou Tang},
title = {Deep Learning Face Attributes in the Wild},
journal = {CoRR},
volume = {abs/1411.7766},
year = {2014},
url = {http://arxiv.org/abs/1411.7766},
archivePrefix = {arXiv},
eprint = {1411.7766},
timestamp = {Tue, 10 Dec 2019 15:37:26 +0100},
biburl = {https://dblp.org/rec/journals/corr/LiuLWT14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org},
howpublished = {http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html},
description = {CelebFaces Attributes Dataset (CelebA) is a large-scale face attributes dataset
with more than 200K celebrity images, each with 40 attribute annotations. The
images in this dataset cover large pose variations and background clutter. CelebA
has large diversities, large quantities, and rich annotations, including
* 10,177 number of identities,
* 202,599 number of face images, and
* 5 landmark locations, 40 binary attributes annotations per image.
The dataset can be employed as the training and test sets for the following computer
vision tasks: face attribute recognition, face detection, landmark (or facial part)
localization, and face editing & synthesis.}
}
"""
@check_celebadataset
......@@ -4442,11 +4562,28 @@ class CLUEDataset(SourceDataset):
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument should be specified only when num_shards is also specified.
Citation of CLUE dataset.
.. code-block::
@article{CLUEbenchmark,
title = {CLUE: A Chinese Language Understanding Evaluation Benchmark},
author = {Liang Xu, Xuanwei Zhang, Lu Li, Hai Hu, Chenjie Cao, Weitang Liu, Junyi Li, Yudong Li,
Kai Sun, Yechen Xu, Yiming Cui, Cong Yu, Qianqian Dong, Yin Tian, Dian Yu, Bo Shi, Jun Zeng,
Rongzhao Wang, Weijian Xie, Yanting Li, Yina Patterson, Zuoyu Tian, Yiwen Zhang, He Zhou,
Shaoweihua Liu, Qipeng Zhao, Cong Yue, Xinrui Zhang, Zhengliang Yang, Zhenzhong Lan},
journal = {arXiv preprint arXiv:2004.05986},
year = {2020},
howpublished = {https://github.com/CLUEbenchmark/CLUE},
description = {CLUE, a Chinese Language Understanding Evaluation benchmark. It contains eight different
tasks, including single-sentence classification, sentence pair classification, and machine
reading comprehension.}
}
Examples:
>>> import mindspore.dataset as ds
>>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files
>>> dataset = ds.CLUEDataset(dataset_files=dataset_files, task='AFQMC', usage='train')
"""
@check_cluedataset
......
......@@ -190,7 +190,8 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
if not os.path.exists(file_path):
raise ValueError(
"user dict file {} is not exist".format(file_path))
file_dict = open(file_path)
real_file_path = os.path.realpath(file_path)
file_dict = open(real_file_path)
data_re = re.compile('^(.+?)( [0-9]+)?$', re.U)
words_list = []
for item in file_dict:
......@@ -200,8 +201,9 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
words = data_re.match(data).groups()
if len(words) != 2:
raise ValueError(
"user dict file {} format error".format(file_path))
"user dict file {} format error".format(real_file_path))
words_list.append(words)
file_dict.close()
return words_list
def __decode(self, data):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册