Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
e4451a1a
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e4451a1a
编写于
6月 29, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
6月 29, 2020
浏览文件
操作
浏览文件
下载
差异文件
!2464 [Dataset] code review & add citation
Merge pull request !2464 from luoyang/pylint
上级
9be17e2a
36d1613f
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
148 addition
and
9 deletion
+148
-9
mindspore/ccsrc/dataset/engine/datasetops/dataset_op.cc
mindspore/ccsrc/dataset/engine/datasetops/dataset_op.cc
+1
-1
mindspore/ccsrc/dataset/engine/datasetops/dataset_op.h
mindspore/ccsrc/dataset/engine/datasetops/dataset_op.h
+1
-1
mindspore/ccsrc/dataset/kernels/image/image_utils.cc
mindspore/ccsrc/dataset/kernels/image/image_utils.cc
+1
-1
mindspore/ccsrc/dataset/kernels/image/image_utils.h
mindspore/ccsrc/dataset/kernels/image/image_utils.h
+1
-1
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
...pore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
+2
-2
mindspore/dataset/engine/datasets.py
mindspore/dataset/engine/datasets.py
+138
-1
mindspore/dataset/text/transforms.py
mindspore/dataset/text/transforms.py
+4
-2
未找到文件。
mindspore/ccsrc/dataset/engine/datasetops/dataset_op.cc
浏览文件 @
e4451a1a
...
...
@@ -104,7 +104,7 @@ Status DatasetOp::InsertAsParent(std::shared_ptr<DatasetOp> to_add) {
void
DatasetOp
::
AddParent
(
DatasetOp
*
parent
)
{
parent_
.
push_back
(
parent
);
}
// Removes a parent operator from this operator
void
DatasetOp
::
RemoveParent
(
DatasetOp
*
parent
)
{
void
DatasetOp
::
RemoveParent
(
const
DatasetOp
*
parent
)
{
parent_
.
erase
(
std
::
remove
(
parent_
.
begin
(),
parent_
.
end
(),
parent
),
parent_
.
end
());
}
...
...
mindspore/ccsrc/dataset/engine/datasetops/dataset_op.h
浏览文件 @
e4451a1a
...
...
@@ -275,7 +275,7 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
// Removes a parent operator from this operator
// @notes External callers do not have access to this function.
// @param parent - The parent node to remove
void
RemoveParent
(
DatasetOp
*
parent
);
void
RemoveParent
(
const
DatasetOp
*
parent
);
// Compute the current op's column map using its child's column map.
// Get called during the tree post-prepare phase in PrepareNodePostAction.
...
...
mindspore/ccsrc/dataset/kernels/image/image_utils.cc
浏览文件 @
e4451a1a
...
...
@@ -782,7 +782,7 @@ Status UpdateBBoxesForCrop(std::shared_ptr<Tensor> *bboxList, size_t *bboxCount,
return
Status
::
OK
();
}
Status
PadBBoxes
(
std
::
shared_ptr
<
Tensor
>
*
bboxList
,
const
size_t
&
bboxCount
,
int32_t
pad_top
,
int32_t
pad_left
)
{
Status
PadBBoxes
(
const
std
::
shared_ptr
<
Tensor
>
*
bboxList
,
const
size_t
&
bboxCount
,
int32_t
pad_top
,
int32_t
pad_left
)
{
for
(
int
i
=
0
;
i
<
bboxCount
;
i
++
)
{
uint32_t
xMin
,
yMin
;
RETURN_IF_NOT_OK
((
*
bboxList
)
->
GetUnsignedIntAt
(
&
xMin
,
{
i
,
0
}));
...
...
mindspore/ccsrc/dataset/kernels/image/image_utils.h
浏览文件 @
e4451a1a
...
...
@@ -244,7 +244,7 @@ Status UpdateBBoxesForCrop(std::shared_ptr<Tensor> *bboxList, size_t *bboxCount,
// @param bboxCount: total Number of bounding boxes - required within caller function to run update loop
// @param pad_top: Total amount of padding applied to image top
// @param pad_left: Total amount of padding applied to image left side
Status
PadBBoxes
(
std
::
shared_ptr
<
Tensor
>
*
bboxList
,
const
size_t
&
bboxCount
,
int32_t
pad_top
,
int32_t
pad_left
);
Status
PadBBoxes
(
const
std
::
shared_ptr
<
Tensor
>
*
bboxList
,
const
size_t
&
bboxCount
,
int32_t
pad_top
,
int32_t
pad_left
);
// Updates bounding boxes for an Image Resize Operation - Takes in set of valid BBoxes
// For e.g those that remain after a crop
...
...
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
浏览文件 @
e4451a1a
...
...
@@ -81,9 +81,9 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect
if
(
!
DecodeRunesInString
(
input_token
.
data
(),
input_token
.
size
(),
runes
))
{
RETURN_STATUS_UNEXPECTED
(
"Decode utf8 string failed."
);
}
int
end
;
int
end
=
0
;
for
(
int
start
=
0
;
start
<
input_token
.
size
();)
{
bool
found
;
bool
found
=
false
;
RETURN_IF_NOT_OK
(
LookupWord
(
input_token
,
runes
,
start
,
&
found
,
&
end
));
if
(
found
)
{
RETURN_IF_NOT_OK
(
AddSubword
(
input_token
,
start
,
end
,
out_tokens
));
...
...
mindspore/dataset/engine/datasets.py
浏览文件 @
e4451a1a
...
...
@@ -2575,6 +2575,22 @@ class MnistDataset(MappableDataset):
- False
- not allowed
Citation of Mnist dataset.
.. code-block::
@article{lecun2010mnist,
title = {MNIST handwritten digit database},
author = {LeCun, Yann and Cortes, Corinna and Burges, CJ},
journal = {ATT Labs [Online]},
volume = {2},
year = {2010},
howpublished = {http://yann.lecun.com/exdb/mnist},
description = {The MNIST database of handwritten digits has a training set of 60,000 examples,
and a test set of 10,000 examples. It is a subset of a larger set available from
NIST. The digits have been size-normalized and centered in a fixed-size image.}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
num_samples (int, optional): The number of images to be included in the dataset
...
...
@@ -3536,6 +3552,20 @@ class Cifar10Dataset(MappableDataset):
- False
- not allowed
Citation of Cifar10 dataset.
.. code-block::
@techreport{Krizhevsky09,
author = {Alex Krizhevsky},
title = {Learning multiple layers of features from tiny images},
institution = {},
year = {2009},
howpublished = {http://www.cs.toronto.edu/~kriz/cifar.html},
description = {The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
with 6000 images per class. There are 50000 training images and 10000 test images.}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
num_samples (int, optional): The number of images to be included in the dataset.
...
...
@@ -3658,6 +3688,22 @@ class Cifar100Dataset(MappableDataset):
- False
- not allowed
Citation of Cifar100 dataset.
.. code-block::
@techreport{Krizhevsky09,
author = {Alex Krizhevsky},
title = {Learning multiple layers of features from tiny images},
institution = {},
year = {2009},
howpublished = {http://www.cs.toronto.edu/~kriz/cifar.html},
description = {This dataset is just like the CIFAR-10, except it has 100 classes containing 600 images
each. There are 500 training images and 100 testing images per class. The 100 classes in
the CIFAR-100 are grouped into 20 superclasses. Each image comes with a "fine" label (the
class to which it belongs) and a "coarse" label (the superclass to which it belongs).}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
num_samples (int, optional): The number of images to be included in the dataset.
...
...
@@ -4026,6 +4072,27 @@ class VOCDataset(MappableDataset):
- False
- not allowed
Citation of VOC dataset.
.. code-block::
@article{Everingham10,
author = {Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.},
title = {The Pascal Visual Object Classes (VOC) Challenge},
journal = {International Journal of Computer Vision},
volume = {88},
year = {2010},
number = {2},
month = {jun},
pages = {303--338},
biburl = {http://host.robots.ox.ac.uk/pascal/VOC/pubs/everingham10.html#bibtex},
howpublished = {http://host.robots.ox.ac.uk/pascal/VOC/voc{year}/index.html},
description = {The PASCAL Visual Object Classes (VOC) challenge is a benchmark in visual
object category recognition and detection, providing the vision and machine
learning communities with a standard dataset of images and annotation, and
standard evaluation procedures.}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
task (str): Set the task type of reading voc data, now only support "Segmentation" or "Detection"
...
...
@@ -4206,6 +4273,30 @@ class CocoDataset(MappableDataset):
- False
- not allowed
Citation of Coco dataset.
.. code-block::
@article{DBLP:journals/corr/LinMBHPRDZ14,
author = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and
Lubomir D. Bourdev and Ross B. Girshick and James Hays and
Pietro Perona and Deva Ramanan and Piotr Doll{
\'
{a}}r and C. Lawrence Zitnick},
title = {Microsoft {COCO:} Common Objects in Context},
journal = {CoRR},
volume = {abs/1405.0312},
year = {2014},
url = {http://arxiv.org/abs/1405.0312},
archivePrefix = {arXiv},
eprint = {1405.0312},
timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
biburl = {https://dblp.org/rec/journals/corr/LinMBHPRDZ14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org},
description = {COCO is a large-scale object detection, segmentation, and captioning dataset.
It contains 91 common object categories with 82 of them having more than 5,000
labeled instances. In contrast to the popular ImageNet dataset, COCO has fewer
categories but more instances per category.}
}
Args:
dataset_dir (str): Path to the root directory that contains the dataset.
annotation_file (str): Path to the annotation json.
...
...
@@ -4343,6 +4434,35 @@ class CelebADataset(MappableDataset):
into (default=None).
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument should be specified only when num_shards is also specified.
Citation of CelebA dataset.
.. code-block::
@article{DBLP:journals/corr/LiuLWT14,
author = {Ziwei Liu and Ping Luo and Xiaogang Wang and Xiaoou Tang},
title = {Deep Learning Face Attributes in the Wild},
journal = {CoRR},
volume = {abs/1411.7766},
year = {2014},
url = {http://arxiv.org/abs/1411.7766},
archivePrefix = {arXiv},
eprint = {1411.7766},
timestamp = {Tue, 10 Dec 2019 15:37:26 +0100},
biburl = {https://dblp.org/rec/journals/corr/LiuLWT14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org},
howpublished = {http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html},
description = {CelebFaces Attributes Dataset (CelebA) is a large-scale face attributes dataset
with more than 200K celebrity images, each with 40 attribute annotations. The
images in this dataset cover large pose variations and background clutter. CelebA
has large diversities, large quantities, and rich annotations, including
* 10,177 number of identities,
* 202,599 number of face images, and
* 5 landmark locations, 40 binary attributes annotations per image.
The dataset can be employed as the training and test sets for the following computer
vision tasks: face attribute recognition, face detection, landmark (or facial part)
localization, and face editing & synthesis.}
}
"""
@
check_celebadataset
...
...
@@ -4442,11 +4562,28 @@ class CLUEDataset(SourceDataset):
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument should be specified only when num_shards is also specified.
Citation of CLUE dataset.
.. code-block::
@article{CLUEbenchmark,
title = {CLUE: A Chinese Language Understanding Evaluation Benchmark},
author = {Liang Xu, Xuanwei Zhang, Lu Li, Hai Hu, Chenjie Cao, Weitang Liu, Junyi Li, Yudong Li,
Kai Sun, Yechen Xu, Yiming Cui, Cong Yu, Qianqian Dong, Yin Tian, Dian Yu, Bo Shi, Jun Zeng,
Rongzhao Wang, Weijian Xie, Yanting Li, Yina Patterson, Zuoyu Tian, Yiwen Zhang, He Zhou,
Shaoweihua Liu, Qipeng Zhao, Cong Yue, Xinrui Zhang, Zhengliang Yang, Zhenzhong Lan},
journal = {arXiv preprint arXiv:2004.05986},
year = {2020},
howpublished = {https://github.com/CLUEbenchmark/CLUE},
description = {CLUE, a Chinese Language Understanding Evaluation benchmark. It contains eight different
tasks, including single-sentence classification, sentence pair classification, and machine
reading comprehension.}
}
Examples:
>>> import mindspore.dataset as ds
>>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files
>>> dataset = ds.CLUEDataset(dataset_files=dataset_files, task='AFQMC', usage='train')
"""
@
check_cluedataset
...
...
mindspore/dataset/text/transforms.py
浏览文件 @
e4451a1a
...
...
@@ -190,7 +190,8 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
if
not
os
.
path
.
exists
(
file_path
):
raise
ValueError
(
"user dict file {} is not exist"
.
format
(
file_path
))
file_dict
=
open
(
file_path
)
real_file_path
=
os
.
path
.
realpath
(
file_path
)
file_dict
=
open
(
real_file_path
)
data_re
=
re
.
compile
(
'^(.+?)( [0-9]+)?$'
,
re
.
U
)
words_list
=
[]
for
item
in
file_dict
:
...
...
@@ -200,8 +201,9 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
words
=
data_re
.
match
(
data
).
groups
()
if
len
(
words
)
!=
2
:
raise
ValueError
(
"user dict file {} format error"
.
format
(
file_path
))
"user dict file {} format error"
.
format
(
real_
file_path
))
words_list
.
append
(
words
)
file_dict
.
close
()
return
words_list
def
__decode
(
self
,
data
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录