Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
e44f0538
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e44f0538
编写于
2月 28, 2017
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'wangyi/dataset' into feature/clean_mnist_v2
上级
792875e3
6bc82c8e
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
333 addition
and
41 deletion
+333
-41
python/paddle/v2/dataset/cifar.py
python/paddle/v2/dataset/cifar.py
+82
-0
python/paddle/v2/dataset/common.py
python/paddle/v2/dataset/common.py
+35
-0
python/paddle/v2/dataset/config.py
python/paddle/v2/dataset/config.py
+0
-8
python/paddle/v2/dataset/mnist.py
python/paddle/v2/dataset/mnist.py
+47
-33
python/paddle/v2/dataset/movielens.py
python/paddle/v2/dataset/movielens.py
+120
-0
python/paddle/v2/dataset/tests/common_test.py
python/paddle/v2/dataset/tests/common_test.py
+23
-0
python/paddle/v2/dataset/tests/mnist_test.py
python/paddle/v2/dataset/tests/mnist_test.py
+26
-0
未找到文件。
python/paddle/v2/dataset/cifar.py
0 → 100644
浏览文件 @
e44f0538
"""
CIFAR Dataset.
URL: https://www.cs.toronto.edu/~kriz/cifar.html
the default train_creator, test_creator used for CIFAR-10 dataset.
"""
import
cPickle
import
itertools
import
tarfile
import
numpy
from
common
import
download
__all__
=
[
'cifar_100_train_creator'
,
'cifar_100_test_creator'
,
'train_creator'
,
'test_creator'
]
CIFAR10_URL
=
'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
CIFAR10_MD5
=
'c58f30108f718f92721af3b95e74349a'
CIFAR100_URL
=
'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
CIFAR100_MD5
=
'eb9058c3a382ffc7106e4002c42a8d85'
def
__read_batch__
(
filename
,
sub_name
):
def
reader
():
def
__read_one_batch_impl__
(
batch
):
data
=
batch
[
'data'
]
labels
=
batch
.
get
(
'labels'
,
batch
.
get
(
'fine_labels'
,
None
))
assert
labels
is
not
None
for
sample
,
label
in
itertools
.
izip
(
data
,
labels
):
yield
(
sample
/
255.0
).
astype
(
numpy
.
float32
),
int
(
label
)
with
tarfile
.
open
(
filename
,
mode
=
'r'
)
as
f
:
names
=
(
each_item
.
name
for
each_item
in
f
if
sub_name
in
each_item
.
name
)
for
name
in
names
:
batch
=
cPickle
.
load
(
f
.
extractfile
(
name
))
for
item
in
__read_one_batch_impl__
(
batch
):
yield
item
return
reader
def
cifar_100_train_creator
():
fn
=
download
(
url
=
CIFAR100_URL
,
md5
=
CIFAR100_MD5
)
return
__read_batch__
(
fn
,
'train'
)
def
cifar_100_test_creator
():
fn
=
download
(
url
=
CIFAR100_URL
,
md5
=
CIFAR100_MD5
)
return
__read_batch__
(
fn
,
'test'
)
def
train_creator
():
"""
Default train reader creator. Use CIFAR-10 dataset.
"""
fn
=
download
(
url
=
CIFAR10_URL
,
md5
=
CIFAR10_MD5
)
return
__read_batch__
(
fn
,
'data_batch'
)
def
test_creator
():
"""
Default test reader creator. Use CIFAR-10 dataset.
"""
fn
=
download
(
url
=
CIFAR10_URL
,
md5
=
CIFAR10_MD5
)
return
__read_batch__
(
fn
,
'test_batch'
)
def
unittest
():
for
_
in
train_creator
()():
pass
for
_
in
test_creator
()():
pass
if
__name__
==
'__main__'
:
unittest
()
python/paddle/v2/dataset/common.py
0 → 100644
浏览文件 @
e44f0538
import
requests
import
hashlib
import
os
import
shutil
__all__
=
[
'DATA_HOME'
,
'download'
,
'md5file'
]
DATA_HOME
=
os
.
path
.
expanduser
(
'~/.cache/paddle/dataset'
)
if
not
os
.
path
.
exists
(
DATA_HOME
):
os
.
makedirs
(
DATA_HOME
)
def
md5file
(
fname
):
hash_md5
=
hashlib
.
md5
()
f
=
open
(
fname
,
"rb"
)
for
chunk
in
iter
(
lambda
:
f
.
read
(
4096
),
b
""
):
hash_md5
.
update
(
chunk
)
f
.
close
()
return
hash_md5
.
hexdigest
()
def
download
(
url
,
module_name
,
md5sum
):
dirname
=
os
.
path
.
join
(
DATA_HOME
,
module_name
)
if
not
os
.
path
.
exists
(
dirname
):
os
.
makedirs
(
dirname
)
filename
=
os
.
path
.
join
(
dirname
,
url
.
split
(
'/'
)[
-
1
])
if
not
(
os
.
path
.
exists
(
filename
)
and
md5file
(
filename
)
==
md5sum
):
# If file doesn't exist or MD5 doesn't match, then download.
r
=
requests
.
get
(
url
,
stream
=
True
)
with
open
(
filename
,
'w'
)
as
f
:
shutil
.
copyfileobj
(
r
.
raw
,
f
)
return
filename
python/paddle/v2/dataset/config.py
已删除
100644 → 0
浏览文件 @
792875e3
import
os
__all__
=
[
'DATA_HOME'
]
DATA_HOME
=
os
.
path
.
expanduser
(
'~/.cache/paddle_data_set'
)
if
not
os
.
path
.
exists
(
DATA_HOME
):
os
.
makedirs
(
DATA_HOME
)
python/paddle/v2/dataset/mnist.py
浏览文件 @
e44f0538
import
sklearn.datasets.mldata
import
s
klearn.model_selection
import
paddle.v2.dataset.common
import
s
ubprocess
import
numpy
from
config
import
DATA_HOME
__all__
=
[
'train
_creator'
,
'test_creator
'
]
__all__
=
[
'train
'
,
'test
'
]
URL_PREFIX
=
'http://yann.lecun.com/exdb/mnist/'
TEST_IMAGE_URL
=
URL_PREFIX
+
't10k-images-idx3-ubyte.gz'
TEST_IMAGE_MD5
=
'25e3cc63507ef6e98d5dc541e8672bb6'
TEST_LABEL_URL
=
URL_PREFIX
+
't10k-labels-idx1-ubyte.gz'
TEST_LABEL_MD5
=
'4e9511fe019b2189026bd0421ba7b688'
TRAIN_IMAGE_URL
=
URL_PREFIX
+
'train-images-idx3-ubyte.gz'
TRAIN_IMAGE_MD5
=
'f68b3c2dcbeaaa9fbdd348bbdeb94873'
TRAIN_LABEL_URL
=
URL_PREFIX
+
'train-labels-idx1-ubyte.gz'
TRAIN_LABEL_MD5
=
'd53e105ee54ea40749a09fcbcd1e9432'
def
__mnist_reader_creator__
(
data
,
target
):
def
reader
():
n_samples
=
data
.
shape
[
0
]
for
i
in
xrange
(
n_samples
):
yield
(
data
[
i
]
/
255.0
).
astype
(
numpy
.
float32
),
int
(
target
[
i
])
return
reader
def
reader_creator
(
image_filename
,
label_filename
,
buffer_size
):
def
reader
():
# According to http://stackoverflow.com/a/38061619/724872, we
# cannot use standard package gzip here.
m
=
subprocess
.
Popen
([
"zcat"
,
image_filename
],
stdout
=
subprocess
.
PIPE
)
m
.
stdout
.
read
(
16
)
# skip some magic bytes
l
=
subprocess
.
Popen
([
"zcat"
,
label_filename
],
stdout
=
subprocess
.
PIPE
)
l
.
stdout
.
read
(
8
)
# skip some magic bytes
TEST_SIZE
=
10000
X_train
=
None
X_test
=
None
y_train
=
None
y_test
=
None
while
True
:
labels
=
numpy
.
fromfile
(
l
.
stdout
,
'ubyte'
,
count
=
buffer_size
).
astype
(
"int"
)
if
labels
.
size
!=
buffer_size
:
break
# numpy.fromfile returns empty slice after EOF.
def
__initialize_dataset__
():
global
X_train
,
X_test
,
y_train
,
y_test
if
X_train
is
not
None
:
return
data
=
sklearn
.
datasets
.
mldata
.
fetch_mldata
(
"MNIST original"
,
data_home
=
DATA_HOME
)
X_train
,
X_test
,
y_train
,
y_test
=
sklearn
.
model_selection
.
train_test_split
(
data
.
data
,
data
.
target
,
test_size
=
TEST_SIZE
,
random_state
=
0
)
images
=
numpy
.
fromfile
(
m
.
stdout
,
'ubyte'
,
count
=
buffer_size
*
28
*
28
).
reshape
(
(
buffer_size
,
28
*
28
)).
astype
(
'float32'
)
images
=
images
/
255.0
*
2.0
-
1.0
def
train_creator
():
__initialize_dataset__
()
return
__mnist_reader_creator__
(
X_train
,
y_train
)
for
i
in
xrange
(
buffer_size
):
yield
images
[
i
,
:],
labels
[
i
]
m
.
terminate
()
l
.
terminate
()
def
test_creator
():
__initialize_dataset__
()
return
__mnist_reader_creator__
(
X_test
,
y_test
)
return
reader
()
def
unittest
():
assert
len
(
list
(
test_creator
()()))
==
TEST_SIZE
def
train
():
return
reader_creator
(
paddle
.
v2
.
dataset
.
common
.
download
(
TRAIN_IMAGE_URL
,
'mnist'
,
TRAIN_IMAGE_MD5
),
paddle
.
v2
.
dataset
.
common
.
download
(
TRAIN_LABEL_URL
,
'mnist'
,
TRAIN_LABEL_MD5
),
100
)
if
__name__
==
'__main__'
:
unittest
()
def
test
():
return
reader_creator
(
paddle
.
v2
.
dataset
.
common
.
download
(
TEST_IMAGE_URL
,
'mnist'
,
TEST_IMAGE_MD5
),
paddle
.
v2
.
dataset
.
common
.
download
(
TEST_LABEL_URL
,
'mnist'
,
TEST_LABEL_MD5
),
100
)
python/paddle/v2/dataset/movielens.py
0 → 100644
浏览文件 @
e44f0538
import
zipfile
from
common
import
download
import
re
import
random
import
functools
__all__
=
[
'train_creator'
,
'test_creator'
]
class
MovieInfo
(
object
):
def
__init__
(
self
,
index
,
categories
,
title
):
self
.
index
=
int
(
index
)
self
.
categories
=
categories
self
.
title
=
title
def
value
(
self
):
return
[
self
.
index
,
[
CATEGORIES_DICT
[
c
]
for
c
in
self
.
categories
],
[
MOVIE_TITLE_DICT
[
w
.
lower
()]
for
w
in
self
.
title
.
split
()]
]
class
UserInfo
(
object
):
def
__init__
(
self
,
index
,
gender
,
age
,
job_id
):
self
.
index
=
int
(
index
)
self
.
is_male
=
gender
==
'M'
self
.
age
=
[
1
,
18
,
25
,
35
,
45
,
50
,
56
].
index
(
int
(
age
))
self
.
job_id
=
int
(
job_id
)
def
value
(
self
):
return
[
self
.
index
,
0
if
self
.
is_male
else
1
,
self
.
age
,
self
.
job_id
]
MOVIE_INFO
=
None
MOVIE_TITLE_DICT
=
None
CATEGORIES_DICT
=
None
USER_INFO
=
None
def
__initialize_meta_info__
():
fn
=
download
(
url
=
'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
,
md5
=
'c4d9eecfca2ab87c1945afe126590906'
)
global
MOVIE_INFO
if
MOVIE_INFO
is
None
:
pattern
=
re
.
compile
(
r
'^(.*)\((\d+)\)$'
)
with
zipfile
.
ZipFile
(
file
=
fn
)
as
package
:
for
info
in
package
.
infolist
():
assert
isinstance
(
info
,
zipfile
.
ZipInfo
)
MOVIE_INFO
=
dict
()
title_word_set
=
set
()
categories_set
=
set
()
with
package
.
open
(
'ml-1m/movies.dat'
)
as
movie_file
:
for
i
,
line
in
enumerate
(
movie_file
):
movie_id
,
title
,
categories
=
line
.
strip
().
split
(
'::'
)
categories
=
categories
.
split
(
'|'
)
for
c
in
categories
:
categories_set
.
add
(
c
)
title
=
pattern
.
match
(
title
).
group
(
1
)
MOVIE_INFO
[
int
(
movie_id
)]
=
MovieInfo
(
index
=
movie_id
,
categories
=
categories
,
title
=
title
)
for
w
in
title
.
split
():
title_word_set
.
add
(
w
.
lower
())
global
MOVIE_TITLE_DICT
MOVIE_TITLE_DICT
=
dict
()
for
i
,
w
in
enumerate
(
title_word_set
):
MOVIE_TITLE_DICT
[
w
]
=
i
global
CATEGORIES_DICT
CATEGORIES_DICT
=
dict
()
for
i
,
c
in
enumerate
(
categories_set
):
CATEGORIES_DICT
[
c
]
=
i
global
USER_INFO
USER_INFO
=
dict
()
with
package
.
open
(
'ml-1m/users.dat'
)
as
user_file
:
for
line
in
user_file
:
uid
,
gender
,
age
,
job
,
_
=
line
.
strip
().
split
(
"::"
)
USER_INFO
[
int
(
uid
)]
=
UserInfo
(
index
=
uid
,
gender
=
gender
,
age
=
age
,
job_id
=
job
)
return
fn
def
__reader__
(
rand_seed
=
0
,
test_ratio
=
0.1
,
is_test
=
False
):
fn
=
__initialize_meta_info__
()
rand
=
random
.
Random
(
x
=
rand_seed
)
with
zipfile
.
ZipFile
(
file
=
fn
)
as
package
:
with
package
.
open
(
'ml-1m/ratings.dat'
)
as
rating
:
for
line
in
rating
:
if
(
rand
.
random
()
<
test_ratio
)
==
is_test
:
uid
,
mov_id
,
rating
,
_
=
line
.
strip
().
split
(
"::"
)
uid
=
int
(
uid
)
mov_id
=
int
(
mov_id
)
rating
=
float
(
rating
)
*
2
-
5.0
mov
=
MOVIE_INFO
[
mov_id
]
usr
=
USER_INFO
[
uid
]
yield
usr
.
value
()
+
mov
.
value
()
+
[[
rating
]]
def
__reader_creator__
(
**
kwargs
):
return
lambda
:
__reader__
(
**
kwargs
)
train_creator
=
functools
.
partial
(
__reader_creator__
,
is_test
=
False
)
test_creator
=
functools
.
partial
(
__reader_creator__
,
is_test
=
True
)
def
unittest
():
for
train_count
,
_
in
enumerate
(
train_creator
()()):
pass
for
test_count
,
_
in
enumerate
(
test_creator
()()):
pass
print
train_count
,
test_count
if
__name__
==
'__main__'
:
unittest
()
python/paddle/v2/dataset/tests/common_test.py
0 → 100644
浏览文件 @
e44f0538
import
paddle.v2.dataset.common
import
unittest
import
tempfile
class
TestCommon
(
unittest
.
TestCase
):
def
test_md5file
(
self
):
_
,
temp_path
=
tempfile
.
mkstemp
()
with
open
(
temp_path
,
'w'
)
as
f
:
f
.
write
(
"Hello
\n
"
)
self
.
assertEqual
(
'09f7e02f1290be211da707a266f153b3'
,
paddle
.
v2
.
dataset
.
common
.
md5file
(
temp_path
))
def
test_download
(
self
):
yi_avatar
=
'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
self
.
assertEqual
(
paddle
.
v2
.
dataset
.
common
.
DATA_HOME
+
'/test/1548775?v=3&s=460'
,
paddle
.
v2
.
dataset
.
common
.
download
(
yi_avatar
,
'test'
,
'f75287202d6622414c706c36c16f8e0d'
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/v2/dataset/tests/mnist_test.py
0 → 100644
浏览文件 @
e44f0538
import
paddle.v2.dataset.mnist
import
unittest
class
TestMNIST
(
unittest
.
TestCase
):
def
check_reader
(
self
,
reader
):
sum
=
0
for
l
in
reader
:
self
.
assertEqual
(
l
[
0
].
size
,
784
)
self
.
assertEqual
(
l
[
1
].
size
,
1
)
self
.
assertLess
(
l
[
1
],
10
)
self
.
assertGreaterEqual
(
l
[
1
],
0
)
sum
+=
1
return
sum
def
test_train
(
self
):
self
.
assertEqual
(
self
.
check_reader
(
paddle
.
v2
.
dataset
.
mnist
.
train
()),
60000
)
def
test_test
(
self
):
self
.
assertEqual
(
self
.
check_reader
(
paddle
.
v2
.
dataset
.
mnist
.
test
()),
10000
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录