提交 f837eee7 编写于 作者: H Helin Wang

add paddle.v2.reader,dataset back for backward compatibility

上级 991b582e
......@@ -81,6 +81,7 @@ if (WITH_TESTING)
# enable v2 API unittest only when paddle swig api is compiled
add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/plot/tests)
add_subdirectory(paddle/v2/reader/tests)
endif()
endif()
add_subdirectory(paddle/fluid/tests)
......
......@@ -37,7 +37,7 @@ __all__ = [
'cifar',
'movielens',
'conll05',
'sentiment'
'sentiment',
'uci_housing',
'wmt14',
'wmt16',
......
......@@ -22,13 +22,17 @@ import data_type
import topology
import networks
import evaluator
from . import dataset
from . import reader
from . import plot
import attr
import op
import pooling
import inference
import networks
import minibatch
import plot
import image
import paddle.trainer.config_parser as cp
__all__ = [
......@@ -44,11 +48,14 @@ __all__ = [
'data_type',
'attr',
'pooling',
'dataset',
'reader',
'topology',
'networks',
'infer',
'plot',
'evaluator',
'image',
'master',
]
......@@ -146,3 +153,4 @@ def init(**kwargs):
infer = inference.infer
batch = minibatch.batch
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Dataset package.
"""
import mnist
import imikolov
import imdb
import cifar
import movielens
import conll05
import uci_housing
import sentiment
import wmt14
import wmt16
import mq2007
import flowers
import voc2012
__all__ = [
'mnist',
'imikolov',
'imdb',
'cifar',
'movielens',
'conll05',
'sentiment'
'uci_housing',
'wmt14',
'wmt16',
'mq2007',
'flowers',
'voc2012',
]
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
CIFAR dataset.
This module will download dataset from
https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
paddle reader creators.
The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
with 6000 images per class. There are 50000 training images and 10000 test
images.
The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
containing 600 images each. There are 500 training images and 100 testing
images per class.
"""
import cPickle
import itertools
import numpy
import paddle.v2.dataset.common
import tarfile
__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
def reader_creator(filename, sub_name):
def read_batch(batch):
data = batch['data']
labels = batch.get('labels', batch.get('fine_labels', None))
assert labels is not None
for sample, label in itertools.izip(data, labels):
yield (sample / 255.0).astype(numpy.float32), int(label)
def reader():
with tarfile.open(filename, mode='r') as f:
names = (each_item.name for each_item in f
if sub_name in each_item.name)
for name in names:
batch = cPickle.load(f.extractfile(name))
for item in read_batch(batch):
yield item
return reader
def train100():
"""
CIFAR-100 training set creator.
It returns a reader creator, each sample in the reader is image pixels in
[0, 1] and label in [0, 99].
:return: Training reader creator
:rtype: callable
"""
return reader_creator(
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
'train')
def test100():
"""
CIFAR-100 test set creator.
It returns a reader creator, each sample in the reader is image pixels in
[0, 1] and label in [0, 9].
:return: Test reader creator.
:rtype: callable
"""
return reader_creator(
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
'test')
def train10():
"""
CIFAR-10 training set creator.
It returns a reader creator, each sample in the reader is image pixels in
[0, 1] and label in [0, 9].
:return: Training reader creator
:rtype: callable
"""
return reader_creator(
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
'data_batch')
def test10():
"""
CIFAR-10 test set creator.
It returns a reader creator, each sample in the reader is image pixels in
[0, 1] and label in [0, 9].
:return: Test reader creator.
:rtype: callable
"""
return reader_creator(
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
'test_batch')
def fetch():
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import requests
import hashlib
import os
import errno
import shutil
import sys
import importlib
import paddle.v2.dataset
import cPickle
import glob
import cPickle as pickle
__all__ = [
'DATA_HOME',
'download',
'md5file',
'split',
'cluster_files_reader',
'convert',
]
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
# When running unit tests, there could be multiple processes that
# trying to create DATA_HOME directory simultaneously, so we cannot
# use a if condition to check for the existence of the directory;
# instead, we use the filesystem as the synchronization mechanism by
# catching returned errors.
def must_mkdirs(path):
try:
os.makedirs(DATA_HOME)
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
pass
must_mkdirs(DATA_HOME)
def md5file(fname):
hash_md5 = hashlib.md5()
f = open(fname, "rb")
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
f.close()
return hash_md5.hexdigest()
def download(url, module_name, md5sum, save_name=None):
dirname = os.path.join(DATA_HOME, module_name)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = os.path.join(dirname,
url.split('/')[-1]
if save_name is None else save_name)
retry = 0
retry_limit = 3
while not (os.path.exists(filename) and md5file(filename) == md5sum):
if os.path.exists(filename):
print "file md5", md5file(filename), md5sum
if retry < retry_limit:
retry += 1
else:
raise RuntimeError("Cannot download {0} within retry limit {1}".
format(url, retry_limit))
print "Cache file %s not found, downloading %s" % (filename, url)
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
if total_length is None:
with open(filename, 'w') as f:
shutil.copyfileobj(r.raw, f)
else:
with open(filename, 'w') as f:
dl = 0
total_length = int(total_length)
for data in r.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done,
' ' * (50 - done)))
sys.stdout.flush()
return filename
def fetch_all():
for module_name in filter(lambda x: not x.startswith("__"),
dir(paddle.v2.dataset)):
if "fetch" in dir(
importlib.import_module("paddle.v2.dataset.%s" % module_name)):
getattr(
importlib.import_module("paddle.v2.dataset.%s" % module_name),
"fetch")()
def fetch_all_recordio(path):
for module_name in filter(lambda x: not x.startswith("__"),
dir(paddle.v2.dataset)):
if "convert" in dir(
importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
not module_name == "common":
ds_path = os.path.join(path, module_name)
must_mkdirs(ds_path)
getattr(
importlib.import_module("paddle.v2.dataset.%s" % module_name),
"convert")(ds_path)
def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
"""
you can call the function as:
split(paddle.v2.dataset.cifar.train10(), line_count=1000,
suffix="imikolov-train-%05d.pickle")
the output files as:
|-imikolov-train-00000.pickle
|-imikolov-train-00001.pickle
|- ...
|-imikolov-train-00480.pickle
:param reader: is a reader creator
:param line_count: line count for each file
:param suffix: the suffix for the output files, should contain "%d"
means the id for each file. Default is "%05d.pickle"
:param dumper: is a callable function that dump object to file, this
function will be called as dumper(obj, f) and obj is the object
will be dumped, f is a file object. Default is cPickle.dump.
"""
if not callable(dumper):
raise TypeError("dumper should be callable.")
lines = []
indx_f = 0
for i, d in enumerate(reader()):
lines.append(d)
if i >= line_count and i % line_count == 0:
with open(suffix % indx_f, "w") as f:
dumper(lines, f)
lines = []
indx_f += 1
if lines:
with open(suffix % indx_f, "w") as f:
dumper(lines, f)
def cluster_files_reader(files_pattern,
trainer_count,
trainer_id,
loader=cPickle.load):
"""
Create a reader that yield element from the given files, select
a file set according trainer count and trainer_id
:param files_pattern: the files which generating by split(...)
:param trainer_count: total trainer count
:param trainer_id: the trainer rank id
:param loader: is a callable function that load object from file, this
function will be called as loader(f) and f is a file object.
Default is cPickle.load
"""
def reader():
if not callable(loader):
raise TypeError("loader should be callable.")
file_list = glob.glob(files_pattern)
file_list.sort()
my_file_list = []
for idx, fn in enumerate(file_list):
if idx % trainer_count == trainer_id:
print "append file: %s" % fn
my_file_list.append(fn)
for fn in my_file_list:
with open(fn, "r") as f:
lines = loader(f)
for line in lines:
yield line
return reader
def convert(output_path, reader, line_count, name_prefix):
import recordio
"""
Convert data from reader to recordio format files.
:param output_path: directory in which output files will be saved.
:param reader: a data reader, from which the convert program will read
data instances.
:param name_prefix: the name prefix of generated files.
:param max_lines_to_shuffle: the max lines numbers to shuffle before
writing.
"""
assert line_count >= 1
indx_f = 0
def write_data(indx_f, lines):
filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
writer = recordio.writer(filename)
for l in lines:
# FIXME(Yancey1989):
# dumps with protocol: pickle.HIGHEST_PROTOCOL
writer.write(cPickle.dumps(l))
writer.close()
lines = []
for i, d in enumerate(reader()):
lines.append(d)
if i % line_count == 0 and i >= line_count:
write_data(indx_f, lines)
lines = []
indx_f += 1
continue
write_data(indx_f, lines)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Conll05 dataset.
Paddle semantic role labeling Book and demo use this dataset as an example.
Because Conll05 is not free in public, the default downloaded URL is test set
of Conll05 (which is public). Users can change URL and MD5 to their Conll
dataset. And a pre-trained word vector model based on Wikipedia corpus is used
to initialize SRL model.
"""
import tarfile
import gzip
import itertools
import paddle.v2.dataset.common
__all__ = ['test, get_dict', 'get_embedding', 'convert']
DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc'
WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
UNK_IDX = 0
def load_label_dict(filename):
d = dict()
tag_dict = set()
with open(filename, 'r') as f:
for i, line in enumerate(f):
line = line.strip()
if line.startswith("B-"):
tag_dict.add(line[2:])
elif line.startswith("I-"):
tag_dict.add(line[2:])
index = 0
for tag in tag_dict:
d["B-" + tag] = index
index += 1
d["I-" + tag] = index
index += 1
d["O"] = index
return d
def load_dict(filename):
d = dict()
with open(filename, 'r') as f:
for i, line in enumerate(f):
d[line.strip()] = i
return d
def corpus_reader(data_path, words_name, props_name):
"""
Read one corpus. It returns an iterator. Each element of
this iterator is a tuple including sentence and labels. The sentence is
consist of a list of word IDs. The labels include a list of label IDs.
:return: a iterator of data.
:rtype: iterator
"""
def reader():
tf = tarfile.open(data_path)
wf = tf.extractfile(words_name)
pf = tf.extractfile(props_name)
with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
fileobj=pf) as props_file:
sentences = []
labels = []
one_seg = []
for word, label in itertools.izip(words_file, props_file):
word = word.strip()
label = label.strip().split()
if len(label) == 0: # end of sentence
for i in xrange(len(one_seg[0])):
a_kind_lable = [x[i] for x in one_seg]
labels.append(a_kind_lable)
if len(labels) >= 1:
verb_list = []
for x in labels[0]:
if x != '-':
verb_list.append(x)
for i, lbl in enumerate(labels[1:]):
cur_tag = 'O'
is_in_bracket = False
lbl_seq = []
verb_word = ''
for l in lbl:
if l == '*' and is_in_bracket == False:
lbl_seq.append('O')
elif l == '*' and is_in_bracket == True:
lbl_seq.append('I-' + cur_tag)
elif l == '*)':
lbl_seq.append('I-' + cur_tag)
is_in_bracket = False
elif l.find('(') != -1 and l.find(')') != -1:
cur_tag = l[1:l.find('*')]
lbl_seq.append('B-' + cur_tag)
is_in_bracket = False
elif l.find('(') != -1 and l.find(')') == -1:
cur_tag = l[1:l.find('*')]
lbl_seq.append('B-' + cur_tag)
is_in_bracket = True
else:
raise RuntimeError('Unexpected label: %s' %
l)
yield sentences, verb_list[i], lbl_seq
sentences = []
labels = []
one_seg = []
else:
sentences.append(word)
one_seg.append(label)
pf.close()
wf.close()
tf.close()
return reader
def reader_creator(corpus_reader,
word_dict=None,
predicate_dict=None,
label_dict=None):
def reader():
for sentence, predicate, labels in corpus_reader():
sen_len = len(sentence)
verb_index = labels.index('B-V')
mark = [0] * len(labels)
if verb_index > 0:
mark[verb_index - 1] = 1
ctx_n1 = sentence[verb_index - 1]
else:
ctx_n1 = 'bos'
if verb_index > 1:
mark[verb_index - 2] = 1
ctx_n2 = sentence[verb_index - 2]
else:
ctx_n2 = 'bos'
mark[verb_index] = 1
ctx_0 = sentence[verb_index]
if verb_index < len(labels) - 1:
mark[verb_index + 1] = 1
ctx_p1 = sentence[verb_index + 1]
else:
ctx_p1 = 'eos'
if verb_index < len(labels) - 2:
mark[verb_index + 2] = 1
ctx_p2 = sentence[verb_index + 2]
else:
ctx_p2 = 'eos'
word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
pred_idx = [predicate_dict.get(predicate)] * sen_len
label_idx = [label_dict.get(w) for w in labels]
yield word_idx, ctx_n2_idx, ctx_n1_idx, \
ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
return reader
def get_dict():
"""
Get the word, verb and label dictionary of Wikipedia corpus.
"""
word_dict = load_dict(
paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
WORDDICT_MD5))
verb_dict = load_dict(
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
VERBDICT_MD5))
label_dict = load_label_dict(
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
TRGDICT_MD5))
return word_dict, verb_dict, label_dict
def get_embedding():
"""
Get the trained word vector based on Wikipedia corpus.
"""
return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
def test():
"""
Conll05 test set creator.
Because the training dataset is not free, the test dataset is used for
training. It returns a reader creator, each sample in the reader is nine
features, including sentence sequence, predicate, predicate context,
predicate context flag and tagged sequence.
:return: Training reader creator
:rtype: callable
"""
word_dict, verb_dict, label_dict = get_dict()
reader = corpus_reader(
paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
return reader_creator(reader, word_dict, verb_dict, label_dict)
def fetch():
paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module will download dataset from
http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
and parse train/test set intopaddle reader creators.
This set contains images of flowers belonging to 102 different categories.
The images were acquired by searching the web and taking pictures. There are a
minimum of 40 images for each category.
The database was used in:
Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
number of classes.Proceedings of the Indian Conference on Computer Vision,
Graphics and Image Processing (2008)
http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
"""
import cPickle
import itertools
import functools
from common import download
import tarfile
import scipy.io as scio
from paddle.v2.image import *
from paddle.v2.reader import *
import os
import numpy as np
from multiprocessing import cpu_count
__all__ = ['train', 'test', 'valid']
DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
# In official 'readme', tstid is the flag of test data
# and trnid is the flag of train data. But test data is more than train data.
# So we exchange the train data and test data.
TRAIN_FLAG = 'tstid'
TEST_FLAG = 'trnid'
VALID_FLAG = 'valid'
def default_mapper(is_train, sample):
'''
map image bytes data to type needed by model input layer
'''
img, label = sample
img = load_image_bytes(img)
img = simple_transform(
img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
return img.flatten().astype('float32'), label
train_mapper = functools.partial(default_mapper, True)
test_mapper = functools.partial(default_mapper, False)
def reader_creator(data_file,
label_file,
setid_file,
dataset_name,
mapper,
buffered_size=1024,
use_xmap=True):
'''
1. read images from tar file and
merge images into batch files in 102flowers.tgz_batch/
2. get a reader to read sample from batch file
:param data_file: downloaded data file
:type data_file: string
:param label_file: downloaded label file
:type label_file: string
:param setid_file: downloaded setid file containing information
about how to split dataset
:type setid_file: string
:param dataset_name: data set name (tstid|trnid|valid)
:type dataset_name: string
:param mapper: a function to map image bytes data to type
needed by model input layer
:type mapper: callable
:param buffered_size: the size of buffer used to process images
:type buffered_size: int
:return: data reader
:rtype: callable
'''
labels = scio.loadmat(label_file)['labels'][0]
indexes = scio.loadmat(setid_file)[dataset_name][0]
img2label = {}
for i in indexes:
img = "jpg/image_%05d.jpg" % i
img2label[img] = labels[i - 1]
file_list = batch_images_from_tar(data_file, dataset_name, img2label)
def reader():
for file in open(file_list):
file = file.strip()
batch = None
with open(file, 'r') as f:
batch = cPickle.load(f)
data = batch['data']
labels = batch['label']
for sample, label in itertools.izip(data, batch['label']):
yield sample, int(label) - 1
if use_xmap:
return xmap_readers(mapper, reader, cpu_count(), buffered_size)
else:
return map_readers(mapper, reader)
def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
'''
Create flowers training set reader.
It returns a reader, each sample in the reader is
image pixels in [0, 1] and label in [1, 102]
translated from original color image by steps:
1. resize to 256*256
2. random crop to 224*224
3. flatten
:param mapper: a function to map sample.
:type mapper: callable
:param buffered_size: the size of buffer used to process images
:type buffered_size: int
:return: train data reader
:rtype: callable
'''
return reader_creator(
download(DATA_URL, 'flowers', DATA_MD5),
download(LABEL_URL, 'flowers', LABEL_MD5),
download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
buffered_size, use_xmap)
def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
'''
Create flowers test set reader.
It returns a reader, each sample in the reader is
image pixels in [0, 1] and label in [1, 102]
translated from original color image by steps:
1. resize to 256*256
2. random crop to 224*224
3. flatten
:param mapper: a function to map sample.
:type mapper: callable
:param buffered_size: the size of buffer used to process images
:type buffered_size: int
:return: test data reader
:rtype: callable
'''
return reader_creator(
download(DATA_URL, 'flowers', DATA_MD5),
download(LABEL_URL, 'flowers', LABEL_MD5),
download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
buffered_size, use_xmap)
def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
'''
Create flowers validation set reader.
It returns a reader, each sample in the reader is
image pixels in [0, 1] and label in [1, 102]
translated from original color image by steps:
1. resize to 256*256
2. random crop to 224*224
3. flatten
:param mapper: a function to map sample.
:type mapper: callable
:param buffered_size: the size of buffer used to process images
:type buffered_size: int
:return: test data reader
:rtype: callable
'''
return reader_creator(
download(DATA_URL, 'flowers', DATA_MD5),
download(LABEL_URL, 'flowers', LABEL_MD5),
download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
buffered_size, use_xmap)
def fetch():
download(DATA_URL, 'flowers', DATA_MD5)
download(LABEL_URL, 'flowers', LABEL_MD5)
download(SETID_URL, 'flowers', SETID_MD5)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
IMDB dataset.
This module downloads IMDB dataset from
http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
of 25,000 highly polar movie reviews for training, and 25,000 for testing.
Besides, this module also provides API for building dictionary.
"""
import paddle.v2.dataset.common
import collections
import tarfile
import re
import string
__all__ = ['build_dict', 'train', 'test', 'convert']
URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
def tokenize(pattern):
"""
Read files that match the given pattern. Tokenize and yield each file.
"""
with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
MD5)) as tarf:
# Note that we should use tarfile.next(), which does
# sequential access of member files, other than
# tarfile.extractfile, which does random access and might
# destroy hard disks.
tf = tarf.next()
while tf != None:
if bool(pattern.match(tf.name)):
# newline and punctuations removal and ad-hoc tokenization.
yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
None, string.punctuation).lower().split()
tf = tarf.next()
def build_dict(pattern, cutoff):
"""
Build a word dictionary from the corpus. Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
word_freq = collections.defaultdict(int)
for doc in tokenize(pattern):
for word in doc:
word_freq[word] += 1
# Not sure if we should prune less-frequent words here.
word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*dictionary))
word_idx = dict(zip(words, xrange(len(words))))
word_idx['<unk>'] = len(words)
return word_idx
def reader_creator(pos_pattern, neg_pattern, word_idx):
UNK = word_idx['<unk>']
INS = []
def load(pattern, out, label):
for doc in tokenize(pattern):
out.append(([word_idx.get(w, UNK) for w in doc], label))
load(pos_pattern, INS, 0)
load(neg_pattern, INS, 1)
def reader():
for doc, label in INS:
yield doc, label
return reader
def train(word_idx):
"""
IMDB training set creator.
It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Training reader creator
:rtype: callable
"""
return reader_creator(
re.compile("aclImdb/train/pos/.*\.txt$"),
re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
def test(word_idx):
"""
IMDB test set creator.
It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Test reader creator
:rtype: callable
"""
return reader_creator(
re.compile("aclImdb/test/pos/.*\.txt$"),
re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
def word_dict():
"""
Build a word dictionary from the corpus.
:return: Word dictionary
:rtype: dict
"""
return build_dict(
re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
def fetch():
paddle.v2.dataset.common.download(URL, 'imdb', MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
w = word_dict()
paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
imikolov's simple dataset.
This module will download dataset from
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
into paddle reader creators.
"""
import paddle.v2.dataset.common
import collections
import tarfile
__all__ = ['train', 'test', 'build_dict', 'convert']
URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
MD5 = '30177ea32e27c525793142b6bf2c8e2d'
class DataType(object):
NGRAM = 1
SEQ = 2
def word_count(f, word_freq=None):
if word_freq is None:
word_freq = collections.defaultdict(int)
for l in f:
for w in l.strip().split():
word_freq[w] += 1
word_freq['<s>'] += 1
word_freq['<e>'] += 1
return word_freq
def build_dict(min_word_freq=50):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
train_filename = './simple-examples/data/ptb.train.txt'
test_filename = './simple-examples/data/ptb.valid.txt'
with tarfile.open(
paddle.v2.dataset.common.download(
paddle.v2.dataset.imikolov.URL, 'imikolov',
paddle.v2.dataset.imikolov.MD5)) as tf:
trainf = tf.extractfile(train_filename)
testf = tf.extractfile(test_filename)
word_freq = word_count(testf, word_count(trainf))
if '<unk>' in word_freq:
# remove <unk> for now, since we will set it as last index
del word_freq['<unk>']
word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(zip(words, xrange(len(words))))
word_idx['<unk>'] = len(words)
return word_idx
def reader_creator(filename, word_idx, n, data_type):
def reader():
with tarfile.open(
paddle.v2.dataset.common.download(
paddle.v2.dataset.imikolov.URL, 'imikolov',
paddle.v2.dataset.imikolov.MD5)) as tf:
f = tf.extractfile(filename)
UNK = word_idx['<unk>']
for l in f:
if DataType.NGRAM == data_type:
assert n > -1, 'Invalid gram length'
l = ['<s>'] + l.strip().split() + ['<e>']
if len(l) >= n:
l = [word_idx.get(w, UNK) for w in l]
for i in range(n, len(l) + 1):
yield tuple(l[i - n:i])
elif DataType.SEQ == data_type:
l = l.strip().split()
l = [word_idx.get(w, UNK) for w in l]
src_seq = [word_idx['<s>']] + l
trg_seq = l + [word_idx['<e>']]
if n > 0 and len(src_seq) > n: continue
yield src_seq, trg_seq
else:
assert False, 'Unknow data type'
return reader
def train(word_idx, n, data_type=DataType.NGRAM):
"""
imikolov training set creator.
It returns a reader creator, each sample in the reader is a word ID
tuple.
:param word_idx: word dictionary
:type word_idx: dict
:param n: sliding window size if type is ngram, otherwise max length of sequence
:type n: int
:param data_type: data type (ngram or sequence)
:type data_type: member variable of DataType (NGRAM or SEQ)
:return: Training reader creator
:rtype: callable
"""
return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
data_type)
def test(word_idx, n, data_type=DataType.NGRAM):
"""
imikolov test set creator.
It returns a reader creator, each sample in the reader is a word ID
tuple.
:param word_idx: word dictionary
:type word_idx: dict
:param n: sliding window size if type is ngram, otherwise max length of sequence
:type n: int
:param data_type: data type (ngram or sequence)
:type data_type: member variable of DataType (NGRAM or SEQ)
:return: Test reader creator
:rtype: callable
"""
return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
data_type)
def fetch():
paddle.v2.dataset.common.download(URL, "imikolov", MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
N = 5
word_dict = build_dict()
paddle.v2.dataset.common.convert(path,
train(word_dict, N), 1000,
"imikolov_train")
paddle.v2.dataset.common.convert(path,
test(word_dict, N), 1000, "imikolov_test")
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
MNIST dataset.
This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
parse training set and test set into paddle reader creators.
"""
import paddle.v2.dataset.common
import subprocess
import numpy
import platform
__all__ = ['train', 'test', 'convert']
URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
def reader_creator(image_filename, label_filename, buffer_size):
def reader():
if platform.system() == 'Darwin':
zcat_cmd = 'gzcat'
elif platform.system() == 'Linux':
zcat_cmd = 'zcat'
else:
raise NotImplementedError()
# According to http://stackoverflow.com/a/38061619/724872, we
# cannot use standard package gzip here.
m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
m.stdout.read(16) # skip some magic bytes
l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
l.stdout.read(8) # skip some magic bytes
try: # reader could be break.
while True:
labels = numpy.fromfile(
l.stdout, 'ubyte', count=buffer_size).astype("int")
if labels.size != buffer_size:
break # numpy.fromfile returns empty slice after EOF.
images = numpy.fromfile(
m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
(buffer_size, 28 * 28)).astype('float32')
images = images / 255.0 * 2.0 - 1.0
for i in xrange(buffer_size):
yield images[i, :], int(labels[i])
finally:
m.terminate()
l.terminate()
return reader
def train():
"""
MNIST training set creator.
It returns a reader creator, each sample in the reader is image pixels in
[0, 1] and label in [0, 9].
:return: Training reader creator
:rtype: callable
"""
return reader_creator(
paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
TRAIN_IMAGE_MD5),
paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
TRAIN_LABEL_MD5), 100)
def test():
"""
MNIST test set creator.
It returns a reader creator, each sample in the reader is image pixels in
[0, 1] and label in [0, 9].
:return: Test reader creator.
:rtype: callable
"""
return reader_creator(
paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
TEST_IMAGE_MD5),
paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
TEST_LABEL_MD5), 100)
def fetch():
paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Movielens 1-M dataset.
Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
movies, which was collected by GroupLens Research. This module will download
Movielens 1-M dataset from
http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
set and test set into paddle reader creators.
"""
import zipfile
import paddle.v2.dataset.common
import re
import random
import functools
__all__ = [
'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
'convert'
]
age_table = [1, 18, 25, 35, 45, 50, 56]
URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
MD5 = 'c4d9eecfca2ab87c1945afe126590906'
class MovieInfo(object):
"""
Movie id, title and categories information are stored in MovieInfo.
"""
def __init__(self, index, categories, title):
self.index = int(index)
self.categories = categories
self.title = title
def value(self):
"""
Get information from a movie.
"""
return [
self.index, [CATEGORIES_DICT[c] for c in self.categories],
[MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
]
def __str__(self):
return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
self.index, self.title, self.categories)
def __repr__(self):
return self.__str__()
class UserInfo(object):
"""
User id, gender, age, and job information are stored in UserInfo.
"""
def __init__(self, index, gender, age, job_id):
self.index = int(index)
self.is_male = gender == 'M'
self.age = age_table.index(int(age))
self.job_id = int(job_id)
def value(self):
"""
Get information from a user.
"""
return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
def __str__(self):
return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
self.index, "M"
if self.is_male else "F", age_table[self.age], self.job_id)
def __repr__(self):
return str(self)
MOVIE_INFO = None
MOVIE_TITLE_DICT = None
CATEGORIES_DICT = None
USER_INFO = None
def __initialize_meta_info__():
fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
global MOVIE_INFO
if MOVIE_INFO is None:
pattern = re.compile(r'^(.*)\((\d+)\)$')
with zipfile.ZipFile(file=fn) as package:
for info in package.infolist():
assert isinstance(info, zipfile.ZipInfo)
MOVIE_INFO = dict()
title_word_set = set()
categories_set = set()
with package.open('ml-1m/movies.dat') as movie_file:
for i, line in enumerate(movie_file):
movie_id, title, categories = line.strip().split('::')
categories = categories.split('|')
for c in categories:
categories_set.add(c)
title = pattern.match(title).group(1)
MOVIE_INFO[int(movie_id)] = MovieInfo(
index=movie_id, categories=categories, title=title)
for w in title.split():
title_word_set.add(w.lower())
global MOVIE_TITLE_DICT
MOVIE_TITLE_DICT = dict()
for i, w in enumerate(title_word_set):
MOVIE_TITLE_DICT[w] = i
global CATEGORIES_DICT
CATEGORIES_DICT = dict()
for i, c in enumerate(categories_set):
CATEGORIES_DICT[c] = i
global USER_INFO
USER_INFO = dict()
with package.open('ml-1m/users.dat') as user_file:
for line in user_file:
uid, gender, age, job, _ = line.strip().split("::")
USER_INFO[int(uid)] = UserInfo(
index=uid, gender=gender, age=age, job_id=job)
return fn
def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
fn = __initialize_meta_info__()
rand = random.Random(x=rand_seed)
with zipfile.ZipFile(file=fn) as package:
with package.open('ml-1m/ratings.dat') as rating:
for line in rating:
if (rand.random() < test_ratio) == is_test:
uid, mov_id, rating, _ = line.strip().split("::")
uid = int(uid)
mov_id = int(mov_id)
rating = float(rating) * 2 - 5.0
mov = MOVIE_INFO[mov_id]
usr = USER_INFO[uid]
yield usr.value() + mov.value() + [[rating]]
def __reader_creator__(**kwargs):
return lambda: __reader__(**kwargs)
train = functools.partial(__reader_creator__, is_test=False)
test = functools.partial(__reader_creator__, is_test=True)
def get_movie_title_dict():
"""
Get movie title dictionary.
"""
__initialize_meta_info__()
return MOVIE_TITLE_DICT
def __max_index_info__(a, b):
if a.index > b.index:
return a
else:
return b
def max_movie_id():
"""
Get the maximum value of movie id.
"""
__initialize_meta_info__()
return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
def max_user_id():
"""
Get the maximum value of user id.
"""
__initialize_meta_info__()
return reduce(__max_index_info__, USER_INFO.viewvalues()).index
def __max_job_id_impl__(a, b):
if a.job_id > b.job_id:
return a
else:
return b
def max_job_id():
"""
Get the maximum value of job id.
"""
__initialize_meta_info__()
return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
def movie_categories():
"""
Get movie categoriges dictionary.
"""
__initialize_meta_info__()
return CATEGORIES_DICT
def user_info():
"""
Get user info dictionary.
"""
__initialize_meta_info__()
return USER_INFO
def movie_info():
"""
Get movie info dictionary.
"""
__initialize_meta_info__()
return MOVIE_INFO
def unittest():
for train_count, _ in enumerate(train()()):
pass
for test_count, _ in enumerate(test()()):
pass
print train_count, test_count
def fetch():
paddle.v2.dataset.common.download(URL, "movielens", MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
if __name__ == '__main__':
unittest()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
MQ2007 dataset
MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
validation set and testing set.
MQ2007 dataset from website
http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
"""
import os
import functools
import rarfile
from common import download
import numpy as np
# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
MD5 = "7be1640ae95c6408dab0ae7207bdc706"
def __initialize_meta_info__():
"""
download and extract the MQ2007 dataset
"""
fn = fetch()
rar = rarfile.RarFile(fn)
dirpath = os.path.dirname(fn)
rar.extractall(path=dirpath)
return dirpath
class Query(object):
"""
queries used for learning to rank algorithms. It is created from relevance scores, query-document feature vectors
Parameters:
----------
query_id : int
query_id in dataset, mapping from query to relevance documents
relevance_score : int
relevance score of query and document pair
feature_vector : array, dense feature
feature in vector format
description : string
comment section in query doc pair data
"""
def __init__(self,
query_id=-1,
relevance_score=-1,
feature_vector=None,
description=""):
self.query_id = query_id
self.relevance_score = relevance_score
if feature_vector is None:
self.feature_vector = []
else:
self.feature_vector = feature_vector
self.description = description
def __str__(self):
string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
" ".join(str(f) for f in self.feature_vector))
return string
# @classmethod
def _parse_(self, text):
"""
parse line into Query
"""
comment_position = text.find('#')
line = text[:comment_position].strip()
self.description = text[comment_position + 1:].strip()
parts = line.split()
if len(parts) != 48:
sys.stdout.write("expect 48 space split parts, get %d" %
(len(parts)))
return None
# format : 0 qid:10 1:0.000272 2:0.000000 ....
self.relevance_score = int(parts[0])
self.query_id = int(parts[1].split(':')[1])
for p in parts[2:]:
pair = p.split(':')
self.feature_vector.append(float(pair[1]))
return self
class QueryList(object):
"""
group query into list, every item in list is a Query
"""
def __init__(self, querylist=None):
self.query_id = -1
if querylist is None:
self.querylist = []
else:
self.querylist = querylist
for query in self.querylist:
if self.query_id == -1:
self.query_id = query.query_id
else:
if self.query_id != query.query_id:
raise ValueError("query in list must be same query_id")
def __iter__(self):
for query in self.querylist:
yield query
def __len__(self):
return len(self.querylist)
def __getitem__(self, i):
return self.querylist[i]
def _correct_ranking_(self):
if self.querylist is None:
return
self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
def _add_query(self, query):
if self.query_id == -1:
self.query_id = query.query_id
else:
if self.query_id != query.query_id:
raise ValueError("query in list must be same query_id")
self.querylist.append(query)
def gen_plain_txt(querylist):
"""
gen plain text in list for other usage
Paramters:
--------
querylist : querylist, one query match many docment pairs in list, see QueryList
return :
------
query_id : np.array, shape=(samples_num, )
label : np.array, shape=(samples_num, )
querylist : np.array, shape=(samples_num, feature_dimension)
"""
if not isinstance(querylist, QueryList):
querylist = QueryList(querylist)
querylist._correct_ranking_()
for query in querylist:
yield querylist.query_id, query.relevance_score, np.array(
query.feature_vector)
def gen_point(querylist):
"""
gen item in list for point-wise learning to rank algorithm
Paramters:
--------
querylist : querylist, one query match many docment pairs in list, see QueryList
return :
------
label : np.array, shape=(samples_num, )
querylist : np.array, shape=(samples_num, feature_dimension)
"""
if not isinstance(querylist, QueryList):
querylist = QueryList(querylist)
querylist._correct_ranking_()
for query in querylist:
yield query.relevance_score, np.array(query.feature_vector)
def gen_pair(querylist, partial_order="full"):
"""
gen pair for pair-wise learning to rank algorithm
Paramters:
--------
querylist : querylist, one query match many docment pairs in list, see QueryList
pairtial_order : "full" or "neighbour"
there is redudant in all possiable pair combinations, which can be simplifed
gen pairs for neighbour items or the full partial order pairs
return :
------
label : np.array, shape=(1)
query_left : np.array, shape=(1, feature_dimension)
query_right : same as left
"""
if not isinstance(querylist, QueryList):
querylist = QueryList(querylist)
querylist._correct_ranking_()
labels = []
docpairs = []
# C(n,2)
for i in range(len(querylist)):
query_left = querylist[i]
for j in range(i + 1, len(querylist)):
query_right = querylist[j]
if query_left.relevance_score > query_right.relevance_score:
labels.append([1])
docpairs.append([
np.array(query_left.feature_vector),
np.array(query_right.feature_vector)
])
elif query_left.relevance_score < query_right.relevance_score:
labels.append([1])
docpairs.append([
np.array(query_right.feature_vector),
np.array(query_left.feature_vector)
])
for label, pair in zip(labels, docpairs):
yield np.array(label), pair[0], pair[1]
def gen_list(querylist):
"""
gen item in list for list-wise learning to rank algorithm
Paramters:
--------
querylist : querylist, one query match many docment pairs in list, see QueryList
return :
------
label : np.array, shape=(samples_num, )
querylist : np.array, shape=(samples_num, feature_dimension)
"""
if not isinstance(querylist, QueryList):
querylist = QueryList(querylist)
querylist._correct_ranking_()
relevance_score_list = [[query.relevance_score] for query in querylist]
feature_vector_list = [query.feature_vector for query in querylist]
yield np.array(relevance_score_list), np.array(feature_vector_list)
def query_filter(querylists):
"""
filter query get only document with label 0.
label 0, 1, 2 means the relevance score document with query
parameters :
querylist : QueyList list
return :
querylist : QueyList list
"""
filter_query = []
for querylist in querylists:
relevance_score_list = [query.relevance_score for query in querylist]
if sum(relevance_score_list) != .0:
filter_query.append(querylist)
return filter_query
def load_from_text(filepath, shuffle=False, fill_missing=-1):
"""
parse data file into querys
"""
prev_query_id = -1
querylists = []
querylist = None
fn = __initialize_meta_info__()
with open(os.path.join(fn, filepath)) as f:
for line in f:
query = Query()
query = query._parse_(line)
if query == None:
continue
if query.query_id != prev_query_id:
if querylist is not None:
querylists.append(querylist)
querylist = QueryList()
prev_query_id = query.query_id
querylist._add_query(query)
if querylist is not None:
querylists.append(querylist)
return querylists
def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
"""
Parameters
--------
filename : string
fill_missing : fill the missing value. default in MQ2007 is -1
Returns
------
yield
label query_left, query_right # format = "pairwise"
label querylist # format = "listwise"
"""
querylists = query_filter(
load_from_text(
filepath, shuffle=shuffle, fill_missing=fill_missing))
for querylist in querylists:
if format == "plain_txt":
yield next(gen_plain_txt(querylist))
elif format == "pointwise":
yield next(gen_point(querylist))
elif format == "pairwise":
for pair in gen_pair(querylist):
yield pair
elif format == "listwise":
yield next(gen_list(querylist))
train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
def fetch():
return download(URL, "MQ2007", MD5)
if __name__ == "__main__":
fetch()
mytest = functools.partial(
__reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
for label, query in mytest():
print label, query
# /usr/bin/env python
# -*- coding:utf-8 -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The script fetch and preprocess movie_reviews data set that provided by NLTK
TODO(yuyang18): Complete dataset.
"""
import collections
from itertools import chain
import nltk
from nltk.corpus import movie_reviews
import paddle.v2.dataset.common
__all__ = ['train', 'test', 'get_word_dict', 'convert']
NUM_TRAINING_INSTANCES = 1600
NUM_TOTAL_INSTANCES = 2000
def download_data_if_not_yet():
"""
Download the data set, if the data set is not download.
"""
try:
# make sure that nltk can find the data
if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
movie_reviews.categories()
except LookupError:
print "Downloading movie_reviews data set, please wait....."
nltk.download(
'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
print "Download data set success....."
print "Path is " + nltk.data.find('corpora/movie_reviews').path
def get_word_dict():
"""
Sorted the words by the frequency of words which occur in sample
:return:
words_freq_sorted
"""
words_freq_sorted = list()
word_freq_dict = collections.defaultdict(int)
download_data_if_not_yet()
for category in movie_reviews.categories():
for field in movie_reviews.fileids(category):
for words in movie_reviews.words(field):
word_freq_dict[words] += 1
words_sort_list = word_freq_dict.items()
words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
for index, word in enumerate(words_sort_list):
words_freq_sorted.append((word[0], index))
return words_freq_sorted
def sort_files():
"""
Sorted the sample for cross reading the sample
:return:
files_list
"""
files_list = list()
neg_file_list = movie_reviews.fileids('neg')
pos_file_list = movie_reviews.fileids('pos')
files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
return files_list
def load_sentiment_data():
"""
Load the data set
:return:
data_set
"""
data_set = list()
download_data_if_not_yet()
words_ids = dict(get_word_dict())
for sample_file in sort_files():
words_list = list()
category = 0 if 'neg' in sample_file else 1
for word in movie_reviews.words(sample_file):
words_list.append(words_ids[word.lower()])
data_set.append((words_list, category))
return data_set
def reader_creator(data):
"""
Reader creator, generate an iterator for data set
:param data:
train data set or test data set
"""
for each in data:
yield each[0], each[1]
def train():
"""
Default training set reader creator
"""
data_set = load_sentiment_data()
return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
def test():
"""
Default test set reader creator
"""
data_set = load_sentiment_data()
return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
def fetch():
nltk.download(
'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.dataset.cifar
import unittest
class TestCIFAR(unittest.TestCase):
def check_reader(self, reader):
sum = 0
label = 0
for l in reader():
self.assertEqual(l[0].size, 3072)
if l[1] > label:
label = l[1]
sum += 1
return sum, label
def test_test10(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.test10())
self.assertEqual(instances, 10000)
self.assertEqual(max_label_value, 9)
def test_train10(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.train10())
self.assertEqual(instances, 50000)
self.assertEqual(max_label_value, 9)
def test_test100(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.test100())
self.assertEqual(instances, 10000)
self.assertEqual(max_label_value, 99)
def test_train100(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.train100())
self.assertEqual(instances, 50000)
self.assertEqual(max_label_value, 99)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.dataset.common
import unittest
import tempfile
import glob
class TestCommon(unittest.TestCase):
def test_md5file(self):
_, temp_path = tempfile.mkstemp()
with open(temp_path, 'w') as f:
f.write("Hello\n")
self.assertEqual('09f7e02f1290be211da707a266f153b3',
paddle.v2.dataset.common.md5file(temp_path))
def test_download(self):
yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
self.assertEqual(
paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
paddle.v2.dataset.common.download(
yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
def test_split(self):
def test_reader():
def reader():
for x in xrange(10):
yield x
return reader
_, temp_path = tempfile.mkstemp()
paddle.v2.dataset.common.split(
test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
files = glob.glob(temp_path + '/test-%05d.pickle')
self.assertEqual(len(files), 3)
def test_cluster_file_reader(self):
_, temp_path = tempfile.mkstemp()
for x in xrange(5):
with open(temp_path + '/%05d.test' % x) as f:
f.write('%d\n' % x)
reader = paddle.v2.dataset.common.cluster_files_reader(
temp_path + '/*.test', 5, 0)
for idx, e in enumerate(reader()):
self.assertEqual(e, str("0"))
def test_convert(self):
record_num = 10
num_shards = 4
def test_reader():
def reader():
for x in xrange(record_num):
yield x
return reader
path = tempfile.mkdtemp()
paddle.v2.dataset.common.convert(path,
test_reader(), num_shards,
'random_images')
files = glob.glob(path + '/random_images-*')
self.assertEqual(len(files), num_shards)
recs = []
for i in range(0, num_shards):
n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
r = recordio.reader(n)
while True:
d = r.read()
if d is None:
break
recs.append(d)
recs.sort()
self.assertEqual(total, record_num)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.dataset.flowers
import unittest
class TestFlowers(unittest.TestCase):
def check_reader(self, reader):
sum = 0
label = 0
size = 224 * 224 * 3
for l in reader():
self.assertEqual(l[0].size, size)
if l[1] > label:
label = l[1]
sum += 1
return sum, label
def test_train(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.flowers.train())
self.assertEqual(instances, 6149)
self.assertEqual(max_label_value, 102)
def test_test(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.flowers.test())
self.assertEqual(instances, 1020)
self.assertEqual(max_label_value, 102)
def test_valid(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.flowers.valid())
self.assertEqual(instances, 1020)
self.assertEqual(max_label_value, 102)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.dataset.imdb
import unittest
import re
TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
class TestIMDB(unittest.TestCase):
word_idx = None
def test_build_dict(self):
if self.word_idx == None:
self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
150)
self.assertEqual(len(self.word_idx), 7036)
def check_dataset(self, dataset, expected_size):
if self.word_idx == None:
self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
150)
sum = 0
for l in dataset(self.word_idx):
self.assertEqual(l[1], sum % 2)
sum += 1
self.assertEqual(sum, expected_size)
def test_train(self):
self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
def test_test(self):
self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.dataset.imikolov
import unittest
WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
class TestMikolov(unittest.TestCase):
def check_reader(self, reader, n):
for l in reader():
self.assertEqual(len(l), n)
def test_train(self):
n = 5
self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
'rake regatta rubens sim snack-food ssangyong swapo wachter'
first_line = [
WORD_DICT.get(ch, WORD_DICT['<unk>'])
for ch in first_line.split(' ')
]
for l in paddle.v2.dataset.imikolov.train(
WORD_DICT, n=-1,
data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
read_line = l[0][1:]
break
self.assertEqual(first_line, read_line)
def test_test(self):
n = 5
self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
first_line = 'consumers may want to move their telephones a little '\
'closer to the tv set'
first_line = [
WORD_DICT.get(ch, WORD_DICT['<unk>'])
for ch in first_line.split(' ')
]
for l in paddle.v2.dataset.imikolov.test(
WORD_DICT, n=-1,
data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
read_line = l[0][1:]
break
self.assertEqual(first_line, read_line)
def test_total(self):
_, idx = zip(*WORD_DICT.items())
self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.dataset.mnist
import unittest
class TestMNIST(unittest.TestCase):
def check_reader(self, reader):
sum = 0
label = 0
for l in reader():
self.assertEqual(l[0].size, 784)
if l[1] > label:
label = l[1]
sum += 1
return sum, label
def test_train(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.mnist.train())
self.assertEqual(instances, 60000)
self.assertEqual(max_label_value, 9)
def test_test(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.mnist.test())
self.assertEqual(instances, 10000)
self.assertEqual(max_label_value, 9)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.dataset.mq2007
import unittest
class TestMQ2007(unittest.TestCase):
def test_pairwise(self):
for label, query_left, query_right in paddle.v2.dataset.mq2007.test(
format="pairwise"):
self.assertEqual(query_left.shape(), (46, ))
self.assertEqual(query_right.shape(), (46, ))
def test_listwise(self):
for label_array, query_array in paddle.v2.dataset.mq2007.test(
format="listwise"):
self.assertEqual(len(label_array), len(query_array))
if __name__ == "__main__":
unittest.main()
# /usr/bin/env python
# -*- coding:utf-8 -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import nltk
import paddle.v2.dataset.sentiment as st
from nltk.corpus import movie_reviews
class TestSentimentMethods(unittest.TestCase):
def test_get_word_dict(self):
word_dict = st.get_word_dict()[0:10]
test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
(u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
(u'is', 8), (u'in', 9)]
for idx, each in enumerate(word_dict):
self.assertEqual(each, test_word_list[idx])
self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
def test_sort_files(self):
last_label = ''
for sample_file in st.sort_files():
current_label = sample_file.split("/")[0]
self.assertNotEqual(current_label, last_label)
last_label = current_label
def test_data_set(self):
data_set = st.load_sentiment_data()
last_label = -1
for each in st.test():
self.assertNotEqual(each[1], last_label)
last_label = each[1]
self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
self.assertEqual(
len(list(st.test())),
(st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.dataset.voc2012
import unittest
class TestVOC(unittest.TestCase):
def check_reader(self, reader):
sum = 0
label = 0
for l in reader():
self.assertEqual(l[0].size, 3 * l[1].size)
sum += 1
return sum
def test_train(self):
count = self.check_reader(paddle.v2.dataset.voc_seg.train())
self.assertEqual(count, 2913)
def test_test(self):
count = self.check_reader(paddle.v2.dataset.voc_seg.test())
self.assertEqual(count, 1464)
def test_val(self):
count = self.check_reader(paddle.v2.dataset.voc_seg.val())
self.assertEqual(count, 1449)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.dataset.wmt16
import unittest
class TestWMT16(unittest.TestCase):
def checkout_one_sample(self, sample):
# train data has 3 field: source language word indices,
# target language word indices, and target next word indices.
self.assertEqual(len(sample), 3)
# test start mark and end mark in source word indices.
self.assertEqual(sample[0][0], 0)
self.assertEqual(sample[0][-1], 1)
# test start mask in target word indices
self.assertEqual(sample[1][0], 0)
# test en mask in target next word indices
self.assertEqual(sample[2][-1], 1)
def test_train(self):
for idx, sample in enumerate(
paddle.v2.dataset.wmt16.train(
src_dict_size=100000, trg_dict_size=100000)()):
if idx >= 10: break
self.checkout_one_sample(sample)
def test_test(self):
for idx, sample in enumerate(
paddle.v2.dataset.wmt16.test(
src_dict_size=1000, trg_dict_size=1000)()):
if idx >= 10: break
self.checkout_one_sample(sample)
def test_val(self):
for idx, sample in enumerate(
paddle.v2.dataset.wmt16.validation(
src_dict_size=1000, trg_dict_size=1000)()):
if idx >= 10: break
self.checkout_one_sample(sample)
def test_get_dict(self):
dict_size = 1000
word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True)
self.assertEqual(len(word_dict), dict_size)
self.assertEqual(word_dict[0], "<s>")
self.assertEqual(word_dict[1], "<e>")
self.assertEqual(word_dict[2], "<unk>")
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
UCI Housing dataset.
This module will download dataset from
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
parse training set and test set into paddle reader creators.
"""
import numpy as np
import os
import paddle.v2.dataset.common
from paddle.v2.parameters import Parameters
__all__ = ['train', 'test']
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
MD5 = 'd4accdce7a25600298819f8e28e8d593'
feature_names = [
'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT', 'convert'
]
UCI_TRAIN_DATA = None
UCI_TEST_DATA = None
URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
def feature_range(maximums, minimums):
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
feature_num = len(maximums)
ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
ax.set_title('feature scale')
plt.xticks(range(feature_num), feature_names)
plt.xlim([-1, feature_num])
fig.set_figheight(6)
fig.set_figwidth(10)
if not os.path.exists('./image'):
os.makedirs('./image')
fig.savefig('image/ranges.png', dpi=48)
plt.close(fig)
def load_data(filename, feature_num=14, ratio=0.8):
global UCI_TRAIN_DATA, UCI_TEST_DATA
if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
return
data = np.fromfile(filename, sep=' ')
data = data.reshape(data.shape[0] / feature_num, feature_num)
maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
axis=0) / data.shape[0]
feature_range(maximums[:-1], minimums[:-1])
for i in xrange(feature_num - 1):
data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
offset = int(data.shape[0] * ratio)
UCI_TRAIN_DATA = data[:offset]
UCI_TEST_DATA = data[offset:]
def train():
"""
UCI_HOUSING training set creator.
It returns a reader creator, each sample in the reader is features after
normalization and price number.
:return: Training reader creator
:rtype: callable
"""
global UCI_TRAIN_DATA
load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
def reader():
for d in UCI_TRAIN_DATA:
yield d[:-1], d[-1:]
return reader
def test():
"""
UCI_HOUSING test set creator.
It returns a reader creator, each sample in the reader is features after
normalization and price number.
:return: Test reader creator
:rtype: callable
"""
global UCI_TEST_DATA
load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
def reader():
for d in UCI_TEST_DATA:
yield d[:-1], d[-1:]
return reader
def model():
tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
MD5_MODEL)
with open(tar_file, 'r') as f:
parameters = Parameters.from_tar(f)
return parameters
def fetch():
paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image dataset for segmentation.
The 2012 dataset contains images from 2008-2011 for which additional
segmentations have been prepared. As in previous years the assignment
to training/test sets has been maintained. The total number of images
with segmentation has been increased from 7,062 to 9,993.
"""
import tarfile
import io
import numpy as np
from paddle.v2.dataset.common import download
from paddle.v2.image import *
from PIL import Image
__all__ = ['train', 'test', 'val']
VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
VOCtrainval_11-May-2012.tar'
VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
CACHE_DIR = 'voc2012'
def reader_creator(filename, sub_name):
tarobject = tarfile.open(filename)
name2mem = {}
for ele in tarobject.getmembers():
name2mem[ele.name] = ele
def reader():
set_file = SET_FILE.format(sub_name)
sets = tarobject.extractfile(name2mem[set_file])
for line in sets:
line = line.strip()
data_file = DATA_FILE.format(line)
label_file = LABEL_FILE.format(line)
data = tarobject.extractfile(name2mem[data_file]).read()
label = tarobject.extractfile(name2mem[label_file]).read()
data = Image.open(io.BytesIO(data))
label = Image.open(io.BytesIO(label))
data = np.array(data)
label = np.array(label)
yield data, label
return reader
def train():
"""
Create a train dataset reader containing 2913 images in HWC order.
"""
return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
def test():
"""
Create a test dataset reader containing 1464 images in HWC order.
"""
return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
def val():
"""
Create a val dataset reader containing 1449 images in HWC order.
"""
return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
WMT14 dataset.
The original WMT14 dataset is too large and a small set of data for set is
provided. This module will download dataset from
http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse training set and test set into paddle reader creators.
"""
import tarfile
import gzip
import paddle.v2.dataset.common
from paddle.v2.parameters import Parameters
__all__ = [
'train',
'test',
'get_dict',
'convert',
]
URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
'cslm_joint_paper/data/dev+test.tgz')
MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
# this is a small set of data for test. The original data is too large and
# will be add later.
URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
'wmt_shrinked_data/wmt14.tgz')
MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
# BLEU of this trained model is 26.92
URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
START = "<s>"
END = "<e>"
UNK = "<unk>"
UNK_IDX = 2
def __read_to_dict(tar_file, dict_size):
def __to_dict(fd, size):
out_dict = dict()
for line_count, line in enumerate(fd):
if line_count < size:
out_dict[line.strip()] = line_count
else:
break
return out_dict
with tarfile.open(tar_file, mode='r') as f:
names = [
each_item.name for each_item in f
if each_item.name.endswith("src.dict")
]
assert len(names) == 1
src_dict = __to_dict(f.extractfile(names[0]), dict_size)
names = [
each_item.name for each_item in f
if each_item.name.endswith("trg.dict")
]
assert len(names) == 1
trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
return src_dict, trg_dict
def reader_creator(tar_file, file_name, dict_size):
def reader():
src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
with tarfile.open(tar_file, mode='r') as f:
names = [
each_item.name for each_item in f
if each_item.name.endswith(file_name)
]
for name in names:
for line in f.extractfile(name):
line_split = line.strip().split('\t')
if len(line_split) != 2:
continue
src_seq = line_split[0] # one source sequence
src_words = src_seq.split()
src_ids = [
src_dict.get(w, UNK_IDX)
for w in [START] + src_words + [END]
]
trg_seq = line_split[1] # one target sequence
trg_words = trg_seq.split()
trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
# remove sequence whose length > 80 in training mode
if len(src_ids) > 80 or len(trg_ids) > 80:
continue
trg_ids_next = trg_ids + [trg_dict[END]]
trg_ids = [trg_dict[START]] + trg_ids
yield src_ids, trg_ids, trg_ids_next
return reader
def train(dict_size):
"""
WMT14 training set creator.
It returns a reader creator, each sample in the reader is source language
word ID sequence, target language word ID sequence and next word ID
sequence.
:return: Training reader creator
:rtype: callable
"""
return reader_creator(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
'train/train', dict_size)
def test(dict_size):
"""
WMT14 test set creator.
It returns a reader creator, each sample in the reader is source language
word ID sequence, target language word ID sequence and next word ID
sequence.
:return: Test reader creator
:rtype: callable
"""
return reader_creator(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
'test/test', dict_size)
def gen(dict_size):
return reader_creator(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
'gen/gen', dict_size)
def model():
tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
with gzip.open(tar_file, 'r') as f:
parameters = Parameters.from_tar(f)
return parameters
def get_dict(dict_size, reverse=True):
# if reverse = False, return dict = {'a':'001', 'b':'002', ...}
# else reverse = true, return dict = {'001':'a', '002':'b', ...}
tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
if reverse:
src_dict = {v: k for k, v in src_dict.items()}
trg_dict = {v: k for k, v in trg_dict.items()}
return src_dict, trg_dict
def fetch():
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
def convert(path):
"""
Converts dataset to recordio format
"""
dict_size = 30000
paddle.v2.dataset.common.convert(path,
train(dict_size), 1000, "wmt14_train")
paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
ACL2016 Multimodal Machine Translation. Please see this website for more
details: http://www.statmt.org/wmt16/multimodal-task.html#task1
If you use the dataset created for your task, please cite the following paper:
Multi30K: Multilingual English-German Image Descriptions.
@article{elliott-EtAl:2016:VL16,
author = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
title = {Multi30K: Multilingual English-German Image Descriptions},
booktitle = {Proceedings of the 6th Workshop on Vision and Language},
year = {2016},
pages = {70--74},
year = 2016
}
"""
import os
import tarfile
import gzip
from collections import defaultdict
import paddle.v2.dataset.common
__all__ = [
"train",
"test",
"validation",
"convert",
"fetch",
"get_dict",
]
DATA_URL = ("http://cloud.dlnel.org/filepub/"
"?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
DATA_MD5 = "0c38be43600334966403524a40dcd81e"
TOTAL_EN_WORDS = 11250
TOTAL_DE_WORDS = 19220
START_MARK = "<s>"
END_MARK = "<e>"
UNK_MARK = "<unk>"
def __build_dict(tar_file, dict_size, save_path, lang):
word_dict = defaultdict(int)
with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile("wmt16/train"):
line_split = line.strip().split("\t")
if len(line_split) != 2: continue
sen = line_split[0] if lang == "en" else line_split[1]
for w in sen.split():
word_dict[w] += 1
with open(save_path, "w") as fout:
fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
for idx, word in enumerate(
sorted(
word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
if idx + 3 == dict_size: break
fout.write("%s\n" % (word[0]))
def __load_dict(tar_file, dict_size, lang, reverse=False):
dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
"wmt16/%s_%d.dict" % (lang, dict_size))
if not os.path.exists(dict_path) or (
len(open(dict_path, "r").readlines()) != dict_size):
__build_dict(tar_file, dict_size, dict_path, lang)
word_dict = {}
with open(dict_path, "r") as fdict:
for idx, line in enumerate(fdict):
if reverse:
word_dict[idx] = line.strip()
else:
word_dict[line.strip()] = idx
return word_dict
def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
TOTAL_DE_WORDS))
trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
TOTAL_ENG_WORDS))
return src_dict_size, trg_dict_size
def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
def reader():
src_dict = __load_dict(tar_file, src_dict_size, src_lang)
trg_dict = __load_dict(tar_file, trg_dict_size,
("de" if src_lang == "en" else "en"))
# the indice for start mark, end mark, and unk are the same in source
# language and target language. Here uses the source language
# dictionary to determine their indices.
start_id = src_dict[START_MARK]
end_id = src_dict[END_MARK]
unk_id = src_dict[UNK_MARK]
src_col = 0 if src_lang == "en" else 1
trg_col = 1 - src_col
with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile(file_name):
line_split = line.strip().split("\t")
if len(line_split) != 2:
continue
src_words = line_split[src_col].split()
src_ids = [start_id] + [
src_dict.get(w, unk_id) for w in src_words
] + [end_id]
trg_words = line_split[trg_col].split()
trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
trg_ids_next = trg_ids + [end_id]
trg_ids = [start_id] + trg_ids
yield src_ids, trg_ids, trg_ids_next
return reader
def train(src_dict_size, trg_dict_size, src_lang="en"):
"""
WMT16 train set reader.
This function returns the reader for train data. Each sample the reader
returns is made up of three fields: the source language word index sequence,
target language word index sequence and next word index sequence.
NOTE:
The original like for training data is:
http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
paddle.dataset.wmt16 provides a tokenized version of the original dataset by
using moses's tokenization script:
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
Args:
src_dict_size(int): Size of the source language dictionary. Three
special tokens will be added into the dictionary:
<s> for start mark, <e> for end mark, and <unk> for
unknown word.
trg_dict_size(int): Size of the target language dictionary. Three
special tokens will be added into the dictionary:
<s> for start mark, <e> for end mark, and <unk> for
unknown word.
src_lang(string): A string indicating which language is the source
language. Available options are: "en" for English
and "de" for Germany.
Returns:
callable: The train reader.
"""
if src_lang not in ["en", "de"]:
raise ValueError("An error language type. Only support: "
"en (for English); de(for Germany).")
src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
src_lang)
return reader_creator(
tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
"wmt16.tar.gz"),
file_name="wmt16/train",
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
src_lang=src_lang)
def test(src_dict_size, trg_dict_size, src_lang="en"):
"""
WMT16 test set reader.
This function returns the reader for test data. Each sample the reader
returns is made up of three fields: the source language word index sequence,
target language word index sequence and next word index sequence.
NOTE:
The original like for test data is:
http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
paddle.dataset.wmt16 provides a tokenized version of the original dataset by
using moses's tokenization script:
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
Args:
src_dict_size(int): Size of the source language dictionary. Three
special tokens will be added into the dictionary:
<s> for start mark, <e> for end mark, and <unk> for
unknown word.
trg_dict_size(int): Size of the target language dictionary. Three
special tokens will be added into the dictionary:
<s> for start mark, <e> for end mark, and <unk> for
unknown word.
src_lang(string): A string indicating which language is the source
language. Available options are: "en" for English
and "de" for Germany.
Returns:
callable: The test reader.
"""
if src_lang not in ["en", "de"]:
raise ValueError("An error language type. "
"Only support: en (for English); de(for Germany).")
src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
src_lang)
return reader_creator(
tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
"wmt16.tar.gz"),
file_name="wmt16/test",
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
src_lang=src_lang)
def validation(src_dict_size, trg_dict_size, src_lang="en"):
"""
WMT16 validation set reader.
This function returns the reader for validation data. Each sample the reader
returns is made up of three fields: the source language word index sequence,
target language word index sequence and next word index sequence.
NOTE:
The original like for validation data is:
http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
paddle.dataset.wmt16 provides a tokenized version of the original dataset by
using moses's tokenization script:
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
Args:
src_dict_size(int): Size of the source language dictionary. Three
special tokens will be added into the dictionary:
<s> for start mark, <e> for end mark, and <unk> for
unknown word.
trg_dict_size(int): Size of the target language dictionary. Three
special tokens will be added into the dictionary:
<s> for start mark, <e> for end mark, and <unk> for
unknown word.
src_lang(string): A string indicating which language is the source
language. Available options are: "en" for English
and "de" for Germany.
Returns:
callable: The validation reader.
"""
if src_lang not in ["en", "de"]:
raise ValueError("An error language type. "
"Only support: en (for English); de(for Germany).")
src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
src_lang)
return reader_creator(
tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
"wmt16.tar.gz"),
file_name="wmt16/val",
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
src_lang=src_lang)
def get_dict(lang, dict_size, reverse=False):
"""
return the word dictionary for the specified language.
Args:
lang(string): A string indicating which language is the source
language. Available options are: "en" for English
and "de" for Germany.
dict_size(int): Size of the specified language dictionary.
reverse(bool): If reverse is set to False, the returned python
dictionary will use word as key and use index as value.
If reverse is set to True, the returned python
dictionary will use index as key and word as value.
Returns:
dict: The word dictionary for the specific language.
"""
if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
else: dict_size = min(dict_size, TOTAL_DE_WORDS)
dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
"wmt16/%s_%d.dict" % (lang, dict_size))
assert os.path.exists(dict_path), "Word dictionary does not exist. "
"Please invoke paddle.dataset.wmt16.train/test/validation first "
"to build the dictionary."
tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz")
return __load_dict(tar_file, dict_size, lang, reverse)
def fetch():
"""download the entire dataset.
"""
paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
"wmt16.tar.gz")
def convert(path, src_dict_size, trg_dict_size, src_lang):
"""Converts dataset to recordio format.
"""
paddle.v2.dataset.common.convert(
path,
train(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
src_lang=src_lang),
1000,
"wmt16_train")
paddle.v2.dataset.common.convert(
path,
test(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
src_lang=src_lang),
1000,
"wmt16_test")
paddle.v2.dataset.common.convert(
path,
validation(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
src_lang=src_lang),
1000,
"wmt16_validation")
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file contains some common interfaces for image preprocess.
Many users are confused about the image layout. We introduce
the image layout as follows.
- CHW Layout
- The abbreviations: C=channel, H=Height, W=Width
- The default layout of image opened by cv2 or PIL is HWC.
PaddlePaddle only supports the CHW layout. And CHW is simply
a transpose of HWC. It must transpose the input image.
- Color format: RGB or BGR
OpenCV use BGR color format. PIL use RGB color format. Both
formats can be used for training. Noted that, the format should
be keep consistent between the training and inference peroid.
"""
import numpy as np
try:
import cv2
except ImportError:
cv2 = None
import os
import tarfile
import cPickle
__all__ = [
"load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
"random_crop", "left_right_flip", "simple_transform", "load_and_transform",
"batch_images_from_tar"
]
def batch_images_from_tar(data_file,
dataset_name,
img2label,
num_per_batch=1024):
"""
Read images from tar file and batch them into batch file.
:param data_file: path of image tar file
:type data_file: string
:param dataset_name: 'train','test' or 'valid'
:type dataset_name: string
:param img2label: a dic with image file name as key
and image's label as value
:type img2label: dic
:param num_per_batch: image number per batch file
:type num_per_batch: int
:return: path of list file containing paths of batch file
:rtype: string
"""
batch_dir = data_file + "_batch"
out_path = "%s/%s" % (batch_dir, dataset_name)
meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
if os.path.exists(out_path):
return meta_file
else:
os.makedirs(out_path)
tf = tarfile.open(data_file)
mems = tf.getmembers()
data = []
labels = []
file_id = 0
for mem in mems:
if mem.name in img2label:
data.append(tf.extractfile(mem).read())
labels.append(img2label[mem.name])
if len(data) == num_per_batch:
output = {}
output['label'] = labels
output['data'] = data
cPickle.dump(
output,
open('%s/batch_%d' % (out_path, file_id), 'w'),
protocol=cPickle.HIGHEST_PROTOCOL)
file_id += 1
data = []
labels = []
if len(data) > 0:
output = {}
output['label'] = labels
output['data'] = data
cPickle.dump(
output,
open('%s/batch_%d' % (out_path, file_id), 'w'),
protocol=cPickle.HIGHEST_PROTOCOL)
with open(meta_file, 'a') as meta:
for file in os.listdir(out_path):
meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
return meta_file
def load_image_bytes(bytes, is_color=True):
"""
Load an color or gray image from bytes array.
Example usage:
.. code-block:: python
with open('cat.jpg') as f:
im = load_image_bytes(f.read())
:param bytes: the input image bytes array.
:type bytes: str
:param is_color: If set is_color True, it will load and
return a color image. Otherwise, it will
load and return a gray image.
:type is_color: bool
"""
flag = 1 if is_color else 0
file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
img = cv2.imdecode(file_bytes, flag)
return img
def load_image(file, is_color=True):
"""
Load an color or gray image from the file path.
Example usage:
.. code-block:: python
im = load_image('cat.jpg')
:param file: the input image path.
:type file: string
:param is_color: If set is_color True, it will load and
return a color image. Otherwise, it will
load and return a gray image.
:type is_color: bool
"""
# cv2.IMAGE_COLOR for OpenCV3
# cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
# cv2.IMAGE_GRAYSCALE for OpenCV3
# cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
# Here, use constant 1 and 0
# 1: COLOR, 0: GRAYSCALE
flag = 1 if is_color else 0
im = cv2.imread(file, flag)
return im
def resize_short(im, size):
"""
Resize an image so that the length of shorter edge is size.
Example usage:
.. code-block:: python
im = load_image('cat.jpg')
im = resize_short(im, 256)
:param im: the input image with HWC layout.
:type im: ndarray
:param size: the shorter edge size of image after resizing.
:type size: int
"""
h, w = im.shape[:2]
h_new, w_new = size, size
if h > w:
h_new = size * h / w
else:
w_new = size * w / h
im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
return im
def to_chw(im, order=(2, 0, 1)):
"""
Transpose the input image order. The image layout is HWC format
opened by cv2 or PIL. Transpose the input image to CHW layout
according the order (2,0,1).
Example usage:
.. code-block:: python
im = load_image('cat.jpg')
im = resize_short(im, 256)
im = to_chw(im)
:param im: the input image with HWC layout.
:type im: ndarray
:param order: the transposed order.
:type order: tuple|list
"""
assert len(im.shape) == len(order)
im = im.transpose(order)
return im
def center_crop(im, size, is_color=True):
"""
Crop the center of image with size.
Example usage:
.. code-block:: python
im = center_crop(im, 224)
:param im: the input image with HWC layout.
:type im: ndarray
:param size: the cropping size.
:type size: int
:param is_color: whether the image is color or not.
:type is_color: bool
"""
h, w = im.shape[:2]
h_start = (h - size) / 2
w_start = (w - size) / 2
h_end, w_end = h_start + size, w_start + size
if is_color:
im = im[h_start:h_end, w_start:w_end, :]
else:
im = im[h_start:h_end, w_start:w_end]
return im
def random_crop(im, size, is_color=True):
"""
Randomly crop input image with size.
Example usage:
.. code-block:: python
im = random_crop(im, 224)
:param im: the input image with HWC layout.
:type im: ndarray
:param size: the cropping size.
:type size: int
:param is_color: whether the image is color or not.
:type is_color: bool
"""
h, w = im.shape[:2]
h_start = np.random.randint(0, h - size + 1)
w_start = np.random.randint(0, w - size + 1)
h_end, w_end = h_start + size, w_start + size
if is_color:
im = im[h_start:h_end, w_start:w_end, :]
else:
im = im[h_start:h_end, w_start:w_end]
return im
def left_right_flip(im, is_color=True):
"""
Flip an image along the horizontal direction.
Return the flipped image.
Example usage:
.. code-block:: python
im = left_right_flip(im)
:param im: input image with HWC layout or HW layout for gray image
:type im: ndarray
:param is_color: whether input image is color or not
:type is_color: bool
"""
if len(im.shape) == 3 and is_color:
return im[:, ::-1, :]
else:
return im[:, ::-1]
def simple_transform(im,
resize_size,
crop_size,
is_train,
is_color=True,
mean=None):
"""
Simply data argumentation for training. These operations include
resizing, croping and flipping.
Example usage:
.. code-block:: python
im = simple_transform(im, 256, 224, True)
:param im: The input image with HWC layout.
:type im: ndarray
:param resize_size: The shorter edge length of the resized image.
:type resize_size: int
:param crop_size: The cropping size.
:type crop_size: int
:param is_train: Whether it is training or not.
:type is_train: bool
:param is_color: whether the image is color or not.
:type is_color: bool
:param mean: the mean values, which can be element-wise mean values or
mean values per channel.
:type mean: numpy array | list
"""
im = resize_short(im, resize_size)
if is_train:
im = random_crop(im, crop_size, is_color=is_color)
if np.random.randint(2) == 0:
im = left_right_flip(im, is_color)
else:
im = center_crop(im, crop_size, is_color)
im = center_crop(im, crop_size, is_color=is_color)
if len(im.shape) == 3:
im = to_chw(im)
im = im.astype('float32')
if mean is not None:
mean = np.array(mean, dtype=np.float32)
# mean value, may be one value per channel
if mean.ndim == 1 and is_color:
mean = mean[:, np.newaxis, np.newaxis]
elif mean.ndim == 1:
mean = mean
else:
# elementwise mean
assert len(mean.shape) == len(im)
im -= mean
return im
def load_and_transform(filename,
resize_size,
crop_size,
is_train,
is_color=True,
mean=None):
"""
Load image from the input file `filename` and transform image for
data argumentation. Please refer to the `simple_transform` interface
for the transform operations.
Example usage:
.. code-block:: python
im = load_and_transform('cat.jpg', 256, 224, True)
:param filename: The file name of input image.
:type filename: string
:param resize_size: The shorter edge length of the resized image.
:type resize_size: int
:param crop_size: The cropping size.
:type crop_size: int
:param is_train: Whether it is training or not.
:type is_train: bool
:param is_color: whether the image is color or not.
:type is_color: bool
:param mean: the mean values, which can be element-wise mean values or
mean values per channel.
:type mean: numpy array | list
"""
im = load_image(filename, is_color)
im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
return im
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['batch']
def batch(reader, batch_size):
"""
Create a batched reader.
:param reader: the data reader to read from.
:type reader: callable
:param batch_size: size of each mini-batch
:type batch_size: int
:return: the batched reader.
:rtype: callable
"""
def batch_reader():
r = reader()
b = []
for instance in r:
b.append(instance)
if len(b) == batch_size:
yield b
b = []
if b:
yield b
return batch_reader
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
At training and testing time, PaddlePaddle programs need to read data. To ease
the users' work to write data reading code, we define that
- A *reader* is a function that reads data (from file, network, random number
generator, etc) and yields data items.
- A *reader creator* is a function that returns a reader function.
- A *reader decorator* is a function, which accepts one or more readers, and
returns a reader.
- A *batch reader* is a function that reads data (from *reader*, file, network,
random number generator, etc) and yields a batch of data items.
#####################
Data Reader Interface
#####################
Indeed, *data reader* doesn't have to be a function that reads and yields data
items. It can be any function with no parameter that creates a iterable
(anything can be used in :code:`for x in iterable`)\:
.. code-block:: python
iterable = data_reader()
Element produced from the iterable should be a **single** entry of data,
**not** a mini batch. That entry of data could be a single item, or a tuple of
items.
Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
array of float32, int, list of int)
An example implementation for single item data reader creator:
.. code-block:: python
def reader_creator_random_image(width, height):
def reader():
while True:
yield numpy.random.uniform(-1, 1, size=width*height)
return reader
An example implementation for multiple item data reader creator:
.. code-block:: python
def reader_creator_random_image_and_label(width, height, label):
def reader():
while True:
yield numpy.random.uniform(-1, 1, size=width*height), label
return reader
TODO(yuyang18): Should we add whole design doc here?
"""
import decorator
from decorator import *
import creator
__all__ = decorator.__all__ + ['creator']
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Creator package contains some simple reader creator, which could
be used in user program.
"""
__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
def np_array(x):
"""
Creates a reader that yields elements of x, if it is a
numpy vector. Or rows of x, if it is a numpy matrix.
Or any sub-hyperplane indexed by the highest dimension.
:param x: the numpy array to create reader from.
:returns: data reader created from x.
"""
def reader():
if x.ndim < 1:
yield x
for e in x:
yield e
return reader
def text_file(path):
"""
Creates a data reader that outputs text line by line from given text file.
Trailing new line ('\\\\n') of each line will be removed.
:path: path of the text file.
:returns: data reader of text file
"""
def reader():
f = open(path, "r")
for l in f:
yield l.rstrip('\n')
f.close()
return reader
def recordio(paths, buf_size=100):
"""
Creates a data reader from given RecordIO file paths separated by ",",
glob pattern is supported.
:path: path of recordio files, can be a string or a string list.
:returns: data reader of recordio files.
"""
import recordio as rec
import paddle.v2.reader.decorator as dec
import cPickle as pickle
def reader():
if isinstance(paths, basestring):
path = paths
else:
path = ",".join(paths)
f = rec.reader(path)
while True:
r = f.read()
if r is None:
break
yield pickle.loads(r)
f.close()
return dec.buffered(reader, buf_size)
pass_num = 0
def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
"""
Create a data reader that yield a record one by one from
the paths:
:paths: path of recordio files, can be a string or a string list.
:etcd_endpoints: the endpoints for etcd cluster
:returns: data reader of recordio files.
.. code-block:: python
from paddle.v2.reader.creator import cloud_reader
etcd_endpoints = "http://127.0.0.1:2379"
trainer.train.(
reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
)
"""
import os
import cPickle as pickle
import paddle.v2.master as master
c = master.client(etcd_endpoints, timeout_sec, buf_size)
if isinstance(paths, basestring):
path = [paths]
else:
path = paths
c.set_dataset(path)
def reader():
global pass_num
c.paddle_start_get_records(pass_num)
pass_num += 1
while True:
r, e = c.next_record()
if not r:
if e != -2:
print "get record error: ", e
break
yield pickle.loads(r)
return reader
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
]
from threading import Thread
import subprocess
from Queue import Queue
import itertools
import random
import zlib
def map_readers(func, *readers):
"""
Creates a data reader that outputs return value of function using
output of each data readers as arguments.
:param func: function to use. The type of func should be (Sample) => Sample
:type: callable
:param readers: readers whose outputs will be used as arguments of func.
:return: the created data reader.
:rtype: callable
"""
def reader():
rs = []
for r in readers:
rs.append(r())
for e in itertools.imap(func, *rs):
yield e
return reader
def shuffle(reader, buf_size):
"""
Creates a data reader whose data output is shuffled.
Output from the iterator that created by original reader will be
buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
is determined by argument buf_size.
:param reader: the original reader whose output will be shuffled.
:type reader: callable
:param buf_size: shuffle buffer size.
:type buf_size: int
:return: the new reader whose output is shuffled.
:rtype: callable
"""
def data_reader():
buf = []
for e in reader():
buf.append(e)
if len(buf) >= buf_size:
random.shuffle(buf)
for b in buf:
yield b
buf = []
if len(buf) > 0:
random.shuffle(buf)
for b in buf:
yield b
return data_reader
def chain(*readers):
"""
Creates a data reader whose output is the outputs of input data
readers chained together.
If input readers output following data entries:
[0, 0, 0]
[1, 1, 1]
[2, 2, 2]
The chained reader will output:
[0, 0, 0, 1, 1, 1, 2, 2, 2]
:param readers: input readers.
:return: the new data reader.
:rtype: callable
"""
def reader():
rs = []
for r in readers:
rs.append(r())
for e in itertools.chain(*rs):
yield e
return reader
class ComposeNotAligned(ValueError):
pass
def compose(*readers, **kwargs):
"""
Creates a data reader whose output is the combination of input readers.
If input readers output following data entries:
(1, 2) 3 (4, 5)
The composed reader will output:
(1, 2, 3, 4, 5)
:param readers: readers that will be composed together.
:param check_alignment: if True, will check if input readers are aligned
correctly. If False, will not check alignment and trailing outputs
will be discarded. Defaults to True.
:type check_alignment: bool
:return: the new data reader.
:raises ComposeNotAligned: outputs of readers are not aligned.
Will not raise when check_alignment is set to False.
"""
check_alignment = kwargs.pop('check_alignment', True)
def make_tuple(x):
if isinstance(x, tuple):
return x
else:
return (x, )
def reader():
rs = []
for r in readers:
rs.append(r())
if not check_alignment:
for outputs in itertools.izip(*rs):
yield sum(map(make_tuple, outputs), ())
else:
for outputs in itertools.izip_longest(*rs):
for o in outputs:
if o is None:
# None will be not be present if compose is aligned
raise ComposeNotAligned(
"outputs of readers are not aligned.")
yield sum(map(make_tuple, outputs), ())
return reader
def buffered(reader, size):
"""
Creates a buffered data reader.
The buffered data reader will read and save data entries into a
buffer. Reading from the buffered data reader will proceed as long
as the buffer is not empty.
:param reader: the data reader to read from.
:type reader: callable
:param size: max buffer size.
:type size: int
:returns: the buffered data reader.
"""
class EndSignal():
pass
end = EndSignal()
def read_worker(r, q):
for d in r:
q.put(d)
q.put(end)
def data_reader():
r = reader()
q = Queue(maxsize=size)
t = Thread(
target=read_worker, args=(
r,
q, ))
t.daemon = True
t.start()
e = q.get()
while e != end:
yield e
e = q.get()
return data_reader
def firstn(reader, n):
"""
Limit the max number of samples that reader could return.
:param reader: the data reader to read from.
:type reader: callable
:param n: the max number of samples that return.
:type n: int
:return: the decorated reader.
:rtype: callable
"""
# TODO(yuyang18): Check if just drop the reader, could clean the opened
# resource or not?
def firstn_reader():
for i, item in enumerate(reader()):
if i == n:
break
yield item
return firstn_reader
class XmapEndSignal():
pass
def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
"""
Use multiprocess to map samples from reader by a mapper defined by user.
And this function contains a buffered decorator.
:param mapper: a function to map sample.
:type mapper: callable
:param reader: the data reader to read from
:type reader: callable
:param process_num: process number to handle original sample
:type process_num: int
:param buffer_size: max buffer size
:type buffer_size: int
:param order: keep the order of reader
:type order: bool
:return: the decarated reader
:rtype: callable
"""
end = XmapEndSignal()
# define a worker to read samples from reader to in_queue
def read_worker(reader, in_queue):
for i in reader():
in_queue.put(i)
in_queue.put(end)
# define a worker to read samples from reader to in_queue with order flag
def order_read_worker(reader, in_queue):
in_order = 0
for i in reader():
in_queue.put((in_order, i))
in_order += 1
in_queue.put(end)
# define a worker to handle samples from in_queue by mapper
# and put mapped samples into out_queue
def handle_worker(in_queue, out_queue, mapper):
sample = in_queue.get()
while not isinstance(sample, XmapEndSignal):
r = mapper(sample)
out_queue.put(r)
sample = in_queue.get()
in_queue.put(end)
out_queue.put(end)
# define a worker to handle samples from in_queue by mapper
# and put mapped samples into out_queue by order
def order_handle_worker(in_queue, out_queue, mapper, out_order):
ins = in_queue.get()
while not isinstance(ins, XmapEndSignal):
order, sample = ins
r = mapper(sample)
while order != out_order[0]:
pass
out_queue.put(r)
out_order[0] += 1
ins = in_queue.get()
in_queue.put(end)
out_queue.put(end)
def xreader():
in_queue = Queue(buffer_size)
out_queue = Queue(buffer_size)
out_order = [0]
# start a read worker in a thread
target = order_read_worker if order else read_worker
t = Thread(target=target, args=(reader, in_queue))
t.daemon = True
t.start()
# start several handle_workers
target = order_handle_worker if order else handle_worker
args = (in_queue, out_queue, mapper, out_order) if order else (
in_queue, out_queue, mapper)
workers = []
for i in xrange(process_num):
worker = Thread(target=target, args=args)
worker.daemon = True
workers.append(worker)
for w in workers:
w.start()
sample = out_queue.get()
while not isinstance(sample, XmapEndSignal):
yield sample
sample = out_queue.get()
finish = 1
while finish < process_num:
sample = out_queue.get()
if isinstance(sample, XmapEndSignal):
finish += 1
else:
yield sample
return xreader
def _buf2lines(buf, line_break="\n"):
# FIXME: line_break should be automatically configured.
lines = buf.split(line_break)
return lines[:-1], lines[-1]
class PipeReader:
"""
PipeReader read data by stream from a command, take it's
stdout into a pipe buffer and redirect it to the parser to
parse, then yield data as your desired format.
You can using standard linux command or call another program
to read data, from HDFS, Ceph, URL, AWS S3 etc:
.. code-block:: python
cmd = "hadoop fs -cat /path/to/some/file"
cmd = "cat sample_file.tar.gz"
cmd = "curl http://someurl"
cmd = "python print_s3_bucket.py"
An example:
.. code-block:: python
def example_reader():
for f in myfiles:
pr = PipeReader("cat %s"%f)
for l in pr.get_line():
sample = l.split(" ")
yield sample
"""
def __init__(self, command, bufsize=8192, file_type="plain"):
if not isinstance(command, str):
raise TypeError("left_cmd must be a string")
if file_type == "gzip":
self.dec = zlib.decompressobj(
32 + zlib.MAX_WBITS) # offset 32 to skip the header
self.file_type = file_type
self.bufsize = bufsize
self.process = subprocess.Popen(
command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
def get_line(self, cut_lines=True, line_break="\n"):
"""
:param cut_lines: cut buffer to lines
:type cut_lines: bool
:param line_break: line break of the file, like \n or \r
:type line_break: string
:return: one line or a buffer of bytes
:rtype: string
"""
remained = ""
while True:
buff = self.process.stdout.read(self.bufsize)
if buff:
if self.file_type == "gzip":
decomp_buff = self.dec.decompress(buff)
elif self.file_type == "plain":
decomp_buff = buff
else:
raise TypeError("file_type %s is not allowed" %
self.file_type)
if cut_lines:
lines, remained = _buf2lines(''.join(
[remained, decomp_buff]), line_break)
for line in lines:
yield line
else:
yield decomp_buff
else:
break
py_test(creator_test SRCS creator_test.py)
py_test(decorator_test SRCS decorator_test.py)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright PaddlePaddle contributors. All Rights Reservedd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
import numpy as np
import paddle.v2.reader.creator
class TestNumpyArray(unittest.TestCase):
def test_numpy_array(self):
l = [[1, 2, 3], [4, 5, 6]]
x = np.array(l, np.int32)
reader = paddle.v2.reader.creator.np_array(x)
for idx, e in enumerate(reader()):
self.assertItemsEqual(e, l[idx])
class TestTextFile(unittest.TestCase):
def test_text_file(self):
path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
reader = paddle.v2.reader.creator.text_file(path)
for idx, e in enumerate(reader()):
self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
class TestRecordIO(unittest.TestCase):
def do_test(self, path):
reader = paddle.v2.reader.creator.recordio(path)
idx = 0
for e in reader():
if idx == 0:
self.assertEqual(e, (1, 2, 3))
elif idx == 1:
self.assertEqual(e, (4, 5, 6))
idx += 1
self.assertEqual(idx, 2)
def test_recordIO(self):
self.do_test(
os.path.join(
os.path.dirname(__file__), "test_reader_recordio.dat"))
self.do_test([
os.path.join(
os.path.dirname(__file__), "test_reader_recordio.dat")
])
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import unittest
import paddle.v2.reader
def reader_creator_10(dur):
def reader():
for i in range(10):
# this invocation helps testing paddle.reader.buffer
time.sleep(dur)
yield i
return reader
class TestMap(unittest.TestCase):
def test_map(self):
d = {"h": 0, "i": 1}
def tokenize(x):
return d[x]
def read():
yield "h"
yield "i"
r = paddle.v2.reader.map_readers(tokenize, read)
for i, e in enumerate(r()):
self.assertEqual(e, i)
class TestBuffered(unittest.TestCase):
def test_read(self):
for size in range(20):
b = paddle.v2.reader.buffered(reader_creator_10(0), size)
c = 0
for i in b():
self.assertEqual(i, c)
c += 1
self.assertEqual(c, 10)
def test_buffering(self):
# read have 30ms delay.
b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
last_time = time.time()
for idx, i in enumerate(b()):
elapsed_time = time.time() - last_time
if i == 0:
time.sleep(0.3)
else:
# read time should be short, meaning already buffered.
self.assertLess(elapsed_time, 0.05)
last_time = time.time()
class TestCompose(unittest.TestCase):
def test_compse(self):
reader = paddle.v2.reader.compose(
reader_creator_10(0), reader_creator_10(0))
for idx, e in enumerate(reader()):
self.assertEqual(e, (idx, idx))
def test_compose_not_aligned(self):
total = 0
reader = paddle.v2.reader.compose(
paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
reader_creator_10(0))
with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
for e in reader():
total += 1
# expecting 10, not 20
self.assertEqual(total, 10)
def test_compose_not_aligned_no_check(self):
total = 0
reader = paddle.v2.reader.compose(
paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
reader_creator_10(0),
check_alignment=False)
for e in reader():
total += 1
# expecting 10, not 20
self.assertEqual(total, 10)
class TestChain(unittest.TestCase):
def test_chain(self):
c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
idx = 0
for e in c():
self.assertEqual(e, idx % 10)
idx += 1
self.assertEqual(idx, 20)
class TestShuffle(unittest.TestCase):
def test_shuffle(self):
case = [(0, True), (1, True), (10, False), (100, False)]
a = reader_creator_10(0)
for size, checkEq in case:
s = paddle.v2.reader.shuffle(a, size)
total = 0
for idx, e in enumerate(s()):
if checkEq:
self.assertEqual(idx, e)
total += 1
self.assertEqual(total, 10)
class TestXmap(unittest.TestCase):
def test_xmap(self):
def mapper(x):
return (x + 1)
orders = (True, False)
thread_nums = (1, 2, 4, 8, 16)
buffered_size = (1, 2, 4, 8, 16)
for order in orders:
for tNum in thread_nums:
for size in buffered_size:
reader = paddle.v2.reader.xmap_readers(mapper,
reader_creator_10(0),
tNum, size, order)
for n in xrange(3):
result = []
for i in reader():
result.append(i)
if not order:
result.sort()
for idx, e in enumerate(result):
self.assertEqual(e, mapper(idx))
class TestPipeReader(unittest.TestCase):
def test_pipe_reader(self):
def example_reader(myfiles):
for f in myfiles:
pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
for l in pr.get_line():
yield l
import tempfile
records = [str(i) for i in xrange(5)]
temp = tempfile.NamedTemporaryFile()
try:
with open(temp.name, 'w') as f:
for r in records:
f.write('%s\n' % r)
result = []
for r in example_reader([temp.name]):
result.append(r)
for idx, e in enumerate(records):
self.assertEqual(e, result[idx])
finally:
# delete the temporary file
temp.close()
if __name__ == '__main__':
unittest.main()
py_test(test_op SRCS test_op.py)
py_test(test_image SRCS test_image.py)
py_test(test_layer SRCS test_layer.py)
py_test(test_topology SRCS test_topology.py)
py_test(test_rnn_layer SRCS test_rnn_layer.py)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.v2.image as image
class Image(unittest.TestCase):
def test_resize_flip_chw(self):
# resize
im = image.load_image('cat.jpg')
im = image.resize_short(im, 256)
self.assertEqual(256, min(im.shape[:2]))
self.assertEqual(3, im.shape[2])
# flip
im = image.left_right_flip(im)
im2 = np.flip(im, 1)
self.assertEqual(im.all(), im2.all())
# to_chw
h, w, c = im.shape
im = image.to_chw(im)
self.assertEqual(c, im.shape[0])
self.assertEqual(h, im.shape[1])
self.assertEqual(w, im.shape[2])
if __name__ == '__main__':
unittest.main()
......@@ -27,7 +27,6 @@
# limitations under the License.
import unittest
import math
import paddle.dataset as dataset
import paddle.v2 as paddle
......@@ -41,7 +40,7 @@ def wordemb(inlayer):
def train():
word_dict = dataset.imikolov.build_dict()
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
# Every layer takes integer value of range [0, dict_size)
firstword = paddle.layer.data(
......
......@@ -77,6 +77,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
'paddle.v2',
'paddle.v2.master',
'paddle.v2.plot',
'paddle.v2.reader',
'paddle.v2.dataset',
'py_paddle']
with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册