diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index f5ae553c8571e21b351d0f5507afdf1539843a51..d074b0136d77fa5a1ce5c29cd52347d04475b029 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -81,6 +81,7 @@ if (WITH_TESTING) # enable v2 API unittest only when paddle swig api is compiled add_subdirectory(paddle/v2/tests) add_subdirectory(paddle/v2/plot/tests) + add_subdirectory(paddle/v2/reader/tests) endif() endif() add_subdirectory(paddle/fluid/tests) diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py index 1fdfd49f1c970d89bfde9d12a24076d38c54ba66..3315e826e82a33dfeb9c5223ce196cffb1ae7234 100644 --- a/python/paddle/dataset/__init__.py +++ b/python/paddle/dataset/__init__.py @@ -37,7 +37,7 @@ __all__ = [ 'cifar', 'movielens', 'conll05', - 'sentiment' + 'sentiment', 'uci_housing', 'wmt14', 'wmt16', diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 02b0d077eefa431baed05c421a367ebe3581626c..df710c33d0c0ca16d358dac1eb42327e9cd4c7ae 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -22,13 +22,17 @@ import data_type import topology import networks import evaluator +from . import dataset +from . import reader from . import plot import attr import op import pooling import inference import networks +import minibatch import plot +import image import paddle.trainer.config_parser as cp __all__ = [ @@ -44,11 +48,14 @@ __all__ = [ 'data_type', 'attr', 'pooling', + 'dataset', + 'reader', 'topology', 'networks', 'infer', 'plot', 'evaluator', + 'image', 'master', ] @@ -146,3 +153,4 @@ def init(**kwargs): infer = inference.infer +batch = minibatch.batch diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..38056fe0a9496bcb5de76634bbab267e324dc2a4 --- /dev/null +++ b/python/paddle/v2/dataset/__init__.py @@ -0,0 +1,46 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Dataset package. +""" + +import mnist +import imikolov +import imdb +import cifar +import movielens +import conll05 +import uci_housing +import sentiment +import wmt14 +import wmt16 +import mq2007 +import flowers +import voc2012 + +__all__ = [ + 'mnist', + 'imikolov', + 'imdb', + 'cifar', + 'movielens', + 'conll05', + 'sentiment', + 'uci_housing', + 'wmt14', + 'wmt16', + 'mq2007', + 'flowers', + 'voc2012', +] diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py new file mode 100644 index 0000000000000000000000000000000000000000..0a2a1ced11ee5cb2fb407b229ce810d553c2fa46 --- /dev/null +++ b/python/paddle/v2/dataset/cifar.py @@ -0,0 +1,139 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +CIFAR dataset. + +This module will download dataset from +https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into +paddle reader creators. + +The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, +with 6000 images per class. There are 50000 training images and 10000 test +images. + +The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes +containing 600 images each. There are 500 training images and 100 testing +images per class. + +""" + +import cPickle +import itertools +import numpy +import paddle.v2.dataset.common +import tarfile + +__all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] + +URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/' +CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz' +CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a' +CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz' +CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85' + + +def reader_creator(filename, sub_name): + def read_batch(batch): + data = batch['data'] + labels = batch.get('labels', batch.get('fine_labels', None)) + assert labels is not None + for sample, label in itertools.izip(data, labels): + yield (sample / 255.0).astype(numpy.float32), int(label) + + def reader(): + with tarfile.open(filename, mode='r') as f: + names = (each_item.name for each_item in f + if sub_name in each_item.name) + + for name in names: + batch = cPickle.load(f.extractfile(name)) + for item in read_batch(batch): + yield item + + return reader + + +def train100(): + """ + CIFAR-100 training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 99]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + 'train') + + +def test100(): + """ + CIFAR-100 test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + 'test') + + +def train10(): + """ + CIFAR-10 training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + 'data_batch') + + +def test10(): + """ + CIFAR-10 test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + 'test_batch') + + +def fetch(): + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5) + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100") + paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100") + paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10") + paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10") diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py new file mode 100644 index 0000000000000000000000000000000000000000..c6ff09a1d1e3ca56877e986c3ed3ae9ecd0a7316 --- /dev/null +++ b/python/paddle/v2/dataset/common.py @@ -0,0 +1,236 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests +import hashlib +import os +import errno +import shutil +import sys +import importlib +import paddle.v2.dataset +import cPickle +import glob +import cPickle as pickle + +__all__ = [ + 'DATA_HOME', + 'download', + 'md5file', + 'split', + 'cluster_files_reader', + 'convert', +] + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') + + +# When running unit tests, there could be multiple processes that +# trying to create DATA_HOME directory simultaneously, so we cannot +# use a if condition to check for the existence of the directory; +# instead, we use the filesystem as the synchronization mechanism by +# catching returned errors. +def must_mkdirs(path): + try: + os.makedirs(DATA_HOME) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + pass + + +must_mkdirs(DATA_HOME) + + +def md5file(fname): + hash_md5 = hashlib.md5() + f = open(fname, "rb") + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + f.close() + return hash_md5.hexdigest() + + +def download(url, module_name, md5sum, save_name=None): + dirname = os.path.join(DATA_HOME, module_name) + if not os.path.exists(dirname): + os.makedirs(dirname) + + filename = os.path.join(dirname, + url.split('/')[-1] + if save_name is None else save_name) + + retry = 0 + retry_limit = 3 + while not (os.path.exists(filename) and md5file(filename) == md5sum): + if os.path.exists(filename): + print "file md5", md5file(filename), md5sum + if retry < retry_limit: + retry += 1 + else: + raise RuntimeError("Cannot download {0} within retry limit {1}". + format(url, retry_limit)) + print "Cache file %s not found, downloading %s" % (filename, url) + r = requests.get(url, stream=True) + total_length = r.headers.get('content-length') + + if total_length is None: + with open(filename, 'w') as f: + shutil.copyfileobj(r.raw, f) + else: + with open(filename, 'w') as f: + dl = 0 + total_length = int(total_length) + for data in r.iter_content(chunk_size=4096): + dl += len(data) + f.write(data) + done = int(50 * dl / total_length) + sys.stdout.write("\r[%s%s]" % ('=' * done, + ' ' * (50 - done))) + sys.stdout.flush() + + return filename + + +def fetch_all(): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "fetch" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)): + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "fetch")() + + +def fetch_all_recordio(path): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "convert" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \ + not module_name == "common": + ds_path = os.path.join(path, module_name) + must_mkdirs(ds_path) + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "convert")(ds_path) + + +def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): + """ + you can call the function as: + + split(paddle.v2.dataset.cifar.train10(), line_count=1000, + suffix="imikolov-train-%05d.pickle") + + the output files as: + + |-imikolov-train-00000.pickle + |-imikolov-train-00001.pickle + |- ... + |-imikolov-train-00480.pickle + + :param reader: is a reader creator + :param line_count: line count for each file + :param suffix: the suffix for the output files, should contain "%d" + means the id for each file. Default is "%05d.pickle" + :param dumper: is a callable function that dump object to file, this + function will be called as dumper(obj, f) and obj is the object + will be dumped, f is a file object. Default is cPickle.dump. + """ + if not callable(dumper): + raise TypeError("dumper should be callable.") + lines = [] + indx_f = 0 + for i, d in enumerate(reader()): + lines.append(d) + if i >= line_count and i % line_count == 0: + with open(suffix % indx_f, "w") as f: + dumper(lines, f) + lines = [] + indx_f += 1 + if lines: + with open(suffix % indx_f, "w") as f: + dumper(lines, f) + + +def cluster_files_reader(files_pattern, + trainer_count, + trainer_id, + loader=cPickle.load): + """ + Create a reader that yield element from the given files, select + a file set according trainer count and trainer_id + + :param files_pattern: the files which generating by split(...) + :param trainer_count: total trainer count + :param trainer_id: the trainer rank id + :param loader: is a callable function that load object from file, this + function will be called as loader(f) and f is a file object. + Default is cPickle.load + """ + + def reader(): + if not callable(loader): + raise TypeError("loader should be callable.") + file_list = glob.glob(files_pattern) + file_list.sort() + my_file_list = [] + for idx, fn in enumerate(file_list): + if idx % trainer_count == trainer_id: + print "append file: %s" % fn + my_file_list.append(fn) + for fn in my_file_list: + with open(fn, "r") as f: + lines = loader(f) + for line in lines: + yield line + + return reader + + +def convert(output_path, reader, line_count, name_prefix): + import recordio + """ + Convert data from reader to recordio format files. + + :param output_path: directory in which output files will be saved. + :param reader: a data reader, from which the convert program will read + data instances. + :param name_prefix: the name prefix of generated files. + :param max_lines_to_shuffle: the max lines numbers to shuffle before + writing. + """ + + assert line_count >= 1 + indx_f = 0 + + def write_data(indx_f, lines): + filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f) + writer = recordio.writer(filename) + for l in lines: + # FIXME(Yancey1989): + # dumps with protocol: pickle.HIGHEST_PROTOCOL + writer.write(cPickle.dumps(l)) + writer.close() + + lines = [] + for i, d in enumerate(reader()): + lines.append(d) + if i % line_count == 0 and i >= line_count: + write_data(indx_f, lines) + lines = [] + indx_f += 1 + continue + + write_data(indx_f, lines) diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py new file mode 100644 index 0000000000000000000000000000000000000000..0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc --- /dev/null +++ b/python/paddle/v2/dataset/conll05.py @@ -0,0 +1,257 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Conll05 dataset. +Paddle semantic role labeling Book and demo use this dataset as an example. +Because Conll05 is not free in public, the default downloaded URL is test set +of Conll05 (which is public). Users can change URL and MD5 to their Conll +dataset. And a pre-trained word vector model based on Wikipedia corpus is used +to initialize SRL model. +""" + +import tarfile +import gzip +import itertools +import paddle.v2.dataset.common + +__all__ = ['test, get_dict', 'get_embedding', 'convert'] + +DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' +DATA_MD5 = '387719152ae52d60422c016e92a742fc' +WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' +WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' +VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' +VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' +TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' +TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' +EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' +EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' + +UNK_IDX = 0 + + +def load_label_dict(filename): + d = dict() + tag_dict = set() + with open(filename, 'r') as f: + for i, line in enumerate(f): + line = line.strip() + if line.startswith("B-"): + tag_dict.add(line[2:]) + elif line.startswith("I-"): + tag_dict.add(line[2:]) + index = 0 + for tag in tag_dict: + d["B-" + tag] = index + index += 1 + d["I-" + tag] = index + index += 1 + d["O"] = index + return d + + +def load_dict(filename): + d = dict() + with open(filename, 'r') as f: + for i, line in enumerate(f): + d[line.strip()] = i + return d + + +def corpus_reader(data_path, words_name, props_name): + """ + Read one corpus. It returns an iterator. Each element of + this iterator is a tuple including sentence and labels. The sentence is + consist of a list of word IDs. The labels include a list of label IDs. + :return: a iterator of data. + :rtype: iterator + """ + + def reader(): + tf = tarfile.open(data_path) + wf = tf.extractfile(words_name) + pf = tf.extractfile(props_name) + with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile( + fileobj=pf) as props_file: + sentences = [] + labels = [] + one_seg = [] + for word, label in itertools.izip(words_file, props_file): + word = word.strip() + label = label.strip().split() + + if len(label) == 0: # end of sentence + for i in xrange(len(one_seg[0])): + a_kind_lable = [x[i] for x in one_seg] + labels.append(a_kind_lable) + + if len(labels) >= 1: + verb_list = [] + for x in labels[0]: + if x != '-': + verb_list.append(x) + + for i, lbl in enumerate(labels[1:]): + cur_tag = 'O' + is_in_bracket = False + lbl_seq = [] + verb_word = '' + for l in lbl: + if l == '*' and is_in_bracket == False: + lbl_seq.append('O') + elif l == '*' and is_in_bracket == True: + lbl_seq.append('I-' + cur_tag) + elif l == '*)': + lbl_seq.append('I-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') != -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') == -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = True + else: + raise RuntimeError('Unexpected label: %s' % + l) + + yield sentences, verb_list[i], lbl_seq + + sentences = [] + labels = [] + one_seg = [] + else: + sentences.append(word) + one_seg.append(label) + + pf.close() + wf.close() + tf.close() + + return reader + + +def reader_creator(corpus_reader, + word_dict=None, + predicate_dict=None, + label_dict=None): + def reader(): + for sentence, predicate, labels in corpus_reader(): + + sen_len = len(sentence) + + verb_index = labels.index('B-V') + mark = [0] * len(labels) + if verb_index > 0: + mark[verb_index - 1] = 1 + ctx_n1 = sentence[verb_index - 1] + else: + ctx_n1 = 'bos' + + if verb_index > 1: + mark[verb_index - 2] = 1 + ctx_n2 = sentence[verb_index - 2] + else: + ctx_n2 = 'bos' + + mark[verb_index] = 1 + ctx_0 = sentence[verb_index] + + if verb_index < len(labels) - 1: + mark[verb_index + 1] = 1 + ctx_p1 = sentence[verb_index + 1] + else: + ctx_p1 = 'eos' + + if verb_index < len(labels) - 2: + mark[verb_index + 2] = 1 + ctx_p2 = sentence[verb_index + 2] + else: + ctx_p2 = 'eos' + + word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] + + ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len + ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len + ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len + ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len + ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + + pred_idx = [predicate_dict.get(predicate)] * sen_len + label_idx = [label_dict.get(w) for w in labels] + + yield word_idx, ctx_n2_idx, ctx_n1_idx, \ + ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx + + return reader + + +def get_dict(): + """ + Get the word, verb and label dictionary of Wikipedia corpus. + """ + word_dict = load_dict( + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', + WORDDICT_MD5)) + verb_dict = load_dict( + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', + VERBDICT_MD5)) + label_dict = load_label_dict( + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', + TRGDICT_MD5)) + return word_dict, verb_dict, label_dict + + +def get_embedding(): + """ + Get the trained word vector based on Wikipedia corpus. + """ + return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + + +def test(): + """ + Conll05 test set creator. + + Because the training dataset is not free, the test dataset is used for + training. It returns a reader creator, each sample in the reader is nine + features, including sentence sequence, predicate, predicate context, + predicate context flag and tagged sequence. + + :return: Training reader creator + :rtype: callable + """ + word_dict, verb_dict, label_dict = get_dict() + reader = corpus_reader( + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5), + words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', + props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') + return reader_creator(reader, word_dict, verb_dict, label_dict) + + +def fetch(): + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) + paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test") diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py new file mode 100644 index 0000000000000000000000000000000000000000..7bdddeaabec733ef26b3f766c6437f5c53d65044 --- /dev/null +++ b/python/paddle/v2/dataset/flowers.py @@ -0,0 +1,199 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module will download dataset from +http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html +and parse train/test set intopaddle reader creators. + +This set contains images of flowers belonging to 102 different categories. +The images were acquired by searching the web and taking pictures. There are a +minimum of 40 images for each category. + +The database was used in: + +Nilsback, M-E. and Zisserman, A. Automated flower classification over a large + number of classes.Proceedings of the Indian Conference on Computer Vision, +Graphics and Image Processing (2008) +http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}. + +""" +import cPickle +import itertools +import functools +from common import download +import tarfile +import scipy.io as scio +from paddle.v2.image import * +from paddle.v2.reader import * +import os +import numpy as np +from multiprocessing import cpu_count +__all__ = ['train', 'test', 'valid'] + +DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' +LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' +SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' +DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118' +LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' +SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' +# In official 'readme', tstid is the flag of test data +# and trnid is the flag of train data. But test data is more than train data. +# So we exchange the train data and test data. +TRAIN_FLAG = 'tstid' +TEST_FLAG = 'trnid' +VALID_FLAG = 'valid' + + +def default_mapper(is_train, sample): + ''' + map image bytes data to type needed by model input layer + ''' + img, label = sample + img = load_image_bytes(img) + img = simple_transform( + img, 256, 224, is_train, mean=[103.94, 116.78, 123.68]) + return img.flatten().astype('float32'), label + + +train_mapper = functools.partial(default_mapper, True) +test_mapper = functools.partial(default_mapper, False) + + +def reader_creator(data_file, + label_file, + setid_file, + dataset_name, + mapper, + buffered_size=1024, + use_xmap=True): + ''' + 1. read images from tar file and + merge images into batch files in 102flowers.tgz_batch/ + 2. get a reader to read sample from batch file + + :param data_file: downloaded data file + :type data_file: string + :param label_file: downloaded label file + :type label_file: string + :param setid_file: downloaded setid file containing information + about how to split dataset + :type setid_file: string + :param dataset_name: data set name (tstid|trnid|valid) + :type dataset_name: string + :param mapper: a function to map image bytes data to type + needed by model input layer + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: data reader + :rtype: callable + ''' + labels = scio.loadmat(label_file)['labels'][0] + indexes = scio.loadmat(setid_file)[dataset_name][0] + img2label = {} + for i in indexes: + img = "jpg/image_%05d.jpg" % i + img2label[img] = labels[i - 1] + file_list = batch_images_from_tar(data_file, dataset_name, img2label) + + def reader(): + for file in open(file_list): + file = file.strip() + batch = None + with open(file, 'r') as f: + batch = cPickle.load(f) + data = batch['data'] + labels = batch['label'] + for sample, label in itertools.izip(data, batch['label']): + yield sample, int(label) - 1 + + if use_xmap: + return xmap_readers(mapper, reader, cpu_count(), buffered_size) + else: + return map_readers(mapper, reader) + + +def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers training set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: train data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper, + buffered_size, use_xmap) + + +def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers test set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: test data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper, + buffered_size, use_xmap) + + +def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers validation set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: test data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper, + buffered_size, use_xmap) + + +def fetch(): + download(DATA_URL, 'flowers', DATA_MD5) + download(LABEL_URL, 'flowers', LABEL_MD5) + download(SETID_URL, 'flowers', SETID_MD5) diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py new file mode 100644 index 0000000000000000000000000000000000000000..37c4296f9bcea7e16daa46f778934331513c30c4 --- /dev/null +++ b/python/paddle/v2/dataset/imdb.py @@ -0,0 +1,148 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +IMDB dataset. + +This module downloads IMDB dataset from +http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set +of 25,000 highly polar movie reviews for training, and 25,000 for testing. +Besides, this module also provides API for building dictionary. +""" + +import paddle.v2.dataset.common +import collections +import tarfile +import re +import string + +__all__ = ['build_dict', 'train', 'test', 'convert'] + +URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' +MD5 = '7c2ac02c03563afcf9b574c7e56c153a' + + +def tokenize(pattern): + """ + Read files that match the given pattern. Tokenize and yield each file. + """ + + with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', + MD5)) as tarf: + # Note that we should use tarfile.next(), which does + # sequential access of member files, other than + # tarfile.extractfile, which does random access and might + # destroy hard disks. + tf = tarf.next() + while tf != None: + if bool(pattern.match(tf.name)): + # newline and punctuations removal and ad-hoc tokenization. + yield tarf.extractfile(tf).read().rstrip("\n\r").translate( + None, string.punctuation).lower().split() + tf = tarf.next() + + +def build_dict(pattern, cutoff): + """ + Build a word dictionary from the corpus. Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + word_freq = collections.defaultdict(int) + for doc in tokenize(pattern): + for word in doc: + word_freq[word] += 1 + + # Not sure if we should prune less-frequent words here. + word_freq = filter(lambda x: x[1] > cutoff, word_freq.items()) + + dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*dictionary)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + return word_idx + + +def reader_creator(pos_pattern, neg_pattern, word_idx): + UNK = word_idx[''] + INS = [] + + def load(pattern, out, label): + for doc in tokenize(pattern): + out.append(([word_idx.get(w, UNK) for w in doc], label)) + + load(pos_pattern, INS, 0) + load(neg_pattern, INS, 1) + + def reader(): + for doc, label in INS: + yield doc, label + + return reader + + +def train(word_idx): + """ + IMDB training set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/train/pos/.*\.txt$"), + re.compile("aclImdb/train/neg/.*\.txt$"), word_idx) + + +def test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/test/pos/.*\.txt$"), + re.compile("aclImdb/test/neg/.*\.txt$"), word_idx) + + +def word_dict(): + """ + Build a word dictionary from the corpus. + + :return: Word dictionary + :rtype: dict + """ + return build_dict( + re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) + + +def fetch(): + paddle.v2.dataset.common.download(URL, 'imdb', MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + w = word_dict() + paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py new file mode 100644 index 0000000000000000000000000000000000000000..617c722c4165cdfed9e650fc968d623ef6ed4391 --- /dev/null +++ b/python/paddle/v2/dataset/imikolov.py @@ -0,0 +1,161 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +imikolov's simple dataset. + +This module will download dataset from +http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set +into paddle reader creators. +""" +import paddle.v2.dataset.common +import collections +import tarfile + +__all__ = ['train', 'test', 'build_dict', 'convert'] + +URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' +MD5 = '30177ea32e27c525793142b6bf2c8e2d' + + +class DataType(object): + NGRAM = 1 + SEQ = 2 + + +def word_count(f, word_freq=None): + if word_freq is None: + word_freq = collections.defaultdict(int) + + for l in f: + for w in l.strip().split(): + word_freq[w] += 1 + word_freq[''] += 1 + word_freq[''] += 1 + + return word_freq + + +def build_dict(min_word_freq=50): + """ + Build a word dictionary from the corpus, Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + train_filename = './simple-examples/data/ptb.train.txt' + test_filename = './simple-examples/data/ptb.valid.txt' + with tarfile.open( + paddle.v2.dataset.common.download( + paddle.v2.dataset.imikolov.URL, 'imikolov', + paddle.v2.dataset.imikolov.MD5)) as tf: + trainf = tf.extractfile(train_filename) + testf = tf.extractfile(test_filename) + word_freq = word_count(testf, word_count(trainf)) + if '' in word_freq: + # remove for now, since we will set it as last index + del word_freq[''] + + word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items()) + + word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*word_freq_sorted)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + + return word_idx + + +def reader_creator(filename, word_idx, n, data_type): + def reader(): + with tarfile.open( + paddle.v2.dataset.common.download( + paddle.v2.dataset.imikolov.URL, 'imikolov', + paddle.v2.dataset.imikolov.MD5)) as tf: + f = tf.extractfile(filename) + + UNK = word_idx[''] + for l in f: + if DataType.NGRAM == data_type: + assert n > -1, 'Invalid gram length' + l = [''] + l.strip().split() + [''] + if len(l) >= n: + l = [word_idx.get(w, UNK) for w in l] + for i in range(n, len(l) + 1): + yield tuple(l[i - n:i]) + elif DataType.SEQ == data_type: + l = l.strip().split() + l = [word_idx.get(w, UNK) for w in l] + src_seq = [word_idx['']] + l + trg_seq = l + [word_idx['']] + if n > 0 and len(src_seq) > n: continue + yield src_seq, trg_seq + else: + assert False, 'Unknow data type' + + return reader + + +def train(word_idx, n, data_type=DataType.NGRAM): + """ + imikolov training set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size if type is ngram, otherwise max length of sequence + :type n: int + :param data_type: data type (ngram or sequence) + :type data_type: member variable of DataType (NGRAM or SEQ) + :return: Training reader creator + :rtype: callable + """ + return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n, + data_type) + + +def test(word_idx, n, data_type=DataType.NGRAM): + """ + imikolov test set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size if type is ngram, otherwise max length of sequence + :type n: int + :param data_type: data type (ngram or sequence) + :type data_type: member variable of DataType (NGRAM or SEQ) + :return: Test reader creator + :rtype: callable + """ + return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n, + data_type) + + +def fetch(): + paddle.v2.dataset.common.download(URL, "imikolov", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + N = 5 + word_dict = build_dict() + paddle.v2.dataset.common.convert(path, + train(word_dict, N), 1000, + "imikolov_train") + paddle.v2.dataset.common.convert(path, + test(word_dict, N), 1000, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..9f675bed895223e054cd3bb6e504fe1607f19858 --- /dev/null +++ b/python/paddle/v2/dataset/mnist.py @@ -0,0 +1,123 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MNIST dataset. + +This module will download dataset from http://yann.lecun.com/exdb/mnist/ and +parse training set and test set into paddle reader creators. +""" +import paddle.v2.dataset.common +import subprocess +import numpy +import platform +__all__ = ['train', 'test', 'convert'] + +URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/' +TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz' +TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3' +TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz' +TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c' +TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz' +TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873' +TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz' +TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432' + + +def reader_creator(image_filename, label_filename, buffer_size): + def reader(): + if platform.system() == 'Darwin': + zcat_cmd = 'gzcat' + elif platform.system() == 'Linux': + zcat_cmd = 'zcat' + else: + raise NotImplementedError() + + # According to http://stackoverflow.com/a/38061619/724872, we + # cannot use standard package gzip here. + m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE) + m.stdout.read(16) # skip some magic bytes + + l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE) + l.stdout.read(8) # skip some magic bytes + + try: # reader could be break. + while True: + labels = numpy.fromfile( + l.stdout, 'ubyte', count=buffer_size).astype("int") + + if labels.size != buffer_size: + break # numpy.fromfile returns empty slice after EOF. + + images = numpy.fromfile( + m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape( + (buffer_size, 28 * 28)).astype('float32') + + images = images / 255.0 * 2.0 - 1.0 + + for i in xrange(buffer_size): + yield images[i, :], int(labels[i]) + finally: + m.terminate() + l.terminate() + + return reader + + +def train(): + """ + MNIST training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', + TRAIN_IMAGE_MD5), + paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', + TRAIN_LABEL_MD5), 100) + + +def test(): + """ + MNIST test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', + TEST_IMAGE_MD5), + paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', + TEST_LABEL_MD5), 100) + + +def fetch(): + paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) + paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) + paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test") diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py new file mode 100644 index 0000000000000000000000000000000000000000..5b61a9420af1bb81e1d826f8a7b69f34c306d382 --- /dev/null +++ b/python/paddle/v2/dataset/movielens.py @@ -0,0 +1,262 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Movielens 1-M dataset. + +Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 +movies, which was collected by GroupLens Research. This module will download +Movielens 1-M dataset from +http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training +set and test set into paddle reader creators. + +""" + +import zipfile +import paddle.v2.dataset.common +import re +import random +import functools + +__all__ = [ + 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', + 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info', + 'convert' +] + +age_table = [1, 18, 25, 35, 45, 50, 56] + +URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' +MD5 = 'c4d9eecfca2ab87c1945afe126590906' + + +class MovieInfo(object): + """ + Movie id, title and categories information are stored in MovieInfo. + """ + + def __init__(self, index, categories, title): + self.index = int(index) + self.categories = categories + self.title = title + + def value(self): + """ + Get information from a movie. + """ + return [ + self.index, [CATEGORIES_DICT[c] for c in self.categories], + [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] + ] + + def __str__(self): + return "" % ( + self.index, self.title, self.categories) + + def __repr__(self): + return self.__str__() + + +class UserInfo(object): + """ + User id, gender, age, and job information are stored in UserInfo. + """ + + def __init__(self, index, gender, age, job_id): + self.index = int(index) + self.is_male = gender == 'M' + self.age = age_table.index(int(age)) + self.job_id = int(job_id) + + def value(self): + """ + Get information from a user. + """ + return [self.index, 0 if self.is_male else 1, self.age, self.job_id] + + def __str__(self): + return "" % ( + self.index, "M" + if self.is_male else "F", age_table[self.age], self.job_id) + + def __repr__(self): + return str(self) + + +MOVIE_INFO = None +MOVIE_TITLE_DICT = None +CATEGORIES_DICT = None +USER_INFO = None + + +def __initialize_meta_info__(): + fn = paddle.v2.dataset.common.download(URL, "movielens", MD5) + global MOVIE_INFO + if MOVIE_INFO is None: + pattern = re.compile(r'^(.*)\((\d+)\)$') + with zipfile.ZipFile(file=fn) as package: + for info in package.infolist(): + assert isinstance(info, zipfile.ZipInfo) + MOVIE_INFO = dict() + title_word_set = set() + categories_set = set() + with package.open('ml-1m/movies.dat') as movie_file: + for i, line in enumerate(movie_file): + movie_id, title, categories = line.strip().split('::') + categories = categories.split('|') + for c in categories: + categories_set.add(c) + title = pattern.match(title).group(1) + MOVIE_INFO[int(movie_id)] = MovieInfo( + index=movie_id, categories=categories, title=title) + for w in title.split(): + title_word_set.add(w.lower()) + + global MOVIE_TITLE_DICT + MOVIE_TITLE_DICT = dict() + for i, w in enumerate(title_word_set): + MOVIE_TITLE_DICT[w] = i + + global CATEGORIES_DICT + CATEGORIES_DICT = dict() + for i, c in enumerate(categories_set): + CATEGORIES_DICT[c] = i + + global USER_INFO + USER_INFO = dict() + with package.open('ml-1m/users.dat') as user_file: + for line in user_file: + uid, gender, age, job, _ = line.strip().split("::") + USER_INFO[int(uid)] = UserInfo( + index=uid, gender=gender, age=age, job_id=job) + return fn + + +def __reader__(rand_seed=0, test_ratio=0.1, is_test=False): + fn = __initialize_meta_info__() + rand = random.Random(x=rand_seed) + with zipfile.ZipFile(file=fn) as package: + with package.open('ml-1m/ratings.dat') as rating: + for line in rating: + if (rand.random() < test_ratio) == is_test: + uid, mov_id, rating, _ = line.strip().split("::") + uid = int(uid) + mov_id = int(mov_id) + rating = float(rating) * 2 - 5.0 + + mov = MOVIE_INFO[mov_id] + usr = USER_INFO[uid] + yield usr.value() + mov.value() + [[rating]] + + +def __reader_creator__(**kwargs): + return lambda: __reader__(**kwargs) + + +train = functools.partial(__reader_creator__, is_test=False) +test = functools.partial(__reader_creator__, is_test=True) + + +def get_movie_title_dict(): + """ + Get movie title dictionary. + """ + __initialize_meta_info__() + return MOVIE_TITLE_DICT + + +def __max_index_info__(a, b): + if a.index > b.index: + return a + else: + return b + + +def max_movie_id(): + """ + Get the maximum value of movie id. + """ + __initialize_meta_info__() + return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index + + +def max_user_id(): + """ + Get the maximum value of user id. + """ + __initialize_meta_info__() + return reduce(__max_index_info__, USER_INFO.viewvalues()).index + + +def __max_job_id_impl__(a, b): + if a.job_id > b.job_id: + return a + else: + return b + + +def max_job_id(): + """ + Get the maximum value of job id. + """ + __initialize_meta_info__() + return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id + + +def movie_categories(): + """ + Get movie categoriges dictionary. + """ + __initialize_meta_info__() + return CATEGORIES_DICT + + +def user_info(): + """ + Get user info dictionary. + """ + __initialize_meta_info__() + return USER_INFO + + +def movie_info(): + """ + Get movie info dictionary. + """ + __initialize_meta_info__() + return MOVIE_INFO + + +def unittest(): + for train_count, _ in enumerate(train()()): + pass + for test_count, _ in enumerate(test()()): + pass + + print train_count, test_count + + +def fetch(): + paddle.v2.dataset.common.download(URL, "movielens", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test") + + +if __name__ == '__main__': + unittest() diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py new file mode 100644 index 0000000000000000000000000000000000000000..d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1 --- /dev/null +++ b/python/paddle/v2/dataset/mq2007.py @@ -0,0 +1,333 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MQ2007 dataset + +MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross +validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set, +validation set and testing set. + +MQ2007 dataset from website +http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators + +""" + +import os +import functools +import rarfile +from common import download +import numpy as np + +# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar" +URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar" +MD5 = "7be1640ae95c6408dab0ae7207bdc706" + + +def __initialize_meta_info__(): + """ + download and extract the MQ2007 dataset + """ + fn = fetch() + rar = rarfile.RarFile(fn) + dirpath = os.path.dirname(fn) + rar.extractall(path=dirpath) + return dirpath + + +class Query(object): + """ + queries used for learning to rank algorithms. It is created from relevance scores, query-document feature vectors + + Parameters: + ---------- + query_id : int + query_id in dataset, mapping from query to relevance documents + relevance_score : int + relevance score of query and document pair + feature_vector : array, dense feature + feature in vector format + description : string + comment section in query doc pair data + """ + + def __init__(self, + query_id=-1, + relevance_score=-1, + feature_vector=None, + description=""): + self.query_id = query_id + self.relevance_score = relevance_score + if feature_vector is None: + self.feature_vector = [] + else: + self.feature_vector = feature_vector + self.description = description + + def __str__(self): + string = "%s %s %s" % (str(self.relevance_score), str(self.query_id), + " ".join(str(f) for f in self.feature_vector)) + return string + + # @classmethod + def _parse_(self, text): + """ + parse line into Query + """ + comment_position = text.find('#') + line = text[:comment_position].strip() + self.description = text[comment_position + 1:].strip() + parts = line.split() + if len(parts) != 48: + sys.stdout.write("expect 48 space split parts, get %d" % + (len(parts))) + return None + # format : 0 qid:10 1:0.000272 2:0.000000 .... + self.relevance_score = int(parts[0]) + self.query_id = int(parts[1].split(':')[1]) + for p in parts[2:]: + pair = p.split(':') + self.feature_vector.append(float(pair[1])) + return self + + +class QueryList(object): + """ + group query into list, every item in list is a Query + """ + + def __init__(self, querylist=None): + self.query_id = -1 + if querylist is None: + self.querylist = [] + else: + self.querylist = querylist + for query in self.querylist: + if self.query_id == -1: + self.query_id = query.query_id + else: + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + + def __iter__(self): + for query in self.querylist: + yield query + + def __len__(self): + return len(self.querylist) + + def __getitem__(self, i): + return self.querylist[i] + + def _correct_ranking_(self): + if self.querylist is None: + return + self.querylist.sort(key=lambda x: x.relevance_score, reverse=True) + + def _add_query(self, query): + if self.query_id == -1: + self.query_id = query.query_id + else: + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + self.querylist.append(query) + + +def gen_plain_txt(querylist): + """ + gen plain text in list for other usage + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + query_id : np.array, shape=(samples_num, ) + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + for query in querylist: + yield querylist.query_id, query.relevance_score, np.array( + query.feature_vector) + + +def gen_point(querylist): + """ + gen item in list for point-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + for query in querylist: + yield query.relevance_score, np.array(query.feature_vector) + + +def gen_pair(querylist, partial_order="full"): + """ + gen pair for pair-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + pairtial_order : "full" or "neighbour" + there is redudant in all possiable pair combinations, which can be simplifed + gen pairs for neighbour items or the full partial order pairs + + return : + ------ + label : np.array, shape=(1) + query_left : np.array, shape=(1, feature_dimension) + query_right : same as left + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + labels = [] + docpairs = [] + + # C(n,2) + for i in range(len(querylist)): + query_left = querylist[i] + for j in range(i + 1, len(querylist)): + query_right = querylist[j] + if query_left.relevance_score > query_right.relevance_score: + labels.append([1]) + docpairs.append([ + np.array(query_left.feature_vector), + np.array(query_right.feature_vector) + ]) + elif query_left.relevance_score < query_right.relevance_score: + labels.append([1]) + docpairs.append([ + np.array(query_right.feature_vector), + np.array(query_left.feature_vector) + ]) + for label, pair in zip(labels, docpairs): + yield np.array(label), pair[0], pair[1] + + +def gen_list(querylist): + """ + gen item in list for list-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + relevance_score_list = [[query.relevance_score] for query in querylist] + feature_vector_list = [query.feature_vector for query in querylist] + yield np.array(relevance_score_list), np.array(feature_vector_list) + + +def query_filter(querylists): + """ + filter query get only document with label 0. + label 0, 1, 2 means the relevance score document with query + parameters : + querylist : QueyList list + + return : + querylist : QueyList list + """ + filter_query = [] + for querylist in querylists: + relevance_score_list = [query.relevance_score for query in querylist] + if sum(relevance_score_list) != .0: + filter_query.append(querylist) + return filter_query + + +def load_from_text(filepath, shuffle=False, fill_missing=-1): + """ + parse data file into querys + """ + prev_query_id = -1 + querylists = [] + querylist = None + fn = __initialize_meta_info__() + with open(os.path.join(fn, filepath)) as f: + for line in f: + query = Query() + query = query._parse_(line) + if query == None: + continue + if query.query_id != prev_query_id: + if querylist is not None: + querylists.append(querylist) + querylist = QueryList() + prev_query_id = query.query_id + querylist._add_query(query) + if querylist is not None: + querylists.append(querylist) + return querylists + + +def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1): + """ + Parameters + -------- + filename : string + fill_missing : fill the missing value. default in MQ2007 is -1 + + Returns + ------ + yield + label query_left, query_right # format = "pairwise" + label querylist # format = "listwise" + """ + querylists = query_filter( + load_from_text( + filepath, shuffle=shuffle, fill_missing=fill_missing)) + for querylist in querylists: + if format == "plain_txt": + yield next(gen_plain_txt(querylist)) + elif format == "pointwise": + yield next(gen_point(querylist)) + elif format == "pairwise": + for pair in gen_pair(querylist): + yield pair + elif format == "listwise": + yield next(gen_list(querylist)) + + +train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt") +test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt") + + +def fetch(): + return download(URL, "MQ2007", MD5) + + +if __name__ == "__main__": + fetch() + mytest = functools.partial( + __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise") + for label, query in mytest(): + print label, query diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..b0b9757c1a75d215cf8945b5cedbb1239fd43af7 --- /dev/null +++ b/python/paddle/v2/dataset/sentiment.py @@ -0,0 +1,141 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The script fetch and preprocess movie_reviews data set that provided by NLTK + +TODO(yuyang18): Complete dataset. +""" + +import collections +from itertools import chain + +import nltk +from nltk.corpus import movie_reviews + +import paddle.v2.dataset.common + +__all__ = ['train', 'test', 'get_word_dict', 'convert'] +NUM_TRAINING_INSTANCES = 1600 +NUM_TOTAL_INSTANCES = 2000 + + +def download_data_if_not_yet(): + """ + Download the data set, if the data set is not download. + """ + try: + # make sure that nltk can find the data + if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path: + nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME) + movie_reviews.categories() + except LookupError: + print "Downloading movie_reviews data set, please wait....." + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + print "Download data set success....." + print "Path is " + nltk.data.find('corpora/movie_reviews').path + + +def get_word_dict(): + """ + Sorted the words by the frequency of words which occur in sample + :return: + words_freq_sorted + """ + words_freq_sorted = list() + word_freq_dict = collections.defaultdict(int) + download_data_if_not_yet() + + for category in movie_reviews.categories(): + for field in movie_reviews.fileids(category): + for words in movie_reviews.words(field): + word_freq_dict[words] += 1 + words_sort_list = word_freq_dict.items() + words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) + for index, word in enumerate(words_sort_list): + words_freq_sorted.append((word[0], index)) + return words_freq_sorted + + +def sort_files(): + """ + Sorted the sample for cross reading the sample + :return: + files_list + """ + files_list = list() + neg_file_list = movie_reviews.fileids('neg') + pos_file_list = movie_reviews.fileids('pos') + files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list))) + return files_list + + +def load_sentiment_data(): + """ + Load the data set + :return: + data_set + """ + data_set = list() + download_data_if_not_yet() + words_ids = dict(get_word_dict()) + for sample_file in sort_files(): + words_list = list() + category = 0 if 'neg' in sample_file else 1 + for word in movie_reviews.words(sample_file): + words_list.append(words_ids[word.lower()]) + data_set.append((words_list, category)) + return data_set + + +def reader_creator(data): + """ + Reader creator, generate an iterator for data set + :param data: + train data set or test data set + """ + for each in data: + yield each[0], each[1] + + +def train(): + """ + Default training set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[0:NUM_TRAINING_INSTANCES]) + + +def test(): + """ + Default test set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[NUM_TRAINING_INSTANCES:]) + + +def fetch(): + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train") + paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test") diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e0e18229da7818be5752ee592e094a00da286ad9 --- /dev/null +++ b/python/paddle/v2/dataset/tests/cifar_test.py @@ -0,0 +1,56 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.cifar +import unittest + + +class TestCIFAR(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 3072) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_test10(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.test10()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 9) + + def test_train10(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.train10()) + self.assertEqual(instances, 50000) + self.assertEqual(max_label_value, 9) + + def test_test100(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.test100()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 99) + + def test_train100(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.train100()) + self.assertEqual(instances, 50000) + self.assertEqual(max_label_value, 99) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cfa194eba38ea70311c4deeac2635dc0a0103576 --- /dev/null +++ b/python/paddle/v2/dataset/tests/common_test.py @@ -0,0 +1,94 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.common +import unittest +import tempfile +import glob + + +class TestCommon(unittest.TestCase): + def test_md5file(self): + _, temp_path = tempfile.mkstemp() + with open(temp_path, 'w') as f: + f.write("Hello\n") + self.assertEqual('09f7e02f1290be211da707a266f153b3', + paddle.v2.dataset.common.md5file(temp_path)) + + def test_download(self): + yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460' + self.assertEqual( + paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460', + paddle.v2.dataset.common.download( + yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d')) + + def test_split(self): + def test_reader(): + def reader(): + for x in xrange(10): + yield x + + return reader + + _, temp_path = tempfile.mkstemp() + paddle.v2.dataset.common.split( + test_reader(), 4, suffix=temp_path + '/test-%05d.pickle') + files = glob.glob(temp_path + '/test-%05d.pickle') + self.assertEqual(len(files), 3) + + def test_cluster_file_reader(self): + _, temp_path = tempfile.mkstemp() + for x in xrange(5): + with open(temp_path + '/%05d.test' % x) as f: + f.write('%d\n' % x) + reader = paddle.v2.dataset.common.cluster_files_reader( + temp_path + '/*.test', 5, 0) + for idx, e in enumerate(reader()): + self.assertEqual(e, str("0")) + + def test_convert(self): + record_num = 10 + num_shards = 4 + + def test_reader(): + def reader(): + for x in xrange(record_num): + yield x + + return reader + + path = tempfile.mkdtemp() + paddle.v2.dataset.common.convert(path, + test_reader(), num_shards, + 'random_images') + + files = glob.glob(path + '/random_images-*') + self.assertEqual(len(files), num_shards) + + recs = [] + for i in range(0, num_shards): + n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1) + r = recordio.reader(n) + while True: + d = r.read() + if d is None: + break + recs.append(d) + + recs.sort() + self.assertEqual(total, record_num) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py new file mode 100644 index 0000000000000000000000000000000000000000..a8ae9a07acc22eb9d3c0cc5ebb07f8f11ed21233 --- /dev/null +++ b/python/paddle/v2/dataset/tests/flowers_test.py @@ -0,0 +1,51 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.flowers +import unittest + + +class TestFlowers(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + size = 224 * 224 * 3 + for l in reader(): + self.assertEqual(l[0].size, size) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_train(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.train()) + self.assertEqual(instances, 6149) + self.assertEqual(max_label_value, 102) + + def test_test(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.test()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + def test_valid(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.valid()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c4d82f26895d77d05c6e936bd636b1239e1a0cd8 --- /dev/null +++ b/python/paddle/v2/dataset/tests/imdb_test.py @@ -0,0 +1,57 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.imdb +import unittest +import re + +TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$") +TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$") +TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$") + +TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$") +TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$") +TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$") + + +class TestIMDB(unittest.TestCase): + word_idx = None + + def test_build_dict(self): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + self.assertEqual(len(self.word_idx), 7036) + + def check_dataset(self, dataset, expected_size): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + sum = 0 + for l in dataset(self.word_idx): + self.assertEqual(l[1], sum % 2) + sum += 1 + self.assertEqual(sum, expected_size) + + def test_train(self): + self.check_dataset(paddle.v2.dataset.imdb.train, 25000) + + def test_test(self): + self.check_dataset(paddle.v2.dataset.imdb.test, 25000) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py new file mode 100644 index 0000000000000000000000000000000000000000..714a75d6f1ff31697eec2d893d350a726d6390fe --- /dev/null +++ b/python/paddle/v2/dataset/tests/imikolov_test.py @@ -0,0 +1,67 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.imikolov +import unittest + +WORD_DICT = paddle.v2.dataset.imikolov.build_dict() + + +class TestMikolov(unittest.TestCase): + def check_reader(self, reader, n): + for l in reader(): + self.assertEqual(len(l), n) + + def test_train(self): + n = 5 + self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n) + + first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\ + 'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\ + 'rake regatta rubens sim snack-food ssangyong swapo wachter' + first_line = [ + WORD_DICT.get(ch, WORD_DICT['']) + for ch in first_line.split(' ') + ] + for l in paddle.v2.dataset.imikolov.train( + WORD_DICT, n=-1, + data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + read_line = l[0][1:] + break + self.assertEqual(first_line, read_line) + + def test_test(self): + n = 5 + self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n) + + first_line = 'consumers may want to move their telephones a little '\ + 'closer to the tv set' + first_line = [ + WORD_DICT.get(ch, WORD_DICT['']) + for ch in first_line.split(' ') + ] + for l in paddle.v2.dataset.imikolov.test( + WORD_DICT, n=-1, + data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + read_line = l[0][1:] + break + self.assertEqual(first_line, read_line) + + def test_total(self): + _, idx = zip(*WORD_DICT.items()) + self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1d344cac3e7483a351033570fbec75a4d19f4a55 --- /dev/null +++ b/python/paddle/v2/dataset/tests/mnist_test.py @@ -0,0 +1,44 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.mnist +import unittest + + +class TestMNIST(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 784) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_train(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.mnist.train()) + self.assertEqual(instances, 60000) + self.assertEqual(max_label_value, 9) + + def test_test(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.mnist.test()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 9) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/v2/dataset/tests/mq2007_test.py new file mode 100644 index 0000000000000000000000000000000000000000..59847b6c18eadb12123cae824e8bce1051a69d4c --- /dev/null +++ b/python/paddle/v2/dataset/tests/mq2007_test.py @@ -0,0 +1,33 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.mq2007 +import unittest + + +class TestMQ2007(unittest.TestCase): + def test_pairwise(self): + for label, query_left, query_right in paddle.v2.dataset.mq2007.test( + format="pairwise"): + self.assertEqual(query_left.shape(), (46, )) + self.assertEqual(query_right.shape(), (46, )) + + def test_listwise(self): + for label_array, query_array in paddle.v2.dataset.mq2007.test( + format="listwise"): + self.assertEqual(len(label_array), len(query_array)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..407405290734609059c1767600748d530e8a13a6 --- /dev/null +++ b/python/paddle/v2/dataset/tests/test_sentiment.py @@ -0,0 +1,55 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import nltk +import paddle.v2.dataset.sentiment as st +from nltk.corpus import movie_reviews + + +class TestSentimentMethods(unittest.TestCase): + def test_get_word_dict(self): + word_dict = st.get_word_dict()[0:10] + test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3), + (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7), + (u'is', 8), (u'in', 9)] + for idx, each in enumerate(word_dict): + self.assertEqual(each, test_word_list[idx]) + self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path) + + def test_sort_files(self): + last_label = '' + for sample_file in st.sort_files(): + current_label = sample_file.split("/")[0] + self.assertNotEqual(current_label, last_label) + last_label = current_label + + def test_data_set(self): + data_set = st.load_sentiment_data() + last_label = -1 + for each in st.test(): + self.assertNotEqual(each[1], last_label) + last_label = each[1] + self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES) + self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES) + self.assertEqual( + len(list(st.test())), + (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py new file mode 100644 index 0000000000000000000000000000000000000000..31e72ebf5eac0508d12783f9ceaa6eef0fa6d353 --- /dev/null +++ b/python/paddle/v2/dataset/tests/voc2012_test.py @@ -0,0 +1,42 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.voc2012 +import unittest + + +class TestVOC(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 3 * l[1].size) + sum += 1 + return sum + + def test_train(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.train()) + self.assertEqual(count, 2913) + + def test_test(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.test()) + self.assertEqual(count, 1464) + + def test_val(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.val()) + self.assertEqual(count, 1449) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/v2/dataset/tests/wmt16_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cef6c3216e7de8d9785a063976e63f88d90b24df --- /dev/null +++ b/python/paddle/v2/dataset/tests/wmt16_test.py @@ -0,0 +1,66 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.wmt16 +import unittest + + +class TestWMT16(unittest.TestCase): + def checkout_one_sample(self, sample): + # train data has 3 field: source language word indices, + # target language word indices, and target next word indices. + self.assertEqual(len(sample), 3) + + # test start mark and end mark in source word indices. + self.assertEqual(sample[0][0], 0) + self.assertEqual(sample[0][-1], 1) + + # test start mask in target word indices + self.assertEqual(sample[1][0], 0) + + # test en mask in target next word indices + self.assertEqual(sample[2][-1], 1) + + def test_train(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.train( + src_dict_size=100000, trg_dict_size=100000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_test(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.test( + src_dict_size=1000, trg_dict_size=1000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_val(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.validation( + src_dict_size=1000, trg_dict_size=1000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_get_dict(self): + dict_size = 1000 + word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True) + self.assertEqual(len(word_dict), dict_size) + self.assertEqual(word_dict[0], "") + self.assertEqual(word_dict[1], "") + self.assertEqual(word_dict[2], "") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py new file mode 100644 index 0000000000000000000000000000000000000000..f10bf7e42a1ead09b3eba0d61e55701215e4360f --- /dev/null +++ b/python/paddle/v2/dataset/uci_housing.py @@ -0,0 +1,134 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +UCI Housing dataset. + +This module will download dataset from +https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and +parse training set and test set into paddle reader creators. +""" + +import numpy as np +import os +import paddle.v2.dataset.common +from paddle.v2.parameters import Parameters + +__all__ = ['train', 'test'] + +URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' +MD5 = 'd4accdce7a25600298819f8e28e8d593' +feature_names = [ + 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', + 'PTRATIO', 'B', 'LSTAT', 'convert' +] + +UCI_TRAIN_DATA = None +UCI_TEST_DATA = None +URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar' +MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b' + + +def feature_range(maximums, minimums): + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + fig, ax = plt.subplots() + feature_num = len(maximums) + ax.bar(range(feature_num), maximums - minimums, color='r', align='center') + ax.set_title('feature scale') + plt.xticks(range(feature_num), feature_names) + plt.xlim([-1, feature_num]) + fig.set_figheight(6) + fig.set_figwidth(10) + if not os.path.exists('./image'): + os.makedirs('./image') + fig.savefig('image/ranges.png', dpi=48) + plt.close(fig) + + +def load_data(filename, feature_num=14, ratio=0.8): + global UCI_TRAIN_DATA, UCI_TEST_DATA + if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None: + return + + data = np.fromfile(filename, sep=' ') + data = data.reshape(data.shape[0] / feature_num, feature_num) + maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum( + axis=0) / data.shape[0] + feature_range(maximums[:-1], minimums[:-1]) + for i in xrange(feature_num - 1): + data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i]) + offset = int(data.shape[0] * ratio) + UCI_TRAIN_DATA = data[:offset] + UCI_TEST_DATA = data[offset:] + + +def train(): + """ + UCI_HOUSING training set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Training reader creator + :rtype: callable + """ + global UCI_TRAIN_DATA + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TRAIN_DATA: + yield d[:-1], d[-1:] + + return reader + + +def test(): + """ + UCI_HOUSING test set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Test reader creator + :rtype: callable + """ + global UCI_TEST_DATA + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TEST_DATA: + yield d[:-1], d[-1:] + + return reader + + +def model(): + tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', + MD5_MODEL) + with open(tar_file, 'r') as f: + parameters = Parameters.from_tar(f) + return parameters + + +def fetch(): + paddle.v2.dataset.common.download(URL, 'uci_housing', MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test") diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py new file mode 100644 index 0000000000000000000000000000000000000000..617e212d67fbe37f9d9663e9c83c62045411fa77 --- /dev/null +++ b/python/paddle/v2/dataset/voc2012.py @@ -0,0 +1,85 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image dataset for segmentation. +The 2012 dataset contains images from 2008-2011 for which additional +segmentations have been prepared. As in previous years the assignment +to training/test sets has been maintained. The total number of images +with segmentation has been increased from 7,062 to 9,993. +""" + +import tarfile +import io +import numpy as np +from paddle.v2.dataset.common import download +from paddle.v2.image import * +from PIL import Image + +__all__ = ['train', 'test', 'val'] + +VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\ +VOCtrainval_11-May-2012.tar' + +VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd' +SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt' +DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg' +LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png' + +CACHE_DIR = 'voc2012' + + +def reader_creator(filename, sub_name): + + tarobject = tarfile.open(filename) + name2mem = {} + for ele in tarobject.getmembers(): + name2mem[ele.name] = ele + + def reader(): + set_file = SET_FILE.format(sub_name) + sets = tarobject.extractfile(name2mem[set_file]) + for line in sets: + line = line.strip() + data_file = DATA_FILE.format(line) + label_file = LABEL_FILE.format(line) + data = tarobject.extractfile(name2mem[data_file]).read() + label = tarobject.extractfile(name2mem[label_file]).read() + data = Image.open(io.BytesIO(data)) + label = Image.open(io.BytesIO(label)) + data = np.array(data) + label = np.array(label) + yield data, label + + return reader + + +def train(): + """ + Create a train dataset reader containing 2913 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval') + + +def test(): + """ + Create a test dataset reader containing 1464 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train') + + +def val(): + """ + Create a val dataset reader containing 1449 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val') diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py new file mode 100644 index 0000000000000000000000000000000000000000..5104e29051e4480f3a7eb18421f1b519841b009b --- /dev/null +++ b/python/paddle/v2/dataset/wmt14.py @@ -0,0 +1,182 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +WMT14 dataset. +The original WMT14 dataset is too large and a small set of data for set is +provided. This module will download dataset from +http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and +parse training set and test set into paddle reader creators. + +""" +import tarfile +import gzip + +import paddle.v2.dataset.common +from paddle.v2.parameters import Parameters + +__all__ = [ + 'train', + 'test', + 'get_dict', + 'convert', +] + +URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/' + 'cslm_joint_paper/data/dev+test.tgz') +MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' +# this is a small set of data for test. The original data is too large and +# will be add later. +URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' + 'wmt_shrinked_data/wmt14.tgz') +MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' +# BLEU of this trained model is 26.92 +URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' +MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' + +START = "" +END = "" +UNK = "" +UNK_IDX = 2 + + +def __read_to_dict(tar_file, dict_size): + def __to_dict(fd, size): + out_dict = dict() + for line_count, line in enumerate(fd): + if line_count < size: + out_dict[line.strip()] = line_count + else: + break + return out_dict + + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith("src.dict") + ] + assert len(names) == 1 + src_dict = __to_dict(f.extractfile(names[0]), dict_size) + names = [ + each_item.name for each_item in f + if each_item.name.endswith("trg.dict") + ] + assert len(names) == 1 + trg_dict = __to_dict(f.extractfile(names[0]), dict_size) + return src_dict, trg_dict + + +def reader_creator(tar_file, file_name, dict_size): + def reader(): + src_dict, trg_dict = __read_to_dict(tar_file, dict_size) + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith(file_name) + ] + for name in names: + for line in f.extractfile(name): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_words = src_seq.split() + src_ids = [ + src_dict.get(w, UNK_IDX) + for w in [START] + src_words + [END] + ] + + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + + # remove sequence whose length > 80 in training mode + if len(src_ids) > 80 or len(trg_ids) > 80: + continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def train(dict_size): + """ + WMT14 training set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'train/train', dict_size) + + +def test(dict_size): + """ + WMT14 test set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'test/test', dict_size) + + +def gen(dict_size): + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'gen/gen', dict_size) + + +def model(): + tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + with gzip.open(tar_file, 'r') as f: + parameters = Parameters.from_tar(f) + return parameters + + +def get_dict(dict_size, reverse=True): + # if reverse = False, return dict = {'a':'001', 'b':'002', ...} + # else reverse = true, return dict = {'001':'a', '002':'b', ...} + tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + src_dict, trg_dict = __read_to_dict(tar_file, dict_size) + if reverse: + src_dict = {v: k for k, v in src_dict.items()} + trg_dict = {v: k for k, v in trg_dict.items()} + return src_dict, trg_dict + + +def fetch(): + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + + +def convert(path): + """ + Converts dataset to recordio format + """ + dict_size = 30000 + paddle.v2.dataset.common.convert(path, + train(dict_size), 1000, "wmt14_train") + paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py new file mode 100644 index 0000000000000000000000000000000000000000..c8818f715beadd9499ae588f2c19a57fbf26f372 --- /dev/null +++ b/python/paddle/v2/dataset/wmt16.py @@ -0,0 +1,349 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +ACL2016 Multimodal Machine Translation. Please see this website for more +details: http://www.statmt.org/wmt16/multimodal-task.html#task1 + +If you use the dataset created for your task, please cite the following paper: +Multi30K: Multilingual English-German Image Descriptions. + +@article{elliott-EtAl:2016:VL16, + author = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.}, + title = {Multi30K: Multilingual English-German Image Descriptions}, + booktitle = {Proceedings of the 6th Workshop on Vision and Language}, + year = {2016}, + pages = {70--74}, + year = 2016 +} +""" + +import os +import tarfile +import gzip +from collections import defaultdict + +import paddle.v2.dataset.common + +__all__ = [ + "train", + "test", + "validation", + "convert", + "fetch", + "get_dict", +] + +DATA_URL = ("http://cloud.dlnel.org/filepub/" + "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed") +DATA_MD5 = "0c38be43600334966403524a40dcd81e" + +TOTAL_EN_WORDS = 11250 +TOTAL_DE_WORDS = 19220 + +START_MARK = "" +END_MARK = "" +UNK_MARK = "" + + +def __build_dict(tar_file, dict_size, save_path, lang): + word_dict = defaultdict(int) + with tarfile.open(tar_file, mode="r") as f: + for line in f.extractfile("wmt16/train"): + line_split = line.strip().split("\t") + if len(line_split) != 2: continue + sen = line_split[0] if lang == "en" else line_split[1] + for w in sen.split(): + word_dict[w] += 1 + + with open(save_path, "w") as fout: + fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)) + for idx, word in enumerate( + sorted( + word_dict.iteritems(), key=lambda x: x[1], reverse=True)): + if idx + 3 == dict_size: break + fout.write("%s\n" % (word[0])) + + +def __load_dict(tar_file, dict_size, lang, reverse=False): + dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + "wmt16/%s_%d.dict" % (lang, dict_size)) + if not os.path.exists(dict_path) or ( + len(open(dict_path, "r").readlines()) != dict_size): + __build_dict(tar_file, dict_size, dict_path, lang) + + word_dict = {} + with open(dict_path, "r") as fdict: + for idx, line in enumerate(fdict): + if reverse: + word_dict[idx] = line.strip() + else: + word_dict[line.strip()] = idx + return word_dict + + +def __get_dict_size(src_dict_size, trg_dict_size, src_lang): + src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else + TOTAL_DE_WORDS)) + trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else + TOTAL_ENG_WORDS)) + return src_dict_size, trg_dict_size + + +def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): + def reader(): + src_dict = __load_dict(tar_file, src_dict_size, src_lang) + trg_dict = __load_dict(tar_file, trg_dict_size, + ("de" if src_lang == "en" else "en")) + + # the indice for start mark, end mark, and unk are the same in source + # language and target language. Here uses the source language + # dictionary to determine their indices. + start_id = src_dict[START_MARK] + end_id = src_dict[END_MARK] + unk_id = src_dict[UNK_MARK] + + src_col = 0 if src_lang == "en" else 1 + trg_col = 1 - src_col + + with tarfile.open(tar_file, mode="r") as f: + for line in f.extractfile(file_name): + line_split = line.strip().split("\t") + if len(line_split) != 2: + continue + src_words = line_split[src_col].split() + src_ids = [start_id] + [ + src_dict.get(w, unk_id) for w in src_words + ] + [end_id] + + trg_words = line_split[trg_col].split() + trg_ids = [trg_dict.get(w, unk_id) for w in trg_words] + + trg_ids_next = trg_ids + [end_id] + trg_ids = [start_id] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def train(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 train set reader. + + This function returns the reader for train data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + + NOTE: + The original like for training data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The train reader. + """ + + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. Only support: " + "en (for English); de(for Germany).") + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/train", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def test(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 test set reader. + + This function returns the reader for test data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + NOTE: + The original like for test data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The test reader. + """ + + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. " + "Only support: en (for English); de(for Germany).") + + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/test", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def validation(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 validation set reader. + + This function returns the reader for validation data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + NOTE: + The original like for validation data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The validation reader. + """ + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. " + "Only support: en (for English); de(for Germany).") + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/val", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def get_dict(lang, dict_size, reverse=False): + """ + return the word dictionary for the specified language. + + Args: + lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + dict_size(int): Size of the specified language dictionary. + reverse(bool): If reverse is set to False, the returned python + dictionary will use word as key and use index as value. + If reverse is set to True, the returned python + dictionary will use index as key and word as value. + + Returns: + dict: The word dictionary for the specific language. + """ + + if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS) + else: dict_size = min(dict_size, TOTAL_DE_WORDS) + + dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + "wmt16/%s_%d.dict" % (lang, dict_size)) + assert os.path.exists(dict_path), "Word dictionary does not exist. " + "Please invoke paddle.dataset.wmt16.train/test/validation first " + "to build the dictionary." + tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz") + return __load_dict(tar_file, dict_size, lang, reverse) + + +def fetch(): + """download the entire dataset. + """ + paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz") + + +def convert(path, src_dict_size, trg_dict_size, src_lang): + """Converts dataset to recordio format. + """ + + paddle.v2.dataset.common.convert( + path, + train( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_train") + paddle.v2.dataset.common.convert( + path, + test( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_test") + paddle.v2.dataset.common.convert( + path, + validation( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_validation") diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py new file mode 100644 index 0000000000000000000000000000000000000000..9235c41e9eb95b25a0dc53a494a203e7a4525981 --- /dev/null +++ b/python/paddle/v2/image.py @@ -0,0 +1,381 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file contains some common interfaces for image preprocess. +Many users are confused about the image layout. We introduce +the image layout as follows. + +- CHW Layout + + - The abbreviations: C=channel, H=Height, W=Width + - The default layout of image opened by cv2 or PIL is HWC. + PaddlePaddle only supports the CHW layout. And CHW is simply + a transpose of HWC. It must transpose the input image. + +- Color format: RGB or BGR + + OpenCV use BGR color format. PIL use RGB color format. Both + formats can be used for training. Noted that, the format should + be keep consistent between the training and inference peroid. +""" +import numpy as np +try: + import cv2 +except ImportError: + cv2 = None +import os +import tarfile +import cPickle + +__all__ = [ + "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop", + "random_crop", "left_right_flip", "simple_transform", "load_and_transform", + "batch_images_from_tar" +] + + +def batch_images_from_tar(data_file, + dataset_name, + img2label, + num_per_batch=1024): + """ + Read images from tar file and batch them into batch file. + + :param data_file: path of image tar file + :type data_file: string + :param dataset_name: 'train','test' or 'valid' + :type dataset_name: string + :param img2label: a dic with image file name as key + and image's label as value + :type img2label: dic + :param num_per_batch: image number per batch file + :type num_per_batch: int + :return: path of list file containing paths of batch file + :rtype: string + """ + batch_dir = data_file + "_batch" + out_path = "%s/%s" % (batch_dir, dataset_name) + meta_file = "%s/%s.txt" % (batch_dir, dataset_name) + + if os.path.exists(out_path): + return meta_file + else: + os.makedirs(out_path) + + tf = tarfile.open(data_file) + mems = tf.getmembers() + data = [] + labels = [] + file_id = 0 + for mem in mems: + if mem.name in img2label: + data.append(tf.extractfile(mem).read()) + labels.append(img2label[mem.name]) + if len(data) == num_per_batch: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + file_id += 1 + data = [] + labels = [] + if len(data) > 0: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + + with open(meta_file, 'a') as meta: + for file in os.listdir(out_path): + meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n") + return meta_file + + +def load_image_bytes(bytes, is_color=True): + """ + Load an color or gray image from bytes array. + + Example usage: + + .. code-block:: python + + with open('cat.jpg') as f: + im = load_image_bytes(f.read()) + + :param bytes: the input image bytes array. + :type bytes: str + :param is_color: If set is_color True, it will load and + return a color image. Otherwise, it will + load and return a gray image. + :type is_color: bool + """ + flag = 1 if is_color else 0 + file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8) + img = cv2.imdecode(file_bytes, flag) + return img + + +def load_image(file, is_color=True): + """ + Load an color or gray image from the file path. + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + + :param file: the input image path. + :type file: string + :param is_color: If set is_color True, it will load and + return a color image. Otherwise, it will + load and return a gray image. + :type is_color: bool + """ + # cv2.IMAGE_COLOR for OpenCV3 + # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version + # cv2.IMAGE_GRAYSCALE for OpenCV3 + # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version + # Here, use constant 1 and 0 + # 1: COLOR, 0: GRAYSCALE + flag = 1 if is_color else 0 + im = cv2.imread(file, flag) + return im + + +def resize_short(im, size): + """ + Resize an image so that the length of shorter edge is size. + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + im = resize_short(im, 256) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the shorter edge size of image after resizing. + :type size: int + """ + h, w = im.shape[:2] + h_new, w_new = size, size + if h > w: + h_new = size * h / w + else: + w_new = size * w / h + im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + return im + + +def to_chw(im, order=(2, 0, 1)): + """ + Transpose the input image order. The image layout is HWC format + opened by cv2 or PIL. Transpose the input image to CHW layout + according the order (2,0,1). + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + im = resize_short(im, 256) + im = to_chw(im) + + :param im: the input image with HWC layout. + :type im: ndarray + :param order: the transposed order. + :type order: tuple|list + """ + assert len(im.shape) == len(order) + im = im.transpose(order) + return im + + +def center_crop(im, size, is_color=True): + """ + Crop the center of image with size. + + Example usage: + + .. code-block:: python + + im = center_crop(im, 224) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the cropping size. + :type size: int + :param is_color: whether the image is color or not. + :type is_color: bool + """ + h, w = im.shape[:2] + h_start = (h - size) / 2 + w_start = (w - size) / 2 + h_end, w_end = h_start + size, w_start + size + if is_color: + im = im[h_start:h_end, w_start:w_end, :] + else: + im = im[h_start:h_end, w_start:w_end] + return im + + +def random_crop(im, size, is_color=True): + """ + Randomly crop input image with size. + + Example usage: + + .. code-block:: python + + im = random_crop(im, 224) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the cropping size. + :type size: int + :param is_color: whether the image is color or not. + :type is_color: bool + """ + h, w = im.shape[:2] + h_start = np.random.randint(0, h - size + 1) + w_start = np.random.randint(0, w - size + 1) + h_end, w_end = h_start + size, w_start + size + if is_color: + im = im[h_start:h_end, w_start:w_end, :] + else: + im = im[h_start:h_end, w_start:w_end] + return im + + +def left_right_flip(im, is_color=True): + """ + Flip an image along the horizontal direction. + Return the flipped image. + + Example usage: + + .. code-block:: python + + im = left_right_flip(im) + + :param im: input image with HWC layout or HW layout for gray image + :type im: ndarray + :param is_color: whether input image is color or not + :type is_color: bool + """ + if len(im.shape) == 3 and is_color: + return im[:, ::-1, :] + else: + return im[:, ::-1] + + +def simple_transform(im, + resize_size, + crop_size, + is_train, + is_color=True, + mean=None): + """ + Simply data argumentation for training. These operations include + resizing, croping and flipping. + + Example usage: + + .. code-block:: python + + im = simple_transform(im, 256, 224, True) + + :param im: The input image with HWC layout. + :type im: ndarray + :param resize_size: The shorter edge length of the resized image. + :type resize_size: int + :param crop_size: The cropping size. + :type crop_size: int + :param is_train: Whether it is training or not. + :type is_train: bool + :param is_color: whether the image is color or not. + :type is_color: bool + :param mean: the mean values, which can be element-wise mean values or + mean values per channel. + :type mean: numpy array | list + """ + im = resize_short(im, resize_size) + if is_train: + im = random_crop(im, crop_size, is_color=is_color) + if np.random.randint(2) == 0: + im = left_right_flip(im, is_color) + else: + im = center_crop(im, crop_size, is_color) + im = center_crop(im, crop_size, is_color=is_color) + if len(im.shape) == 3: + im = to_chw(im) + + im = im.astype('float32') + if mean is not None: + mean = np.array(mean, dtype=np.float32) + # mean value, may be one value per channel + if mean.ndim == 1 and is_color: + mean = mean[:, np.newaxis, np.newaxis] + elif mean.ndim == 1: + mean = mean + else: + # elementwise mean + assert len(mean.shape) == len(im) + im -= mean + + return im + + +def load_and_transform(filename, + resize_size, + crop_size, + is_train, + is_color=True, + mean=None): + """ + Load image from the input file `filename` and transform image for + data argumentation. Please refer to the `simple_transform` interface + for the transform operations. + + Example usage: + + .. code-block:: python + + im = load_and_transform('cat.jpg', 256, 224, True) + + :param filename: The file name of input image. + :type filename: string + :param resize_size: The shorter edge length of the resized image. + :type resize_size: int + :param crop_size: The cropping size. + :type crop_size: int + :param is_train: Whether it is training or not. + :type is_train: bool + :param is_color: whether the image is color or not. + :type is_color: bool + :param mean: the mean values, which can be element-wise mean values or + mean values per channel. + :type mean: numpy array | list + """ + im = load_image(filename, is_color) + im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean) + return im diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py new file mode 100644 index 0000000000000000000000000000000000000000..317cf037c69f8639e3760fbfce20565127794fcb --- /dev/null +++ b/python/paddle/v2/minibatch.py @@ -0,0 +1,41 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['batch'] + + +def batch(reader, batch_size): + """ + Create a batched reader. + + :param reader: the data reader to read from. + :type reader: callable + :param batch_size: size of each mini-batch + :type batch_size: int + :return: the batched reader. + :rtype: callable + """ + + def batch_reader(): + r = reader() + b = [] + for instance in r: + b.append(instance) + if len(b) == batch_size: + yield b + b = [] + if b: + yield b + + return batch_reader diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6 --- /dev/null +++ b/python/paddle/v2/reader/__init__.py @@ -0,0 +1,74 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +At training and testing time, PaddlePaddle programs need to read data. To ease +the users' work to write data reading code, we define that + +- A *reader* is a function that reads data (from file, network, random number + generator, etc) and yields data items. +- A *reader creator* is a function that returns a reader function. +- A *reader decorator* is a function, which accepts one or more readers, and + returns a reader. +- A *batch reader* is a function that reads data (from *reader*, file, network, + random number generator, etc) and yields a batch of data items. + +##################### +Data Reader Interface +##################### + +Indeed, *data reader* doesn't have to be a function that reads and yields data +items. It can be any function with no parameter that creates a iterable +(anything can be used in :code:`for x in iterable`)\: + +.. code-block:: python + + iterable = data_reader() + +Element produced from the iterable should be a **single** entry of data, +**not** a mini batch. That entry of data could be a single item, or a tuple of +items. +Item should be of `supported type `_ (e.g., numpy 1d +array of float32, int, list of int) + +An example implementation for single item data reader creator: + +.. code-block:: python + + def reader_creator_random_image(width, height): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height) + return reader + +An example implementation for multiple item data reader creator: + +.. code-block:: python + + def reader_creator_random_image_and_label(width, height, label): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height), label + return reader + + +TODO(yuyang18): Should we add whole design doc here? +""" + +import decorator +from decorator import * + +import creator + +__all__ = decorator.__all__ + ['creator'] diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py new file mode 100644 index 0000000000000000000000000000000000000000..fda5246d74f598200b439774a25e80ec3e504077 --- /dev/null +++ b/python/paddle/v2/reader/creator.py @@ -0,0 +1,130 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Creator package contains some simple reader creator, which could +be used in user program. +""" + +__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader'] + + +def np_array(x): + """ + Creates a reader that yields elements of x, if it is a + numpy vector. Or rows of x, if it is a numpy matrix. + Or any sub-hyperplane indexed by the highest dimension. + + :param x: the numpy array to create reader from. + :returns: data reader created from x. + """ + + def reader(): + if x.ndim < 1: + yield x + + for e in x: + yield e + + return reader + + +def text_file(path): + """ + Creates a data reader that outputs text line by line from given text file. + Trailing new line ('\\\\n') of each line will be removed. + + :path: path of the text file. + :returns: data reader of text file + """ + + def reader(): + f = open(path, "r") + for l in f: + yield l.rstrip('\n') + f.close() + + return reader + + +def recordio(paths, buf_size=100): + """ + Creates a data reader from given RecordIO file paths separated by ",", + glob pattern is supported. + :path: path of recordio files, can be a string or a string list. + :returns: data reader of recordio files. + """ + + import recordio as rec + import paddle.v2.reader.decorator as dec + import cPickle as pickle + + def reader(): + if isinstance(paths, basestring): + path = paths + else: + path = ",".join(paths) + f = rec.reader(path) + while True: + r = f.read() + if r is None: + break + yield pickle.loads(r) + f.close() + + return dec.buffered(reader, buf_size) + + +pass_num = 0 + + +def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64): + """ + Create a data reader that yield a record one by one from + the paths: + :paths: path of recordio files, can be a string or a string list. + :etcd_endpoints: the endpoints for etcd cluster + :returns: data reader of recordio files. + + .. code-block:: python + from paddle.v2.reader.creator import cloud_reader + etcd_endpoints = "http://127.0.0.1:2379" + trainer.train.( + reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints), + ) + """ + import os + import cPickle as pickle + import paddle.v2.master as master + c = master.client(etcd_endpoints, timeout_sec, buf_size) + + if isinstance(paths, basestring): + path = [paths] + else: + path = paths + c.set_dataset(path) + + def reader(): + global pass_num + c.paddle_start_get_records(pass_num) + pass_num += 1 + + while True: + r, e = c.next_record() + if not r: + if e != -2: + print "get record error: ", e + break + yield pickle.loads(r) + + return reader diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py new file mode 100644 index 0000000000000000000000000000000000000000..44a6e344630bb35d28ee29078bf8727053a24bef --- /dev/null +++ b/python/paddle/v2/reader/decorator.py @@ -0,0 +1,405 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' +] + +from threading import Thread +import subprocess + +from Queue import Queue +import itertools +import random +import zlib + + +def map_readers(func, *readers): + """ + Creates a data reader that outputs return value of function using + output of each data readers as arguments. + + :param func: function to use. The type of func should be (Sample) => Sample + :type: callable + :param readers: readers whose outputs will be used as arguments of func. + :return: the created data reader. + :rtype: callable + """ + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + for e in itertools.imap(func, *rs): + yield e + + return reader + + +def shuffle(reader, buf_size): + """ + Creates a data reader whose data output is shuffled. + + Output from the iterator that created by original reader will be + buffered into shuffle buffer, and then shuffled. The size of shuffle buffer + is determined by argument buf_size. + + :param reader: the original reader whose output will be shuffled. + :type reader: callable + :param buf_size: shuffle buffer size. + :type buf_size: int + + :return: the new reader whose output is shuffled. + :rtype: callable + """ + + def data_reader(): + buf = [] + for e in reader(): + buf.append(e) + if len(buf) >= buf_size: + random.shuffle(buf) + for b in buf: + yield b + buf = [] + + if len(buf) > 0: + random.shuffle(buf) + for b in buf: + yield b + + return data_reader + + +def chain(*readers): + """ + Creates a data reader whose output is the outputs of input data + readers chained together. + + If input readers output following data entries: + [0, 0, 0] + [1, 1, 1] + [2, 2, 2] + The chained reader will output: + [0, 0, 0, 1, 1, 1, 2, 2, 2] + + :param readers: input readers. + :return: the new data reader. + :rtype: callable + """ + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + + for e in itertools.chain(*rs): + yield e + + return reader + + +class ComposeNotAligned(ValueError): + pass + + +def compose(*readers, **kwargs): + """ + Creates a data reader whose output is the combination of input readers. + + If input readers output following data entries: + (1, 2) 3 (4, 5) + The composed reader will output: + (1, 2, 3, 4, 5) + + :param readers: readers that will be composed together. + :param check_alignment: if True, will check if input readers are aligned + correctly. If False, will not check alignment and trailing outputs + will be discarded. Defaults to True. + :type check_alignment: bool + + :return: the new data reader. + + :raises ComposeNotAligned: outputs of readers are not aligned. + Will not raise when check_alignment is set to False. + """ + check_alignment = kwargs.pop('check_alignment', True) + + def make_tuple(x): + if isinstance(x, tuple): + return x + else: + return (x, ) + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + if not check_alignment: + for outputs in itertools.izip(*rs): + yield sum(map(make_tuple, outputs), ()) + else: + for outputs in itertools.izip_longest(*rs): + for o in outputs: + if o is None: + # None will be not be present if compose is aligned + raise ComposeNotAligned( + "outputs of readers are not aligned.") + yield sum(map(make_tuple, outputs), ()) + + return reader + + +def buffered(reader, size): + """ + Creates a buffered data reader. + + The buffered data reader will read and save data entries into a + buffer. Reading from the buffered data reader will proceed as long + as the buffer is not empty. + + :param reader: the data reader to read from. + :type reader: callable + :param size: max buffer size. + :type size: int + + :returns: the buffered data reader. + """ + + class EndSignal(): + pass + + end = EndSignal() + + def read_worker(r, q): + for d in r: + q.put(d) + q.put(end) + + def data_reader(): + r = reader() + q = Queue(maxsize=size) + t = Thread( + target=read_worker, args=( + r, + q, )) + t.daemon = True + t.start() + e = q.get() + while e != end: + yield e + e = q.get() + + return data_reader + + +def firstn(reader, n): + """ + Limit the max number of samples that reader could return. + + :param reader: the data reader to read from. + :type reader: callable + :param n: the max number of samples that return. + :type n: int + :return: the decorated reader. + :rtype: callable + """ + + # TODO(yuyang18): Check if just drop the reader, could clean the opened + # resource or not? + + def firstn_reader(): + for i, item in enumerate(reader()): + if i == n: + break + yield item + + return firstn_reader + + +class XmapEndSignal(): + pass + + +def xmap_readers(mapper, reader, process_num, buffer_size, order=False): + """ + Use multiprocess to map samples from reader by a mapper defined by user. + And this function contains a buffered decorator. + :param mapper: a function to map sample. + :type mapper: callable + :param reader: the data reader to read from + :type reader: callable + :param process_num: process number to handle original sample + :type process_num: int + :param buffer_size: max buffer size + :type buffer_size: int + :param order: keep the order of reader + :type order: bool + :return: the decarated reader + :rtype: callable + """ + end = XmapEndSignal() + + # define a worker to read samples from reader to in_queue + def read_worker(reader, in_queue): + for i in reader(): + in_queue.put(i) + in_queue.put(end) + + # define a worker to read samples from reader to in_queue with order flag + def order_read_worker(reader, in_queue): + in_order = 0 + for i in reader(): + in_queue.put((in_order, i)) + in_order += 1 + in_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue + def handle_worker(in_queue, out_queue, mapper): + sample = in_queue.get() + while not isinstance(sample, XmapEndSignal): + r = mapper(sample) + out_queue.put(r) + sample = in_queue.get() + in_queue.put(end) + out_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue by order + def order_handle_worker(in_queue, out_queue, mapper, out_order): + ins = in_queue.get() + while not isinstance(ins, XmapEndSignal): + order, sample = ins + r = mapper(sample) + while order != out_order[0]: + pass + out_queue.put(r) + out_order[0] += 1 + ins = in_queue.get() + in_queue.put(end) + out_queue.put(end) + + def xreader(): + in_queue = Queue(buffer_size) + out_queue = Queue(buffer_size) + out_order = [0] + # start a read worker in a thread + target = order_read_worker if order else read_worker + t = Thread(target=target, args=(reader, in_queue)) + t.daemon = True + t.start() + # start several handle_workers + target = order_handle_worker if order else handle_worker + args = (in_queue, out_queue, mapper, out_order) if order else ( + in_queue, out_queue, mapper) + workers = [] + for i in xrange(process_num): + worker = Thread(target=target, args=args) + worker.daemon = True + workers.append(worker) + for w in workers: + w.start() + + sample = out_queue.get() + while not isinstance(sample, XmapEndSignal): + yield sample + sample = out_queue.get() + finish = 1 + while finish < process_num: + sample = out_queue.get() + if isinstance(sample, XmapEndSignal): + finish += 1 + else: + yield sample + + return xreader + + +def _buf2lines(buf, line_break="\n"): + # FIXME: line_break should be automatically configured. + lines = buf.split(line_break) + return lines[:-1], lines[-1] + + +class PipeReader: + """ + PipeReader read data by stream from a command, take it's + stdout into a pipe buffer and redirect it to the parser to + parse, then yield data as your desired format. + + You can using standard linux command or call another program + to read data, from HDFS, Ceph, URL, AWS S3 etc: + + .. code-block:: python + cmd = "hadoop fs -cat /path/to/some/file" + cmd = "cat sample_file.tar.gz" + cmd = "curl http://someurl" + cmd = "python print_s3_bucket.py" + + An example: + + .. code-block:: python + + def example_reader(): + for f in myfiles: + pr = PipeReader("cat %s"%f) + for l in pr.get_line(): + sample = l.split(" ") + yield sample + """ + + def __init__(self, command, bufsize=8192, file_type="plain"): + if not isinstance(command, str): + raise TypeError("left_cmd must be a string") + if file_type == "gzip": + self.dec = zlib.decompressobj( + 32 + zlib.MAX_WBITS) # offset 32 to skip the header + self.file_type = file_type + self.bufsize = bufsize + self.process = subprocess.Popen( + command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + + def get_line(self, cut_lines=True, line_break="\n"): + """ + :param cut_lines: cut buffer to lines + :type cut_lines: bool + :param line_break: line break of the file, like \n or \r + :type line_break: string + + :return: one line or a buffer of bytes + :rtype: string + """ + remained = "" + while True: + buff = self.process.stdout.read(self.bufsize) + if buff: + if self.file_type == "gzip": + decomp_buff = self.dec.decompress(buff) + elif self.file_type == "plain": + decomp_buff = buff + else: + raise TypeError("file_type %s is not allowed" % + self.file_type) + + if cut_lines: + lines, remained = _buf2lines(''.join( + [remained, decomp_buff]), line_break) + for line in lines: + yield line + else: + yield decomp_buff + else: + break diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..107d5912e1567e0c8721987a281272c7feb51e63 --- /dev/null +++ b/python/paddle/v2/reader/tests/CMakeLists.txt @@ -0,0 +1,2 @@ +py_test(creator_test SRCS creator_test.py) +py_test(decorator_test SRCS decorator_test.py) diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eca2dce114b069bf9b455d77ce670d73b5047fd2 --- /dev/null +++ b/python/paddle/v2/reader/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7fe374e663607607cd0839eb6ca9c70c4d15eef8 --- /dev/null +++ b/python/paddle/v2/reader/tests/creator_test.py @@ -0,0 +1,74 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright PaddlePaddle contributors. All Rights Reservedd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +import numpy as np +import paddle.v2.reader.creator + + +class TestNumpyArray(unittest.TestCase): + def test_numpy_array(self): + l = [[1, 2, 3], [4, 5, 6]] + x = np.array(l, np.int32) + reader = paddle.v2.reader.creator.np_array(x) + for idx, e in enumerate(reader()): + self.assertItemsEqual(e, l[idx]) + + +class TestTextFile(unittest.TestCase): + def test_text_file(self): + path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt") + reader = paddle.v2.reader.creator.text_file(path) + for idx, e in enumerate(reader()): + self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) + + +class TestRecordIO(unittest.TestCase): + def do_test(self, path): + reader = paddle.v2.reader.creator.recordio(path) + idx = 0 + for e in reader(): + if idx == 0: + self.assertEqual(e, (1, 2, 3)) + elif idx == 1: + self.assertEqual(e, (4, 5, 6)) + idx += 1 + self.assertEqual(idx, 2) + + def test_recordIO(self): + self.do_test( + os.path.join( + os.path.dirname(__file__), "test_reader_recordio.dat")) + self.do_test([ + os.path.join( + os.path.dirname(__file__), "test_reader_recordio.dat") + ]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6b680e39f3fb299a14e7d8162470996d1d16b83d --- /dev/null +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -0,0 +1,178 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import unittest + +import paddle.v2.reader + + +def reader_creator_10(dur): + def reader(): + for i in range(10): + # this invocation helps testing paddle.reader.buffer + time.sleep(dur) + yield i + + return reader + + +class TestMap(unittest.TestCase): + def test_map(self): + d = {"h": 0, "i": 1} + + def tokenize(x): + return d[x] + + def read(): + yield "h" + yield "i" + + r = paddle.v2.reader.map_readers(tokenize, read) + for i, e in enumerate(r()): + self.assertEqual(e, i) + + +class TestBuffered(unittest.TestCase): + def test_read(self): + for size in range(20): + b = paddle.v2.reader.buffered(reader_creator_10(0), size) + c = 0 + for i in b(): + self.assertEqual(i, c) + c += 1 + self.assertEqual(c, 10) + + def test_buffering(self): + # read have 30ms delay. + b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10) + last_time = time.time() + for idx, i in enumerate(b()): + elapsed_time = time.time() - last_time + if i == 0: + time.sleep(0.3) + else: + # read time should be short, meaning already buffered. + self.assertLess(elapsed_time, 0.05) + last_time = time.time() + + +class TestCompose(unittest.TestCase): + def test_compse(self): + reader = paddle.v2.reader.compose( + reader_creator_10(0), reader_creator_10(0)) + for idx, e in enumerate(reader()): + self.assertEqual(e, (idx, idx)) + + def test_compose_not_aligned(self): + total = 0 + reader = paddle.v2.reader.compose( + paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader_creator_10(0)) + with self.assertRaises(paddle.v2.reader.ComposeNotAligned): + for e in reader(): + total += 1 + # expecting 10, not 20 + self.assertEqual(total, 10) + + def test_compose_not_aligned_no_check(self): + total = 0 + reader = paddle.v2.reader.compose( + paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader_creator_10(0), + check_alignment=False) + for e in reader(): + total += 1 + # expecting 10, not 20 + self.assertEqual(total, 10) + + +class TestChain(unittest.TestCase): + def test_chain(self): + c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)) + idx = 0 + for e in c(): + self.assertEqual(e, idx % 10) + idx += 1 + self.assertEqual(idx, 20) + + +class TestShuffle(unittest.TestCase): + def test_shuffle(self): + case = [(0, True), (1, True), (10, False), (100, False)] + a = reader_creator_10(0) + for size, checkEq in case: + s = paddle.v2.reader.shuffle(a, size) + total = 0 + for idx, e in enumerate(s()): + if checkEq: + self.assertEqual(idx, e) + total += 1 + self.assertEqual(total, 10) + + +class TestXmap(unittest.TestCase): + def test_xmap(self): + def mapper(x): + return (x + 1) + + orders = (True, False) + thread_nums = (1, 2, 4, 8, 16) + buffered_size = (1, 2, 4, 8, 16) + for order in orders: + for tNum in thread_nums: + for size in buffered_size: + reader = paddle.v2.reader.xmap_readers(mapper, + reader_creator_10(0), + tNum, size, order) + for n in xrange(3): + result = [] + for i in reader(): + result.append(i) + if not order: + result.sort() + for idx, e in enumerate(result): + self.assertEqual(e, mapper(idx)) + + +class TestPipeReader(unittest.TestCase): + def test_pipe_reader(self): + def example_reader(myfiles): + for f in myfiles: + pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128) + for l in pr.get_line(): + yield l + + import tempfile + + records = [str(i) for i in xrange(5)] + temp = tempfile.NamedTemporaryFile() + try: + with open(temp.name, 'w') as f: + for r in records: + f.write('%s\n' % r) + + result = [] + for r in example_reader([temp.name]): + result.append(r) + + for idx, e in enumerate(records): + self.assertEqual(e, result[idx]) + finally: + # delete the temporary file + temp.close() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31 --- /dev/null +++ b/python/paddle/v2/reader/tests/test_data_creator.txt @@ -0,0 +1,3 @@ +0 1 +2 3 +4 5 diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat new file mode 100644 index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491 Binary files /dev/null and b/python/paddle/v2/reader/tests/test_reader_recordio.dat differ diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat new file mode 100644 index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69 Binary files /dev/null and b/python/paddle/v2/reader/tests/test_recordio_creator.dat differ diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt index 46e4feb8e1ce1d12f214f5c49b1b589a46110603..b4333ed530ce464095ec38d72706949cc464fbe4 100644 --- a/python/paddle/v2/tests/CMakeLists.txt +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -1,4 +1,5 @@ py_test(test_op SRCS test_op.py) +py_test(test_image SRCS test_image.py) py_test(test_layer SRCS test_layer.py) py_test(test_topology SRCS test_topology.py) py_test(test_rnn_layer SRCS test_rnn_layer.py) diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/v2/tests/cat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bc1fbbd371216b9904b522ed302700c79d2e4876 Binary files /dev/null and b/python/paddle/v2/tests/cat.jpg differ diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py new file mode 100644 index 0000000000000000000000000000000000000000..c78bbdc40a25878b21ba7e678afedf9d8f0a87cf --- /dev/null +++ b/python/paddle/v2/tests/test_image.py @@ -0,0 +1,43 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle.v2.image as image + + +class Image(unittest.TestCase): + def test_resize_flip_chw(self): + # resize + im = image.load_image('cat.jpg') + im = image.resize_short(im, 256) + self.assertEqual(256, min(im.shape[:2])) + self.assertEqual(3, im.shape[2]) + + # flip + im = image.left_right_flip(im) + im2 = np.flip(im, 1) + self.assertEqual(im.all(), im2.all()) + + # to_chw + h, w, c = im.shape + im = image.to_chw(im) + self.assertEqual(c, im.shape[0]) + self.assertEqual(h, im.shape[1]) + self.assertEqual(w, im.shape[2]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py index 8320217da2795da756cf12a80f39279182789eef..264442be182ea69c95b39b3bdb4c389d52eff66e 100644 --- a/python/paddle/v2/tests/test_paramconf_order.py +++ b/python/paddle/v2/tests/test_paramconf_order.py @@ -27,7 +27,6 @@ # limitations under the License. import unittest import math -import paddle.dataset as dataset import paddle.v2 as paddle @@ -41,7 +40,7 @@ def wordemb(inlayer): def train(): - word_dict = dataset.imikolov.build_dict() + word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) # Every layer takes integer value of range [0, dict_size) firstword = paddle.layer.data( diff --git a/python/setup.py.in b/python/setup.py.in index d73a3a6a1c41b87efb9600ac59983bd16547ec6a..08a448934d3248b46618acdef9e1894f94a93893 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -77,6 +77,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF': 'paddle.v2', 'paddle.v2.master', 'paddle.v2.plot', + 'paddle.v2.reader', + 'paddle.v2.dataset', 'py_paddle'] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: