From f837eee724824a7f06025152400e70a1b9a2be53 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Mon, 2 Apr 2018 13:21:03 -0700 Subject: [PATCH] add paddle.v2.reader,dataset back for backward compatibility --- python/CMakeLists.txt | 1 + python/paddle/dataset/__init__.py | 2 +- python/paddle/v2/__init__.py | 8 + python/paddle/v2/dataset/__init__.py | 46 ++ python/paddle/v2/dataset/cifar.py | 139 ++++++ python/paddle/v2/dataset/common.py | 236 ++++++++++ python/paddle/v2/dataset/conll05.py | 257 +++++++++++ python/paddle/v2/dataset/flowers.py | 199 +++++++++ python/paddle/v2/dataset/imdb.py | 148 +++++++ python/paddle/v2/dataset/imikolov.py | 161 +++++++ python/paddle/v2/dataset/mnist.py | 123 ++++++ python/paddle/v2/dataset/movielens.py | 262 +++++++++++ python/paddle/v2/dataset/mq2007.py | 333 ++++++++++++++ python/paddle/v2/dataset/sentiment.py | 141 ++++++ python/paddle/v2/dataset/tests/cifar_test.py | 56 +++ python/paddle/v2/dataset/tests/common_test.py | 94 ++++ .../paddle/v2/dataset/tests/flowers_test.py | 51 +++ python/paddle/v2/dataset/tests/imdb_test.py | 57 +++ .../paddle/v2/dataset/tests/imikolov_test.py | 67 +++ python/paddle/v2/dataset/tests/mnist_test.py | 44 ++ python/paddle/v2/dataset/tests/mq2007_test.py | 33 ++ .../paddle/v2/dataset/tests/test_sentiment.py | 55 +++ .../paddle/v2/dataset/tests/voc2012_test.py | 42 ++ python/paddle/v2/dataset/tests/wmt16_test.py | 66 +++ python/paddle/v2/dataset/uci_housing.py | 134 ++++++ python/paddle/v2/dataset/voc2012.py | 85 ++++ python/paddle/v2/dataset/wmt14.py | 182 ++++++++ python/paddle/v2/dataset/wmt16.py | 349 +++++++++++++++ python/paddle/v2/image.py | 381 ++++++++++++++++ python/paddle/v2/minibatch.py | 41 ++ python/paddle/v2/reader/__init__.py | 74 ++++ python/paddle/v2/reader/creator.py | 130 ++++++ python/paddle/v2/reader/decorator.py | 405 ++++++++++++++++++ python/paddle/v2/reader/tests/CMakeLists.txt | 2 + python/paddle/v2/reader/tests/__init__.py | 13 + python/paddle/v2/reader/tests/creator_test.py | 74 ++++ .../paddle/v2/reader/tests/decorator_test.py | 178 ++++++++ .../v2/reader/tests/test_data_creator.txt | 3 + .../v2/reader/tests/test_reader_recordio.dat | Bin 0 -> 76 bytes .../v2/reader/tests/test_recordio_creator.dat | Bin 0 -> 88 bytes python/paddle/v2/tests/CMakeLists.txt | 1 + python/paddle/v2/tests/cat.jpg | Bin 0 -> 57218 bytes python/paddle/v2/tests/test_image.py | 43 ++ .../paddle/v2/tests/test_paramconf_order.py | 3 +- python/setup.py.in | 2 + 45 files changed, 4718 insertions(+), 3 deletions(-) create mode 100644 python/paddle/v2/dataset/__init__.py create mode 100644 python/paddle/v2/dataset/cifar.py create mode 100644 python/paddle/v2/dataset/common.py create mode 100644 python/paddle/v2/dataset/conll05.py create mode 100644 python/paddle/v2/dataset/flowers.py create mode 100644 python/paddle/v2/dataset/imdb.py create mode 100644 python/paddle/v2/dataset/imikolov.py create mode 100644 python/paddle/v2/dataset/mnist.py create mode 100644 python/paddle/v2/dataset/movielens.py create mode 100644 python/paddle/v2/dataset/mq2007.py create mode 100644 python/paddle/v2/dataset/sentiment.py create mode 100644 python/paddle/v2/dataset/tests/cifar_test.py create mode 100644 python/paddle/v2/dataset/tests/common_test.py create mode 100644 python/paddle/v2/dataset/tests/flowers_test.py create mode 100644 python/paddle/v2/dataset/tests/imdb_test.py create mode 100644 python/paddle/v2/dataset/tests/imikolov_test.py create mode 100644 python/paddle/v2/dataset/tests/mnist_test.py create mode 100644 python/paddle/v2/dataset/tests/mq2007_test.py create mode 100644 python/paddle/v2/dataset/tests/test_sentiment.py create mode 100644 python/paddle/v2/dataset/tests/voc2012_test.py create mode 100644 python/paddle/v2/dataset/tests/wmt16_test.py create mode 100644 python/paddle/v2/dataset/uci_housing.py create mode 100644 python/paddle/v2/dataset/voc2012.py create mode 100644 python/paddle/v2/dataset/wmt14.py create mode 100644 python/paddle/v2/dataset/wmt16.py create mode 100644 python/paddle/v2/image.py create mode 100644 python/paddle/v2/minibatch.py create mode 100644 python/paddle/v2/reader/__init__.py create mode 100644 python/paddle/v2/reader/creator.py create mode 100644 python/paddle/v2/reader/decorator.py create mode 100644 python/paddle/v2/reader/tests/CMakeLists.txt create mode 100644 python/paddle/v2/reader/tests/__init__.py create mode 100644 python/paddle/v2/reader/tests/creator_test.py create mode 100644 python/paddle/v2/reader/tests/decorator_test.py create mode 100644 python/paddle/v2/reader/tests/test_data_creator.txt create mode 100644 python/paddle/v2/reader/tests/test_reader_recordio.dat create mode 100644 python/paddle/v2/reader/tests/test_recordio_creator.dat create mode 100644 python/paddle/v2/tests/cat.jpg create mode 100644 python/paddle/v2/tests/test_image.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index f5ae553c857..d074b0136d7 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -81,6 +81,7 @@ if (WITH_TESTING) # enable v2 API unittest only when paddle swig api is compiled add_subdirectory(paddle/v2/tests) add_subdirectory(paddle/v2/plot/tests) + add_subdirectory(paddle/v2/reader/tests) endif() endif() add_subdirectory(paddle/fluid/tests) diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py index 1fdfd49f1c9..3315e826e82 100644 --- a/python/paddle/dataset/__init__.py +++ b/python/paddle/dataset/__init__.py @@ -37,7 +37,7 @@ __all__ = [ 'cifar', 'movielens', 'conll05', - 'sentiment' + 'sentiment', 'uci_housing', 'wmt14', 'wmt16', diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 02b0d077eef..df710c33d0c 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -22,13 +22,17 @@ import data_type import topology import networks import evaluator +from . import dataset +from . import reader from . import plot import attr import op import pooling import inference import networks +import minibatch import plot +import image import paddle.trainer.config_parser as cp __all__ = [ @@ -44,11 +48,14 @@ __all__ = [ 'data_type', 'attr', 'pooling', + 'dataset', + 'reader', 'topology', 'networks', 'infer', 'plot', 'evaluator', + 'image', 'master', ] @@ -146,3 +153,4 @@ def init(**kwargs): infer = inference.infer +batch = minibatch.batch diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py new file mode 100644 index 00000000000..c1acbecd9c3 --- /dev/null +++ b/python/paddle/v2/dataset/__init__.py @@ -0,0 +1,46 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Dataset package. +""" + +import mnist +import imikolov +import imdb +import cifar +import movielens +import conll05 +import uci_housing +import sentiment +import wmt14 +import wmt16 +import mq2007 +import flowers +import voc2012 + +__all__ = [ + 'mnist', + 'imikolov', + 'imdb', + 'cifar', + 'movielens', + 'conll05', + 'sentiment' + 'uci_housing', + 'wmt14', + 'wmt16', + 'mq2007', + 'flowers', + 'voc2012', +] diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py new file mode 100644 index 00000000000..0a2a1ced11e --- /dev/null +++ b/python/paddle/v2/dataset/cifar.py @@ -0,0 +1,139 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +CIFAR dataset. + +This module will download dataset from +https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into +paddle reader creators. + +The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, +with 6000 images per class. There are 50000 training images and 10000 test +images. + +The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes +containing 600 images each. There are 500 training images and 100 testing +images per class. + +""" + +import cPickle +import itertools +import numpy +import paddle.v2.dataset.common +import tarfile + +__all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] + +URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/' +CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz' +CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a' +CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz' +CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85' + + +def reader_creator(filename, sub_name): + def read_batch(batch): + data = batch['data'] + labels = batch.get('labels', batch.get('fine_labels', None)) + assert labels is not None + for sample, label in itertools.izip(data, labels): + yield (sample / 255.0).astype(numpy.float32), int(label) + + def reader(): + with tarfile.open(filename, mode='r') as f: + names = (each_item.name for each_item in f + if sub_name in each_item.name) + + for name in names: + batch = cPickle.load(f.extractfile(name)) + for item in read_batch(batch): + yield item + + return reader + + +def train100(): + """ + CIFAR-100 training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 99]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + 'train') + + +def test100(): + """ + CIFAR-100 test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + 'test') + + +def train10(): + """ + CIFAR-10 training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + 'data_batch') + + +def test10(): + """ + CIFAR-10 test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + 'test_batch') + + +def fetch(): + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5) + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100") + paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100") + paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10") + paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10") diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py new file mode 100644 index 00000000000..c6ff09a1d1e --- /dev/null +++ b/python/paddle/v2/dataset/common.py @@ -0,0 +1,236 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests +import hashlib +import os +import errno +import shutil +import sys +import importlib +import paddle.v2.dataset +import cPickle +import glob +import cPickle as pickle + +__all__ = [ + 'DATA_HOME', + 'download', + 'md5file', + 'split', + 'cluster_files_reader', + 'convert', +] + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') + + +# When running unit tests, there could be multiple processes that +# trying to create DATA_HOME directory simultaneously, so we cannot +# use a if condition to check for the existence of the directory; +# instead, we use the filesystem as the synchronization mechanism by +# catching returned errors. +def must_mkdirs(path): + try: + os.makedirs(DATA_HOME) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + pass + + +must_mkdirs(DATA_HOME) + + +def md5file(fname): + hash_md5 = hashlib.md5() + f = open(fname, "rb") + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + f.close() + return hash_md5.hexdigest() + + +def download(url, module_name, md5sum, save_name=None): + dirname = os.path.join(DATA_HOME, module_name) + if not os.path.exists(dirname): + os.makedirs(dirname) + + filename = os.path.join(dirname, + url.split('/')[-1] + if save_name is None else save_name) + + retry = 0 + retry_limit = 3 + while not (os.path.exists(filename) and md5file(filename) == md5sum): + if os.path.exists(filename): + print "file md5", md5file(filename), md5sum + if retry < retry_limit: + retry += 1 + else: + raise RuntimeError("Cannot download {0} within retry limit {1}". + format(url, retry_limit)) + print "Cache file %s not found, downloading %s" % (filename, url) + r = requests.get(url, stream=True) + total_length = r.headers.get('content-length') + + if total_length is None: + with open(filename, 'w') as f: + shutil.copyfileobj(r.raw, f) + else: + with open(filename, 'w') as f: + dl = 0 + total_length = int(total_length) + for data in r.iter_content(chunk_size=4096): + dl += len(data) + f.write(data) + done = int(50 * dl / total_length) + sys.stdout.write("\r[%s%s]" % ('=' * done, + ' ' * (50 - done))) + sys.stdout.flush() + + return filename + + +def fetch_all(): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "fetch" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)): + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "fetch")() + + +def fetch_all_recordio(path): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "convert" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \ + not module_name == "common": + ds_path = os.path.join(path, module_name) + must_mkdirs(ds_path) + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "convert")(ds_path) + + +def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): + """ + you can call the function as: + + split(paddle.v2.dataset.cifar.train10(), line_count=1000, + suffix="imikolov-train-%05d.pickle") + + the output files as: + + |-imikolov-train-00000.pickle + |-imikolov-train-00001.pickle + |- ... + |-imikolov-train-00480.pickle + + :param reader: is a reader creator + :param line_count: line count for each file + :param suffix: the suffix for the output files, should contain "%d" + means the id for each file. Default is "%05d.pickle" + :param dumper: is a callable function that dump object to file, this + function will be called as dumper(obj, f) and obj is the object + will be dumped, f is a file object. Default is cPickle.dump. + """ + if not callable(dumper): + raise TypeError("dumper should be callable.") + lines = [] + indx_f = 0 + for i, d in enumerate(reader()): + lines.append(d) + if i >= line_count and i % line_count == 0: + with open(suffix % indx_f, "w") as f: + dumper(lines, f) + lines = [] + indx_f += 1 + if lines: + with open(suffix % indx_f, "w") as f: + dumper(lines, f) + + +def cluster_files_reader(files_pattern, + trainer_count, + trainer_id, + loader=cPickle.load): + """ + Create a reader that yield element from the given files, select + a file set according trainer count and trainer_id + + :param files_pattern: the files which generating by split(...) + :param trainer_count: total trainer count + :param trainer_id: the trainer rank id + :param loader: is a callable function that load object from file, this + function will be called as loader(f) and f is a file object. + Default is cPickle.load + """ + + def reader(): + if not callable(loader): + raise TypeError("loader should be callable.") + file_list = glob.glob(files_pattern) + file_list.sort() + my_file_list = [] + for idx, fn in enumerate(file_list): + if idx % trainer_count == trainer_id: + print "append file: %s" % fn + my_file_list.append(fn) + for fn in my_file_list: + with open(fn, "r") as f: + lines = loader(f) + for line in lines: + yield line + + return reader + + +def convert(output_path, reader, line_count, name_prefix): + import recordio + """ + Convert data from reader to recordio format files. + + :param output_path: directory in which output files will be saved. + :param reader: a data reader, from which the convert program will read + data instances. + :param name_prefix: the name prefix of generated files. + :param max_lines_to_shuffle: the max lines numbers to shuffle before + writing. + """ + + assert line_count >= 1 + indx_f = 0 + + def write_data(indx_f, lines): + filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f) + writer = recordio.writer(filename) + for l in lines: + # FIXME(Yancey1989): + # dumps with protocol: pickle.HIGHEST_PROTOCOL + writer.write(cPickle.dumps(l)) + writer.close() + + lines = [] + for i, d in enumerate(reader()): + lines.append(d) + if i % line_count == 0 and i >= line_count: + write_data(indx_f, lines) + lines = [] + indx_f += 1 + continue + + write_data(indx_f, lines) diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py new file mode 100644 index 00000000000..0d544efac9c --- /dev/null +++ b/python/paddle/v2/dataset/conll05.py @@ -0,0 +1,257 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Conll05 dataset. +Paddle semantic role labeling Book and demo use this dataset as an example. +Because Conll05 is not free in public, the default downloaded URL is test set +of Conll05 (which is public). Users can change URL and MD5 to their Conll +dataset. And a pre-trained word vector model based on Wikipedia corpus is used +to initialize SRL model. +""" + +import tarfile +import gzip +import itertools +import paddle.v2.dataset.common + +__all__ = ['test, get_dict', 'get_embedding', 'convert'] + +DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' +DATA_MD5 = '387719152ae52d60422c016e92a742fc' +WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' +WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' +VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' +VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' +TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' +TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' +EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' +EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' + +UNK_IDX = 0 + + +def load_label_dict(filename): + d = dict() + tag_dict = set() + with open(filename, 'r') as f: + for i, line in enumerate(f): + line = line.strip() + if line.startswith("B-"): + tag_dict.add(line[2:]) + elif line.startswith("I-"): + tag_dict.add(line[2:]) + index = 0 + for tag in tag_dict: + d["B-" + tag] = index + index += 1 + d["I-" + tag] = index + index += 1 + d["O"] = index + return d + + +def load_dict(filename): + d = dict() + with open(filename, 'r') as f: + for i, line in enumerate(f): + d[line.strip()] = i + return d + + +def corpus_reader(data_path, words_name, props_name): + """ + Read one corpus. It returns an iterator. Each element of + this iterator is a tuple including sentence and labels. The sentence is + consist of a list of word IDs. The labels include a list of label IDs. + :return: a iterator of data. + :rtype: iterator + """ + + def reader(): + tf = tarfile.open(data_path) + wf = tf.extractfile(words_name) + pf = tf.extractfile(props_name) + with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile( + fileobj=pf) as props_file: + sentences = [] + labels = [] + one_seg = [] + for word, label in itertools.izip(words_file, props_file): + word = word.strip() + label = label.strip().split() + + if len(label) == 0: # end of sentence + for i in xrange(len(one_seg[0])): + a_kind_lable = [x[i] for x in one_seg] + labels.append(a_kind_lable) + + if len(labels) >= 1: + verb_list = [] + for x in labels[0]: + if x != '-': + verb_list.append(x) + + for i, lbl in enumerate(labels[1:]): + cur_tag = 'O' + is_in_bracket = False + lbl_seq = [] + verb_word = '' + for l in lbl: + if l == '*' and is_in_bracket == False: + lbl_seq.append('O') + elif l == '*' and is_in_bracket == True: + lbl_seq.append('I-' + cur_tag) + elif l == '*)': + lbl_seq.append('I-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') != -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') == -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = True + else: + raise RuntimeError('Unexpected label: %s' % + l) + + yield sentences, verb_list[i], lbl_seq + + sentences = [] + labels = [] + one_seg = [] + else: + sentences.append(word) + one_seg.append(label) + + pf.close() + wf.close() + tf.close() + + return reader + + +def reader_creator(corpus_reader, + word_dict=None, + predicate_dict=None, + label_dict=None): + def reader(): + for sentence, predicate, labels in corpus_reader(): + + sen_len = len(sentence) + + verb_index = labels.index('B-V') + mark = [0] * len(labels) + if verb_index > 0: + mark[verb_index - 1] = 1 + ctx_n1 = sentence[verb_index - 1] + else: + ctx_n1 = 'bos' + + if verb_index > 1: + mark[verb_index - 2] = 1 + ctx_n2 = sentence[verb_index - 2] + else: + ctx_n2 = 'bos' + + mark[verb_index] = 1 + ctx_0 = sentence[verb_index] + + if verb_index < len(labels) - 1: + mark[verb_index + 1] = 1 + ctx_p1 = sentence[verb_index + 1] + else: + ctx_p1 = 'eos' + + if verb_index < len(labels) - 2: + mark[verb_index + 2] = 1 + ctx_p2 = sentence[verb_index + 2] + else: + ctx_p2 = 'eos' + + word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] + + ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len + ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len + ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len + ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len + ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + + pred_idx = [predicate_dict.get(predicate)] * sen_len + label_idx = [label_dict.get(w) for w in labels] + + yield word_idx, ctx_n2_idx, ctx_n1_idx, \ + ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx + + return reader + + +def get_dict(): + """ + Get the word, verb and label dictionary of Wikipedia corpus. + """ + word_dict = load_dict( + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', + WORDDICT_MD5)) + verb_dict = load_dict( + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', + VERBDICT_MD5)) + label_dict = load_label_dict( + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', + TRGDICT_MD5)) + return word_dict, verb_dict, label_dict + + +def get_embedding(): + """ + Get the trained word vector based on Wikipedia corpus. + """ + return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + + +def test(): + """ + Conll05 test set creator. + + Because the training dataset is not free, the test dataset is used for + training. It returns a reader creator, each sample in the reader is nine + features, including sentence sequence, predicate, predicate context, + predicate context flag and tagged sequence. + + :return: Training reader creator + :rtype: callable + """ + word_dict, verb_dict, label_dict = get_dict() + reader = corpus_reader( + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5), + words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', + props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') + return reader_creator(reader, word_dict, verb_dict, label_dict) + + +def fetch(): + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) + paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test") diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py new file mode 100644 index 00000000000..7bdddeaabec --- /dev/null +++ b/python/paddle/v2/dataset/flowers.py @@ -0,0 +1,199 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module will download dataset from +http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html +and parse train/test set intopaddle reader creators. + +This set contains images of flowers belonging to 102 different categories. +The images were acquired by searching the web and taking pictures. There are a +minimum of 40 images for each category. + +The database was used in: + +Nilsback, M-E. and Zisserman, A. Automated flower classification over a large + number of classes.Proceedings of the Indian Conference on Computer Vision, +Graphics and Image Processing (2008) +http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}. + +""" +import cPickle +import itertools +import functools +from common import download +import tarfile +import scipy.io as scio +from paddle.v2.image import * +from paddle.v2.reader import * +import os +import numpy as np +from multiprocessing import cpu_count +__all__ = ['train', 'test', 'valid'] + +DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' +LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' +SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' +DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118' +LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' +SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' +# In official 'readme', tstid is the flag of test data +# and trnid is the flag of train data. But test data is more than train data. +# So we exchange the train data and test data. +TRAIN_FLAG = 'tstid' +TEST_FLAG = 'trnid' +VALID_FLAG = 'valid' + + +def default_mapper(is_train, sample): + ''' + map image bytes data to type needed by model input layer + ''' + img, label = sample + img = load_image_bytes(img) + img = simple_transform( + img, 256, 224, is_train, mean=[103.94, 116.78, 123.68]) + return img.flatten().astype('float32'), label + + +train_mapper = functools.partial(default_mapper, True) +test_mapper = functools.partial(default_mapper, False) + + +def reader_creator(data_file, + label_file, + setid_file, + dataset_name, + mapper, + buffered_size=1024, + use_xmap=True): + ''' + 1. read images from tar file and + merge images into batch files in 102flowers.tgz_batch/ + 2. get a reader to read sample from batch file + + :param data_file: downloaded data file + :type data_file: string + :param label_file: downloaded label file + :type label_file: string + :param setid_file: downloaded setid file containing information + about how to split dataset + :type setid_file: string + :param dataset_name: data set name (tstid|trnid|valid) + :type dataset_name: string + :param mapper: a function to map image bytes data to type + needed by model input layer + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: data reader + :rtype: callable + ''' + labels = scio.loadmat(label_file)['labels'][0] + indexes = scio.loadmat(setid_file)[dataset_name][0] + img2label = {} + for i in indexes: + img = "jpg/image_%05d.jpg" % i + img2label[img] = labels[i - 1] + file_list = batch_images_from_tar(data_file, dataset_name, img2label) + + def reader(): + for file in open(file_list): + file = file.strip() + batch = None + with open(file, 'r') as f: + batch = cPickle.load(f) + data = batch['data'] + labels = batch['label'] + for sample, label in itertools.izip(data, batch['label']): + yield sample, int(label) - 1 + + if use_xmap: + return xmap_readers(mapper, reader, cpu_count(), buffered_size) + else: + return map_readers(mapper, reader) + + +def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers training set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: train data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper, + buffered_size, use_xmap) + + +def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers test set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: test data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper, + buffered_size, use_xmap) + + +def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers validation set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: test data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper, + buffered_size, use_xmap) + + +def fetch(): + download(DATA_URL, 'flowers', DATA_MD5) + download(LABEL_URL, 'flowers', LABEL_MD5) + download(SETID_URL, 'flowers', SETID_MD5) diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py new file mode 100644 index 00000000000..37c4296f9bc --- /dev/null +++ b/python/paddle/v2/dataset/imdb.py @@ -0,0 +1,148 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +IMDB dataset. + +This module downloads IMDB dataset from +http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set +of 25,000 highly polar movie reviews for training, and 25,000 for testing. +Besides, this module also provides API for building dictionary. +""" + +import paddle.v2.dataset.common +import collections +import tarfile +import re +import string + +__all__ = ['build_dict', 'train', 'test', 'convert'] + +URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' +MD5 = '7c2ac02c03563afcf9b574c7e56c153a' + + +def tokenize(pattern): + """ + Read files that match the given pattern. Tokenize and yield each file. + """ + + with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', + MD5)) as tarf: + # Note that we should use tarfile.next(), which does + # sequential access of member files, other than + # tarfile.extractfile, which does random access and might + # destroy hard disks. + tf = tarf.next() + while tf != None: + if bool(pattern.match(tf.name)): + # newline and punctuations removal and ad-hoc tokenization. + yield tarf.extractfile(tf).read().rstrip("\n\r").translate( + None, string.punctuation).lower().split() + tf = tarf.next() + + +def build_dict(pattern, cutoff): + """ + Build a word dictionary from the corpus. Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + word_freq = collections.defaultdict(int) + for doc in tokenize(pattern): + for word in doc: + word_freq[word] += 1 + + # Not sure if we should prune less-frequent words here. + word_freq = filter(lambda x: x[1] > cutoff, word_freq.items()) + + dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*dictionary)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + return word_idx + + +def reader_creator(pos_pattern, neg_pattern, word_idx): + UNK = word_idx[''] + INS = [] + + def load(pattern, out, label): + for doc in tokenize(pattern): + out.append(([word_idx.get(w, UNK) for w in doc], label)) + + load(pos_pattern, INS, 0) + load(neg_pattern, INS, 1) + + def reader(): + for doc, label in INS: + yield doc, label + + return reader + + +def train(word_idx): + """ + IMDB training set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/train/pos/.*\.txt$"), + re.compile("aclImdb/train/neg/.*\.txt$"), word_idx) + + +def test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/test/pos/.*\.txt$"), + re.compile("aclImdb/test/neg/.*\.txt$"), word_idx) + + +def word_dict(): + """ + Build a word dictionary from the corpus. + + :return: Word dictionary + :rtype: dict + """ + return build_dict( + re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) + + +def fetch(): + paddle.v2.dataset.common.download(URL, 'imdb', MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + w = word_dict() + paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py new file mode 100644 index 00000000000..617c722c416 --- /dev/null +++ b/python/paddle/v2/dataset/imikolov.py @@ -0,0 +1,161 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +imikolov's simple dataset. + +This module will download dataset from +http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set +into paddle reader creators. +""" +import paddle.v2.dataset.common +import collections +import tarfile + +__all__ = ['train', 'test', 'build_dict', 'convert'] + +URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' +MD5 = '30177ea32e27c525793142b6bf2c8e2d' + + +class DataType(object): + NGRAM = 1 + SEQ = 2 + + +def word_count(f, word_freq=None): + if word_freq is None: + word_freq = collections.defaultdict(int) + + for l in f: + for w in l.strip().split(): + word_freq[w] += 1 + word_freq[''] += 1 + word_freq[''] += 1 + + return word_freq + + +def build_dict(min_word_freq=50): + """ + Build a word dictionary from the corpus, Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + train_filename = './simple-examples/data/ptb.train.txt' + test_filename = './simple-examples/data/ptb.valid.txt' + with tarfile.open( + paddle.v2.dataset.common.download( + paddle.v2.dataset.imikolov.URL, 'imikolov', + paddle.v2.dataset.imikolov.MD5)) as tf: + trainf = tf.extractfile(train_filename) + testf = tf.extractfile(test_filename) + word_freq = word_count(testf, word_count(trainf)) + if '' in word_freq: + # remove for now, since we will set it as last index + del word_freq[''] + + word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items()) + + word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*word_freq_sorted)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + + return word_idx + + +def reader_creator(filename, word_idx, n, data_type): + def reader(): + with tarfile.open( + paddle.v2.dataset.common.download( + paddle.v2.dataset.imikolov.URL, 'imikolov', + paddle.v2.dataset.imikolov.MD5)) as tf: + f = tf.extractfile(filename) + + UNK = word_idx[''] + for l in f: + if DataType.NGRAM == data_type: + assert n > -1, 'Invalid gram length' + l = [''] + l.strip().split() + [''] + if len(l) >= n: + l = [word_idx.get(w, UNK) for w in l] + for i in range(n, len(l) + 1): + yield tuple(l[i - n:i]) + elif DataType.SEQ == data_type: + l = l.strip().split() + l = [word_idx.get(w, UNK) for w in l] + src_seq = [word_idx['']] + l + trg_seq = l + [word_idx['']] + if n > 0 and len(src_seq) > n: continue + yield src_seq, trg_seq + else: + assert False, 'Unknow data type' + + return reader + + +def train(word_idx, n, data_type=DataType.NGRAM): + """ + imikolov training set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size if type is ngram, otherwise max length of sequence + :type n: int + :param data_type: data type (ngram or sequence) + :type data_type: member variable of DataType (NGRAM or SEQ) + :return: Training reader creator + :rtype: callable + """ + return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n, + data_type) + + +def test(word_idx, n, data_type=DataType.NGRAM): + """ + imikolov test set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size if type is ngram, otherwise max length of sequence + :type n: int + :param data_type: data type (ngram or sequence) + :type data_type: member variable of DataType (NGRAM or SEQ) + :return: Test reader creator + :rtype: callable + """ + return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n, + data_type) + + +def fetch(): + paddle.v2.dataset.common.download(URL, "imikolov", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + N = 5 + word_dict = build_dict() + paddle.v2.dataset.common.convert(path, + train(word_dict, N), 1000, + "imikolov_train") + paddle.v2.dataset.common.convert(path, + test(word_dict, N), 1000, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py new file mode 100644 index 00000000000..9f675bed895 --- /dev/null +++ b/python/paddle/v2/dataset/mnist.py @@ -0,0 +1,123 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MNIST dataset. + +This module will download dataset from http://yann.lecun.com/exdb/mnist/ and +parse training set and test set into paddle reader creators. +""" +import paddle.v2.dataset.common +import subprocess +import numpy +import platform +__all__ = ['train', 'test', 'convert'] + +URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/' +TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz' +TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3' +TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz' +TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c' +TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz' +TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873' +TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz' +TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432' + + +def reader_creator(image_filename, label_filename, buffer_size): + def reader(): + if platform.system() == 'Darwin': + zcat_cmd = 'gzcat' + elif platform.system() == 'Linux': + zcat_cmd = 'zcat' + else: + raise NotImplementedError() + + # According to http://stackoverflow.com/a/38061619/724872, we + # cannot use standard package gzip here. + m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE) + m.stdout.read(16) # skip some magic bytes + + l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE) + l.stdout.read(8) # skip some magic bytes + + try: # reader could be break. + while True: + labels = numpy.fromfile( + l.stdout, 'ubyte', count=buffer_size).astype("int") + + if labels.size != buffer_size: + break # numpy.fromfile returns empty slice after EOF. + + images = numpy.fromfile( + m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape( + (buffer_size, 28 * 28)).astype('float32') + + images = images / 255.0 * 2.0 - 1.0 + + for i in xrange(buffer_size): + yield images[i, :], int(labels[i]) + finally: + m.terminate() + l.terminate() + + return reader + + +def train(): + """ + MNIST training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', + TRAIN_IMAGE_MD5), + paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', + TRAIN_LABEL_MD5), 100) + + +def test(): + """ + MNIST test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', + TEST_IMAGE_MD5), + paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', + TEST_LABEL_MD5), 100) + + +def fetch(): + paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) + paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) + paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test") diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py new file mode 100644 index 00000000000..5b61a9420af --- /dev/null +++ b/python/paddle/v2/dataset/movielens.py @@ -0,0 +1,262 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Movielens 1-M dataset. + +Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 +movies, which was collected by GroupLens Research. This module will download +Movielens 1-M dataset from +http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training +set and test set into paddle reader creators. + +""" + +import zipfile +import paddle.v2.dataset.common +import re +import random +import functools + +__all__ = [ + 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', + 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info', + 'convert' +] + +age_table = [1, 18, 25, 35, 45, 50, 56] + +URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' +MD5 = 'c4d9eecfca2ab87c1945afe126590906' + + +class MovieInfo(object): + """ + Movie id, title and categories information are stored in MovieInfo. + """ + + def __init__(self, index, categories, title): + self.index = int(index) + self.categories = categories + self.title = title + + def value(self): + """ + Get information from a movie. + """ + return [ + self.index, [CATEGORIES_DICT[c] for c in self.categories], + [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] + ] + + def __str__(self): + return "" % ( + self.index, self.title, self.categories) + + def __repr__(self): + return self.__str__() + + +class UserInfo(object): + """ + User id, gender, age, and job information are stored in UserInfo. + """ + + def __init__(self, index, gender, age, job_id): + self.index = int(index) + self.is_male = gender == 'M' + self.age = age_table.index(int(age)) + self.job_id = int(job_id) + + def value(self): + """ + Get information from a user. + """ + return [self.index, 0 if self.is_male else 1, self.age, self.job_id] + + def __str__(self): + return "" % ( + self.index, "M" + if self.is_male else "F", age_table[self.age], self.job_id) + + def __repr__(self): + return str(self) + + +MOVIE_INFO = None +MOVIE_TITLE_DICT = None +CATEGORIES_DICT = None +USER_INFO = None + + +def __initialize_meta_info__(): + fn = paddle.v2.dataset.common.download(URL, "movielens", MD5) + global MOVIE_INFO + if MOVIE_INFO is None: + pattern = re.compile(r'^(.*)\((\d+)\)$') + with zipfile.ZipFile(file=fn) as package: + for info in package.infolist(): + assert isinstance(info, zipfile.ZipInfo) + MOVIE_INFO = dict() + title_word_set = set() + categories_set = set() + with package.open('ml-1m/movies.dat') as movie_file: + for i, line in enumerate(movie_file): + movie_id, title, categories = line.strip().split('::') + categories = categories.split('|') + for c in categories: + categories_set.add(c) + title = pattern.match(title).group(1) + MOVIE_INFO[int(movie_id)] = MovieInfo( + index=movie_id, categories=categories, title=title) + for w in title.split(): + title_word_set.add(w.lower()) + + global MOVIE_TITLE_DICT + MOVIE_TITLE_DICT = dict() + for i, w in enumerate(title_word_set): + MOVIE_TITLE_DICT[w] = i + + global CATEGORIES_DICT + CATEGORIES_DICT = dict() + for i, c in enumerate(categories_set): + CATEGORIES_DICT[c] = i + + global USER_INFO + USER_INFO = dict() + with package.open('ml-1m/users.dat') as user_file: + for line in user_file: + uid, gender, age, job, _ = line.strip().split("::") + USER_INFO[int(uid)] = UserInfo( + index=uid, gender=gender, age=age, job_id=job) + return fn + + +def __reader__(rand_seed=0, test_ratio=0.1, is_test=False): + fn = __initialize_meta_info__() + rand = random.Random(x=rand_seed) + with zipfile.ZipFile(file=fn) as package: + with package.open('ml-1m/ratings.dat') as rating: + for line in rating: + if (rand.random() < test_ratio) == is_test: + uid, mov_id, rating, _ = line.strip().split("::") + uid = int(uid) + mov_id = int(mov_id) + rating = float(rating) * 2 - 5.0 + + mov = MOVIE_INFO[mov_id] + usr = USER_INFO[uid] + yield usr.value() + mov.value() + [[rating]] + + +def __reader_creator__(**kwargs): + return lambda: __reader__(**kwargs) + + +train = functools.partial(__reader_creator__, is_test=False) +test = functools.partial(__reader_creator__, is_test=True) + + +def get_movie_title_dict(): + """ + Get movie title dictionary. + """ + __initialize_meta_info__() + return MOVIE_TITLE_DICT + + +def __max_index_info__(a, b): + if a.index > b.index: + return a + else: + return b + + +def max_movie_id(): + """ + Get the maximum value of movie id. + """ + __initialize_meta_info__() + return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index + + +def max_user_id(): + """ + Get the maximum value of user id. + """ + __initialize_meta_info__() + return reduce(__max_index_info__, USER_INFO.viewvalues()).index + + +def __max_job_id_impl__(a, b): + if a.job_id > b.job_id: + return a + else: + return b + + +def max_job_id(): + """ + Get the maximum value of job id. + """ + __initialize_meta_info__() + return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id + + +def movie_categories(): + """ + Get movie categoriges dictionary. + """ + __initialize_meta_info__() + return CATEGORIES_DICT + + +def user_info(): + """ + Get user info dictionary. + """ + __initialize_meta_info__() + return USER_INFO + + +def movie_info(): + """ + Get movie info dictionary. + """ + __initialize_meta_info__() + return MOVIE_INFO + + +def unittest(): + for train_count, _ in enumerate(train()()): + pass + for test_count, _ in enumerate(test()()): + pass + + print train_count, test_count + + +def fetch(): + paddle.v2.dataset.common.download(URL, "movielens", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test") + + +if __name__ == '__main__': + unittest() diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py new file mode 100644 index 00000000000..d3b3dd524c3 --- /dev/null +++ b/python/paddle/v2/dataset/mq2007.py @@ -0,0 +1,333 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MQ2007 dataset + +MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross +validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set, +validation set and testing set. + +MQ2007 dataset from website +http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators + +""" + +import os +import functools +import rarfile +from common import download +import numpy as np + +# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar" +URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar" +MD5 = "7be1640ae95c6408dab0ae7207bdc706" + + +def __initialize_meta_info__(): + """ + download and extract the MQ2007 dataset + """ + fn = fetch() + rar = rarfile.RarFile(fn) + dirpath = os.path.dirname(fn) + rar.extractall(path=dirpath) + return dirpath + + +class Query(object): + """ + queries used for learning to rank algorithms. It is created from relevance scores, query-document feature vectors + + Parameters: + ---------- + query_id : int + query_id in dataset, mapping from query to relevance documents + relevance_score : int + relevance score of query and document pair + feature_vector : array, dense feature + feature in vector format + description : string + comment section in query doc pair data + """ + + def __init__(self, + query_id=-1, + relevance_score=-1, + feature_vector=None, + description=""): + self.query_id = query_id + self.relevance_score = relevance_score + if feature_vector is None: + self.feature_vector = [] + else: + self.feature_vector = feature_vector + self.description = description + + def __str__(self): + string = "%s %s %s" % (str(self.relevance_score), str(self.query_id), + " ".join(str(f) for f in self.feature_vector)) + return string + + # @classmethod + def _parse_(self, text): + """ + parse line into Query + """ + comment_position = text.find('#') + line = text[:comment_position].strip() + self.description = text[comment_position + 1:].strip() + parts = line.split() + if len(parts) != 48: + sys.stdout.write("expect 48 space split parts, get %d" % + (len(parts))) + return None + # format : 0 qid:10 1:0.000272 2:0.000000 .... + self.relevance_score = int(parts[0]) + self.query_id = int(parts[1].split(':')[1]) + for p in parts[2:]: + pair = p.split(':') + self.feature_vector.append(float(pair[1])) + return self + + +class QueryList(object): + """ + group query into list, every item in list is a Query + """ + + def __init__(self, querylist=None): + self.query_id = -1 + if querylist is None: + self.querylist = [] + else: + self.querylist = querylist + for query in self.querylist: + if self.query_id == -1: + self.query_id = query.query_id + else: + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + + def __iter__(self): + for query in self.querylist: + yield query + + def __len__(self): + return len(self.querylist) + + def __getitem__(self, i): + return self.querylist[i] + + def _correct_ranking_(self): + if self.querylist is None: + return + self.querylist.sort(key=lambda x: x.relevance_score, reverse=True) + + def _add_query(self, query): + if self.query_id == -1: + self.query_id = query.query_id + else: + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + self.querylist.append(query) + + +def gen_plain_txt(querylist): + """ + gen plain text in list for other usage + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + query_id : np.array, shape=(samples_num, ) + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + for query in querylist: + yield querylist.query_id, query.relevance_score, np.array( + query.feature_vector) + + +def gen_point(querylist): + """ + gen item in list for point-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + for query in querylist: + yield query.relevance_score, np.array(query.feature_vector) + + +def gen_pair(querylist, partial_order="full"): + """ + gen pair for pair-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + pairtial_order : "full" or "neighbour" + there is redudant in all possiable pair combinations, which can be simplifed + gen pairs for neighbour items or the full partial order pairs + + return : + ------ + label : np.array, shape=(1) + query_left : np.array, shape=(1, feature_dimension) + query_right : same as left + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + labels = [] + docpairs = [] + + # C(n,2) + for i in range(len(querylist)): + query_left = querylist[i] + for j in range(i + 1, len(querylist)): + query_right = querylist[j] + if query_left.relevance_score > query_right.relevance_score: + labels.append([1]) + docpairs.append([ + np.array(query_left.feature_vector), + np.array(query_right.feature_vector) + ]) + elif query_left.relevance_score < query_right.relevance_score: + labels.append([1]) + docpairs.append([ + np.array(query_right.feature_vector), + np.array(query_left.feature_vector) + ]) + for label, pair in zip(labels, docpairs): + yield np.array(label), pair[0], pair[1] + + +def gen_list(querylist): + """ + gen item in list for list-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + relevance_score_list = [[query.relevance_score] for query in querylist] + feature_vector_list = [query.feature_vector for query in querylist] + yield np.array(relevance_score_list), np.array(feature_vector_list) + + +def query_filter(querylists): + """ + filter query get only document with label 0. + label 0, 1, 2 means the relevance score document with query + parameters : + querylist : QueyList list + + return : + querylist : QueyList list + """ + filter_query = [] + for querylist in querylists: + relevance_score_list = [query.relevance_score for query in querylist] + if sum(relevance_score_list) != .0: + filter_query.append(querylist) + return filter_query + + +def load_from_text(filepath, shuffle=False, fill_missing=-1): + """ + parse data file into querys + """ + prev_query_id = -1 + querylists = [] + querylist = None + fn = __initialize_meta_info__() + with open(os.path.join(fn, filepath)) as f: + for line in f: + query = Query() + query = query._parse_(line) + if query == None: + continue + if query.query_id != prev_query_id: + if querylist is not None: + querylists.append(querylist) + querylist = QueryList() + prev_query_id = query.query_id + querylist._add_query(query) + if querylist is not None: + querylists.append(querylist) + return querylists + + +def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1): + """ + Parameters + -------- + filename : string + fill_missing : fill the missing value. default in MQ2007 is -1 + + Returns + ------ + yield + label query_left, query_right # format = "pairwise" + label querylist # format = "listwise" + """ + querylists = query_filter( + load_from_text( + filepath, shuffle=shuffle, fill_missing=fill_missing)) + for querylist in querylists: + if format == "plain_txt": + yield next(gen_plain_txt(querylist)) + elif format == "pointwise": + yield next(gen_point(querylist)) + elif format == "pairwise": + for pair in gen_pair(querylist): + yield pair + elif format == "listwise": + yield next(gen_list(querylist)) + + +train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt") +test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt") + + +def fetch(): + return download(URL, "MQ2007", MD5) + + +if __name__ == "__main__": + fetch() + mytest = functools.partial( + __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise") + for label, query in mytest(): + print label, query diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py new file mode 100644 index 00000000000..b0b9757c1a7 --- /dev/null +++ b/python/paddle/v2/dataset/sentiment.py @@ -0,0 +1,141 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The script fetch and preprocess movie_reviews data set that provided by NLTK + +TODO(yuyang18): Complete dataset. +""" + +import collections +from itertools import chain + +import nltk +from nltk.corpus import movie_reviews + +import paddle.v2.dataset.common + +__all__ = ['train', 'test', 'get_word_dict', 'convert'] +NUM_TRAINING_INSTANCES = 1600 +NUM_TOTAL_INSTANCES = 2000 + + +def download_data_if_not_yet(): + """ + Download the data set, if the data set is not download. + """ + try: + # make sure that nltk can find the data + if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path: + nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME) + movie_reviews.categories() + except LookupError: + print "Downloading movie_reviews data set, please wait....." + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + print "Download data set success....." + print "Path is " + nltk.data.find('corpora/movie_reviews').path + + +def get_word_dict(): + """ + Sorted the words by the frequency of words which occur in sample + :return: + words_freq_sorted + """ + words_freq_sorted = list() + word_freq_dict = collections.defaultdict(int) + download_data_if_not_yet() + + for category in movie_reviews.categories(): + for field in movie_reviews.fileids(category): + for words in movie_reviews.words(field): + word_freq_dict[words] += 1 + words_sort_list = word_freq_dict.items() + words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) + for index, word in enumerate(words_sort_list): + words_freq_sorted.append((word[0], index)) + return words_freq_sorted + + +def sort_files(): + """ + Sorted the sample for cross reading the sample + :return: + files_list + """ + files_list = list() + neg_file_list = movie_reviews.fileids('neg') + pos_file_list = movie_reviews.fileids('pos') + files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list))) + return files_list + + +def load_sentiment_data(): + """ + Load the data set + :return: + data_set + """ + data_set = list() + download_data_if_not_yet() + words_ids = dict(get_word_dict()) + for sample_file in sort_files(): + words_list = list() + category = 0 if 'neg' in sample_file else 1 + for word in movie_reviews.words(sample_file): + words_list.append(words_ids[word.lower()]) + data_set.append((words_list, category)) + return data_set + + +def reader_creator(data): + """ + Reader creator, generate an iterator for data set + :param data: + train data set or test data set + """ + for each in data: + yield each[0], each[1] + + +def train(): + """ + Default training set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[0:NUM_TRAINING_INSTANCES]) + + +def test(): + """ + Default test set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[NUM_TRAINING_INSTANCES:]) + + +def fetch(): + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train") + paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test") diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py new file mode 100644 index 00000000000..e0e18229da7 --- /dev/null +++ b/python/paddle/v2/dataset/tests/cifar_test.py @@ -0,0 +1,56 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.cifar +import unittest + + +class TestCIFAR(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 3072) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_test10(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.test10()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 9) + + def test_train10(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.train10()) + self.assertEqual(instances, 50000) + self.assertEqual(max_label_value, 9) + + def test_test100(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.test100()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 99) + + def test_train100(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.train100()) + self.assertEqual(instances, 50000) + self.assertEqual(max_label_value, 99) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py new file mode 100644 index 00000000000..cfa194eba38 --- /dev/null +++ b/python/paddle/v2/dataset/tests/common_test.py @@ -0,0 +1,94 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.common +import unittest +import tempfile +import glob + + +class TestCommon(unittest.TestCase): + def test_md5file(self): + _, temp_path = tempfile.mkstemp() + with open(temp_path, 'w') as f: + f.write("Hello\n") + self.assertEqual('09f7e02f1290be211da707a266f153b3', + paddle.v2.dataset.common.md5file(temp_path)) + + def test_download(self): + yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460' + self.assertEqual( + paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460', + paddle.v2.dataset.common.download( + yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d')) + + def test_split(self): + def test_reader(): + def reader(): + for x in xrange(10): + yield x + + return reader + + _, temp_path = tempfile.mkstemp() + paddle.v2.dataset.common.split( + test_reader(), 4, suffix=temp_path + '/test-%05d.pickle') + files = glob.glob(temp_path + '/test-%05d.pickle') + self.assertEqual(len(files), 3) + + def test_cluster_file_reader(self): + _, temp_path = tempfile.mkstemp() + for x in xrange(5): + with open(temp_path + '/%05d.test' % x) as f: + f.write('%d\n' % x) + reader = paddle.v2.dataset.common.cluster_files_reader( + temp_path + '/*.test', 5, 0) + for idx, e in enumerate(reader()): + self.assertEqual(e, str("0")) + + def test_convert(self): + record_num = 10 + num_shards = 4 + + def test_reader(): + def reader(): + for x in xrange(record_num): + yield x + + return reader + + path = tempfile.mkdtemp() + paddle.v2.dataset.common.convert(path, + test_reader(), num_shards, + 'random_images') + + files = glob.glob(path + '/random_images-*') + self.assertEqual(len(files), num_shards) + + recs = [] + for i in range(0, num_shards): + n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1) + r = recordio.reader(n) + while True: + d = r.read() + if d is None: + break + recs.append(d) + + recs.sort() + self.assertEqual(total, record_num) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py new file mode 100644 index 00000000000..a8ae9a07acc --- /dev/null +++ b/python/paddle/v2/dataset/tests/flowers_test.py @@ -0,0 +1,51 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.flowers +import unittest + + +class TestFlowers(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + size = 224 * 224 * 3 + for l in reader(): + self.assertEqual(l[0].size, size) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_train(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.train()) + self.assertEqual(instances, 6149) + self.assertEqual(max_label_value, 102) + + def test_test(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.test()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + def test_valid(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.valid()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py new file mode 100644 index 00000000000..c4d82f26895 --- /dev/null +++ b/python/paddle/v2/dataset/tests/imdb_test.py @@ -0,0 +1,57 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.imdb +import unittest +import re + +TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$") +TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$") +TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$") + +TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$") +TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$") +TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$") + + +class TestIMDB(unittest.TestCase): + word_idx = None + + def test_build_dict(self): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + self.assertEqual(len(self.word_idx), 7036) + + def check_dataset(self, dataset, expected_size): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + sum = 0 + for l in dataset(self.word_idx): + self.assertEqual(l[1], sum % 2) + sum += 1 + self.assertEqual(sum, expected_size) + + def test_train(self): + self.check_dataset(paddle.v2.dataset.imdb.train, 25000) + + def test_test(self): + self.check_dataset(paddle.v2.dataset.imdb.test, 25000) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py new file mode 100644 index 00000000000..714a75d6f1f --- /dev/null +++ b/python/paddle/v2/dataset/tests/imikolov_test.py @@ -0,0 +1,67 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.imikolov +import unittest + +WORD_DICT = paddle.v2.dataset.imikolov.build_dict() + + +class TestMikolov(unittest.TestCase): + def check_reader(self, reader, n): + for l in reader(): + self.assertEqual(len(l), n) + + def test_train(self): + n = 5 + self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n) + + first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\ + 'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\ + 'rake regatta rubens sim snack-food ssangyong swapo wachter' + first_line = [ + WORD_DICT.get(ch, WORD_DICT['']) + for ch in first_line.split(' ') + ] + for l in paddle.v2.dataset.imikolov.train( + WORD_DICT, n=-1, + data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + read_line = l[0][1:] + break + self.assertEqual(first_line, read_line) + + def test_test(self): + n = 5 + self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n) + + first_line = 'consumers may want to move their telephones a little '\ + 'closer to the tv set' + first_line = [ + WORD_DICT.get(ch, WORD_DICT['']) + for ch in first_line.split(' ') + ] + for l in paddle.v2.dataset.imikolov.test( + WORD_DICT, n=-1, + data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + read_line = l[0][1:] + break + self.assertEqual(first_line, read_line) + + def test_total(self): + _, idx = zip(*WORD_DICT.items()) + self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py new file mode 100644 index 00000000000..1d344cac3e7 --- /dev/null +++ b/python/paddle/v2/dataset/tests/mnist_test.py @@ -0,0 +1,44 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.mnist +import unittest + + +class TestMNIST(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 784) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_train(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.mnist.train()) + self.assertEqual(instances, 60000) + self.assertEqual(max_label_value, 9) + + def test_test(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.mnist.test()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 9) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/v2/dataset/tests/mq2007_test.py new file mode 100644 index 00000000000..59847b6c18e --- /dev/null +++ b/python/paddle/v2/dataset/tests/mq2007_test.py @@ -0,0 +1,33 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.mq2007 +import unittest + + +class TestMQ2007(unittest.TestCase): + def test_pairwise(self): + for label, query_left, query_right in paddle.v2.dataset.mq2007.test( + format="pairwise"): + self.assertEqual(query_left.shape(), (46, )) + self.assertEqual(query_right.shape(), (46, )) + + def test_listwise(self): + for label_array, query_array in paddle.v2.dataset.mq2007.test( + format="listwise"): + self.assertEqual(len(label_array), len(query_array)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py new file mode 100644 index 00000000000..40740529073 --- /dev/null +++ b/python/paddle/v2/dataset/tests/test_sentiment.py @@ -0,0 +1,55 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import nltk +import paddle.v2.dataset.sentiment as st +from nltk.corpus import movie_reviews + + +class TestSentimentMethods(unittest.TestCase): + def test_get_word_dict(self): + word_dict = st.get_word_dict()[0:10] + test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3), + (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7), + (u'is', 8), (u'in', 9)] + for idx, each in enumerate(word_dict): + self.assertEqual(each, test_word_list[idx]) + self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path) + + def test_sort_files(self): + last_label = '' + for sample_file in st.sort_files(): + current_label = sample_file.split("/")[0] + self.assertNotEqual(current_label, last_label) + last_label = current_label + + def test_data_set(self): + data_set = st.load_sentiment_data() + last_label = -1 + for each in st.test(): + self.assertNotEqual(each[1], last_label) + last_label = each[1] + self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES) + self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES) + self.assertEqual( + len(list(st.test())), + (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py new file mode 100644 index 00000000000..31e72ebf5ea --- /dev/null +++ b/python/paddle/v2/dataset/tests/voc2012_test.py @@ -0,0 +1,42 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.voc2012 +import unittest + + +class TestVOC(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 3 * l[1].size) + sum += 1 + return sum + + def test_train(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.train()) + self.assertEqual(count, 2913) + + def test_test(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.test()) + self.assertEqual(count, 1464) + + def test_val(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.val()) + self.assertEqual(count, 1449) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/v2/dataset/tests/wmt16_test.py new file mode 100644 index 00000000000..cef6c3216e7 --- /dev/null +++ b/python/paddle/v2/dataset/tests/wmt16_test.py @@ -0,0 +1,66 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.wmt16 +import unittest + + +class TestWMT16(unittest.TestCase): + def checkout_one_sample(self, sample): + # train data has 3 field: source language word indices, + # target language word indices, and target next word indices. + self.assertEqual(len(sample), 3) + + # test start mark and end mark in source word indices. + self.assertEqual(sample[0][0], 0) + self.assertEqual(sample[0][-1], 1) + + # test start mask in target word indices + self.assertEqual(sample[1][0], 0) + + # test en mask in target next word indices + self.assertEqual(sample[2][-1], 1) + + def test_train(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.train( + src_dict_size=100000, trg_dict_size=100000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_test(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.test( + src_dict_size=1000, trg_dict_size=1000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_val(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.validation( + src_dict_size=1000, trg_dict_size=1000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_get_dict(self): + dict_size = 1000 + word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True) + self.assertEqual(len(word_dict), dict_size) + self.assertEqual(word_dict[0], "") + self.assertEqual(word_dict[1], "") + self.assertEqual(word_dict[2], "") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py new file mode 100644 index 00000000000..f10bf7e42a1 --- /dev/null +++ b/python/paddle/v2/dataset/uci_housing.py @@ -0,0 +1,134 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +UCI Housing dataset. + +This module will download dataset from +https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and +parse training set and test set into paddle reader creators. +""" + +import numpy as np +import os +import paddle.v2.dataset.common +from paddle.v2.parameters import Parameters + +__all__ = ['train', 'test'] + +URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' +MD5 = 'd4accdce7a25600298819f8e28e8d593' +feature_names = [ + 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', + 'PTRATIO', 'B', 'LSTAT', 'convert' +] + +UCI_TRAIN_DATA = None +UCI_TEST_DATA = None +URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar' +MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b' + + +def feature_range(maximums, minimums): + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + fig, ax = plt.subplots() + feature_num = len(maximums) + ax.bar(range(feature_num), maximums - minimums, color='r', align='center') + ax.set_title('feature scale') + plt.xticks(range(feature_num), feature_names) + plt.xlim([-1, feature_num]) + fig.set_figheight(6) + fig.set_figwidth(10) + if not os.path.exists('./image'): + os.makedirs('./image') + fig.savefig('image/ranges.png', dpi=48) + plt.close(fig) + + +def load_data(filename, feature_num=14, ratio=0.8): + global UCI_TRAIN_DATA, UCI_TEST_DATA + if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None: + return + + data = np.fromfile(filename, sep=' ') + data = data.reshape(data.shape[0] / feature_num, feature_num) + maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum( + axis=0) / data.shape[0] + feature_range(maximums[:-1], minimums[:-1]) + for i in xrange(feature_num - 1): + data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i]) + offset = int(data.shape[0] * ratio) + UCI_TRAIN_DATA = data[:offset] + UCI_TEST_DATA = data[offset:] + + +def train(): + """ + UCI_HOUSING training set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Training reader creator + :rtype: callable + """ + global UCI_TRAIN_DATA + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TRAIN_DATA: + yield d[:-1], d[-1:] + + return reader + + +def test(): + """ + UCI_HOUSING test set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Test reader creator + :rtype: callable + """ + global UCI_TEST_DATA + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TEST_DATA: + yield d[:-1], d[-1:] + + return reader + + +def model(): + tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', + MD5_MODEL) + with open(tar_file, 'r') as f: + parameters = Parameters.from_tar(f) + return parameters + + +def fetch(): + paddle.v2.dataset.common.download(URL, 'uci_housing', MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test") diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py new file mode 100644 index 00000000000..617e212d67f --- /dev/null +++ b/python/paddle/v2/dataset/voc2012.py @@ -0,0 +1,85 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image dataset for segmentation. +The 2012 dataset contains images from 2008-2011 for which additional +segmentations have been prepared. As in previous years the assignment +to training/test sets has been maintained. The total number of images +with segmentation has been increased from 7,062 to 9,993. +""" + +import tarfile +import io +import numpy as np +from paddle.v2.dataset.common import download +from paddle.v2.image import * +from PIL import Image + +__all__ = ['train', 'test', 'val'] + +VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\ +VOCtrainval_11-May-2012.tar' + +VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd' +SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt' +DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg' +LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png' + +CACHE_DIR = 'voc2012' + + +def reader_creator(filename, sub_name): + + tarobject = tarfile.open(filename) + name2mem = {} + for ele in tarobject.getmembers(): + name2mem[ele.name] = ele + + def reader(): + set_file = SET_FILE.format(sub_name) + sets = tarobject.extractfile(name2mem[set_file]) + for line in sets: + line = line.strip() + data_file = DATA_FILE.format(line) + label_file = LABEL_FILE.format(line) + data = tarobject.extractfile(name2mem[data_file]).read() + label = tarobject.extractfile(name2mem[label_file]).read() + data = Image.open(io.BytesIO(data)) + label = Image.open(io.BytesIO(label)) + data = np.array(data) + label = np.array(label) + yield data, label + + return reader + + +def train(): + """ + Create a train dataset reader containing 2913 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval') + + +def test(): + """ + Create a test dataset reader containing 1464 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train') + + +def val(): + """ + Create a val dataset reader containing 1449 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val') diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py new file mode 100644 index 00000000000..5104e29051e --- /dev/null +++ b/python/paddle/v2/dataset/wmt14.py @@ -0,0 +1,182 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +WMT14 dataset. +The original WMT14 dataset is too large and a small set of data for set is +provided. This module will download dataset from +http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and +parse training set and test set into paddle reader creators. + +""" +import tarfile +import gzip + +import paddle.v2.dataset.common +from paddle.v2.parameters import Parameters + +__all__ = [ + 'train', + 'test', + 'get_dict', + 'convert', +] + +URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/' + 'cslm_joint_paper/data/dev+test.tgz') +MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' +# this is a small set of data for test. The original data is too large and +# will be add later. +URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' + 'wmt_shrinked_data/wmt14.tgz') +MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' +# BLEU of this trained model is 26.92 +URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' +MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' + +START = "" +END = "" +UNK = "" +UNK_IDX = 2 + + +def __read_to_dict(tar_file, dict_size): + def __to_dict(fd, size): + out_dict = dict() + for line_count, line in enumerate(fd): + if line_count < size: + out_dict[line.strip()] = line_count + else: + break + return out_dict + + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith("src.dict") + ] + assert len(names) == 1 + src_dict = __to_dict(f.extractfile(names[0]), dict_size) + names = [ + each_item.name for each_item in f + if each_item.name.endswith("trg.dict") + ] + assert len(names) == 1 + trg_dict = __to_dict(f.extractfile(names[0]), dict_size) + return src_dict, trg_dict + + +def reader_creator(tar_file, file_name, dict_size): + def reader(): + src_dict, trg_dict = __read_to_dict(tar_file, dict_size) + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith(file_name) + ] + for name in names: + for line in f.extractfile(name): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_words = src_seq.split() + src_ids = [ + src_dict.get(w, UNK_IDX) + for w in [START] + src_words + [END] + ] + + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + + # remove sequence whose length > 80 in training mode + if len(src_ids) > 80 or len(trg_ids) > 80: + continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def train(dict_size): + """ + WMT14 training set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'train/train', dict_size) + + +def test(dict_size): + """ + WMT14 test set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'test/test', dict_size) + + +def gen(dict_size): + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'gen/gen', dict_size) + + +def model(): + tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + with gzip.open(tar_file, 'r') as f: + parameters = Parameters.from_tar(f) + return parameters + + +def get_dict(dict_size, reverse=True): + # if reverse = False, return dict = {'a':'001', 'b':'002', ...} + # else reverse = true, return dict = {'001':'a', '002':'b', ...} + tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + src_dict, trg_dict = __read_to_dict(tar_file, dict_size) + if reverse: + src_dict = {v: k for k, v in src_dict.items()} + trg_dict = {v: k for k, v in trg_dict.items()} + return src_dict, trg_dict + + +def fetch(): + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + + +def convert(path): + """ + Converts dataset to recordio format + """ + dict_size = 30000 + paddle.v2.dataset.common.convert(path, + train(dict_size), 1000, "wmt14_train") + paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py new file mode 100644 index 00000000000..c8818f715be --- /dev/null +++ b/python/paddle/v2/dataset/wmt16.py @@ -0,0 +1,349 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +ACL2016 Multimodal Machine Translation. Please see this website for more +details: http://www.statmt.org/wmt16/multimodal-task.html#task1 + +If you use the dataset created for your task, please cite the following paper: +Multi30K: Multilingual English-German Image Descriptions. + +@article{elliott-EtAl:2016:VL16, + author = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.}, + title = {Multi30K: Multilingual English-German Image Descriptions}, + booktitle = {Proceedings of the 6th Workshop on Vision and Language}, + year = {2016}, + pages = {70--74}, + year = 2016 +} +""" + +import os +import tarfile +import gzip +from collections import defaultdict + +import paddle.v2.dataset.common + +__all__ = [ + "train", + "test", + "validation", + "convert", + "fetch", + "get_dict", +] + +DATA_URL = ("http://cloud.dlnel.org/filepub/" + "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed") +DATA_MD5 = "0c38be43600334966403524a40dcd81e" + +TOTAL_EN_WORDS = 11250 +TOTAL_DE_WORDS = 19220 + +START_MARK = "" +END_MARK = "" +UNK_MARK = "" + + +def __build_dict(tar_file, dict_size, save_path, lang): + word_dict = defaultdict(int) + with tarfile.open(tar_file, mode="r") as f: + for line in f.extractfile("wmt16/train"): + line_split = line.strip().split("\t") + if len(line_split) != 2: continue + sen = line_split[0] if lang == "en" else line_split[1] + for w in sen.split(): + word_dict[w] += 1 + + with open(save_path, "w") as fout: + fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)) + for idx, word in enumerate( + sorted( + word_dict.iteritems(), key=lambda x: x[1], reverse=True)): + if idx + 3 == dict_size: break + fout.write("%s\n" % (word[0])) + + +def __load_dict(tar_file, dict_size, lang, reverse=False): + dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + "wmt16/%s_%d.dict" % (lang, dict_size)) + if not os.path.exists(dict_path) or ( + len(open(dict_path, "r").readlines()) != dict_size): + __build_dict(tar_file, dict_size, dict_path, lang) + + word_dict = {} + with open(dict_path, "r") as fdict: + for idx, line in enumerate(fdict): + if reverse: + word_dict[idx] = line.strip() + else: + word_dict[line.strip()] = idx + return word_dict + + +def __get_dict_size(src_dict_size, trg_dict_size, src_lang): + src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else + TOTAL_DE_WORDS)) + trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else + TOTAL_ENG_WORDS)) + return src_dict_size, trg_dict_size + + +def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): + def reader(): + src_dict = __load_dict(tar_file, src_dict_size, src_lang) + trg_dict = __load_dict(tar_file, trg_dict_size, + ("de" if src_lang == "en" else "en")) + + # the indice for start mark, end mark, and unk are the same in source + # language and target language. Here uses the source language + # dictionary to determine their indices. + start_id = src_dict[START_MARK] + end_id = src_dict[END_MARK] + unk_id = src_dict[UNK_MARK] + + src_col = 0 if src_lang == "en" else 1 + trg_col = 1 - src_col + + with tarfile.open(tar_file, mode="r") as f: + for line in f.extractfile(file_name): + line_split = line.strip().split("\t") + if len(line_split) != 2: + continue + src_words = line_split[src_col].split() + src_ids = [start_id] + [ + src_dict.get(w, unk_id) for w in src_words + ] + [end_id] + + trg_words = line_split[trg_col].split() + trg_ids = [trg_dict.get(w, unk_id) for w in trg_words] + + trg_ids_next = trg_ids + [end_id] + trg_ids = [start_id] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def train(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 train set reader. + + This function returns the reader for train data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + + NOTE: + The original like for training data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The train reader. + """ + + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. Only support: " + "en (for English); de(for Germany).") + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/train", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def test(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 test set reader. + + This function returns the reader for test data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + NOTE: + The original like for test data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The test reader. + """ + + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. " + "Only support: en (for English); de(for Germany).") + + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/test", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def validation(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 validation set reader. + + This function returns the reader for validation data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + NOTE: + The original like for validation data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The validation reader. + """ + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. " + "Only support: en (for English); de(for Germany).") + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/val", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def get_dict(lang, dict_size, reverse=False): + """ + return the word dictionary for the specified language. + + Args: + lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + dict_size(int): Size of the specified language dictionary. + reverse(bool): If reverse is set to False, the returned python + dictionary will use word as key and use index as value. + If reverse is set to True, the returned python + dictionary will use index as key and word as value. + + Returns: + dict: The word dictionary for the specific language. + """ + + if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS) + else: dict_size = min(dict_size, TOTAL_DE_WORDS) + + dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + "wmt16/%s_%d.dict" % (lang, dict_size)) + assert os.path.exists(dict_path), "Word dictionary does not exist. " + "Please invoke paddle.dataset.wmt16.train/test/validation first " + "to build the dictionary." + tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz") + return __load_dict(tar_file, dict_size, lang, reverse) + + +def fetch(): + """download the entire dataset. + """ + paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz") + + +def convert(path, src_dict_size, trg_dict_size, src_lang): + """Converts dataset to recordio format. + """ + + paddle.v2.dataset.common.convert( + path, + train( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_train") + paddle.v2.dataset.common.convert( + path, + test( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_test") + paddle.v2.dataset.common.convert( + path, + validation( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_validation") diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py new file mode 100644 index 00000000000..9235c41e9eb --- /dev/null +++ b/python/paddle/v2/image.py @@ -0,0 +1,381 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file contains some common interfaces for image preprocess. +Many users are confused about the image layout. We introduce +the image layout as follows. + +- CHW Layout + + - The abbreviations: C=channel, H=Height, W=Width + - The default layout of image opened by cv2 or PIL is HWC. + PaddlePaddle only supports the CHW layout. And CHW is simply + a transpose of HWC. It must transpose the input image. + +- Color format: RGB or BGR + + OpenCV use BGR color format. PIL use RGB color format. Both + formats can be used for training. Noted that, the format should + be keep consistent between the training and inference peroid. +""" +import numpy as np +try: + import cv2 +except ImportError: + cv2 = None +import os +import tarfile +import cPickle + +__all__ = [ + "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop", + "random_crop", "left_right_flip", "simple_transform", "load_and_transform", + "batch_images_from_tar" +] + + +def batch_images_from_tar(data_file, + dataset_name, + img2label, + num_per_batch=1024): + """ + Read images from tar file and batch them into batch file. + + :param data_file: path of image tar file + :type data_file: string + :param dataset_name: 'train','test' or 'valid' + :type dataset_name: string + :param img2label: a dic with image file name as key + and image's label as value + :type img2label: dic + :param num_per_batch: image number per batch file + :type num_per_batch: int + :return: path of list file containing paths of batch file + :rtype: string + """ + batch_dir = data_file + "_batch" + out_path = "%s/%s" % (batch_dir, dataset_name) + meta_file = "%s/%s.txt" % (batch_dir, dataset_name) + + if os.path.exists(out_path): + return meta_file + else: + os.makedirs(out_path) + + tf = tarfile.open(data_file) + mems = tf.getmembers() + data = [] + labels = [] + file_id = 0 + for mem in mems: + if mem.name in img2label: + data.append(tf.extractfile(mem).read()) + labels.append(img2label[mem.name]) + if len(data) == num_per_batch: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + file_id += 1 + data = [] + labels = [] + if len(data) > 0: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + + with open(meta_file, 'a') as meta: + for file in os.listdir(out_path): + meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n") + return meta_file + + +def load_image_bytes(bytes, is_color=True): + """ + Load an color or gray image from bytes array. + + Example usage: + + .. code-block:: python + + with open('cat.jpg') as f: + im = load_image_bytes(f.read()) + + :param bytes: the input image bytes array. + :type bytes: str + :param is_color: If set is_color True, it will load and + return a color image. Otherwise, it will + load and return a gray image. + :type is_color: bool + """ + flag = 1 if is_color else 0 + file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8) + img = cv2.imdecode(file_bytes, flag) + return img + + +def load_image(file, is_color=True): + """ + Load an color or gray image from the file path. + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + + :param file: the input image path. + :type file: string + :param is_color: If set is_color True, it will load and + return a color image. Otherwise, it will + load and return a gray image. + :type is_color: bool + """ + # cv2.IMAGE_COLOR for OpenCV3 + # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version + # cv2.IMAGE_GRAYSCALE for OpenCV3 + # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version + # Here, use constant 1 and 0 + # 1: COLOR, 0: GRAYSCALE + flag = 1 if is_color else 0 + im = cv2.imread(file, flag) + return im + + +def resize_short(im, size): + """ + Resize an image so that the length of shorter edge is size. + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + im = resize_short(im, 256) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the shorter edge size of image after resizing. + :type size: int + """ + h, w = im.shape[:2] + h_new, w_new = size, size + if h > w: + h_new = size * h / w + else: + w_new = size * w / h + im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + return im + + +def to_chw(im, order=(2, 0, 1)): + """ + Transpose the input image order. The image layout is HWC format + opened by cv2 or PIL. Transpose the input image to CHW layout + according the order (2,0,1). + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + im = resize_short(im, 256) + im = to_chw(im) + + :param im: the input image with HWC layout. + :type im: ndarray + :param order: the transposed order. + :type order: tuple|list + """ + assert len(im.shape) == len(order) + im = im.transpose(order) + return im + + +def center_crop(im, size, is_color=True): + """ + Crop the center of image with size. + + Example usage: + + .. code-block:: python + + im = center_crop(im, 224) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the cropping size. + :type size: int + :param is_color: whether the image is color or not. + :type is_color: bool + """ + h, w = im.shape[:2] + h_start = (h - size) / 2 + w_start = (w - size) / 2 + h_end, w_end = h_start + size, w_start + size + if is_color: + im = im[h_start:h_end, w_start:w_end, :] + else: + im = im[h_start:h_end, w_start:w_end] + return im + + +def random_crop(im, size, is_color=True): + """ + Randomly crop input image with size. + + Example usage: + + .. code-block:: python + + im = random_crop(im, 224) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the cropping size. + :type size: int + :param is_color: whether the image is color or not. + :type is_color: bool + """ + h, w = im.shape[:2] + h_start = np.random.randint(0, h - size + 1) + w_start = np.random.randint(0, w - size + 1) + h_end, w_end = h_start + size, w_start + size + if is_color: + im = im[h_start:h_end, w_start:w_end, :] + else: + im = im[h_start:h_end, w_start:w_end] + return im + + +def left_right_flip(im, is_color=True): + """ + Flip an image along the horizontal direction. + Return the flipped image. + + Example usage: + + .. code-block:: python + + im = left_right_flip(im) + + :param im: input image with HWC layout or HW layout for gray image + :type im: ndarray + :param is_color: whether input image is color or not + :type is_color: bool + """ + if len(im.shape) == 3 and is_color: + return im[:, ::-1, :] + else: + return im[:, ::-1] + + +def simple_transform(im, + resize_size, + crop_size, + is_train, + is_color=True, + mean=None): + """ + Simply data argumentation for training. These operations include + resizing, croping and flipping. + + Example usage: + + .. code-block:: python + + im = simple_transform(im, 256, 224, True) + + :param im: The input image with HWC layout. + :type im: ndarray + :param resize_size: The shorter edge length of the resized image. + :type resize_size: int + :param crop_size: The cropping size. + :type crop_size: int + :param is_train: Whether it is training or not. + :type is_train: bool + :param is_color: whether the image is color or not. + :type is_color: bool + :param mean: the mean values, which can be element-wise mean values or + mean values per channel. + :type mean: numpy array | list + """ + im = resize_short(im, resize_size) + if is_train: + im = random_crop(im, crop_size, is_color=is_color) + if np.random.randint(2) == 0: + im = left_right_flip(im, is_color) + else: + im = center_crop(im, crop_size, is_color) + im = center_crop(im, crop_size, is_color=is_color) + if len(im.shape) == 3: + im = to_chw(im) + + im = im.astype('float32') + if mean is not None: + mean = np.array(mean, dtype=np.float32) + # mean value, may be one value per channel + if mean.ndim == 1 and is_color: + mean = mean[:, np.newaxis, np.newaxis] + elif mean.ndim == 1: + mean = mean + else: + # elementwise mean + assert len(mean.shape) == len(im) + im -= mean + + return im + + +def load_and_transform(filename, + resize_size, + crop_size, + is_train, + is_color=True, + mean=None): + """ + Load image from the input file `filename` and transform image for + data argumentation. Please refer to the `simple_transform` interface + for the transform operations. + + Example usage: + + .. code-block:: python + + im = load_and_transform('cat.jpg', 256, 224, True) + + :param filename: The file name of input image. + :type filename: string + :param resize_size: The shorter edge length of the resized image. + :type resize_size: int + :param crop_size: The cropping size. + :type crop_size: int + :param is_train: Whether it is training or not. + :type is_train: bool + :param is_color: whether the image is color or not. + :type is_color: bool + :param mean: the mean values, which can be element-wise mean values or + mean values per channel. + :type mean: numpy array | list + """ + im = load_image(filename, is_color) + im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean) + return im diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py new file mode 100644 index 00000000000..317cf037c69 --- /dev/null +++ b/python/paddle/v2/minibatch.py @@ -0,0 +1,41 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['batch'] + + +def batch(reader, batch_size): + """ + Create a batched reader. + + :param reader: the data reader to read from. + :type reader: callable + :param batch_size: size of each mini-batch + :type batch_size: int + :return: the batched reader. + :rtype: callable + """ + + def batch_reader(): + r = reader() + b = [] + for instance in r: + b.append(instance) + if len(b) == batch_size: + yield b + b = [] + if b: + yield b + + return batch_reader diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py new file mode 100644 index 00000000000..3b059735a92 --- /dev/null +++ b/python/paddle/v2/reader/__init__.py @@ -0,0 +1,74 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +At training and testing time, PaddlePaddle programs need to read data. To ease +the users' work to write data reading code, we define that + +- A *reader* is a function that reads data (from file, network, random number + generator, etc) and yields data items. +- A *reader creator* is a function that returns a reader function. +- A *reader decorator* is a function, which accepts one or more readers, and + returns a reader. +- A *batch reader* is a function that reads data (from *reader*, file, network, + random number generator, etc) and yields a batch of data items. + +##################### +Data Reader Interface +##################### + +Indeed, *data reader* doesn't have to be a function that reads and yields data +items. It can be any function with no parameter that creates a iterable +(anything can be used in :code:`for x in iterable`)\: + +.. code-block:: python + + iterable = data_reader() + +Element produced from the iterable should be a **single** entry of data, +**not** a mini batch. That entry of data could be a single item, or a tuple of +items. +Item should be of `supported type `_ (e.g., numpy 1d +array of float32, int, list of int) + +An example implementation for single item data reader creator: + +.. code-block:: python + + def reader_creator_random_image(width, height): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height) + return reader + +An example implementation for multiple item data reader creator: + +.. code-block:: python + + def reader_creator_random_image_and_label(width, height, label): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height), label + return reader + + +TODO(yuyang18): Should we add whole design doc here? +""" + +import decorator +from decorator import * + +import creator + +__all__ = decorator.__all__ + ['creator'] diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py new file mode 100644 index 00000000000..fda5246d74f --- /dev/null +++ b/python/paddle/v2/reader/creator.py @@ -0,0 +1,130 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Creator package contains some simple reader creator, which could +be used in user program. +""" + +__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader'] + + +def np_array(x): + """ + Creates a reader that yields elements of x, if it is a + numpy vector. Or rows of x, if it is a numpy matrix. + Or any sub-hyperplane indexed by the highest dimension. + + :param x: the numpy array to create reader from. + :returns: data reader created from x. + """ + + def reader(): + if x.ndim < 1: + yield x + + for e in x: + yield e + + return reader + + +def text_file(path): + """ + Creates a data reader that outputs text line by line from given text file. + Trailing new line ('\\\\n') of each line will be removed. + + :path: path of the text file. + :returns: data reader of text file + """ + + def reader(): + f = open(path, "r") + for l in f: + yield l.rstrip('\n') + f.close() + + return reader + + +def recordio(paths, buf_size=100): + """ + Creates a data reader from given RecordIO file paths separated by ",", + glob pattern is supported. + :path: path of recordio files, can be a string or a string list. + :returns: data reader of recordio files. + """ + + import recordio as rec + import paddle.v2.reader.decorator as dec + import cPickle as pickle + + def reader(): + if isinstance(paths, basestring): + path = paths + else: + path = ",".join(paths) + f = rec.reader(path) + while True: + r = f.read() + if r is None: + break + yield pickle.loads(r) + f.close() + + return dec.buffered(reader, buf_size) + + +pass_num = 0 + + +def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64): + """ + Create a data reader that yield a record one by one from + the paths: + :paths: path of recordio files, can be a string or a string list. + :etcd_endpoints: the endpoints for etcd cluster + :returns: data reader of recordio files. + + .. code-block:: python + from paddle.v2.reader.creator import cloud_reader + etcd_endpoints = "http://127.0.0.1:2379" + trainer.train.( + reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints), + ) + """ + import os + import cPickle as pickle + import paddle.v2.master as master + c = master.client(etcd_endpoints, timeout_sec, buf_size) + + if isinstance(paths, basestring): + path = [paths] + else: + path = paths + c.set_dataset(path) + + def reader(): + global pass_num + c.paddle_start_get_records(pass_num) + pass_num += 1 + + while True: + r, e = c.next_record() + if not r: + if e != -2: + print "get record error: ", e + break + yield pickle.loads(r) + + return reader diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py new file mode 100644 index 00000000000..44a6e344630 --- /dev/null +++ b/python/paddle/v2/reader/decorator.py @@ -0,0 +1,405 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' +] + +from threading import Thread +import subprocess + +from Queue import Queue +import itertools +import random +import zlib + + +def map_readers(func, *readers): + """ + Creates a data reader that outputs return value of function using + output of each data readers as arguments. + + :param func: function to use. The type of func should be (Sample) => Sample + :type: callable + :param readers: readers whose outputs will be used as arguments of func. + :return: the created data reader. + :rtype: callable + """ + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + for e in itertools.imap(func, *rs): + yield e + + return reader + + +def shuffle(reader, buf_size): + """ + Creates a data reader whose data output is shuffled. + + Output from the iterator that created by original reader will be + buffered into shuffle buffer, and then shuffled. The size of shuffle buffer + is determined by argument buf_size. + + :param reader: the original reader whose output will be shuffled. + :type reader: callable + :param buf_size: shuffle buffer size. + :type buf_size: int + + :return: the new reader whose output is shuffled. + :rtype: callable + """ + + def data_reader(): + buf = [] + for e in reader(): + buf.append(e) + if len(buf) >= buf_size: + random.shuffle(buf) + for b in buf: + yield b + buf = [] + + if len(buf) > 0: + random.shuffle(buf) + for b in buf: + yield b + + return data_reader + + +def chain(*readers): + """ + Creates a data reader whose output is the outputs of input data + readers chained together. + + If input readers output following data entries: + [0, 0, 0] + [1, 1, 1] + [2, 2, 2] + The chained reader will output: + [0, 0, 0, 1, 1, 1, 2, 2, 2] + + :param readers: input readers. + :return: the new data reader. + :rtype: callable + """ + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + + for e in itertools.chain(*rs): + yield e + + return reader + + +class ComposeNotAligned(ValueError): + pass + + +def compose(*readers, **kwargs): + """ + Creates a data reader whose output is the combination of input readers. + + If input readers output following data entries: + (1, 2) 3 (4, 5) + The composed reader will output: + (1, 2, 3, 4, 5) + + :param readers: readers that will be composed together. + :param check_alignment: if True, will check if input readers are aligned + correctly. If False, will not check alignment and trailing outputs + will be discarded. Defaults to True. + :type check_alignment: bool + + :return: the new data reader. + + :raises ComposeNotAligned: outputs of readers are not aligned. + Will not raise when check_alignment is set to False. + """ + check_alignment = kwargs.pop('check_alignment', True) + + def make_tuple(x): + if isinstance(x, tuple): + return x + else: + return (x, ) + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + if not check_alignment: + for outputs in itertools.izip(*rs): + yield sum(map(make_tuple, outputs), ()) + else: + for outputs in itertools.izip_longest(*rs): + for o in outputs: + if o is None: + # None will be not be present if compose is aligned + raise ComposeNotAligned( + "outputs of readers are not aligned.") + yield sum(map(make_tuple, outputs), ()) + + return reader + + +def buffered(reader, size): + """ + Creates a buffered data reader. + + The buffered data reader will read and save data entries into a + buffer. Reading from the buffered data reader will proceed as long + as the buffer is not empty. + + :param reader: the data reader to read from. + :type reader: callable + :param size: max buffer size. + :type size: int + + :returns: the buffered data reader. + """ + + class EndSignal(): + pass + + end = EndSignal() + + def read_worker(r, q): + for d in r: + q.put(d) + q.put(end) + + def data_reader(): + r = reader() + q = Queue(maxsize=size) + t = Thread( + target=read_worker, args=( + r, + q, )) + t.daemon = True + t.start() + e = q.get() + while e != end: + yield e + e = q.get() + + return data_reader + + +def firstn(reader, n): + """ + Limit the max number of samples that reader could return. + + :param reader: the data reader to read from. + :type reader: callable + :param n: the max number of samples that return. + :type n: int + :return: the decorated reader. + :rtype: callable + """ + + # TODO(yuyang18): Check if just drop the reader, could clean the opened + # resource or not? + + def firstn_reader(): + for i, item in enumerate(reader()): + if i == n: + break + yield item + + return firstn_reader + + +class XmapEndSignal(): + pass + + +def xmap_readers(mapper, reader, process_num, buffer_size, order=False): + """ + Use multiprocess to map samples from reader by a mapper defined by user. + And this function contains a buffered decorator. + :param mapper: a function to map sample. + :type mapper: callable + :param reader: the data reader to read from + :type reader: callable + :param process_num: process number to handle original sample + :type process_num: int + :param buffer_size: max buffer size + :type buffer_size: int + :param order: keep the order of reader + :type order: bool + :return: the decarated reader + :rtype: callable + """ + end = XmapEndSignal() + + # define a worker to read samples from reader to in_queue + def read_worker(reader, in_queue): + for i in reader(): + in_queue.put(i) + in_queue.put(end) + + # define a worker to read samples from reader to in_queue with order flag + def order_read_worker(reader, in_queue): + in_order = 0 + for i in reader(): + in_queue.put((in_order, i)) + in_order += 1 + in_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue + def handle_worker(in_queue, out_queue, mapper): + sample = in_queue.get() + while not isinstance(sample, XmapEndSignal): + r = mapper(sample) + out_queue.put(r) + sample = in_queue.get() + in_queue.put(end) + out_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue by order + def order_handle_worker(in_queue, out_queue, mapper, out_order): + ins = in_queue.get() + while not isinstance(ins, XmapEndSignal): + order, sample = ins + r = mapper(sample) + while order != out_order[0]: + pass + out_queue.put(r) + out_order[0] += 1 + ins = in_queue.get() + in_queue.put(end) + out_queue.put(end) + + def xreader(): + in_queue = Queue(buffer_size) + out_queue = Queue(buffer_size) + out_order = [0] + # start a read worker in a thread + target = order_read_worker if order else read_worker + t = Thread(target=target, args=(reader, in_queue)) + t.daemon = True + t.start() + # start several handle_workers + target = order_handle_worker if order else handle_worker + args = (in_queue, out_queue, mapper, out_order) if order else ( + in_queue, out_queue, mapper) + workers = [] + for i in xrange(process_num): + worker = Thread(target=target, args=args) + worker.daemon = True + workers.append(worker) + for w in workers: + w.start() + + sample = out_queue.get() + while not isinstance(sample, XmapEndSignal): + yield sample + sample = out_queue.get() + finish = 1 + while finish < process_num: + sample = out_queue.get() + if isinstance(sample, XmapEndSignal): + finish += 1 + else: + yield sample + + return xreader + + +def _buf2lines(buf, line_break="\n"): + # FIXME: line_break should be automatically configured. + lines = buf.split(line_break) + return lines[:-1], lines[-1] + + +class PipeReader: + """ + PipeReader read data by stream from a command, take it's + stdout into a pipe buffer and redirect it to the parser to + parse, then yield data as your desired format. + + You can using standard linux command or call another program + to read data, from HDFS, Ceph, URL, AWS S3 etc: + + .. code-block:: python + cmd = "hadoop fs -cat /path/to/some/file" + cmd = "cat sample_file.tar.gz" + cmd = "curl http://someurl" + cmd = "python print_s3_bucket.py" + + An example: + + .. code-block:: python + + def example_reader(): + for f in myfiles: + pr = PipeReader("cat %s"%f) + for l in pr.get_line(): + sample = l.split(" ") + yield sample + """ + + def __init__(self, command, bufsize=8192, file_type="plain"): + if not isinstance(command, str): + raise TypeError("left_cmd must be a string") + if file_type == "gzip": + self.dec = zlib.decompressobj( + 32 + zlib.MAX_WBITS) # offset 32 to skip the header + self.file_type = file_type + self.bufsize = bufsize + self.process = subprocess.Popen( + command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + + def get_line(self, cut_lines=True, line_break="\n"): + """ + :param cut_lines: cut buffer to lines + :type cut_lines: bool + :param line_break: line break of the file, like \n or \r + :type line_break: string + + :return: one line or a buffer of bytes + :rtype: string + """ + remained = "" + while True: + buff = self.process.stdout.read(self.bufsize) + if buff: + if self.file_type == "gzip": + decomp_buff = self.dec.decompress(buff) + elif self.file_type == "plain": + decomp_buff = buff + else: + raise TypeError("file_type %s is not allowed" % + self.file_type) + + if cut_lines: + lines, remained = _buf2lines(''.join( + [remained, decomp_buff]), line_break) + for line in lines: + yield line + else: + yield decomp_buff + else: + break diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt new file mode 100644 index 00000000000..107d5912e15 --- /dev/null +++ b/python/paddle/v2/reader/tests/CMakeLists.txt @@ -0,0 +1,2 @@ +py_test(creator_test SRCS creator_test.py) +py_test(decorator_test SRCS decorator_test.py) diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py new file mode 100644 index 00000000000..eca2dce114b --- /dev/null +++ b/python/paddle/v2/reader/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py new file mode 100644 index 00000000000..7fe374e6636 --- /dev/null +++ b/python/paddle/v2/reader/tests/creator_test.py @@ -0,0 +1,74 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright PaddlePaddle contributors. All Rights Reservedd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +import numpy as np +import paddle.v2.reader.creator + + +class TestNumpyArray(unittest.TestCase): + def test_numpy_array(self): + l = [[1, 2, 3], [4, 5, 6]] + x = np.array(l, np.int32) + reader = paddle.v2.reader.creator.np_array(x) + for idx, e in enumerate(reader()): + self.assertItemsEqual(e, l[idx]) + + +class TestTextFile(unittest.TestCase): + def test_text_file(self): + path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt") + reader = paddle.v2.reader.creator.text_file(path) + for idx, e in enumerate(reader()): + self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) + + +class TestRecordIO(unittest.TestCase): + def do_test(self, path): + reader = paddle.v2.reader.creator.recordio(path) + idx = 0 + for e in reader(): + if idx == 0: + self.assertEqual(e, (1, 2, 3)) + elif idx == 1: + self.assertEqual(e, (4, 5, 6)) + idx += 1 + self.assertEqual(idx, 2) + + def test_recordIO(self): + self.do_test( + os.path.join( + os.path.dirname(__file__), "test_reader_recordio.dat")) + self.do_test([ + os.path.join( + os.path.dirname(__file__), "test_reader_recordio.dat") + ]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py new file mode 100644 index 00000000000..6b680e39f3f --- /dev/null +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -0,0 +1,178 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import unittest + +import paddle.v2.reader + + +def reader_creator_10(dur): + def reader(): + for i in range(10): + # this invocation helps testing paddle.reader.buffer + time.sleep(dur) + yield i + + return reader + + +class TestMap(unittest.TestCase): + def test_map(self): + d = {"h": 0, "i": 1} + + def tokenize(x): + return d[x] + + def read(): + yield "h" + yield "i" + + r = paddle.v2.reader.map_readers(tokenize, read) + for i, e in enumerate(r()): + self.assertEqual(e, i) + + +class TestBuffered(unittest.TestCase): + def test_read(self): + for size in range(20): + b = paddle.v2.reader.buffered(reader_creator_10(0), size) + c = 0 + for i in b(): + self.assertEqual(i, c) + c += 1 + self.assertEqual(c, 10) + + def test_buffering(self): + # read have 30ms delay. + b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10) + last_time = time.time() + for idx, i in enumerate(b()): + elapsed_time = time.time() - last_time + if i == 0: + time.sleep(0.3) + else: + # read time should be short, meaning already buffered. + self.assertLess(elapsed_time, 0.05) + last_time = time.time() + + +class TestCompose(unittest.TestCase): + def test_compse(self): + reader = paddle.v2.reader.compose( + reader_creator_10(0), reader_creator_10(0)) + for idx, e in enumerate(reader()): + self.assertEqual(e, (idx, idx)) + + def test_compose_not_aligned(self): + total = 0 + reader = paddle.v2.reader.compose( + paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader_creator_10(0)) + with self.assertRaises(paddle.v2.reader.ComposeNotAligned): + for e in reader(): + total += 1 + # expecting 10, not 20 + self.assertEqual(total, 10) + + def test_compose_not_aligned_no_check(self): + total = 0 + reader = paddle.v2.reader.compose( + paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader_creator_10(0), + check_alignment=False) + for e in reader(): + total += 1 + # expecting 10, not 20 + self.assertEqual(total, 10) + + +class TestChain(unittest.TestCase): + def test_chain(self): + c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)) + idx = 0 + for e in c(): + self.assertEqual(e, idx % 10) + idx += 1 + self.assertEqual(idx, 20) + + +class TestShuffle(unittest.TestCase): + def test_shuffle(self): + case = [(0, True), (1, True), (10, False), (100, False)] + a = reader_creator_10(0) + for size, checkEq in case: + s = paddle.v2.reader.shuffle(a, size) + total = 0 + for idx, e in enumerate(s()): + if checkEq: + self.assertEqual(idx, e) + total += 1 + self.assertEqual(total, 10) + + +class TestXmap(unittest.TestCase): + def test_xmap(self): + def mapper(x): + return (x + 1) + + orders = (True, False) + thread_nums = (1, 2, 4, 8, 16) + buffered_size = (1, 2, 4, 8, 16) + for order in orders: + for tNum in thread_nums: + for size in buffered_size: + reader = paddle.v2.reader.xmap_readers(mapper, + reader_creator_10(0), + tNum, size, order) + for n in xrange(3): + result = [] + for i in reader(): + result.append(i) + if not order: + result.sort() + for idx, e in enumerate(result): + self.assertEqual(e, mapper(idx)) + + +class TestPipeReader(unittest.TestCase): + def test_pipe_reader(self): + def example_reader(myfiles): + for f in myfiles: + pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128) + for l in pr.get_line(): + yield l + + import tempfile + + records = [str(i) for i in xrange(5)] + temp = tempfile.NamedTemporaryFile() + try: + with open(temp.name, 'w') as f: + for r in records: + f.write('%s\n' % r) + + result = [] + for r in example_reader([temp.name]): + result.append(r) + + for idx, e in enumerate(records): + self.assertEqual(e, result[idx]) + finally: + # delete the temporary file + temp.close() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt new file mode 100644 index 00000000000..a2a8d47d438 --- /dev/null +++ b/python/paddle/v2/reader/tests/test_data_creator.txt @@ -0,0 +1,3 @@ +0 1 +2 3 +4 5 diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat new file mode 100644 index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491 GIT binary patch literal 76 zcmZQ!W@4P2Bs!asfq}sSh?#)+KN|x>v0q|9K_sIV14Bftj}1RiRKwGd%hQO<)0nHI Tz>rH1B4onlY0Bkk1`z@P(}N7c literal 0 HcmV?d00001 diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat new file mode 100644 index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69 GIT binary patch literal 88 zcmZQ!W@2QOHwAi#!Nv*34S-X=ZL#@+H5Ob$9ML`|P{V`M30M4ZvzJCU& zUI7dS3rebKsVZv9DS;LL=SRrs=;#<37`d33xDwQv@{jUZ1KNlG}1tk?V4J{o#!}SAgtN?N{3JP*c3Mwi}%Il}2ufGRS zvQe=ME9g;k*tpS%1OgS~vnyyt^_%-SVY5HPl-z?7=;&|U;I?-54vtRF9-a@qynPVJ;E>R;C*cv1iAl*RscFwpIk|cH1%*Y$n93?F zuDS+aTlc1=wXMCQv#Wbx@cq#6$mrPk9Fa7?u(D?8Gzz{!@Bz*zXO)={|njw1?>OEwG3dSAiF+13O0Za z;ByTOwm4ChWV29^1G&sL$D?W+5rz7snkNFI#nfYUAyBkLzPqV#6w1I?s2?(ioJO`>i6jf|D{&5RM_ zJ#OwIiUAAW(MtKNQ2VeqUn3pSJNR$YWNun@s8Fk*2+ zmoS(skO#yh!~<33al=t#xTU~d_>&2Vg zGE*_qqSi=wkdqog%^pj~>Rv;GdWN(PZ@>=6ciTl!IZ9L@Mf&rdB1uumG5q++-D=Yt zpFlXIFHC+tRfwxi8il6MhI%cH0dN(Ou(?j^@aiWx2^1o3PPLl34a8>~zWu_3YXVXa zW7FlVj3j5h1?8$H&b8#FK&Uif^fC$sMz_GBRmviQ5U9AYvJnFmHx0}?6b>|rv@tbJ zbC#;aQi6$=XV?*7-?L=h^@b+AKwDZdlN8XslnU;eN*`E_3WW|!lTgyiwe~4}0l+m7 zm3l=Jj}aIZQJssCqhN71FbKsGY($~JPJ)C2T!?Xwh!`k$v#ci8E>VEJa&o~l2`^E) zpnN=M05Kj>Zn$0ud_MQmHsv~NkzSnKbz}-d;UUFN)U!EF$%Ft&!x6eY_@rHuitUIzmdT>SzHfN_rE zYTREK;Va2$tG8?1vaR1-+8!sSO`e#k!>)stt*F&25vm+WW?BVa*)FLJFoeeJLQ7U3COuuf$R{mDpfw4e&WUII2pL4> z0y)bKRmL1g8m4$MvMT$_ldW(XfbsK(wiyG;cvQe=;D z6{>+N8cC_uS#*xVK*@-8wB5}h_P29>L(ukIKMC}*h)B`A(k){n@h&k}Pf{I70VX}) zPKJWENqhn+zyrzYi;>El(kQNxb+k1IDx8ibfU2M1i;}^E3okK<+B7CGJrn&S^RxgU z7KU`&OJhq=Z~u z9WalXK7z3s6Dkj{BVz{g_#^b{zRDB0u~-5mHPPIH`XvV7GRH$DCq+iKT?l?DRZb;> zG1AWEwDI6>M+d^#AV!XC3e{dEq;YCBD`j;|ob3dxFZz~Bp1nJT1^5K2454-}p%SX0 zub>jH%mp(du)yl9&s7&;qen@241@22ohjQB2Nia9`hV9Tuu)skYfxp zfL+*Ky+&4As4-L(gT`3sZ=>gIV8VJejq>mXyk{H?Lq&t}5CJbo!I=Ol7?(hElSo}i zz7*b;RDN`K!~;?R!QX8(v*P3s@F^I`q7MMhfYzIvubI>hhH)#Wg;dAG*q}q?00e0U zL^=eIxaacCp)}@Qu)=({7Wp3dYBmc$)`MCgEPF_L&XalMI?OCdbJ74;K2klDytEe) z_eGBz2wP85RM!V>!gz|Q(srv$5c%Znd7;&FsQR=lo=|z70A*KE+fY$kuSyHo>vS%z zVb04XV=+UN6HV(n`!EJ`If*ceC}OfwuPRR~<;4FwofKgNPeB%GiEc;^(OawJ)Htl# zCUwm$0}T!W&^urbXsxRzPdNU>iaLoh5d#BR2U?0Wiln7sfj{xTNn?&&U(7%>W&zkD z28ao8&%jworW3#C%_#sv+rVC%q3N?=KJIfrv6U5-Mp_7v`YZ+ku)!d}Ue{DqjFpIo z=;;EV;5dEUXN@Qt0m9Y9CD%1lsK|{%@XC6|=?8PSyEw&?NVv44JPwPiV80~{Ze38W zWE}=C!xzvnAHD13p#lMO$ck%f@kPWuB6$EJ3JxUhqCH^lrA8`QOhdwwitaHCYW ziT;{ID2}skmPdHFh>T#moE3`wAgf?v<}!oeaHCtt_(&9E3dkTv!r(8av_`;n)$91M zFo?9lI3xD;c|1sS*^t(~`ochL;B{pn`ndZ9l+)`iluA>%b>fDkfUN`lW=vez%0bpS zfxIp(@)=;z!XLCJ#!3S4B z^)P~%xUpa{a8eDumk*P@$N(|^I>h>Bgjexk<~&=lA}qW*R}cK@EQ`5@-lxW^L3cP8 zoDL;GSg5i0+h~aD;6JhA|OF%#v7_z;xSk#Qby zCfX zhy`i=16-bUH2b;iWw4EL6^U|xx9o4stsYd&$)E^bjvcOtR*wPQjkc5)=c;Y7&n{cn ztsicPR#dZnYViA*wE|lE9YwOhIo7VB2vu}LHTZ$I3FG zjsq?S&C>>8kpa6)BftT*+9)`rvB8EJ#0AfErX|oZvGOQcJ;Z0|RXocD z)#lB6YLfQmMI?Yp>lmH{lB361Igb}&;7yal8nL#rpq5<}PKN^%&HAJ?Np(mz7zj(b zhcLHwol{t^+vvQpT~dR})Fj3a8TNC4YPKukRs^X-g%9hPVKk9z;ST@-<}lU8?@=tg zTrL8ASZ}qUlz3r=sD2ZbW+sJIHbpC0T{GYS-XfqfUJne&d$|i0mlvRkS>1<2-<&M9 z@Jasz+>FI}4V^ddJF~AUzshq04DwO?&*QjiZdtws+ZO$SJ0(og<}#~C#ygFHJER3} zHDF&R1%Ir;YuYOi#=KTGGps~fLmFpB-FLTxXf-`UUd@ZGX5O=DeM~C2W$ZIG;^1IV zA`ojp@cn=cTl7&o{?IAL_bU2NT_;MWvOh&^O646ROLc6;m2?lH`FZ+;_J>rvhoYi0 z8D}oNX-tN^+a&yj5DR02#cMl94N#&TCVXUZ)vx!1e72XX^vQQ}z(BN6^BL0Ov&rvwuY5oT|i!oD%}4UOelECB{R$*D+S$b+yZo>J3q4jERx~!bp{AgbT;kQLWu(&p@kI z-64WQOJ$LNxP-;tm;q4QiGkB-3G}V&oi$SH@k`k>oD!YmAtr+vG2uXLf11w}2nr_o8}RXzFqv2q~4G@lcl7hRKrrs zio8Q;Hb@C{(Zf8r59e2zVtZyjZn~LET4)*^+auWXKueC_J0_@2M3|mY-ck z*;1C))Hd?|B3B@{yl{^y({suQ*z{I?g=}t5)vVm8Vw{7g^>Ob*IWhJ^h;n!umi(D8 z^E8kHD)9o7hcak^Ae4-e-y|d>^I14KwVh_`2oIthqmO7Jv!bpd zXxjVI=~dKH2`fV=!E1=p(=L1tk73`D6?KJu&32ts#fR!#V+8t58%gb zkkFW&@#ch8YBI8n&ygs0hS256M7y_^_dbc(1X+8YQFlvyeWAtopR^sleGrx3WPsk) zzPl<(Na292N5|lxbY$Pr{tJVYaB<|qeEV9=z5c)irG>VP)GnBcGD$W&sgpj344tLl z(R_M2r|x1RSingNv-cbPq4Os)*)yQLR$=3vX6$Djl3m$>VY^WV>>y6fNO08S>gTJx z^>Z~bY`b(sPJ#5yRE~!bezhb9xD&C+BE})1lWuZxRX4Nnb~3qR{HKhhPFf!}`|Zjb zN`s6FvXM=zA$^cXEYy=_npk>%_fTtEWb|YT7E3BDZx|)a(w(xB~D@YUXXs( z_OMM)r0tV1wpBxigr(s}rM8XDV{pdOW;I!D+r<9r?ySIje9pRn*DgWQy`M7nruMo3 zp7xe6+NL6`bBtvn&Y^%Owc_#l@gu%G%gu9R0l!Y`7oD9)lS#2RcP+uIsPvnUO7iew zpzr88s*6~juaCiDY%i-M@mKo_4W1U{3Is{&VkIn5I-?aE414>Kc0C)eIvtF8!F8;$ z^z83GPa}H%gI;VNa~sb<*xC+vZF|+`6PX6F?5_wj&)n}0NsdDCeaeCbP8sSH@y=on z;;0G*b5z^r8nZg-p+ebsUZwdnW+kXdwcXf|a$fvdE#x^Hy#cJ@VY*x&nRebcK3o{V za1}WSnv$lW%pFl?qN}7BtL2RF`o^THZ=U?!;r@zH|5qLObzSIhb;y(?Z#-Bk{%-Dv z4022yQun1M>j`UQe{(x${97E+$~i)fFVHkIk9p_9c4=zR1iTkwQ|OTae4>?X)^zcr zN&fk#TH`Hzy8(~CKH-<}NqOjNGnY%-K2i`wCq z)cR8oAwKqaQ^0FJihEoWz?O2aJAP4cuJu{M`0W=HR8bUtb-8=iAGcK(ceFY@|3Kk3 z%KHUUljR{_X-JDk>Ii-_w#2xp&&e4=zkG=C`UeP! z{2%^?FGoEJOo=`m^vSrL7p=4?@L$`DC$YV8c2;?A`f{AwrGL-bNcK2se=9?W9@2NQ zThe~K6!anMlecoh8rTmIvsr~d$}r{p)qYfty4+96^45#es2#2ZNC2Ws^8W+9}^vRuquV z_9=mJKv(s^#RjI;+bQ4`Qd&0RLp6mN6WyNhDbW&`WS_>vU5ZdLL$nUL8>*_E*0zl% ze!M$c1t`b6i-yzm-hZ1pN%WA`1LbcXoxZ~GRA+pnY_#M1(_p@xTd?uE4Uux38!mX~ zNA8r?UI}tnXG(?9`|5H&MLfe^WT$2n@(-Ys|H$KWZkN~-@eD7fGx*|<+7?=>tHqee zI!wroJM5h-p5qER_fTsw`7IA0F;en|XKzm{O1RXXQXV<$GaY)U4FT7iS8kXZzKCKe z4n9-cI3_@fvwT}xVk4B=biP;8T|K(z_fi)jahw+vp#J-bO?Cc(aD)iA^j=Rq$E{Li0io?cVOn!Iu zUBxc_|?#6Si{oclK`lNAiZ$}&2e?Lm3ZrI)>i@1 z(v3q{e-f#r_Q5Lwp8({@oz3Z=ax903aWD;ZBb-_Sk*NrF}z4+2~{ z>>d)kwbf4XAxr3mL_@8sjt@Ip(XG_0eOsQ{bvLL^{#m`Vt#G7veiYOY9wL|_PbQy@!s@%~;k@_083`{<_q=~?}#fIWS$R`M+Z zp)DsUSh9Q*vj8Nj*nVMwwKUkz6y$nww1iN%;7I??cM0E;P%j!8*J-?|sJsDESu)ak z$Ss%!OsDiuP~&%}sH|Onmn}HDFq>(J*dDEF6@Is9z@}9yqV%h@)x~+rgS)_V@pp5l zV8zH;rMZ?RV|1~5j)d5JfXcT^kPSCPBcG*P3{+_HAeaYiFfLws@fJ|V zn%n$7AI*%ZXxZ6bpc+MOU#-{$1tGJR*@B>!<)pw@P-nxMaah`9pbbZYk&_InQIg5P zaEz^M$fT3_sklR;dH`|<9>4ygiI@s>DjY}hm<0moWH=a@z{AKu{IoMxxQ{rZ%&H9b z5VKWa7de4%kSDp&2Z!L?=S0B6;8%e*O4dt7*Z%3sEGAA2eyuj)R;v!IVg&Gp}f2JyllBETYRwX`R07u3lxsY z#C>KLVnmM^uFvBU_HP}CsOiGj-|3Rgc8Zk0((7-0#O+WJQy<@#kUG(;M+lL1TW1 zC;vUZwG_n9nOOEd^a{Bgo@wQl-;rLX6B*pE;}WZMKQAJ(^kY!Y+feYWwI>t4y1f(S zHe~^HW0wnA;TskJda?-xX&t#U)aAx_yp#6y$oS~!puop;y;d`eOnmB!V%K1dCZB6G zvMXmP!~X9rrbms2HB@%M4PlV2c!v_&qxz34rD#bMz42~hO7 zneYAo{w77*)#y&@+DY?lE3ZEL;Db6~3qV=wZh64p=^O}reyNZ*!kb`D zG3DZ>aC)dCJ3k)>xc5<%+sl>gv3}r8q!hkLwkAaH=)48Z9vu{0Ch?qO((My@9CsfD zpU{2C&-;l-)gu85{Wl`SlX+j}Y-InP`4+8kmmHWXIsOj-3C`nXkUtaEd9oMic&?7T z$*%PFZsuhtJgVc0;c|nsq>8klXk{GIPSlI`r-$$lYy^Bis|%75O&h)FHHjQ)yK^Jx zMb3MZubJ?iMKrp*j_bNn;|!V-jO5nJikQ3$zmol?N8YDYjNouf>X0rz1lO&07+$S@X4H z|FlLj^SzY9ppXE^%~($8_P1f_1`BGdr$o|WkST6*#8Yc~*cd5%XWTL}48N%(7i~IZ zU%DnnLt=iUGH$k2rC`LRSt>30GJfB-z#)sAY%08##OImu5)bozTFb1+R#H}!Aw7g; z67|fEi&gh)D=YfsTeHPiG8=*C5>*&_qpj*5A!x+6xj@?e?9g1Y<0slBe!BaJLn=#z z?Zkucx90+5A-P99QfyiuFukTkusl|J)%6DeLrCa&0x-ogMHfdtJ5 zA?%-3e@gOYj(|@X1IncDJ~dpmi~7s2BbU-@BRs~<5*&!sgH*_*_a~5ZMIK!Utj+z` z5r9svdRCF;Z)%(Ui|+|%EuZd<2 zCySS87}9>W@FvZXa6P`|hVgAS&D6A=qmc*WmEFwE_1VB#?XWSf;phmi{;5aGZ}*=Z z7ainViuw=r#Fhm*2WeP&a2T{XO(+=5oNfqO&$IbEE17WKXz+M2K=AiWr@-^Au}^$6 zId1qJBD-74dtukplQNi;-B0-xdt6h^vWB?4_hvd;V4hy-MCz;7ko#1rH&0|isNs_P z^KwIe=);oRVt+Yn>pCBe(cZuE#(Dl1>~a>P@^s52LgiYy8Yxqmy{s3Jbr!IH+@Z^ zt&b0{J9*hFnvNRTTP$}|<{|=EDTr*U#pDT9z@@?Cem6AW zPrJLvO6R%F_!Ud&(uHy0luk>wqH4tx7!gwURfElg6Rg>0I4>*7c|&5TDZC;RNFbpH zfsS+UVFE}IpX>|#d7$;EAjaj$X1h?+cFBN5Y_R}(V_GTtYrLv}UDd-nV4^WX36qzS zb!2&90ld!OK5)DAO=J|$Yti*up_`LKqNefI+|nvJZ78r&($hAV7hVk- zCkTLk*U^4NMIbGFeTyj*Z7`)qipH-rN(4P@R)kH}18mtY>@Bq{mPI6Lo`bG0TFZEM z-ja+OeEp0u7w!0=@t}lORQz`b+RwK1cbM!bXND9@JUz6}65{>EJ!J|tS(z8EG&t@( z_(1mN;R&6!K@fG|{vv0Y+{=d`-fb4iSUJshEqUmZp*xRCr*6DH{5ILv;zRpDMX%=& zo7F^fOx^8wTh9}1g*W9wyS?O8=$VBG1s>S9)`LU=21OMxI)q@b$I*3Sr#ymgk)@?b zy&do18L03m3y^2Td zD#t^|`kQvAUTx;n=kLa_A`c*Po7M(G!PXz96 z34A)M@P`9lTkSeMVfz-Fx2g7(MFf}fSdN`JT=ml7);Gd(H+Z|>g}DZWNQomKVu028f|u=QgPrEtQ<5mI&ip!r8)r-?_Us69G>7wwrvIyR{3dj^|;0 z4PWIba;H^Q8?a~14R-Sc@!T!#US!+Qgb$@vn;Fa*7m9oJ`O_l*VZ;`GW{w8j=7DG;V zHCgHK&L{M?0(wIZ%4uz5JDtlfOzzL?DovCGPXm4*S&I+4@K>q9%W(J7e3|-IZb~Z4 ze0q2#RUB4+3+=ZSEB)@VAKF|+nYD}6C_^Y4G)-i)E{w(sG-(m*v01=1IUmU0SV>&)JiO(cGw znSb3RCE(8Ip(-AEnV=YfqbAB*QB9m!pK^ zRO@d(Fk9W2;qMWRBp{~hYQ0h9aiF|Sael-`P-%u#XB+D85;j&0Wgz61$ouosd*KDs zN4XnDlB*GJyV?H$`}H!ik56RC;?I+n-BgHOV|xK@SNA;w%hZ6WmbRVzfnUdi{|4H= zJw0~UDAANvgiHm|?Hm!p&h29nXb8SO;B9uP;L+Ks^4MO12vvUH@gUPkz#3VW70Hui z#&6TBUi;M*IsU8;@-g$fs0o0@S_iKjHf+`Mq4jNQU|V#cU3KzzWlB+IT4{*TsXNWC zgQN?8+6xZnC=d!lc{y}NMhyZKzXq};Q*V-M|ow?a;ej7OQlV~`{iMqx-Em7_3p zL;C#$JpP#EqXbW0$2>!dF_qXl)*C4zs?}!=`5+2VSLAe|#qkO3OKlm*PHz?N$5nIT z2*x2d$nzlFNEJ)u6bKjeD7sjjv3rsAZABgkxzi{~6wp)Pp7C zZGP3} zsUoi>&8|)Gb{ge7-uw%7DbjtHkqg)xzXz0i*460)EyDCz^CRI(K?7@!nve7q1p|-bUsvr4^ zhfU6d_tgIZN|7Zo@8jjrJEb6D*FEi!exb-P?e*8olP}`g{27Bh`2Q?k>AVb4`YHpy z9$JW*v+$>;|NCG9Qy%iXsHkeGUz%egb?uJ{CC?mKoTkF1@n%ChZs1LQ?ewP%YXgc4 z*sVrf&o|V{gi1JD4n$ywf*kiIVebwtjC5Pq@(J7S6!d<$OT5%w8=a#8ZM#yi$kon>-9t5UVtctB~^_~wx$>b3YF zH+^N#Ws<(_wdZvBDC(Qo!hk;0AfUJMep$M0>7%eL!?Zb-*m=-~cFp2ZJ)6_yltp7R ziyckwulb@4)Q3R&2#6r%lNS4Yx#C@`VXMBH>hp*^aHu19#00#FOuR4 z){|O1)Ac51x*%#KRDPHfLfOMkE0}MhdJt@EDw-+nLDBK_ZwdYAJJJCyo7Vh5ZlAfQ zx8yMwr-@C5UO{KKuRtAYN#M#uGZI56*cr<+c+5EI=&ICCcgY+(j2oSk2j7oYoVWa_ zkwK$qHXe8iIN7xsjP~p-Rl`0Mr634O=^TWPtOGdn#T=N~sN~88%%!>Rdp=>$mkCpk%zb~ zz7{HTUO&A!^rI|~=Ye4J`(?`D2m8c@?tP7T*%W;;Uyl`U%EW!x2gUSfa*cTkWX5y{ z!&F{H6f!0yIeHsyzSAcOx@#G$^%9aN_U<>&Q&k1WyLR}YZnx+O3^Kl%+|z0`apvsm z@fqrNDQQxDzFSQ!F;pgUU;FF?!;>#d>{Y;>`cFGcgSwWI%Fp~0uYQv-ri>@rdK3Ub z=K}YFcGzeHTg|wZmEPt2Eujl*rkz{~)3d(mUv|p``t#>^K3RaYf18}uIXEB{%YVF$ z0T;4=nG@Gi)qxW{?z_1ARaO22sE3MsLRD7_N1vFs3`N zZ@2#kK%gWgx9sJjk7i=3F~Its?2{WhnHNLe;w{QnA?AKLOPc)eS+?xXj1-+#sI+W5 z{+wihQr1jD(wMa$EHq?cft!bnItrOT>eDg8s~Sa@PdClQp#5Be8B~b(Ize&F$*H?e z^riPJq`q%^|KRLFR#dKLFi`x_8E#2f7$TfWPdOT+6d`LwM)A*)86|%r=dno*=!W2l zK&YVx7R|~Wq{Cr1ZkDy=7nX2@?l|Fp7+KMPRE_Gjy3 z;&k1mpv}l}-CXOBu_=YpRqcvYwV$FVb#D~VSO?Z(4qlr?H5t5dmi!zT(J^*fnYfz^OrZ>TFP6ms&1){v!FlKlXJz~5 z^u71|I3_sxY3~t3{_~jsbs>XMCA&0oOJwY@jdqICBuQ@m~KUH|fQT9x25mYt?#x=^hDapm=SL z3>(}S^t3zQfwMc_o-L(V}3u(Sli;a~*eJw%#T24dp?vyIRZNSuUr%Tye|nz((OUnSh(S zHqyPqT zxu(&210x9+3@5Tb1H*xt1tzBjI{=@yIo+hR!|}m&&AD#%Jy6rYs|Nm!<>~d@reR|E zPLWf!<#r}W;+d40LXO{4kMEQ%+TCW%{&m)9I~SQB+)O;Y_0ZRpS9mz6D)CorvT4MZ z&fBJ#;SlyapNxOMK0DPu8y{@c)J^F~??3bU+ttVr%Tf7isP&=tGtkJFp%7^jwo!ys z2)A6Gwr?tYG`#e!@4H|ce@yQqhl4vG^{n-FPlR>wJEzL|e(PPL_gb7?IKMKCzndh@ zg%8gu-uVb&A?m!M^5fRKH4c@-^#{jZS)(KAXmnDj*iFDm0=9pU4KbjE-GLG2&laRY z=e*`&E`0wkK|8$nCBgM|Q@T6%JTCfum|NKnCY+90FL)olabX|)*=$3@kujE|Y|9D0 zSHV};--$k@5S+qIezksV5VT?KxVUMP35&h_P^b7=?RQ+W7A-vU zx`W!g%h_K;1T>W*wj!1MhO%0}hj%ufU+KbjHLo(GHWZu<(~jUW z8S4{mZn!uDS@iQWuZ5`um;BTWz3zBME-LdER6Gupb{PreX}&P`@X<rG!E{~5;rQVP9ekZ!8fx6L?j&0L_ULp2oISD;h!k%8z@laH|tkLD!Lt$|_ zF$<;_9W=)0LYtW<2acDo(xbftWoM0DRGz56o@CbJZ9e#Xbm@;!J{qubK1pI=Xw3KO`CL_S9$8Zu`7MbMST4=df;UOxh z8}$dt)jr`T@GK5(%{_3By_5-o2C_l;_)v+9iEJ(e1>sn`-To^WK44A{I(+;TfR zw9nM#L?s(|J_T} zcu{t8X}`7j9lYe8DPH4Fb8W|-u`D!~uB3YFL}Uu;Bayy#TFUBC?83w^To^fh~s`nLxXBa~$-fcijm0J_XLj~;D zPE18hIKHZfY%I6k+Hl{IjnPXHZ9Ww^a}K}S>F0+0fQX_?gQ9g_TEBvu?}`CR)=Hn3 zu;>+Bi&W(2pUItWZ-obf4vx{$*J_C9Y<%eH@pi+Vxi2z!q}nP*hJANr8LjZ2ordJ3 zrFwzaT8ypljV{tiFt9fNVm(IQs_5N@<*~K;TXs#cAkDW&zapa?BvOF9AM6+yHxo?x zJ2iv%GjUpNAYnI=RW*24MFfKtrhjbdK)9 zs42@s{AJi$WKQ@<@1w!`j~$~qG8?}#xh`YVUAzp-XYogxQj_sDGzsN#n^8Y?(qYsZ zsqc+89t!TQ%9}LP)Q|#@w)tJ*pJnzJPwKf%e~3{%XrV~_#nUDdrtoR$2U3bh11a`t8Ud^7KD1g$v?lM^Gn%fjRz%VoX()wfhHFW4i~I+7JG=*|p6S zn|;lfn^Xio+~__ZdEMFY^0C!}vmbwmx$$#Y@ih&8hLI9H4HZjW>B!akD?RxF@{A{j zD-m(g#61&_J!$7{doS7bv1$*@%)IRK+*bU(@?!!rj>K zh~QVywdAijhV1gZ+LO$U12pIHP~F{jGmM%0koD_00rB-kdD@P*qR<>a(brBCHE0?Vce?-uZQ+zaJLMOtE95}KOH!ZO* z6dRCEm4yJD9nS2zoDxz@-Fi?G=mJ)V;VXLT2cFBPh@yAf)q?&Wc*2 za0txAq(=8)){Rh_-&HmL0M2cpTqaBHl+(XOd#z_1)-~=uFAHMwTQyE#ugWrsdDPBe zP@U7#HRWIICiGtSpu6jj<=V%t2w@Mwx`ZI@l3uR zi{P&9?1!6;Q+yA&wq+Yotfltif)=^C_Eei4rYUW*Rw90=y_)^_KQcJX+?2t#Z(Du_ z+3g#1T8btQMo^WX1Um{~vqPE|Yn%)q&_4j#G{tU~wC@8=Gj^?2Vp94S^S22-DR*)Y z%6bnduL7brNS8#O z(P7;lrE61pq~M{K>CN_|*^%D{*`E^pQ%w-!TPR*X)4_TwQ;#eekKbr{J?*#+tyC_|p;eO#lLhG@?8?t1-{>;~!8`m=-jc`Uq_2O?8 zP7Kip#SU7-U&hPu)dtx-dp0z2W(*%J!(=~4mo5dz8p}OOq1GGB zs=sSmn00D!&gxH@3DUw7cqdL%;W$3eC#tIFUy<*9EFPNP>g(U+$n+n%0x4QgJSuB9 zV4S@(wIofedEF&HCFr^xwp%uNtyB09aQ}5GfKxMB?Q*+Cjgpb9gI^;~?{C|~ zhpVi&KG*WM5DXqVzL^W)&>BJ3YQ88$p2aAfZCrR&ouJ}7H#h$#>U=S$WpB{P6ny<| z7xR3jwZR1crZJmVcXZ0iZU3tNWgDD1Fz0ZSBBMs<@9VjaHqjma8Ug2*--6NcA5RFL zX6VNv&x}>7ce%o*#bVwy8W}#adc-qZ_l7ZuEsR>=PoNFd*YoMYuxTH^Qij4Qrw0_e z$M>f%h+2J<(eC-A0Y!&ssTA@^;-{DPYxN=e5w5YR*)y=($0|AIJ8@)JG^b;AN-_`3 zYE*esw3BwOkdH1LzSYtgv@(YAk6zW?=E^nDHhRNmf} zDR~pjD@CK}t@;n3-8QFh{^3i628|W*3O}I=zz|>xUM&3cX>h@=1hlm zCAoI@ZOqBj?f$#ZNq1j$Aj4|gNIWqj%5)=eb((nP6G(cx_>AL%tX$Th5PI z#%A?h)%C1geHhznN5`7AZA&XWxIm62u(<8oV?iPgPayVEX{#M-`*x@XJ!j+JU2LzR zBRxF6=ae0XSuMf?TTKiewLdBT0p{&I$1Uo^>>UTlw(pR=*xg-3VU5xxJrpN`HQ!(X zto5ycaTYP=i{0JKMvqm?ET=K8F1zPX6#25Ongqjt|8;D+|2w;lM-;(6fd<;Y_U}A0Z#U6i%OsWI zKphbY(=vZb=L>lg6p3k;U9n~el4k7uxbbfj@9Pb=$70S)T@xY z<}W&O!aYelx~%c7vaVqRAp(iu#+6xUh|*PkeR|Ue7gIiI#pc4*+`W-{!QfwH_Eqhj z&ul06g=w}wrW1Z8C|r`NWfHrDQy4Vd6}Ik5Y`Yz<#3YdnhVxT{ zEf=lBxun7P!J48Ee;(N=zq$EGTEgydja-KM(}$`DrIwNTLOgE^vY;m^o`Ii~(0@50 zDOJ&yWmcrMF7AXaahDD0;ERQVgZ+_777v&IvL8~ibBm_iO@H#-(6Mo5iv8&nG<{sU z)F>*^p|&UWS$~xzBo}tmX7RDzKLAg~&*Y8($DEqO(In4IgVJ~Dh`~^*HAZHxz|wps zk{XS$Crtti<@~CWMA|FSj`?p-TOYFHFU%QZ_!dE6#eRvg>}=A3s}U0H_Yl;Tuyzp} zQdi=!ryP$k z?KS6Xscd7IJfzxg;PIA0$US;?4@e%5-Ip#b7f$WK8LPp5r_Fex8_S1M z7`ivGW$M8nH78N7!@5zR~FT;UnVyH9$>4#sCRb4jD=^m!tIR*qn=eFj0!;0OHb)AthhcX)6 zjuA**L@_}v7$XZaO1rczS}$f@ulYD+AfDBPZx6=2v9A(&PHV_1RE@6%R153kgkW<# z01IM%oni!zIz_^&8%=U0#RWpixz5;}nU^#8KlhZ9B@NJ#~`Jo+AJFoC3{vnXB>WJ}nS-z)@u(Ei1s?U*~)7A6i^M%brCC+xQ3F zo02)|-Ya3?gQ=RWEI+N5kuQHaDN<#6udO9>Ny=yw8;cXqoyXHtw8~7a&XBivr(EB! zKjcsS#DASNPYr>Tg4wD5^Lo7XIkvT`(~bAi)`f*#*v{kl|CXNJ={#_|(|*#iSmQDB z4^a1v&Nh4T@L>0JNchh(x%S~9?WuO_`GiSZ%)SEO&ox_g{G!}MS-uHnM)Alm(k#T# z=4fnX<>%Nn?Ozh_3V5Qa?QB_wUz|H}XyscGYimE`opFaL$p#2s_!xirT%j`)6Y3nw z0{*76RpGC9r87N;RoecsqWGf@GW{xcpx#-hXhlGtX8#{xu{Q73Hy68n!bivV_JLAs ziMfAtaC-F#Kcj3t^Sv%l$s0V4IQz8>T%!Ua;g9vx91ljVfgbzLcg?vzf9je_o3t16 z-Wi?={62l6tdG##eVFBa9wB;k-9qe%+!86%1X9KEr|RlhE!3JY+-kO_^P+Wpq9*va zZs1-<(Yfd|dKD{KTD6gm>%XMC=l=iKmJ@Rw>%~evPQDg%tyI}X@{Bu{AA1s7? z+qW_NIQ==QyH+-725^~d`kYZ9iK`5lwa7hNulfG~dZ@`eJhI4sRNOY7Z%@*ti*sqg zM{~jS8%Amsk1R9Fr_2Yh`OQ_^Ng3MAo-zw#I3tzF>Hc$8%a1UCJK+BSpUSJeeqQFs zIl%P>pN3fa4u2u*`qd#aPm&Ze0fHABvCwrqdXrFxSjxWF+(r~&Mv*pVnL$FzX;}0Q!GAnf zWb>`y^2_;4*uXb&@3>KN(8-O`#Ko3&UhJ{_x%31Ml`V|YHL-1gD)E&Z{VA~9TBCs! z(+L-*@09-lc{%jT9mvp?Ew4stB%JlS9fy^%3^;q7UX{jt$Q6&ZsRe^yI6N4IR60aS3h+t zbPh*7xT=C_A(duEB0zJRSGLA?I{T|w?iHRRwsHwyn}^45f1&2N$=FC9Foqf9Zc(}) zr=?tjQCXlRl)C-rjZO&1b@v^rJxKkcE$3^>sg;#S`qcG5?FznYpxCJdY&+Nk-NInO=Mtxx9psgL<+2OsSZPpwR3QX3<4Rhtad*PE_sBYd?}MKoHE zfm0!>>e#74s35eamvA*y%d}#b%)-?Sw6EHoyCSK1+|-LEYdJd**)tl6n+G(k!x^g5 zNHA%}Wkj4wNMx#Beml|Ud)A6Zwk??ltvMvB23l2==9`#U)DnzTGRO$3hFnx;Vrw?S z!b)<~%+ne2)P<^BGm-{8)77}B*nHFjgI;<(;$YP%HC!5yHxz=v?bfSEYG&rEN^wF! zLg$)jYHPPO9{y?Aq>G4hMOX)lr4w;h194Rt*wQPgNNNL8zG?60Ynvi?b5LBRX zP|U?qPXdq%QPzMg$5W1#UOJdJy=FsT!lNRjV^I@KR}@r$)FG+rYCx28NWy?=tkp

{4ciM8UKAoy%x!4*r7zKKxkGg$EJ9~<_VU5>!1-lRie@?W`j7Ud` z9CY2*oyU8PhIt)E03$pr4CClWU*}fQno+lQ)*Xqe%VHUu%Kh7SQb_#A^%Q1U=UC@a zpW$^pgYW7+>4>%t8+$62Uf|WB0bQYp$jGX4A#`aMg1nRJr@!Y`ZCJgyW?!BxsUF|f zkOHf#ql|`Ajmmi+l}@OWFb{=&*xon?>N|Z!F;x}UkKH~-b5pbhW&wFt8B^;(3^t7z zbZqho!5s}tY~%$DzcP#!>$qo;{uMkdSxi#oE^w!zT1K~TGbDWKHjqA?F!k+8uBa2} z#`#-f+O4=_gPxspxs*^vL{aBPh`Dy}`iknoSKd z2T-udfOC_^Ju0lL%Q!qPPCrlatv@Uk+n&7R^Zu2cJYXv-G5LWR_dn!Tb4A6Ow=;Qd zh2v=J_!Cw%*ob)){lNMB2pInWKGjVS&od(rodFp3^!zJI1p?HL?o?!Y0LG@8ySa0A zRxj?%2PcP@anGj|>wpSKI5-*hI6Xh{+M)K4NFd}nLG}LtIjsl(097;OoUqTf1#F$n z8nMbq%{SXY8B)EEU-QK}Fv3OwiDg3n0FVGwHk;U7CT3jzB^FGpkY>f8f{{XLAy1;2!RY^GA{(m7+=&@|HS=*ku>g^35rf zo5CK56u3RocdU}^)?qF5HFz^9{3pjD$Gy?W&jQc#Sk)dtU;5g%f=*RjG#;GUu??lYfJFz2Z0020`gVWemSzWMIMgbd`^uQFZI}eiOix~0&?iZ)$RBo=t zqkN<^%rTzGJx};k(0dMZ1 z>V3^IB)YiDENdEeW;oB#cBlD)7#O44y_tqtsf^PfnbT2+FYN!?6`D^)R^9ncdByd7GD>$E{bH>8WZsi6s93Hc2@(Bob8?Vogm85!S8F<+f@oRH6P}wMkzh zpTCMpkjK2)f29U)D!J>LYqkYiVA*6#?^DGtd)8QXCapk7BC@E=*`Upps&X;rtY<4# zqGU;9OWV9vr#isOJbN?H2Hy! z-OVXD6rJf9@rqY5k!6KP-kwb}_^Xa50z<&!f_b1(ift6ojCiV&w>4%?l}ba_nnYNJ zLsi3Nsq(c(qZO)&q!r6Z_!T-nYLc}|uZpfyXB`VgsxmpI2hBWlQqhwfwK~)TgHtKu zrHaI&lnk0`oYaV<)RM^*U{mu*1_dWMtjGpxVd9M`wsx9cfYvwK-6MJ2Y4YPxGcSDC3f0%u)s&(R28ZYMJcC z#APN$+tfbw1%3MTq-$wJq1j1R zVZmkjz53I{E}>>vTmi$V=V|;pkN&k-R0;B98R!oq>FrVZIk@v9A!Gv*MtI2e6b7SA zOsqs)dvw9;`qc4&>SJZY2Fj7f4ND&7KvjuV{uksCM2iX9Lb8H!xej*c)Ou3TE;D8( zRmmH_fP;+vdsNX#>&U_9Di2Tp08!~omNL+lc@&U-`kn#pPsq_=x z-3G*s_onTmk`GnE{$iKQ*(vA)3>-mbz)pu-FAZ`dZeiS_-+AV6* zWD9%dNUEo}Y<&(ZpqgE}Sz(Nk&*lFB)~-WMQ!LMr!Icl?&Obq2P1)O{x}1Z62ewcD z0A7k)HY&+oBQhvdMDh+cw$u4%`I?3m@^ovt{ULpzae`Q{^{K7dTHVtiG>l0dh{s>^ z=}ftdu8k$XLo*`&Y(K3Q*5TxA>F7Sdha;-Drg-$MY1pmwu21@)q&Lu}8fP(P(E zodJd=W&|uFEuN*Zz#l?IJ57;%#0qhT^A0+ZkMYG^*lg_NE%$~oo_XZ{ILSX;R3&wO z#*NzOu^R*n3Xi%zZM`$ztSD2vc5+85AQeeAE$riLj-0?6M12njnBzaK*{Z&YO4*C&gC}~yOv?; z_5Qu9UG1Jp_V?=Lx{q)1sOQ+Bp^898aK!%r5znun^rVK_iF(S9h}Uxux=#v#clIcUIey$NVd!dqwiemn05wJJxaYT%VUEjsgx90QEi3wF6AXOGW^&Cx3UC z?mv*=RLg8v9%fzh(Nt&r1!(=5iw(*Uo`m)Kf!pe7t8EzJGTq5yNorwpIK!4X*n{1- zAJVln>vRMkynUAyEOy`!Ku|uFqi(1$Lvun`6K0*D+Moei4Fb&;4^vt2KTKAPQX1M8 zFrKG3s_3VdMF;6zoXA#2w37n|DBYF4K8C$r&^%;U3*tKsk`srIVZ{z z7!XGU9@NEPYUR{sL=tjXe6Pn}LsWJZAZEFe1+ge>(mHOW1DccqX^lv%(fd%VgT-0c zNv5)5nk_O&`cj&SSoEipni~c<42q4#NDmYh$e~Dr6kcl53{w?&sZ>&sNfRN+rpeNp z4h=i4W`!NdcGOP0TQy{WDhZDqRU@L~RM1XpBG?rhN>-hR7D&ex7;svnk;xRu$sEvK z%OuhRkxVa24-^>4vRaco(+g0i*12MECQ7v^=}{$2I#sgBWEE0n;MLX~)GnE(g<~6^fwYs&zT2wh-LZs;9MEdKyXSJrt8x6>untI#k;+SvFwE zg7Z>=PDts+aZzzxatgAGP(&)qQe?5)c&M0Ttyt|)$yt&_d8V3CN>??kQ9}SUy(!^w zNW;AnLK8gF5k@E{6p|_^a+;ZVs1l?N?@mK#lygYf6oIJk(>|3cU5K}57elvmo`cig zyDb(4C^H!czH6J%?H)^j>|76VUFEb$k%9oO`cbmEtRtx-C{@6!lH^sX8mY=#r%F<1 zR8@-3>N zs)a^;vIk?I{{USHgB*}VP%Li624$Nhlmlzr69^Xop4$uie!#zh@m4d$` zBmJCpswAqrSLHcn9XRD_?F!4W*Co3UdHg+UK@el|oa3=LBAl@W$N++Rqww^kG*tpbZZVF7>FJN@(xy+G zfT~gZW2fQ%6h1d#g;VtYpW#grVRrz#eF5YD0IgAXa_p=;t474~q;u_1%B-YIAAA+` zXQ7BLym-Zs3cH|LCKA~4l!C=LNBSAcHl-Oi{%7hkMrr%@~TlvvdmPTUyywd zU*}e?zD9UlfQ&eP-pBm=)@_(?JjBOuC3^B0bovlMtR<|GymX0sU}tOsRo9$$&-oRb zs-7?j+{jm_Q(E(H3UPvR7&k%d{xzGbL|Ig+<$){pP`pNu13JPQgAnQ_X8aMAJVC7 zVPb(3k{z}c9Y{bhzf#2oX#Nv8{6so?he>h6k@I_d56ITf+11uP-!M~_J(ub4$N9x` z8hnJ@Fj;qDh-Oj4Hhy9~he7nNwlTI9L?CgT_Fmu5rDIF6rCk}8vhBD9u&(EY1BDf* zYQ|>?9~mUE^d6?MH93|!#B2VroE2Vnf$RAGxE-rf%$Va_bRfuo)^or=-TXWI{U~1* zg)fHYz+iU4ZUY{yTR8fk=qka881QhzsUTx_{{UK`g~V8i6MJCio~EaeJALv#fY(ay zbFm-G35h;#+d}@Hn5wvUEXcv%yN-%IGu!Z}XSzi@R0cnw_Vp%|Nb(K`KU@LqDjb%@ za=phm3m(LjAnrovpRG1KR6&p!w`OCEew8{~p00r7pev9lk}?#MOJly@=qaLEPGYP! zq79whn*-_wDH;em6C#XpgOYzrkhu+&j41x|a(@r-6p}0K+!S(oXW)G)+zJ;E+qOV* zqqsjVe-WSNDx)lL@Gaag_lHCH)`XBdj1~ZQ!0S+$OdP?GM^-13`BHAFG@l3uZ+>R2clnPvukX!fsKO{_y_*JWzzmp_#wC_|}v%oD9}V=chGd7iIOT zOqPwLlOnOT)k7XisruH03{Em?rmtr6!gkiGO`{n^;AbGpz>pO1YJ)~qh&@5YYTj79 z1eP^MRn8kdct2Xl)Xl3RxI$Y3tSopGM7cFN;8Zgt^&{8-sNWqbLe&c&nyDKmFrXaM zbInLk%TN}hI%_7QzGl~p&t3f%6c9M-(7L4z*H2 zR^&MqRmU~FO`=Aq4M`jv=7`rOuEifTrpc2T99T6cms5!8+iM2Qn<3eOQI)BmiijMV z7OY5maZ^S)s7D5-PAb_=4<{8AoYJ|)M9;-%ZAq});i>&<27YPi-K#`^oOGlkB7si> zG?>U3VAN-mR%A?yghoeN5oi^0NuF^}>BTc{TPB7kJTb*AQkdopG5F9p9@T2sPd84x z4h=XbeF8e2W{rFbC}j;@DoPnR7_1E&NBce+Ndq5B=t6~D0bTdd>4If>AFWqn4stlD zrXA{|$U1ZQRt)GvWXL>Y=}`Ga+m!>{ig``^{KM-?p_jHt)~4m80u~&Azg*H4OksdN zqMl!9EqF_!6F!awft~7!;;zPj#t*GPg4t-RLhCU7pZ9neSg;vH^R{C+xR7D{( zw;&J+@6YQ(mdWP?bs+Ih$>c~6)|wRw7}|dkR7miw!*}XNX~75wE5YlGe=N|E?ci=+ z{2H7B4o(lgX{0NV1@5HeQZl*fFnvcuR!H5!VV^^h{V59ik_hY16gY=5QKMooFvHMx zs1+U~xdVmzXCB6{H<(W+Bh*v`6=8?idnw21{{ZW#aVebwf=iS(zWH!32&8 z91Q(&kHW5M=^Mnv@Z&jePyV%M=oc}`B*nB()%7sS^FdH0>D`{+F4~XUx zR!f%1A2V>?r2ha))s{jmnj^+T4^znh01xL`Vkc>%a!x~V3C;&>dsW%oODeLj%JHcD zNF($h*E}z&(>q-gDk{vFW0wFD2VuebedAEw%v)5k;PEVnk9Q;dDp30+A#uWw$PvwD zeW3pUY1xsS1Z;!b=LfHT->q!)iJH}$;zF0FbE_bH^am$D{bBjhs!Bhz(aSk)#~kD4 zIXV0RC+ZfcJ4)7zaJb&eFfoi0O*dMeQ5~!Z2}aAauVaosK2Ok6zT!&l5oxC0Nj9Qj zZrTS-V0Nyy*>Mncz){ft6~*YPk-DGzsuI`DXZ`6T7&tut057j! zO3JZyR@$Uzxg2NM4$J)N=B&{WcTDMRV^xflBc7~#(`FdQvLlEppQ^_FUnzbB201yxdrfF3qke2fcj@9GV+U>q^&L8M{{Ysgj0)U3@5ew+YSRoKnUF7h9MbMpIZ)63e+smS zxm<={yBI<0asGcQVFZk^+}*G`eJW>^x9|n`-ZNEM8-C+3!R^!eP{9i7fOC!B=Rc)W zkT@SJaC(wOT~&1>c_e!e@u=h^gSWS(A(`xj+YcE0Y2q}5PiZp409Pq28#g9%-n}CJ-^`4ITy_1Qmnk7>NXl{0#8I}9 z+XAMNHd1#CRcQzVn!@^=$qg(3$*S)t&sv7!Vmclwy2x18h}joB&w67V^HG9vPgA>! z3P@DRHF6`;t0xR9`{%`qZjxGIk%f zowp*Kqau`y3Rlz=tVYN&P%x`JnvK_*cQocR=3G^LRoT}fs%<`%ab;TQz6TVlywtmB z#szYtiLx>&F-pYdl}CECE8I{ikKUxHV7oaVAyCser(wMZl+#RM~8= zDdMWL8n-B_j;AyrNRPcrl#3o}RE&dG^bT=K9M$hE z3S+MnBCKZ>=cOjfu<3(LBADz&u(9H{w8%!`nF5N;jB|n3wzRtmCVcUV(vs9^S)FBs zXs9rA?^nne;;LEMn4FyMV^Jn)7?w`(+?M<*MP|lJ06ot&RzG>O>^Q5DgN?up z`*3PeM6q%P&|DtjxT{FA4ns%n$3LxCiC7%5V1L>*bS_sUllXsHxRD|jGW6WP_thXI z@Ji>~nwJVk2k{*$Bd`O4N%aScWRDpGzCRj~TqyaAkHDI&z$os_dm4?OV9bQ5?~_)9 z(V!Vv{pRPfrnpm(u6XyXnYD{sY=@I>J$U2ssx9K1GRMoAN$k>sqw}hAk5MQ~p}V{X z$C!BPGyWo!qk)AP9-XTh{6++U9t(+EyIZi31N7aKQ^Bji$8n#_W1b{Nk3Udb-`N1++P9{&JJxv1@)0t1F8rs7BPs`@;E z5<mNL(gN*Kb2`d-!|NUaIq7C{{Ve+`Tn%qm=)e2 zqiFlX^CbTODv_af0RI5fE>1Iy4nL)51#6p1SFsFi8tLDHk%9_?{5d%OMyTDu!u|jO z$C&DI=uX^zXEmiAs;rlClZAIx>CgF2xWa;Z%G5D#oDh&WxWTlwQ5C2|v!MM)8Rx2YMt+ z<)OeOh~zKRIXUT?=bo(E7Dm34=C0p4$vNGy@CTqjLDst4R%TGo_nQRcx1g?POoB93 z50o3XF&{C`I{yGKO6l#vW*%Nu7mdy7=s(7)lv$lwCdn@C8fMD24i8bBd*k}{tcW9q z8MdZdpIq)I)YjeI)Nw${#|4ft)Sqk`#)jf{n6WGde(J73`mySNPfFTp8A43lg-AeI zOS$zp1OEW6L6MG7I%hw1dB@>Y((7_O1F?j56vWTuW|+l^ZvBfsEt!C zqe+316nDwT{{UL2QyE}eV|)Jq`s)F;+ZeYoodW0Y$luHV0M%9_)*z7;i}1raJFsX; z4cyxTGCn|`&YJvn4Ufkap1OoB(cxk24OvT=vGQ$Sp=JJ6S`(CDeGs zK<({FqdHa#_esxegT+#~XWO{T_+qrCaS_KSzCr3MF6QzR{0_u_bW}*RK^*6KaHN=I7tNX+YTmxyw1rumo1^yYMS`vlfeT zAZ0a?djxws#PY0w@+_iDmu8wLXNO&Tg}S(i;5hyH8*xC z3sPk6;y$k$&I12+}5OO zxEbf@aYRr_mLYi+0b`$R)S;we3lOW=0h3U9>kz;g_a>9=MYvp(>(Eu1*t)@S&g1x0 zPO6K%ug%zh^{R{A>;$vt9kOv$C)6f}bg6X`DN}5{G|r#J$^Pf{tX*2@#;utf zrgPu$AO5Oi-OTPhypz*CaY3+ViOh#{9QDO0%b!BrRCQ;bYnsf~kUYx}`H#xd63zY< zhZd!JmdkfN#%?9%0IQ9I{(5@nu>AcirSV;iM^9;H2g!u)a5|Ho6n%PqGhASm?qo7a zjm^#gz#f2m4wcfO7{>M`&9|XtH4CrexVY<*M5G$i`$nf6YEto~1Z7c)Vt*0O>scUS zG7mKy-Oq0m#}uxp!vt=}y(seD%uUGIvGK&Vb00R{kx$-eGByHRzTJ2gr*vy5Mwy^^seRG_8`q!QKW5n+Bt)1%(kU~m&pJQEF z-30rYxxosE3rtw$uwG{{4Ldk5Q$*F9&bI+&E? z$XE=4+ZgBbHKA`hNopKFAd$yjrz81QGBz(NDV+mjDuiR->q?-4p}&ST>EE1V^%Th5 zJ4G^_=Q;Y~ts)k=X&eoO0*rS%s8I&NZG&3KiSutevFV<1RIcIQZU(}qYXOgKr~GTC zir5F=)HXjtYK`T(E-Rhm$Ga8D$1SM?^Ng2ZllW!&4@m! z$JZwnywL5Dvw^+F3I70iW~oa4TE>HsjifiOFu%_=k0GzLELr9+owk-F@t%Mjb^ic9 zm8JcpakX0q9P^xig>usB2Vv!i3xSTLo&`a7sXHqwouR%^0psiXev~aXA?kEDdZb_# z8xS`Hmcc*Q@~rgMp-_JA#Pu8xObX>DpD5QC3{o{8jz7=;0A9I0TV0y?`A`!Ly6^xz zx4AUbT$s_`-~2$fv|@?@W&BC{dJ5;SJZEu!a*(D!+gzM|Pxx08C;B=wj57hBQJT=P zz9^xwv@jU}4!wBvtyMK^h0=CL*wpXT86q&T&niPB{y^aVAC)#Q6nShK>raV*>d06V z_2e3%bkR#YG-Zk$U>aa&*73dmQ+HBxcCS-w+%zL5rALp9Nad3J)3(`vO zbI>BVE`Dd;eOLm2Ig&41vmmKQZbLTGEG5-bPO!m1ob~9W$U? zkN`Pj?~Kwtz-)|j^sMOT7#v`GQ;dL~G70U_)uT3vWt%I8WeR;y@)bhrV5bBr2NgCc zvqHhoaa;D%aacC-F~G^iYukdV25Q!XGg%uVm6Q=x<9?jglsT;vGia$Q3V;an&MNJ~ z{JE@Y+@l;CndoaxCSL^yKq_QKJmRsE=LeHklK27Evgb8)%@Yc@Y78g`097@;2OQMN zKI7J_oe<#1Gzv!oqHARXrCHt$E;CNzv7Xk7dsHuA2*xX4U>wsJQ?O`ZzM4HMvuR+S zoL5D#R8jy1CV@pW5BoVl=}ucUQ_{3pQyD<(P|XxkPstrIQlttg%zY^VIp(J2@}hM` z>?%PPYADW4T2`in)rJVb>qzOIf~Np=G#p}?Cz?G%%bKvlm=~JC9tt4<(uL-y37TIo zj+LQEgS`Z1rZHlTqNR+^YxuhhCA?brtgv~Kmik>nkTACyqKop<~QfL(H zQ$#YZ4MKWVw_U=kNSLOHv0cqZUWqa)A(BlnW4C0~sA-D4Q@T(nlWxT@y)$(tY1pyF z=vgvqCp1P5X;?+nb5d@|sE~0_98$0WCWQOcq*BCyxaOKeH+0sCO63DLrFC}93UHf6 zU}-QQP;r{*)PREn71BeAMYa8sZSk7{+y zrzBQn%QUO#Lo0!~u+f@4sjhr#aEC-;fkyuN(ZgIEQoSLyDk=ud>N$*2p zVnw;wK*`VhMy^A5BLR%#@fFTJ@M(bB>-GbNSqF&Fh9>Vx915HK3nz1Mn9qEw1gwMoaZH>O>E+fWt1*>z!*Q) zl-nqTq+z)ne7O~1tcQg2zJP820QJ&{Sd+Ye-)zLzv5H$5%1xeg;|oJJmvd^X^93i4 zK|ZzS)&Na!f-{bz(!H<579$0NDb7LL85#co>(`1aO{ME|LozgI>~WEding1JE;^UI zfD4Vvo#`K74$De>&)t?nGhAjF}Y2F6AXy z9FJpOf#LmE`%=L-ureUp!1V|G^IRq2z$Lgfr{S4e%K99!Rhxz${i$-^Hyhr^a$s+k zNdExV;DSN*&-AO1lOrT*avAaO*R43h&uqa)BqL$K>T%cdH7daH%+3J$co;e72m1d2 zFM5MEYFo1xK%+l4Fg-E<0M@N&NGbw=c7S;4(?8a;t>f7d$>kJh9CpQPTSu?}y~fk+ z$NvCaRisNcAOp%3LjEGfu({4gJ#o!wqHO?n0~HkMxer{9mAj4Aj8v@7UhJrpf;cL~ z{$L;HpTe*;Yp*5U2nAPw-FfJLophRvUo+$dWys^d9Y^7c6s+DI5Hl0=j)D#`qLst@=`}5=I!m%HELKHBs`J{+A-^2Dvx@`I?Bb^*6@v}ds${HoW2L=UV0jyidp=|#7-NESV)AOo=S)l+J!CuE5D?7NFSxD$E1Xf}ncU$S$08U`;-B ziS}n)lY@?NR^pR5!Oe3v)5zE)*43<;12ie!icqpNiQK%_KA|4pm@R9m$f}oU#Z7Ix zA}-A49!wKan}R#lo5TT+t!KB=xn`_(#oZSox;Yhk*7Q8GN6us0pTuUc)rhCVKbLy{ z0PV&PtdaF$*W5!Ju91r@DeiBqYdJL*0hYwr2}7Zqpebo<99i!#GF>D zMou;)ViZ+o1-_Lp-ZbsG6<4^nDKhQmp=PVHpv6^a@sfH}G-(!B$flE;MrsHFoGA*T_MOh$Z>^H3GRtMac}sLRa+whO?flTQ^J z^`J?MjwvxfJYtjzcP`=;ry#+lq{tLpo0F;HotlAOqz02HaeQ%4f^Cj7(x856V;MbZ zp`EUiDzXd;-BZU}4A?qN0rEQe^~?7n;Gdp&)69UKp|VtofvA81CSD=cQSe7TSJoze=RAgC-hK z2Yi|?Qb=n6x%solUA0s^JF;aZhgN@-Rm+=D>Ohf*_7$3_ut|l4wmnTDrfEcrAlriD zsl{kP8X)<$?mxmYRkY_bN1dBfIL9B(uDn}vcLm4rf-1RmGji6&_-05~a4WZ=1$q4V zt8v}L;9y{S@%}Z6hTteMgd@IlpVFd{h}>DQ-|UXG?$E5B=Tsz*IpqQVB1RAEN`@&p zGVlKCg~a@!i)t zEzyGdAbG;e&@lc$`&9Q+!E(Dm7$Z5Zt6uP(%&JUeOoWry<~i-3O5=xzFW|d)f;2Jh z$Z-5;>5Nd}8EY3}(`ol0hT7aozdWz47n5zvoMC#L^Hzt2^&PGLoUAdD6*>Ist)GQ2 zuWq)ryrM4VLz2hX4k;*Ed(3ZFqA&*Q`Mk}i%Lli5x{D-taJIs}TaGb`^qn)nQzCD0 z?m*~!W80vl)OVjZ z!Ny4(aon1U-U5uf7a(ISPY3*u>sm2F0rRoA94Z6ew1jSFCt_d;L+wnJOAnXUtg7}H z&$ThvBn8K){OhNoiO%Y_%_JwwVhQAW51m~I4%_fH15EN4uDa6uf@(RgQ|xdY?|i>smU0gOUQb+O(oc17#sh4Dfkg!=C(~rFKx;wb_LQxNg z*&f{u9;R~rNS@y2){{x!1Kg~r?R- zpDD?-vJRQgq0KS;$-pvYaay-=6Cuj&Ur;(zVzqL9f1FhE*uG;jC`Rxy3H2haqDvpk z#tmEk%SHL)-IM$|rx+PHagWC|IE_s*7()}uKSNq}-zyFQsNl6nAeSbzp|p$vf@?c5 zwp+GfK;sp$ZO~R^5QQKBO>00f2NiKfb7q81#X3M8X_1kNbMh;pY+=;H)Gfk-O6IU8 zkSQX%d&MUsHG_9(GDxhdF2`Lb+|Fx^bQKUK#@n!63=I@V(ZjGPRSO%k?q(xIyw=+V-_eoso$g6DQJYmv9OImK$* zTpW&QQfP@l>LZXfOdB15j*HhK!7Zy(Cd+@4KOdck5?!R3yBDs&(vs*Gfk zO=R8Y%45%Z$#x?7fvF>Nnq!lXN@_PWo`+rTb0l*{DpIt~rl~XLlNCDDpi>b@H9k11 zD20Y;SLLY!(iG=3(P@Iv)V&83#%u~15@IuAp)Ho7nuHlSsku>l5)PEB!KVeM1*y0; zJsO{%T8kJo@N-s}l!nPrGS!9|6%iB(vb%9n9%^?L7Ac7_gRMPi3(Y(&OJLa2YBtEJ zSW|Y;n3E$`Ak^Ebfhp>=+cdaltXr!Y1{RnUdRB#u5^n`insC*WjmtPyW4U$<@0XDRwZ$t;>SUqA}g1?N;E^?GTN}&5-v4 zfkR0&@AHv>dJdIhIT=RO{Do4CNtBsZ@l^p-U5Yxf>sgNg19t7J>PIK&D`HsZk+KF& zQ<}zh<(RkV1xtt~Lu%0(Lgxq3m=^y4>sG{3B#ZK*f!vk<02~^%I1);@=MTsp;-$FL_c3HKsq``xAC*^FriH#; zqz-eOkL6P0*=k7Hj!`NuBu&5G3<~}$#a)MhIp5zX*&jj<4l54a4x@56bA>;xZCa=? zgD;fn*LjZwenyD4f>vgNmh#8SnLWdDY6)!PowFny0gc$l<=(79k-ss_G2aq#{EbeL zV*!G>JuIS`^+#ZqLDIeCJ(m=pbu8y53O=q+)We`?Q{;vLh*nz>0M0Hhb*eH9CAP$oc{nX z=Um3VjDRx*MiJ4wa>$7KWFrkib=){08y#|{=I6Vxg4lb=nZz$gT2lf$!t|H zjiii#Mm|;P{uN$GnTX6l0pE5AAJ?^Oh{kh_nubXBp#T7Vx>n5@+{cm}kD0?292{~* zVBgwHzk0D?pEd_OhI#yPkF9mmh6(d-+wI3nrz~;1zXT4172T0>Qu1ExDx9V$~()0<&qXpf( z?Gh`aH_P)f;1a*gR}+^jcRS$=#^5w&=$08%654^RFci~89-jBCyAD52R(AxQt zMZB<<+}Xj?x!V2+BTH7&K&(MseIX|N0vnKArbe@@k8m5l9obWVVP zf2C8{p-W}=XNLf&AC_u2ymVYFtfX`Ij|bP9j&z60*=~E|Z^B;64nF9{e!n+0LTy&< zlF6u?; z)}nSdr*kG?xjAw46;Xng3NlGJ8LP9EBZ19Om0NIO)EtWEZ0K?3i%dz_#6s;Ez~t3e zl#|U_yKguNSAn^Q_5QUGe zfm*U6fyt^9l~rc!22DdqwJzpu%Wj>IWy1deQ&$moz^Xdqn#@egARpdr{(RN*BZFM@ z+H!I^@ZR*LF|tN_lTo69-iLBeCYWRk(wwY!#KlpQOShVGigi)*19tOLfyFaC(}PnT z#oJ9`QZHIY9MN*FLPlyC22EPqQMzK2V%)2?o{D`cK$$f#kBXau8jI^s#wp=2DH@%I z*up7>pPEk8$he^7Q(0*`Q-SPgY%V~{QI)F>DiS%Ra7=2gDHPx_N<~eXpn$NuG~6C(04YI z0ALDJlSrHnY5DZ7rxT$vyT`^&Rh9OPD65ilRV4vOtzkV4Vr5*Ha(5cfkvZozr#?wF zLS8<#&i6W_u@Y@&2(je*R-`s*vo3b~R#3Z&*RL(tinSi0B*q3&~Hx{=o&lF?tGJ=2E#b&cd9Dr<j)hbEu&5~-Ywz9ug0DUTFzbKW-;hBAkCfafg8fKyAM#TtAqJ-{HtA20bQ~#@Oc6?{zQXaF{Zkl@|(E$6gNwD z8S1H({(}^$19`+nZWQO1FO&UG@~bgMq=3;kKAFemnzmL9gtsy=Wf%i2$G@?;r>X2X$I#btq-pWmTgDJs&-JG% z-^X0{9+gz6sOrM4XFU--ah;%UBQqAx{^7=oM-f=mCjGSJ3RZ* zkVbpZ12$ym12szKA^{;XGti#3d>kAaWXrt>9B03!4GQ|6apLjcnbgEHksSw=~obK*>dRHA< zO}m|x>ZKbpyf8}`3=hr@9Y=HOJ!)Wqn`kI|l_Lxfw|dhSVPe4<0AL(;^dhQHY~U3H zNHf4;mS3l>OQ&Q?nrx!#th}qE^dLBI(}Psaq-I>ROSE(*7ay%^q;CHJD?2_qf(Qff zAk;S!TEvGUHa*E80r?7_W|}YBTSE%-WMHUeBeL`Qj8+}Z&&*4bn|KPqbNbfR_3e{x z21Ll`=kLOv3S#$s@ zC)aIHYpq8q#DD|rF;{d$i`>zOsN)1xo#>+y9zAnVE5|fs#OVBTGlN-^T{XLvP)0ou z_|w=*H!s|^yn_hjei_X^($NkNB-KlMnP*{q5r!EM^9#%O9tcC%xO>zjQ5&iBDtwAVp z&}3Di6mHNrLs*wsLPkw%-8lq;F<94akVqWXRSueE2^>{dk%cA9?1b=YnZHcn^{!^l zsG@bx*;xnhoc{nytm%`w z62OcKmX6DtwqvI?w{E};3g_(|xZu{bEax0m!Pw`hu4@TY4z(N3PYX1O=oHn6^KXO&OZv;byM3l&*~C+my7`7F|>V6Wm)pUvnuoDru@cQM$a)S zKIp4bLm=jxVB#nXW0pUlsaof8gv3Y9e{3FG>>5^o{c3E*bZCn!jla2V*d0{l{)hSsk|{3Xak@2Q{@hqUf#Wq+*3$7xma~RI z{pDs-e=qQ@Yc#_y=q_MAep!nt{KaVq>vTk{-j8MgUWKo`& zjK$I)E}i1JqlA^qWiKRzpF?&HZ|B)E}D>aj!e*E^}r z1c!%tTy;=b5%mCpT$K_!VA~@r&leC%dSs|y%ZyY^FJf$?06P8PL8=kYEMw%2K_{e{ zKdIub#XRHYHvRc3I(}8pYn$G~n(E~9Ap_sH@~oXg?b%P3L;8T3tMc6!f89t>{{VHc zr}L`LWRPLqhtTG!CPy+bHkNJ1IRVd5D%=8C$Y5Bn(zOMwilBj!*NmD$f%26fT9*?= zG4R|Cxfuj?+t#f4o^SyC$KI{ZbWnMV?RYo1mWoIk&|~L8a zJ*q{U1D+{a1~}rQJC}|Ksu4Ee=}AWZblU4;V>cGPfN8qkUrRpii>&Zi>G1r&4@r38TE7_7uZ zQPi5d0Ry0`S{X}ISsdb=Mpvy=S#Ubj{?MRuYh=tC9L=y2M*w8kEj*ju-}}s3x34cF z`#ghi+*I_gcwwbLdJ;4JYnu}8)KSNYyOM#k9k`C#A^br`ezhAajw!NSGv^2Js_nd< zwWaK=k4iDJu=x}aamlJ|LLZn?$R!6M_^70EMh{w+Xo|1FsE=B(SFJ$es>jTyjcq-~;l*_pvJIlSdlbnP(^;kzU~^rx zo3YbP7u4i&QUifaRUmUpYik>j$(H(6ck}|SFgWI`NZWq3gpp~L<-#vLYcAk!B%0D- zTO^(;q_O8Gpsbp?q+N_jlLVgiojbbloO{;n&~@UdOKYA;{3||IYNU>SM3Av17wx)n5Ey?t%uLB>Lt~~`DrE?hp92_s>nt6fSt~2UuL2a3jPpw#30*r0RrlL!z zVZD)d$wKGy6+Bvb7%zao-R)ZeG1Wj6;U?3F)1OmRp<-P^D@dh6Jh*`azRo{0TTuDf z`JGefNA#@afBER!kEL6NQ=G1G>~UJhLn%9#gGldzb0&QOAIh~XCcO*>-h@W4^t^@Y$UQV{{T{l)PtIxWgz_9RDYjSRAP-A7!99N z4O&Q}L6%oz?l}Exp&2sbKH-HU=nj9CSd2F8AJ5zOi` zHn1bt{{XL4C6JN#BxOFb59v}|OOSBOk^R^I02<4Zbn4m2?~s3$Ln#FC+DyMPu7BPD zkHWI9ba+@2kh>@SGmtp{0KQKn@T>Dn9l~LVf6$NXSyMc4xdmk>i59O(R&gWZhcY z+qohs&O44jPPIl!EyCn1tLRLdf2Cc#xO0#bgWNH!8SN#KLdCKP?_BLAvAk~gCz9z< z5)s&Y$_+vNr4hjLi1g(Clt*l;dh_qiOt#Um;YNKiQ0`UYh9;y{zE!~W=B=#Wcmk9S<3%GJkfJzF)k!q}(ao1oH^% z`P6S99MjvL)f1^8jC~CV2&8bsKdn`gP0Ce|IN;NsMumuF;P({-_;uWeBhrG>GUbR8 z@g7vyKYe7;Ny~0u;bGpjFXm{8bR-aZ8iL;T3-ZYL^FN7CM)$5di;?ND({x`l);&z9 z?YHrtrC5m`E(yr=G+kS@teF`xlfm?rT|9-FV|Y4nHcpVHp~HXBaQ@etkbm=1ZQd?2amb zysqT>5VDw$12r2G-npyJgbqzWRH(=wjcnuC^puK{BV&rEDP?X}uE@lm4M4*=$<0eN zMP1pbo2@iTDOH0S-Nwl9oCgJ`W4$3@yii*;nVNwn{0vdLq1}_kNUBJr;u47}H4c3# zm^q|wl@gAG@{BywPZVdGTvkz9fVeHPOmz za25b0O&QPZ+(3!0VF5iAOZOST;-}WN&p3z^ilp5(Az^KLx$@28J1*)?_UTL!It+QA zDD>izE46;iGt;$Br5}@2gfcnUj0IvpomdE13;?Io9Moj42@XPztPLt$CU${RX8ztlS9kJJ*wcJjz$^e&<*b2whETEd%)DOHdo%j`< zOQm#aO{?m8$NVMo>Junc*r<8!o;^Evt1GQG(%bft&PQtD{AHm{q}(*t*Do625w*6Q z)g24OlFGu;{{Rfq|o~1a!t622-JW40E%1ki8G;V_+k6+4&H95J< z<&JXM;l84{S?~6IirZJ(tXgV}gzj&UbCPPKTU#>T9VK*jGhIRk=4x)A;xTJ<_U|_E z!zelv+XLxdZGGZf zmybi*p`U7@DT2eMYBKIu1ExDs8h31C)OM;7og^~c0vbm87_WneTqqa_o=G4T9u>#@6q`6bN>L=tD2pZ z02YolUilnXH8suLE&Gd^P!C5x%cXP4wYj26l1btV5J*i5#Gm$?Kgd)++mOq_@b;K9 z-w*yYtEpZ$6Mdd>{{T&`(Z5_~t;H-ZM*S|t`>L{kCaAspeZ^YECAfk{{{WCKoVoN8 z5B-|0!XaWeX;y>KhLisQimeD@v6BI;UO#x|Kh#yo?JUpTl4L%TAS3ezj#uO^Vzdbc zL47t1dWj?-)~}>?rx>tB_YPX6iq9YF75aMAVY;g*Bh*$+wYj3O@7*x@b5TCn`P|t50CzMp z;N*0m9E={7De5w`{!5N!QJ+w2J>mczvqnMx0BaRu_Acsq^sHIr*iLirO}lh7itItC zS%JwNlk=wX&(62SMGi39~HH&to8*-eA!fi9AntK}dlN-xzlf=sS z!z&DCqH8}Oh6L>=gXvi|((RR97=|9Ex)?6EW1Xx==Ugw`dTesZPYEt%@V~&W5wX2` zcOI1s^x6 z@}z~{#M8?Vy(-jppssOGdu%Tvp+v^TD1K-5tE-H0P{#%&>BU4IEX9yJ@>{YRODf zHiBvL+?ygb>rdO7aF3dlM^vem$Wvsfj|Q*2dF@r+6O7W8Fm%ef=AXqhsRtCJrDEV_ z;5%;>6Py}(YK4wkmnbL_BUQ)=HAx3GZXrU3r6?O#Wj6tlT@|q#KnA(JJ`K(}=DMpp zZ~nw za?PLiSNv(4+oAx2{hq&{T6qNfRBpmC#YL<;jFEy3tJKmYVLoT&y*(+3o%@g8Jw2*M z8TmyoF_1_Mt)9Z6Xn^fnA~30eP`r8hK;+Qk7c*WWLB&GQ5D7J|&|rI14FgC%Rmk~F zPQTKfmql>(BmU7~JgZm)N_!_6`s0)MlUmwjZX;6I8~Q4659QXW<4x1f6^+FGK2RLz z+xSQRJZZ0}T=~0Lt*-ty0%V>20mnaw$||FExSqQmD2aMG!Sn-~VYXKLi};h$vQtnH zFo)$50rLiJ%s#l_A8u;eYA)Q6Saccx01DReBFvskR7K>HPi5ovsFfHlN~lhsH%e8% ziH|2O-_oA*#@{f;@9#P3{(Y*Ix(H+3zTl^5QR!1hDgrPtPXir)3aYHE3n?r+zCS@o zK|!2j>rUWmTIUJC%hAoaC=0DL5X3{Pm?5jHl(xIUKR2kB8y z8!f<64svU6MDYBV_X_d`STo7*_)!(9**b|tx#1rVu6))|a@(*+KZSb6lWMRgLXnOu z2TzG$i)e@jPfoSF9nf_cZNv-^c*ytv02=CqQq=Nl)N*Fh$jU}a^y4G1(9_I{#H4_A z$ie2an)mlx8Fw#noDuyqRNCQ*>db)hz;ZsKwz@_ZxzK&8D6@`n?Nx4XgJ3F;#2zaw zz>T&*O9RG39DntVU4=efx!e!%p1=Kn!j53E5BfvNl-R)b546{3+gWh-Nn1>IvZg07_P4F?9$ekpnF63wCgRT>JV~6tcWhn6(R4k-xk~$Na|! z@-^4Zd1ojwOo@Jo8RBi7nR#E#aaPk!gpJ7*!F^$MnSY`D>cz4xx&6(|$bB|r z^v!3=x|MRYF>zO7OmIb8MvhVQh@ATEDAgc>KFsKQ2N@rjt5FrmQ9xAT9oL$2f({SoPSz1^U9o~s<5GjWt`Ff=V>mS^$mAT;xC-&B4D27BR*_TgkmKBQ zR>S9~2hyW(G03TuQyOJFi1GWuk46TV@fZ1{+JD_OG>VwUYLuu`l{l;W$*s#Xk>!w1M`Kubm;~pLYU5w7 zK6_VCd!zpVjZ~E_E(Qg|p*^HdKQUQ4)ZUjW6`;2=%5eDS_LN}7GI z%0}Afnz7LNXB#te+VCU!Q}?S^*5KJQ$Ije1`gf{pYPWEJcyaez(xbN2&85K8uZ9)Q z(8soW{VE|FY0&cJ2SK<$f&K=Diy)DnGJ9lJ^h(Rn8s#*pzRjqrIq1a?@~+4*1LYN^ zHF4o*`@UYo(42EpF4NwkErCq?Kta;CH?_+_MPcjmfYHP|DPxa*D?{yy+z@LIQn-#; zoaZ=feE<~|TI6;)u`Q?;G@D}3%`dA2U%+$vnzYb5pS{g3jv%p#7~!$F{ZBOcBva7S zrxc)(wI$00i1R9rlq`hjate-Hhe7g;imunW2O|cl=xnaZriaZ0g#xHOqh#dqPHWd6 zbmpLv@vw4EO+&G2=6XU4XBB!Ya5=?A8lOtP82M|i$CW8+J_P{a)U!l3^r%Q+K^0+} zF@k#2anO~iSs@rS;TgwDa=8{tFI=Ssot!zb4U;^X$XoV5i?Q* zspCM^DT51|)YUSPk0zs$Fe>p(4ns?k4;1Xuj8#Vknyo)I3F4wah}Ch=dVGAgYBF=c>U}DBlq&rEdk#$`40ARq(Zzwe z&I#?_qw@(kQG$3F>T1r_iGO*^e{g?Vv>~N850yYs)!(3~-S+VjCBNsRb^HfvSCF{N zV`%6HN^-zS8?)2yj=%kH#(@-!Wt$2MG5krVeUC4-*zW$~4bAxWAB|a(SmPUaaor<5 zey#o$PI(!VV-XBpe6$>&!@tzhX>}U7tu?)&{n`hYpa^#y`?vH__|+?^69yn}(H%sPE)LVm$f4)agpy%mO<=CqU8c!^2VA*MI zygbB__-^V?u&S+aux8&&j!!Z8X~_NK`zZbq^yl8Jzngfgad{_~ZoEdSxX-9X>G<^e z(xBgN&Zq1)JsH=TCH4oYKadp~y}z$d^DES4g{s?@6Ed`lH#BN;5PG|I`gN^LpowLb zzWU`<3}t5{sN=&ebU9g-+b0qoYdf|_+y%S$5DL~uL@i6Fp zhqYDEtXlh7KonyGxE(%~AKTVe!NMTwPp7q1lI0?ZF#x*{k$t^SxAUy|WK!5>C!TRb zoqRXHwLFH`L;nEQs`K24*ACbKVB__wH(Hv-FP5is^2hoLl-EM6%$xfNUsp%K z`T<>30yq?7g#eOArEt%vIleOIAUIm-Z8a#^FC*^Y=lRpUnR2^I=SK^Llc+p!YDBro z+Sm<_IjkkP4yvf#K2y)-`qYN)mXkxyO{cwUd5J@u^#FDHj{gA8kl)E6%x8jlBiGQ1 zq=?QybDVTFY86G!;s`xy^Ea@pTZ>V+!C6-$x#&mZO`hf2CV0qX95)?4qM&=xYE&>1 z#heTr5%fKew@RDr=_)PVoXVu}mKa9=0A+{ppV0eLO^Br(!yl4W^J4zvbzh;aTS=vY z0$Hikq|>nu%#?bNkyz zFu!=;oP7^p=T2cfKnce1dKVjk`ZxaoUZRdjr6bH9TlSF$ABg_|3c9jQ5c%-v-^REh zzP*p*&04b&3IuLID1E+R{xxbfZM;#Au_9}frsd}6^4e#EJ=aK>U?@s_Qz~}kX5gx_Q4TJAPD=$4g4{Ccyk;hR{g2~gN zpkjG|@J0;=Onu{#>rlxc-G@wiGXDUDFtA~TaL3T_e=3j?;jvWA!#hqX(#x>yV}s~x zI_g#AWCPop&PK6Bva*Kc;Ea!Wg>XOvv3ckWna&|fo0m&o@LxNkSmg@+AdNp{QCZ zqZv76^;2BUyv+mQT&-!tsLvnXZNjm1?eOShJRFv=WyWdFe7o7=K83dr)}Rt<$o~NA z(slPm&+?~@O4-NH^1gbSUotl!qbc;-YF$dP-Q0y%y&Wd6^8HJe+JyfAc0~aH0JsH7 zsM*1*+Ie$utkx2&7bo|t-}}SV zS9Pb}toCtB+>U;txx3+Os@wC3=TY)W>Hh%Mr)avnLpRuO6d_IqKcBr+WgFcy8gqpy zHl5S_4zqsl)Yit{l?j*2#!f-#M|x{YPaxn{_eXMRtDzclpYqRv-mp_M+{YhX?~lW& zt$T)H1F?S9ou#9BvP+EZ8$tBXTEt&>Mp zh#oAid8(4ymd##iS3i4-X#n!CK+%lwv}|w^)x`R8rJI*f%{X>`4{RQEE(;gkB9lUg}g zZbgWQHGbcw?1fRmW7GctuUSaZ#Bk?(kKj|l`k&}OI<}I$gfkrAa0dVnKs!}NUCK`8 z%ri3`v4R)79;f^(UIz-G4ozWMu{?CGc!&T2S8-Uh(lb+A5w7Mmo3<`VtgS-$E=Fm& zB(BDy-iVTGwevdFyLn5D3g+NuxtwD-&2-k0VFuAt2{|#;ii#y}l?$jGxxn=ARvQb` zlj~I^%f1Fn6Wgh)mWD`ZW_H?xYW}D3t4k3nfq>!K7xg&;K%u@hY*507d=nY(vU04`w(aQ$FTaEdczzm=1$*4Q_1{lyw?pN+5koD zi_m|Gqix0LS~Ecd1&{Z#58~*f=}|P#BDPp64!wu}0A8XBv5dc8?*9OdK{Sl`T>f1J z4ptq92}Vgy-^6?rPnG3g{RKS-ss2K#s4pkkZCy;P2$thNDlc#kW%X|T2OV2> z{=cud5iCj(=5sM(KjZ4@PDv;JS;l*l(?8I272Vmja&W717id-cw#nHC`(CH|)b}&T zVGKzMkjg)Bj5%EOm529_Vc1q|5E6XWUop2~5dQ!h{{VpR_)=}HKd^0ncB*PQC z{G_g5)l~Ff_nY(|T8qxYeWFk>Psp8*{{UEj^}T5>SUkQ@m`qWU^2z=7{wDr{smDLs zZnsE2MH{yE3Of(w_||RPLXs@YEAPvaT#e38sj8-9yr1rhagfQEWAia2{c1BG?xgp? z{eSw^E$UK4Qe3x_D*|~qC)=<1R%2f|bqB6%LRiAvk?+Z>MgYx~91cZGk)lY+y}6c2 znRj(K{LeqqsXUT>qA5!sm#Fo?z%{oup7v(iJ&jVD*A3GY`HHeKUM=E8>6XZ9d~V2h zDLZfmK9zi1mWyuRn2hF^>{-Ca7|kwVu^qM4u|DF&@%h&3-DC#OC#_^9xl^@);#`>ygj$%~~^Ny502v^Nf#r zciIKaYysp9@%-xrZWt=CC*>i59jfv(M$2&#$2^bjpHHEyQX!)3R}J@E2UEkNe-P|P zzrAQ$yV)?77B6o65&XgT{&kx!(9z0^`>8sAu-(-AdR5ywwljRYk+^uzfAi~9o}{y& zj3mn&#Q5zS4oV;H`;YcdBer@~pDk{9nF7s>?p|^|bNoZ~6_*$`*#)>qeT3v&mzW*(hwC!OCG|h zF%|-;&JI0^;+j;W4CGT|aOxdT1Y{B1denkUt_}ztsEA0~tJO=zAK?lpa-ptIsOGD*d`vRrG*IyCWTltmITtPJ8*ICN|pYIxK^5{{RtH zKGkJ?`=&BlPC6X#L;Y$SjY2GeCJuNfy=Tg*?Y=TP(a_P5DrJ^ZtpWcM- z$Klqo?st61cv~^SKsgx>+UOJWif>)+mMWdT(#0Q@~cKx zITMgb{#AcX)THvv7YntA1x;xyCApeaFzepf(zuZQowH8U{!1~>EOKg{)9zJO9ZgFR zlmG^BIIPpu-iPlhxvs{LAf9SB7#SHf*umhMm}j0ityZU7(4iyX^dgZaVmLLR@Z@~z z6neK62l_N;fLQzRC~$j?NedIm+yyO8cSw$0?mmnDG#NK81_JsH1xCjM1PW=`R=E_a z!Lv?{s?^bfII60IHF{3RlG;p|HbJVFF&8yvlfaw(}8-JI2snL&!mak%tmeCdJ3 zSB=O#R*cc*Y6xKj^sKpT#bPyQYItV{nz*+9!jCKi$gPsl5*Uw#sgitgOk`fw6DbEH zr7^TgE9JV?8JSw_{xOJ>6L?q%}tM=ARs3RA{Wtgc+%U5#w*p-lw4Y4)ox! zsA$Q2ye{TohU|S(qyC3t-MYt*35G$Upe)0bR3X(=?4?lqP zrWFzJS9FJ>e~Q8e=n-`$zrBH?#G9y2I&sYBd z0qs>;K*j+lpaZ$3S1rPf=Yfv(D@-iTBKhu*%eZs1*?mvFE8KRl^(&_?Hi>WFWQ!Swe&y@x%;RGLScW|8rnZ2)i-dJkb!vb!7C zLKfX|cb0NV%R9H~O-}O&t^Ck7mr}5=bx!T|JpQ#_La0n`NI<@1wj9W9D*xXex^p)K7*6}DK7W}C$>-H zP+XFt7>Dr23HrAu`HITrq)+GYxg&v-_|t{RSXqh2SSb9BP@Y003O$GEPeLSxx8y1i z#Qt=yGTNUu5xb9X#*jyhckfje;hoPI2a2^6z^)1U(irl9;Yh|tK9x>b0SbLjO1NJK zIpowkx149AP&8ML05(2)91%`1g~obzse%o>9&?&=sPgDSY@pHYa%wfY2`E#+Clt3c z43MqPDulU_44oZ#`ubHNGgRFyp@8e05!_T~_iY@bqPF5coo6dH^#`cyT5+j(fIfu& zw9_SoX|m=>NcnjJuG=@3N=g2$xNmQIh(W8)rZWyb*RyI;Nz7TKH&7N2$V?}Q~t3BAMG#rkL6k=av1wNri$mu zDjGJ7Gb!)RD_$7P5(y(YL!GDk8pMrH+rS;5&;C7Ic8|3PZdCOY;`%a8OL8{lN6Xz$ zZ~nDe357d(Q`8!x8yH`%RP;4$N;d9Z-9>Dla#{caWas7Yp4EXp%jc*viB=dcbPI#K>2x*~i>U}f89;2=*-MkRl+n*r&AbjXWdDJGA}%4vredE&mf+KTcP$fT*+u?qS?7D+fG5+PwP-U z>`r!1>BA19vYPYl7VRyoGX?wcH!PpxJ?kO$WRr1aZOPXiYFzA6r*om6IienHG6^Ca z<$ABwRz>~B$c*XN_|bYdzCWc$ZK%kYlHOd7xXnuU7I*4*e&%EMiVTD9RDUt~8qKu! zG>lq$ob;}fatKm-2IxoEx8qHP_Ayhq9_ndz3mNRnIr-sp@=AwtpT?qD0a20&9>3>` zUm#UWv%VylL;*D zWQ3lfTZ8o-4I;_rsa8F@)h1?wU9I;?;MBL(7MDhJF!+Kw+ZXzD5}t-ge{}x7DwaaNbhnL%^~$A0y% zGysp1IDqVSXZh7eRtE)$`f*uJC9^T9QZ*xDrt*>n!vF?BZYxe7nh_wCI()z!pVG2# z?0(Y;8N&~{$F*kZTBNo&Ceq{;aC&w>N~pBa%kR$lo!;d(~-dh~!|3P2`Z+%}TTG`Fdy8 zuIgKrs@SR*X~Co_ry$jdQ9;2K7QhB^QsZKiC4q#|gG7-5&MP;=_vvt(3FKDQzyT}O zRPL^M3LVO*!*Bwx$AYzvB?$Pdu-x&AOG52p@dFi3bHNoapY2trW+x)8G$zW7gttn5 z&;>je^rUUoPQ=}fP!^OsX9Ad8Ja?%fQd>Bzw`RApB@wfVdO{Ruy<3R}-juRN3e#~X zn5q~Krl68k+;C`>2q&7)bZz;`snHdwb9G3hHBb@}#Y-Icp|m>^%r=V5k%4}4eT7-N zy2{{HTXgd!1lOHjZc|Z5HAi=3TUgo93}AW+O^m0dXhj%Q6v>wdfnD^~k)&f}Qpc@1 zW6dr(=~9njRLe+bmw{2^X{nCttuqy>T8Xm%3ZdKzikYX_nB?B5fNVK^SW%|uwc1zu0JQdv(3qbz$=i69>;9Ov&hudw#2=D4{_pKyG< zx$ZlCKgO#{;Q>|Ze)E4ydrBAOA9(fls_t<^VzCj)Y*cT+z&v+8wJt^~76ph331h$m zl1*mHAl>uhwM@u~iEfDtjHE!}pWONnf69^FEJ-9Rw_crAKk+}HsuHuv#pGhn+_5K< zOKrn)OPlwd$3Uy{qNu=8>?$U9cUY4wNx%SPlj%@n=g(nQ$uJShI^pXO^O{^VraKyrkSF~{+rPxnV}&Z)jt{eM^dgq6(;Mn;K}AN514k%J-i z_8x=r>sBqTqLJR>N=oOmBYYpdHEf9co*dmU$wzha%b&^5=rb zyB~8xdR>FP?1Z|wdzNYB7#$UVx+=0D3%&m8jmO{Gq?LopRetL6MZ+q_sDAD@ztDF7 z06w&vySoc$W!)100J|sVW9CYI`wze$)|nh<810PI=^H$I-Dsk0Jl1~72_KbI5VX(FK}n4n%>ii(@{rj#Xz(yKP*keD@ce~Wgpt? zZ^^aIP`}ZeOVCKQ4a?)Pn3tnM26!LTg`8Q8&_c!B#wX z{{R}RsOpDGw(|xaMmUX6M*jfoRamaG*QPtxQ(cWx=)&qmEO8Sac6y(o?@Dz!SD!*B zriRvyZ4>X+Audd$kRR_M@6UXmhtjsvZRC|0an*ml^#1?~$cE?aWg`G8fwOOJJ5Hf7xMjzf4%xd>7J@zcow&QUFo;`hswM|=C)|{;DX0_GDw7z6+%OK=)gHLk~v=xCC zx`-bujuKPoeGlkrIbCNsVS;^oA4<;+hUeD@+PNJ@wCHxIC$CZBdz)K_j#OypPO5v= zw4Eja+qW`_PGhWW6{F%`}^UqIfY`Bh1+X0n*9s4{s39IFpdXl?Gc4K{n4 zkX}fkmvB~Zyc}*xJ(zSqTCF{t7j8wiEpGZ7c^l+^_Za^GBl()IygG5g6y;@_KPq0_ zD<0n8O0_GiG8}EHDJ+sjBSA7UNQVrLFjk?wD(SU{2fw9Qp7cj5+iGa?#-j-`{{XFs zdmrLoWB6jBy0>(VU0r5@_>ST}RlSdJeUCIXZBWy`=Ra{9+eyL79%>huIQh9#S0{Wi zNd_xAXz&R`$E8CyiY!FoSFS}^NXQ}MJa?>^q$tDRt!P0pLJ0gtVI!g_*|cC-1{J?+ z3a}N4WD;BvTaY6r2&~rRR4tov-E&puu!=+XdHPkz4Q?@i&O44$512Wen-D*xZ7mpR#=5YmR74NBdDSj9 zb=kDL1zk_zN2so@FqccZG6VM>wNC!V4Nl+6kZhHCj#1T3WLR9>X*Y(}@L}_i-JE)Z zT3+htoBKDtL@PKa2Bc6h zN|E%Yv;zgUV-&&)jIME2siudTXxiDa)Iz&_q~g6AJM_4PST=Gi#xFrhOAdh6&W++X zg#=Q$!02n!r&CKDT&JUzQzmNMN&@D&IJK2^1WBL9t;ei!_o^i;u{fP8N~_kH_Jk*s zSifnKRNi=B=T&CAW#K{1E@LFMI%SqJDme~%S3hrgH*jkDU6a=$t`6pkD0}&WO!ljm z((}(1Rc2Q1D#9ouiq$|;Pob)lAYQb|Za`CmS&q)0wNhKA8Km8af}O>feS1 zEyR=DW6pn(e(Y*Cs{H_~D-b;fDbEnaODP$V$sv3W6jqg`;gV5r1{sGW_8z33!n3A= zX{GX^A2H`SzyrA_uoW$;BnV)DorLB|2*H&907V^#;%aO=of^!hEvVap`_KLF@D(gm z2++uVaHG`LI!?^%1bz6cZ7LT4ACWj;e|k^3N%S`;q> zGc*4Hs9a|qw`2Kx)n{+8hEUnNek6S7U_DfO`k%wrrn|XloXpr4EarKzR+G=FDL^{>R3{t^A(@%*ZQi!x;N6zGm$ zapxzkXj;OF9G8;-VTYoh`Do{&{X3ujy+iky_9QR%TX7%!wv|DD_7k%GPgD3+WS3^< z9eY)FndOn4iDqM#C%L3z#ACH5AV`|szG7JQIW)49BxLuIvG@=Ezw)GtJVwXx6>DHk zqx*~c(4K%pDOeD9W6)G%I1JrSYJ7-gpK<3L3Y0=eBvQB)1Vw|KaezNsks%1RKEfkD zl^rLVy3T26?WnAET)#r?=liZrDS9H0g1Pu-& zHBI8j&{o68i0Xu?oG+=O!%V92;Dg$!%7Y*eTGn*=YMiNtUt>ZdsHM;xRTqJX5g6Sn$UqhF)saE0zjJY*O2`y~hzsu2-lBkSf~1JitKW zor$>%0*Jp;{VGWFw1eE#(?$ps#auVg(qt2-DhI7Z(x@d#zz2i%qDI*0z5f8inxm*V zSfq43vWCad=B+ZJ8?wFNk{f`*hVQ%&uealhjdcMAs9@-r00ZgNe>$xl>AQ#)z+^c3 zj%v12OK6?j_p7-#W3D+Lp{!K(b~dLCWSO$pb2B_jPRAI}r!^!6W6LW5K*H4s7{_d8 zla|L{TFGmvkgs5N1DuWcrL^ce&GdLZ=oAIs@euCmaC zAw+=uj)$X;)`CNBl-ns)JC~g>RE&(b1(~dx`-CJwas+OjU(&~=| z*n4+%K-r00^N)Jfg61O;xwZ!f=9(eZPI`}Q{{TNjT)a*mLh>f~*kPBxJ-rQCQ7)Y_ zG;-!9`I205>5@M}Ju~f58k*hz0K*dqTUl;t-`K}6`)#2GPt7E2+tr7+sO?xXj7(T{ z?kiS1#ZkPSMB#l=(+S%~ zP>GbQDEz9`z2c;QWw}EovE%{wqdu$t6_{pBkKsMOl(b6k6p|6R-yp1 z#~^qg?n=E6vGhLGSg{<`mUmKHNb!&)g~!bsbZ`E>SeDv0h}_ze@P87Yyob>L0F7Ho z>WNO;GdAH??mr>^{{UFq)7rBv?rwDHoFP)J#E{4`k`HnHdWzGYEDgZbS)&(7W$7aD z`0wvZ?6i>Ut6P0W-R86mHVnxVa`Air0JOgU0G)GF#`fnp%AWPP3p^Jp;o3J-m61*e zHDd18EkJ(kM-AJ5%RZ6Jf7$GXVc3E@nym`uI_79(A1T^tqbff2P_nvN6bC%|invoehRCd>mCkv+Nl}Sx92&Q8 za`F79k?&c8Go8xZ`c=4#0hT<9o>Oi}$~KXrr6lD@2hi0^>sart%rh@eRlA|}6(lo8 zK~@LrPmo}6cQ5KGG}fr|>fxnL-WwQpH`f|n(6_li?!RUg*FkW%k%FuP{{T9|xw1>h zN=P>;=togm_V+g$J<7#!#iBoQPR5AN#t@TE=<4_FB^Z$1!Jv*#R+|*C>JctmY=CgW znTeosk{pWNO6Q|WqMGwOuTj&$fI5z%xl9z8fO;DA3*EnEz~|-SxOR^h1oq8#LF!=_ zdzyA$V8$5NNv6Wm&Y_9gIW>!?sK_F^T_|k@o2Oc-SSI;Z&3(PIg%s#*QbsuyS1%wq zJOP@0ozY2f=e;DfxS_G99fB`WQ#GV&xxl9_!W7`uE8E?Xk_|LALv)K6EgN9gYU`3) zOSQ&8tlPatc_b>J;;MOa%5re1vH>FsYM zjxCdpeQRjSGeWiYDwu>)b5cVX&V!3Q|PcpibSl2Qc zWhyg)TNX%@3^D6c-02F6TzzUR<*_X$W?RMb4At1})s1JHpCVv{a1Cl(OtOrMTWHGB zXkCKI2{ibm#Rx)opE9;3y=X0IHgk z{hAhr5td#~;%It|*6<+!{;xNVY=kLBTy#D})p!{jjj3Hyo8p0oFlH@MK zjPqO3tbz~#!Nn-H^<>3gQSmpC9FqS45&2j8{{TvTxM?3G@jf9L0lD3}B95f{3b}tj zn(k@7(=pD7J_!@s~e6$sXK<{5$)fYK3>R7jL}BHCrnScX8_&104r* zO~GUGjy-C0h>@2)YGEHhI#k$WVThfodz_k>p?OFjjXp3Gk6%ijCFF5V#N1?d;Xv+b z?GUR$Fe$7CD!ECyNzfv5>zbz~UBK)r+~l97S6I$@?^4P`qmPc9RC0`*Rjgqu76T-j zVkt}jUwVe#4E)BdI++-HW~WHPrU0Z-PGrFLthq)VkEcr7n~()%yhM@@QfOd^f^Nk_ zG+9!8dRC8;RwCWU0+}>ncARwLlH3eHq&9sHYi7lPw@^C@&yq3bx(6*{DP@GDbLbxZ;id@d;RT;#N+Dv?B08;{{V$lkIR}8l4mEO@0yPGe9s^(c0bF{BO}t8r^76_PSPm_I486J z0QKp`B^7bgwJKL-`xrcl+U%9xw5qn=fQ%o;oA#x+x;DV_Mh86)Z}aa?yE6TnQ3oul zTmJyBj`*%h-f5wqd1fpdC!Vy{x|vgpto@_;9lSUMDH!cnQI}zcU-fKq5BCqR_|`Oc z%QSFY=ZQY>{_BpvpI^qcB5>fTBC~D9gzwaBqH9Op8isUTu7{+t^~d-g+__8Lq22^-jm!5K{{SQHQ29!Q;&6P4dV~BY*YN&TtSZ=0pK_k3 zX0jt(+eT4A+cZ7lZ|bM9tk~J3BX?ehHNS4MthfXVsD|I0kt%<5`q6WyLo{A|3AGPY z?-}$y-iE46n^bkeX=ut2NT(U(q85(2Ati?yKvHK_jjJBryGIRIF&@ zb5?t3uHk8|hSCOJEcEvt)rtP5Z=0du`-;)DxoyFL*z3^L*S0Br8%YX?<`MSoHL*HP{(BI4R>8@9K&LXJNp=t1s1>h08PaTee(%Si>uo)xpc4Dhe| zxfEsLKHjV9zm+*PC9dVh@!Mf7&Hz1YR^5rgQb6xn%V#yL!5oHG8wG!_H2OL%`GRG?HakSOo14IC9k?TzvHFDL(ycZ*K^)-oi zq{q0E!1k?JU87bMGnaIBcN>dxKa8;jN^@ZYx%j{{U_ZrjcvaoLSZGgTjwuUhCl0`b|#yMjmyWpLT zX}u3Tw-O}F7|$ma*F-#vTy!K;ZM1MNH_&|3Q=~T;PFfLv5|(R;Kg*yw)Tn{c+M)uqX<|F4@z4(gGZbSTZj8Z1wRV7 zDf&I;5V@{`eIX>cAdYig2AieD6e9{La$Jedn-1`S$^Tax6N3|5H4rA)R84;?D;&aN@sW>#oHI$^8N(MjnMYx{RKYb_i>n^IbY&$?;r52sik!U z5JhFpt%3_t#n(G?ienB=1yar&xnf0HNkf2pQj*ZRnk$irH_8iitFgrMPaU)ix<@zz z-kTWbC$(uTm4@dm39TzQOc9&L19&@O2m9UI@cwx8sBJA8dxTD& zr1k@;tw}=2;GiNj8&Q1+`RDPgit1-B#&W~9H)G2=$3L0=l=O=ovTD;rb}l}(EKpz% zS}X)97jyYj%5zqhFKSkgXz4|XxmGyIV^<=NVD~h{z#LTR&ot5#V0hxAZgM*gwIg@M zCNP+xf+{k1rUMEtz=kkwVPqaaN#}2T36^-EzzsLlNkg7(=^XB9+FuJNIG_McU5L8P~kT!|u(scycyIxGjk9edSTVNo+HcjOV; zr8@*|EzJA9Ijv+4OEMGkvZ!6Y#PzGT(g@{3!v*;O_Bh2$9npt$7emc~_|JS*StCJr zZjeYGC2~O=@kG7r7g92mQkPDp1QM&3R7Tw2G8~m``qt_!cJOiXs2%ymQ?ZGbM`s`e z9KULUd!uu3fk{}<{F&~3O;gu%N|gPpQClK8<(*{nK|kv| zA75(LSlp-%fzeOQzr?>o=~nGvNafud@Ou%~pEpCHMMY?1Y4AUnECdp+yOiUd%l&bh z)3h!|(s>o4Z6jA(h(H7!vW}F=tkoZSa6PJJ8=_>Ci6WDJ6~6al>;C}Pt7Sy7^Y?{8 z6h2_d7(Ma)>dFQO=KJ5yx!P%*bEf056QN~SQ}f6Czxvf;8H%4d06nBb(EkASjSPSs zj`YB$H^>|wzV%V5VqsUK8+l>~`$euyXR7y8=xZ_J>_!i*ZbIfKByy}UN2sPGiF-l$FlPbr!}M1qj>*M|!mj94jjj!;nX6ds~Cor@c4IXt`6Bq?ys93GKz% zXN(1$j5-dEJxBOf45Y4Ah8P@Tw`I#HWjkDQ20xyNZfp7o>mmzgWw zentbGx!cgw5+ECJs|=N0xsFuxA%DKU{-fTSchQ-@Ab65h-3bK#wImX{0J3xa zDjSO<3oF9^04s8iIv@VOwIo*=ZL)Gf`@)k=J%+5&qK>3s{Mf1O{!&7o4{E5YjKS29 zd;8V6Q;x zV|0F%+e>V+AS82Cp2?FW46mWCo>QtbIL27rikz!et4Ur1jmK%=irKu# z$DX35hIL%SCLIq2TB%1H54qG-&9*~^;r%FUnHf%;WUV6Yyd_(uSZKlRPK+O#pq^r! z;=L$Z);#p|xZ@Ila5~cksK9Ns*zPx0qDEwkfyOFh9%(jZ8D3X1u?t!|gr|d1NoH7s*A-HB98AQuc{w#Q*_G+m zw7jsZjC86q+?b!I6)tYVvblEI+6YK)Y8bDCb27Q%jYOBK$&y7^HJm$5h?-|ZVc>D~~q1>F- zY70X=H{w1M~oHBB@^Xs;!*$oo%2PhCD+XA(eir>LzjDM-!^eJNsWjiDS-9R{7A zg_~2pKbwP6n2A4gBkNMyS~)v_I6aMOrK%1{=CpE8P;TaY8hSU)$Tgsmgct^%Kwbqj zw>*mHoXSxH$0z>)*HJM!=A6x*wHux@ST<;o(%9=!vCn!!bKaQEg(uRY<=A0yAS51@ zUiHi{Iv>8sNzY;F{xp^kv`H!K#3((_{{UK{zJ2?H9F|f(yiv_%a$CAFyy7V)MV2~; zc3aw~+45&Kj2jd+W*x*NHbLN1tcw?I!+}rKW{Eyo0blloL2L4iAue5j7_F1KG^rnT znz69KW*Jq%BiGurUP88!D;&u=GdcYi^dGHdGXRAYj4x6-X{!}F8*Q#A9OlULzSI(1Rhwgpkt^d^okv)X4EdARy0{{R#GszgkI zPo0CvcKy-vuh4X*jC_Rmtvrmp#Av}Ij+J6G80k^=%}*apQ(Kb9B6r6%9Fga(SqF-2 zfB~AT5>fJqH6Y+%Q;$6;E4GtBG>{HE(iWhpCyu6^Av_aOP~7Jjrn3Yd=B8x|ed;Zv zIX!3@6E1l@Jt_%CIVYO6BZXS3=OnQoN{L9)F)2Ji9G*!%O;?jF%onFX2YR+onMdi%cW&mv+MMrRakn%Twx=%s=H3X~!Jj5IxgQ4kCn=Z(!HPZd1 z)F{U8JMmdMg~OOh1mw8{dk$$WWRiQCi;O8?bM2fFO4jD&ODlcn$-wP`PxPR*WJNBj z(gJO%B>JRkTvJ05g1ZUGce?P*Zg3TehfT#@TEx)p2q*-H@NFcW5Oqr08 zjo;om^aB_k{Y=k2T9lmY&;~!7!Rug^zV1FLb6H$L zDkveJYbo!?Ut?QP**wk)BL?^Ae!i6ss|+^LG>sva;a76`W8^?R`}M1YTh#ATQjaN_ z46|PsK@-NYovgm6*NVQD0H7^~$6;A|JaRM=GReo8jk)0Q*0n-OY(VHwJanp+u3U6l z4vmaXayjU7dWx{E9}F@v)Ou4E7L;xs`xvU@E#cK*(TeUEan{Nz}JtvF;dX$@+R#<#6aSpdV_k ziGnuZA4--|(R|p+^rF`lW}Tw9Wa}7lfPS9FqhLn_Q5(lEsyC--Z7g zx+65sx^=o4w-)>oo%>c5^lW0>G94R&E2?xXK3d1WfCYb-2N}g~wmKs#nMo>YspZZ@ zcVJ*hcn|D-{p&JwJ*=%1szFnX8j|4|PU!meAW$!o@ddr(?P0`o5z6~FsHt|A`suYZhh1rCcRA}3=SjsT-A#c8! z$KpK=A%pBrtbj=i@~w`(-pAUi8B}1ddGAlOx-qjO<>6!VAwQ_}H2JNeE;lt`xwm`< z8%Kd<;7YP5(^D;ZsGa#f=ktHrbuvu-@oV!4qR5CIvcIp(Lf zi+9qZ7jHv~Dx-IcEvB?o;i5S9s}bs!(u@`=eZ^Li%rbnq$f|N$73eExIm2?@T9Mq@ z#7@~qQ}i`T_e~cADn4=0ip~*Ar{&1{RG(?NhxpIqN3~CMX&hH{Rhk*%;ei76mf(|eM_vB}D|-)gY-ir^3iM{!L^&>XI6 z!i)-!Xk%9uAF|6aTd)PNNc5_AknUlQ^if{SqLeOkrul4C7VORydz@EKYXYPoA4({w zq40_c6PQCG=~#D`8{GDyirPx$r>RaG285DG+f%9u4il{uQbLn=8uo6@hmC;s6n|g~ zfC8{=^v@JgSIC+b+GEqfwcC7uLqy ze+a1syi3tV6jO3xWIb@50HiWviYgqLN+@Y;ifK6*2Z|`EWmt`cLm}=@6j4yiMQI-& zcA+vBQlsx*$9Ta8XymBe?7c~Z+!rm60TO^YlOYi(g@Emv=x|`MbOY!7@xW^S@5tH?z zik0ymWi2X~p!qZOtqT|Vq?7xL!S_6WQ$-cB-0P7ja4<45dJ3?I6~Emilp|gmPI)Su;U-rszs-}x)mIz zcIW%ub43(fYhr7CA>o4I)xLRxf)5~M<+%Rgf! z%!P|O?c3A}D6Jswc2hZx4>tEwMI&P*sXq15C`9=VHucCkZ~nCuR*!Q;D%(QNUKt5- z*qXYNsX())03Cg(qLq2rg;=mSlixa6t!|>k?m8tl#F*K znA(lIzS$MWZNrg86uGRfZ9#L*x6ofR=E{*G_Bg>6rD*Eo89a`2MHO8Uw4$BH&mPvn z%ChmE#CE1HBOa7dSk#KS$v7(x#wqCBVDzGj=T6T=dsU9t1L+J38A~L?&rwCN?2kS)@%Tb(_p%t+z#s_MC0HTVBE3=KC|Jir} Bp^E?j literal 0 HcmV?d00001 diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py new file mode 100644 index 00000000000..c78bbdc40a2 --- /dev/null +++ b/python/paddle/v2/tests/test_image.py @@ -0,0 +1,43 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle.v2.image as image + + +class Image(unittest.TestCase): + def test_resize_flip_chw(self): + # resize + im = image.load_image('cat.jpg') + im = image.resize_short(im, 256) + self.assertEqual(256, min(im.shape[:2])) + self.assertEqual(3, im.shape[2]) + + # flip + im = image.left_right_flip(im) + im2 = np.flip(im, 1) + self.assertEqual(im.all(), im2.all()) + + # to_chw + h, w, c = im.shape + im = image.to_chw(im) + self.assertEqual(c, im.shape[0]) + self.assertEqual(h, im.shape[1]) + self.assertEqual(w, im.shape[2]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py index 8320217da27..264442be182 100644 --- a/python/paddle/v2/tests/test_paramconf_order.py +++ b/python/paddle/v2/tests/test_paramconf_order.py @@ -27,7 +27,6 @@ # limitations under the License. import unittest import math -import paddle.dataset as dataset import paddle.v2 as paddle @@ -41,7 +40,7 @@ def wordemb(inlayer): def train(): - word_dict = dataset.imikolov.build_dict() + word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) # Every layer takes integer value of range [0, dict_size) firstword = paddle.layer.data( diff --git a/python/setup.py.in b/python/setup.py.in index d73a3a6a1c4..08a448934d3 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -77,6 +77,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF': 'paddle.v2', 'paddle.v2.master', 'paddle.v2.plot', + 'paddle.v2.reader', + 'paddle.v2.dataset', 'py_paddle'] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: -- GitLab