提交 e9cd3867 编写于 作者: Q qiaolongfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into rnn

import paddle.v2 as paddle import paddle.v2 as paddle
import mnist_util
def train_reader():
train_file = './data/raw_data/train'
generator = mnist_util.read_from_mnist(train_file)
for item in generator:
yield item
def main(): def main():
paddle.init(use_gpu=False, trainer_count=1) paddle.init(use_gpu=False, trainer_count=1)
...@@ -40,11 +31,13 @@ def main(): ...@@ -40,11 +31,13 @@ def main():
trainer = paddle.trainer.SGD(update_equation=adam_optimizer) trainer = paddle.trainer.SGD(update_equation=adam_optimizer)
trainer.train( trainer.train(
train_data_reader=train_reader, reader=paddle.reader.batched(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=32),
cost=cost, cost=cost,
parameters=parameters, parameters=parameters,
event_handler=event_handler, event_handler=event_handler,
batch_size=32, # batch size should be refactor in Data reader
reader_dict={images.name: 0, reader_dict={images.name: 0,
label.name: 1}) label.name: 1})
......
...@@ -92,7 +92,6 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat, ...@@ -92,7 +92,6 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
CHECK(in1_mat.useGpu_ == true && in2_mat.useGpu_ == true) CHECK(in1_mat.useGpu_ == true && in2_mat.useGpu_ == true)
<< "Matrix type are not GPU"; << "Matrix type are not GPU";
size_t num_samples = out_mat.getHeight();
size_t dim = in1_mat.getWidth(); size_t dim = in1_mat.getWidth();
real* out = out_mat.getData(); real* out = out_mat.getData();
const real* x = in1_mat.getData(); const real* x = in1_mat.getData();
......
...@@ -4,7 +4,7 @@ set(OUTPUT_DIR ...@@ -4,7 +4,7 @@ set(OUTPUT_DIR
file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py) file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py) file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py) file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
file(GLOB V2_PY_FILES . ./paddle/v2/*.py) file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
set(PY_FILES paddle/__init__.py set(PY_FILES paddle/__init__.py
${TRAINER_PY_FILES} ${TRAINER_PY_FILES}
...@@ -24,7 +24,7 @@ add_custom_target(paddle_python ALL DEPENDS ...@@ -24,7 +24,7 @@ add_custom_target(paddle_python ALL DEPENDS
${OUTPUT_DIR}/.timestamp) ${OUTPUT_DIR}/.timestamp)
add_subdirectory(paddle/trainer_config_helpers/tests) add_subdirectory(paddle/trainer_config_helpers/tests)
add_subdirectory(paddle/reader/tests) add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/tests) add_subdirectory(paddle/v2/tests)
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/ install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
......
add_test(NAME reader_decorator_test
COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/reader/tests/decorator_test.py
WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
add_test(NAME reader_creator_test
COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/reader/tests/creator_test.py
WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
...@@ -52,6 +52,10 @@ def wrap_param_default(param_names=None, ...@@ -52,6 +52,10 @@ def wrap_param_default(param_names=None,
kwargs[name] = default_factory(func) kwargs[name] = default_factory(func)
return func(*args, **kwargs) return func(*args, **kwargs)
if hasattr(func, 'argspec'):
__wrapper__.argspec = func.argspec
else:
__wrapper__.argspec = inspect.getargspec(func)
return __wrapper__ return __wrapper__
return __impl__ return __impl__
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import functools import functools
import collections import collections
import inspect
from paddle.trainer.config_parser import * from paddle.trainer.config_parser import *
from .activations import LinearActivation, SigmoidActivation, TanhActivation, \ from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
...@@ -316,6 +317,11 @@ def layer_support(*attrs): ...@@ -316,6 +317,11 @@ def layer_support(*attrs):
val.check(method.__name__) val.check(method.__name__)
return method(*args, **kwargs) return method(*args, **kwargs)
if hasattr(method, 'argspec'):
wrapper.argspec = method.argspec
else:
wrapper.argspec = inspect.getargspec(method)
return wrapper return wrapper
return decorator return decorator
......
...@@ -20,13 +20,16 @@ import event ...@@ -20,13 +20,16 @@ import event
import data_type import data_type
import topology import topology
import data_feeder import data_feeder
from . import dataset
from . import reader
import attr import attr
import pooling import pooling
import py_paddle.swig_paddle as api import py_paddle.swig_paddle as api
__all__ = [ __all__ = [
'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer', 'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'topology' 'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
'topology'
] ]
......
""" """
CIFAR Dataset. CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html
URL: https://www.cs.toronto.edu/~kriz/cifar.html
the default train_creator, test_creator used for CIFAR-10 dataset.
""" """
import cPickle import cPickle
import itertools import itertools
import tarfile
import numpy import numpy
import paddle.v2.dataset.common
import tarfile
from config import download __all__ = ['train100', 'test100', 'train10', 'test10']
__all__ = [
'cifar_100_train_creator', 'cifar_100_test_creator', 'train_creator',
'test_creator'
]
CIFAR10_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a' CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
CIFAR100_URL = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz' CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85' CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
def __read_batch__(filename, sub_name): def reader_creator(filename, sub_name):
def reader(): def read_batch(batch):
def __read_one_batch_impl__(batch):
data = batch['data'] data = batch['data']
labels = batch.get('labels', batch.get('fine_labels', None)) labels = batch.get('labels', batch.get('fine_labels', None))
assert labels is not None assert labels is not None
for sample, label in itertools.izip(data, labels): for sample, label in itertools.izip(data, labels):
yield (sample / 255.0).astype(numpy.float32), int(label) yield (sample / 255.0).astype(numpy.float32), int(label)
def reader():
with tarfile.open(filename, mode='r') as f: with tarfile.open(filename, mode='r') as f:
names = (each_item.name for each_item in f names = (each_item.name for each_item in f
if sub_name in each_item.name) if sub_name in each_item.name)
for name in names: for name in names:
batch = cPickle.load(f.extractfile(name)) batch = cPickle.load(f.extractfile(name))
for item in __read_one_batch_impl__(batch): for item in read_batch(batch):
yield item yield item
return reader return reader
def cifar_100_train_creator(): def train100():
fn = download(url=CIFAR100_URL, md5=CIFAR100_MD5) return reader_creator(
return __read_batch__(fn, 'train') paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
'train')
def cifar_100_test_creator():
fn = download(url=CIFAR100_URL, md5=CIFAR100_MD5)
return __read_batch__(fn, 'test')
def train_creator():
"""
Default train reader creator. Use CIFAR-10 dataset.
"""
fn = download(url=CIFAR10_URL, md5=CIFAR10_MD5)
return __read_batch__(fn, 'data_batch')
def test_creator(): def test100():
""" return reader_creator(
Default test reader creator. Use CIFAR-10 dataset. paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
""" 'test')
fn = download(url=CIFAR10_URL, md5=CIFAR10_MD5)
return __read_batch__(fn, 'test_batch')
def unittest(): def train10():
for _ in train_creator()(): return reader_creator(
pass paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
for _ in test_creator()(): 'data_batch')
pass
if __name__ == '__main__': def test10():
unittest() return reader_creator(
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
'test_batch')
import requests
import hashlib
import os
import shutil
__all__ = ['DATA_HOME', 'download', 'md5file']
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
if not os.path.exists(DATA_HOME):
os.makedirs(DATA_HOME)
def md5file(fname):
hash_md5 = hashlib.md5()
f = open(fname, "rb")
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
f.close()
return hash_md5.hexdigest()
def download(url, module_name, md5sum):
dirname = os.path.join(DATA_HOME, module_name)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = os.path.join(dirname, url.split('/')[-1])
if not (os.path.exists(filename) and md5file(filename) == md5sum):
r = requests.get(url, stream=True)
with open(filename, 'w') as f:
shutil.copyfileobj(r.raw, f)
return filename
import hashlib
import os
import shutil
import urllib2
__all__ = ['DATA_HOME', 'download']
DATA_HOME = os.path.expanduser('~/.cache/paddle_data_set')
if not os.path.exists(DATA_HOME):
os.makedirs(DATA_HOME)
def download(url, md5):
filename = os.path.split(url)[-1]
assert DATA_HOME is not None
filepath = os.path.join(DATA_HOME, md5)
if not os.path.exists(filepath):
os.makedirs(filepath)
__full_file__ = os.path.join(filepath, filename)
def __file_ok__():
if not os.path.exists(__full_file__):
return False
md5_hash = hashlib.md5()
with open(__full_file__, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
md5_hash.update(chunk)
return md5_hash.hexdigest() == md5
while not __file_ok__():
response = urllib2.urlopen(url)
with open(__full_file__, mode='wb') as of:
shutil.copyfileobj(fsrc=response, fdst=of)
return __full_file__
import sklearn.datasets.mldata """
import sklearn.model_selection MNIST dataset.
"""
import paddle.v2.dataset.common
import subprocess
import numpy import numpy
from config import DATA_HOME import platform
__all__ = ['train', 'test']
__all__ = ['train_creator', 'test_creator'] URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
TEST_IMAGE_MD5 = '25e3cc63507ef6e98d5dc541e8672bb6'
TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
TEST_LABEL_MD5 = '4e9511fe019b2189026bd0421ba7b688'
TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
def __mnist_reader_creator__(data, target): def reader_creator(image_filename, label_filename, buffer_size):
def reader(): def reader():
n_samples = data.shape[0] if platform.system() == 'Darwin':
for i in xrange(n_samples): zcat_cmd = 'gzcat'
yield (data[i] / 255.0).astype(numpy.float32), int(target[i]) elif platform.system() == 'Linux':
zcat_cmd = 'zcat'
else:
raise NotImplementedError()
return reader # According to http://stackoverflow.com/a/38061619/724872, we
# cannot use standard package gzip here.
m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
m.stdout.read(16) # skip some magic bytes
l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
l.stdout.read(8) # skip some magic bytes
while True:
labels = numpy.fromfile(
l.stdout, 'ubyte', count=buffer_size).astype("int")
TEST_SIZE = 10000 if labels.size != buffer_size:
break # numpy.fromfile returns empty slice after EOF.
data = sklearn.datasets.mldata.fetch_mldata( images = numpy.fromfile(
"MNIST original", data_home=DATA_HOME) m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( (buffer_size, 28 * 28)).astype('float32')
data.data, data.target, test_size=TEST_SIZE, random_state=0)
images = images / 255.0 * 2.0 - 1.0
def train_creator(): for i in xrange(buffer_size):
return __mnist_reader_creator__(X_train, y_train) yield images[i, :], int(labels[i])
m.terminate()
l.terminate()
def test_creator(): return reader
return __mnist_reader_creator__(X_test, y_test)
def unittest(): def train():
assert len(list(test_creator()())) == TEST_SIZE return reader_creator(
paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
TRAIN_IMAGE_MD5),
paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
TRAIN_LABEL_MD5), 100)
if __name__ == '__main__': def test():
unittest() return reader_creator(
paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
TEST_IMAGE_MD5),
paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
TEST_LABEL_MD5), 100)
import zipfile import zipfile
from config import download from common import download
import re import re
import random import random
import functools import functools
......
import paddle.v2.dataset.cifar
import unittest
class TestCIFAR(unittest.TestCase):
def check_reader(self, reader):
sum = 0
label = 0
for l in reader():
self.assertEqual(l[0].size, 3072)
if l[1] > label:
label = l[1]
sum += 1
return sum, label
def test_test10(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.test10())
self.assertEqual(instances, 10000)
self.assertEqual(max_label_value, 9)
def test_train10(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.train10())
self.assertEqual(instances, 50000)
self.assertEqual(max_label_value, 9)
def test_test100(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.test100())
self.assertEqual(instances, 10000)
self.assertEqual(max_label_value, 99)
def test_train100(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.cifar.train100())
self.assertEqual(instances, 50000)
self.assertEqual(max_label_value, 99)
if __name__ == '__main__':
unittest.main()
import paddle.v2.dataset.common
import unittest
import tempfile
class TestCommon(unittest.TestCase):
def test_md5file(self):
_, temp_path = tempfile.mkstemp()
with open(temp_path, 'w') as f:
f.write("Hello\n")
self.assertEqual('09f7e02f1290be211da707a266f153b3',
paddle.v2.dataset.common.md5file(temp_path))
def test_download(self):
yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
self.assertEqual(
paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
paddle.v2.dataset.common.download(
yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
if __name__ == '__main__':
unittest.main()
import paddle.v2.dataset.mnist
import unittest
class TestMNIST(unittest.TestCase):
def check_reader(self, reader):
sum = 0
label = 0
for l in reader():
self.assertEqual(l[0].size, 784)
if l[1] > label:
label = l[1]
sum += 1
return sum, label
def test_train(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.mnist.train())
self.assertEqual(instances, 60000)
self.assertEqual(max_label_value, 9)
def test_test(self):
instances, max_label_value = self.check_reader(
paddle.v2.dataset.mnist.test())
self.assertEqual(instances, 10000)
self.assertEqual(max_label_value, 9)
if __name__ == '__main__':
unittest.main()
...@@ -72,26 +72,15 @@ import paddle.trainer_config_helpers as conf_helps ...@@ -72,26 +72,15 @@ import paddle.trainer_config_helpers as conf_helps
from paddle.trainer_config_helpers.config_parser_utils import \ from paddle.trainer_config_helpers.config_parser_utils import \
parse_network_config as __parse__ parse_network_config as __parse__
from paddle.trainer_config_helpers.default_decorators import wrap_act_default from paddle.trainer_config_helpers.default_decorators import wrap_act_default
from paddle.trainer_config_helpers.default_decorators import wrap_bias_attr_default from paddle.trainer_config_helpers.default_decorators import \
wrap_bias_attr_default
from paddle.trainer_config_helpers.default_decorators import wrap_name_default from paddle.trainer_config_helpers.default_decorators import wrap_name_default
from paddle.trainer_config_helpers.layers import layer_support from paddle.trainer_config_helpers.layers import layer_support
import activation import activation
import data_type import data_type
__all__ = [ __all__ = ['parse_network', 'data']
'parse_network', 'data', 'fc', 'conv_shift', 'img_conv', 'img_pool', 'spp',
'maxout', 'img_cmrnorm', 'batch_norm', 'sum_to_one_norm', 'recurrent',
'lstmemory', 'grumemory', 'pool', 'last_seq', 'first_seq', 'concat',
'seq_concat', 'block_expand', 'expand', 'repeat', 'seq_reshape', 'addto',
'linear_comb', 'interpolation', 'bilinear_interp', 'power', 'scaling',
'slope_intercept', 'tensor', 'cos_sim', 'trans', 'max_id', 'sampling_id',
'pad', 'classification_cost', 'cross_entropy_cost',
'cross_entropy_with_selfnorm_cost', 'regression_cost',
'multi_binary_label_cross_entropy_cost', 'rank_cost', 'lambda_cost',
'sum_cost', 'huber_cost', 'crf', 'crf_decoding', 'ctc', 'warp_ctc', 'nce',
'hsigmoid', 'eos', 'memory', 'embedding', 'recurrent_group'
]
__projection_names__ = filter(lambda x: x.endswith('_projection'), __projection_names__ = filter(lambda x: x.endswith('_projection'),
dir(conf_helps)) dir(conf_helps))
...@@ -395,85 +384,51 @@ ExpandLevel = conf_helps.layers.ExpandLevel ...@@ -395,85 +384,51 @@ ExpandLevel = conf_helps.layers.ExpandLevel
recurrent_group = RecurrentGroupV2 recurrent_group = RecurrentGroupV2
memory = MemoryV2 memory = MemoryV2
layer_list = [
# [V2LayerImpl, V1_method_name, parent_names] def __layer_name_mapping__(inname):
# fully connected layers if inname in ['data_layer', 'memory', 'mixed_layer']:
['fc', 'fc_layer', ['input']], # Do Not handle these layers
['embedding', 'embedding_layer', ['input']], return
# conv layers elif inname == 'maxid_layer':
['conv_shift', 'conv_shift_layer', ['a', 'b']], return 'max_id'
['img_conv', 'img_conv_layer', ['input']], elif inname.endswith('memory') or inname.endswith(
# image pooling layers '_seq') or inname.endswith('_sim') or inname == 'hsigmoid':
['img_pool', 'img_pool_layer', ['input']], return inname
['spp', 'spp_layer', ['input']], elif inname in [
['maxout', 'maxout_layer', ['input']], 'cross_entropy', 'multi_binary_label_cross_entropy',
# norm layers 'cross_entropy_with_selfnorm'
['img_cmrnorm', 'img_cmrnorm_layer', ['input']], ]:
['batch_norm', 'batch_norm_layer', ['input']], return inname + "_cost"
['sum_to_one_norm', 'sum_to_one_norm_layer', ['input']], elif inname.endswith('_cost'):
# recurrent layers return inname
['recurrent', 'recurrent_layer', ['input']], elif inname.endswith("_layer"):
['lstmemory', 'lstmemory', ['input']], return inname[:-len("_layer")]
['grumemory', 'grumemory', ['input']],
# aggregate layers
['pool', 'pooling_layer', ['input']], def __layer_name_mapping_parent_names__(inname):
['last_seq', 'last_seq', ['input']], all_args = getattr(conf_helps, inname).argspec.args
['first_seq', 'first_seq', ['input']], return filter(
['concat', 'concat_layer', ['input']], lambda x: x in ['input1', 'input2','label', 'input', 'a', 'b', 'expand_as',
['seq_concat', 'seq_concat_layer', ['a', 'b']], 'weights', 'vectors', 'weight', 'score', 'left', 'right'],
# reshaping layers all_args)
['block_expand', 'block_expand_layer', ['input']],
['expand', 'expand_layer', ['input', 'expand_as']],
['repeat', 'repeat_layer', ['input']], def __convert_layer__(_new_name_, _old_name_, _parent_names_):
['rotate', 'rotate_layer', ['input']], global __all__
['seq_reshape', 'seq_reshape_layer', ['input']], __all__.append(_new_name_)
# math layers globals()[new_name] = __convert_to_v2__(_old_name_, _parent_names_)
['addto', 'addto_layer', ['input']],
['linear_comb', 'linear_comb_layer', ['weights', 'vectors']],
['interpolation', 'interpolation_layer', ['input', 'weight']], for each_layer_name in dir(conf_helps):
['bilinear_interp', 'bilinear_interp_layer', ['input']], new_name = __layer_name_mapping__(each_layer_name)
['power', 'power_layer', ['input', 'weight']], if new_name is not None:
['scaling', 'scaling_layer', ['input', 'weight']], parent_names = __layer_name_mapping_parent_names__(each_layer_name)
['slope_intercept', 'slope_intercept_layer', ['input']], assert len(parent_names) != 0, each_layer_name
['tensor', 'tensor_layer', ['a', 'b']], __convert_layer__(new_name, each_layer_name, parent_names)
['cos_sim', 'cos_sim', ['a', 'b']],
['trans', 'trans_layer', ['input']], del parent_names
# sampling layers del new_name
['max_id', 'maxid_layer', ['input']], del each_layer_name
['sampling_id', 'sampling_id_layer', ['input']],
# slicing and joining layers
['pad', 'pad_layer', ['input']],
# cost layers
[
'classification_cost', 'classification_cost',
['input', 'label', 'weight']
],
['regression_cost', 'regression_cost', ['input', 'label', 'weight']],
['cross_entropy_cost', 'cross_entropy', ['input', 'label']],
[
'cross_entropy_with_selfnorm_cost', 'cross_entropy_with_selfnorm',
['input', 'label']
],
[
'multi_binary_label_cross_entropy_cost',
'multi_binary_label_cross_entropy', ['input', 'label']
],
['rank_cost', 'rank_cost', ['left', 'right', 'label', 'weight']],
['lambda_cost', 'lambda_cost', ['input', 'score']],
['sum_cost', 'sum_cost', ['input']],
['huber_cost', 'huber_cost', ['input', 'label']],
['crf', 'crf_layer', ['input', 'label']],
['crf_decoding', 'crf_decoding_layer', ['input']],
['ctc', 'ctc_layer', ['input', 'label']],
['warp_ctc', 'warp_ctc_layer', ['input', 'label']],
['nce', 'nce_layer', ['input', 'label']],
['hsigmoid', 'hsigmoid', ['input', 'label']],
# check layers
['eos', 'eos_layer', ['input']],
['gru_step_layer', 'gru_step_layer', ['input', 'output_mem']]
]
for l in layer_list:
globals()[l[0]] = __convert_to_v2__(l[1], l[2])
# convert projection # convert projection
for prj in __projection_names__: for prj in __projection_names__:
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
__all__ = [ __all__ = [
'map_readers', 'buffered', 'compose', 'chain', 'shuffle', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
'ComposeNotAligned' 'ComposeNotAligned', 'batched'
] ]
from Queue import Queue from Queue import Queue
...@@ -191,3 +191,25 @@ def buffered(reader, size): ...@@ -191,3 +191,25 @@ def buffered(reader, size):
e = q.get() e = q.get()
return data_reader return data_reader
def batched(reader, batch_size):
"""
Create a batched reader.
:param reader: the data reader to read from.
:param batch_size: batch_size
:return: the batched reader.
"""
def batched_reader():
r = reader()
batch = []
for instance in r:
batch.append(instance)
if len(batch) == batch_size:
yield batch
batch = []
if batch:
yield batch
return batched_reader
add_test(NAME reader_tests
COMMAND bash ${PROJ_ROOT}/python/paddle/v2/reader/tests/run_tests.sh
${PYTHON_EXECUTABLE})
...@@ -11,17 +11,19 @@ ...@@ -11,17 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import unittest import unittest
import paddle.reader.creator
import numpy as np import numpy as np
import os
import paddle.v2.reader.creator
class TestNumpyArray(unittest.TestCase): class TestNumpyArray(unittest.TestCase):
def test_numpy_array(self): def test_numpy_array(self):
l = [[1, 2, 3], [4, 5, 6]] l = [[1, 2, 3], [4, 5, 6]]
x = np.array(l, np.int32) x = np.array(l, np.int32)
reader = paddle.reader.creator.np_array(x) reader = paddle.v2.reader.creator.np_array(x)
for idx, e in enumerate(reader()): for idx, e in enumerate(reader()):
self.assertItemsEqual(e, l[idx]) self.assertItemsEqual(e, l[idx])
...@@ -29,7 +31,7 @@ class TestNumpyArray(unittest.TestCase): ...@@ -29,7 +31,7 @@ class TestNumpyArray(unittest.TestCase):
class TestTextFile(unittest.TestCase): class TestTextFile(unittest.TestCase):
def test_text_file(self): def test_text_file(self):
path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt") path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
reader = paddle.reader.creator.text_file(path) reader = paddle.v2.reader.creator.text_file(path)
for idx, e in enumerate(reader()): for idx, e in enumerate(reader()):
self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
......
...@@ -11,9 +11,10 @@ ...@@ -11,9 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import unittest
import paddle.reader
import time import time
import unittest
import paddle.v2.reader
def reader_creator_10(dur): def reader_creator_10(dur):
...@@ -37,7 +38,7 @@ class TestMap(unittest.TestCase): ...@@ -37,7 +38,7 @@ class TestMap(unittest.TestCase):
yield "h" yield "h"
yield "i" yield "i"
r = paddle.reader.map_readers(tokenize, read) r = paddle.v2.reader.map_readers(tokenize, read)
for i, e in enumerate(r()): for i, e in enumerate(r()):
self.assertEqual(e, i) self.assertEqual(e, i)
...@@ -45,7 +46,7 @@ class TestMap(unittest.TestCase): ...@@ -45,7 +46,7 @@ class TestMap(unittest.TestCase):
class TestBuffered(unittest.TestCase): class TestBuffered(unittest.TestCase):
def test_read(self): def test_read(self):
for size in range(20): for size in range(20):
b = paddle.reader.buffered(reader_creator_10(0), size) b = paddle.v2.reader.buffered(reader_creator_10(0), size)
c = 0 c = 0
for i in b(): for i in b():
self.assertEqual(i, c) self.assertEqual(i, c)
...@@ -54,7 +55,7 @@ class TestBuffered(unittest.TestCase): ...@@ -54,7 +55,7 @@ class TestBuffered(unittest.TestCase):
def test_buffering(self): def test_buffering(self):
# read have 30ms delay. # read have 30ms delay.
b = paddle.reader.buffered(reader_creator_10(0.03), 10) b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
last_time = time.time() last_time = time.time()
for idx, i in enumerate(b()): for idx, i in enumerate(b()):
elapsed_time = time.time() - last_time elapsed_time = time.time() - last_time
...@@ -68,17 +69,17 @@ class TestBuffered(unittest.TestCase): ...@@ -68,17 +69,17 @@ class TestBuffered(unittest.TestCase):
class TestCompose(unittest.TestCase): class TestCompose(unittest.TestCase):
def test_compse(self): def test_compse(self):
reader = paddle.reader.compose( reader = paddle.v2.reader.compose(
reader_creator_10(0), reader_creator_10(0)) reader_creator_10(0), reader_creator_10(0))
for idx, e in enumerate(reader()): for idx, e in enumerate(reader()):
self.assertEqual(e, (idx, idx)) self.assertEqual(e, (idx, idx))
def test_compose_not_aligned(self): def test_compose_not_aligned(self):
total = 0 total = 0
reader = paddle.reader.compose( reader = paddle.v2.reader.compose(
paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)), paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
reader_creator_10(0)) reader_creator_10(0))
with self.assertRaises(paddle.reader.ComposeNotAligned): with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
for e in reader(): for e in reader():
total += 1 total += 1
# expecting 10, not 20 # expecting 10, not 20
...@@ -86,8 +87,8 @@ class TestCompose(unittest.TestCase): ...@@ -86,8 +87,8 @@ class TestCompose(unittest.TestCase):
def test_compose_not_aligned_no_check(self): def test_compose_not_aligned_no_check(self):
total = 0 total = 0
reader = paddle.reader.compose( reader = paddle.v2.reader.compose(
paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)), paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
reader_creator_10(0), reader_creator_10(0),
check_alignment=False) check_alignment=False)
for e in reader(): for e in reader():
...@@ -98,7 +99,7 @@ class TestCompose(unittest.TestCase): ...@@ -98,7 +99,7 @@ class TestCompose(unittest.TestCase):
class TestChain(unittest.TestCase): class TestChain(unittest.TestCase):
def test_chain(self): def test_chain(self):
c = paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)) c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
idx = 0 idx = 0
for e in c(): for e in c():
self.assertEqual(e, idx % 10) self.assertEqual(e, idx % 10)
...@@ -111,7 +112,7 @@ class TestShuffle(unittest.TestCase): ...@@ -111,7 +112,7 @@ class TestShuffle(unittest.TestCase):
case = [(0, True), (1, True), (10, False), (100, False)] case = [(0, True), (1, True), (10, False), (100, False)]
a = reader_creator_10(0) a = reader_creator_10(0)
for size, checkEq in case: for size, checkEq in case:
s = paddle.reader.shuffle(a, size) s = paddle.v2.reader.shuffle(a, size)
total = 0 total = 0
for idx, e in enumerate(s()): for idx, e in enumerate(s()):
if checkEq: if checkEq:
......
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pushd `dirname $0` > /dev/null
SCRIPTPATH=$PWD
popd > /dev/null
cd $SCRIPTPATH
$1 -m pip install ../../../../../paddle/dist/*.whl
test_list="creator_test.py decorator_test.py"
export PYTHONPATH=$PWD/../../../../../python/
for fn in $test_list
do
echo "test $fn"
$1 $fn
if [ $? -ne 0 ]; then
exit 1
fi
done
...@@ -66,7 +66,7 @@ class ImageLayerTest(unittest.TestCase): ...@@ -66,7 +66,7 @@ class ImageLayerTest(unittest.TestCase):
class AggregateLayerTest(unittest.TestCase): class AggregateLayerTest(unittest.TestCase):
def test_aggregate_layer(self): def test_aggregate_layer(self):
pool = layer.pool( pool = layer.pooling(
input=pixel, input=pixel,
pooling_type=pooling.Avg(), pooling_type=pooling.Avg(),
agg_level=layer.AggregateLevel.EACH_SEQUENCE) agg_level=layer.AggregateLevel.EACH_SEQUENCE)
......
...@@ -27,19 +27,13 @@ class ITrainer(object): ...@@ -27,19 +27,13 @@ class ITrainer(object):
The interface of Trainer. The only exposed method is `train`. The interface of Trainer. The only exposed method is `train`.
""" """
def train(self, def train(self, reader, topology, parameters, event_handler=None):
train_data_reader,
cost,
parameters,
test_data_reader=None,
event_handler=None):
""" """
train method. train method.
:param train_data_reader: :param reader:
:param cost: :param topology:
:param parameters: :param parameters:
:param test_data_reader:
:param event_handler: :param event_handler:
:return: :return:
""" """
...@@ -61,26 +55,22 @@ class SGD(ITrainer): ...@@ -61,26 +55,22 @@ class SGD(ITrainer):
self.__optimizer__ = update_equation self.__optimizer__ = update_equation
def train(self, def train(self,
train_data_reader, reader,
cost, cost,
parameters, parameters,
num_passes=1, num_passes=1,
test_data_reader=None,
event_handler=None, event_handler=None,
batch_size=32,
reader_dict=None): reader_dict=None):
""" """
Training method. Will train num_passes of input data. Training method. Will train num_passes of input data.
:param train_data_reader: :param reader:
:param cost: cost layers, to be optimized. :param topology: Network Topology, use one or more Layers to represent it.
:param parameters: The parameter pools. :param parameters: The parameter pools.
:param num_passes: The total train passes. :param num_passes: The total train passes.
:param test_data_reader:
:param event_handler: Event handler. A method will be invoked when event :param event_handler: Event handler. A method will be invoked when event
occurred. occurred.
:type event_handler: (BaseEvent) => None :type event_handler: (BaseEvent) => None
:param batch_size: Not important, will be removed after data refactor.
:return: :return:
""" """
if event_handler is None: if event_handler is None:
...@@ -112,9 +102,9 @@ class SGD(ITrainer): ...@@ -112,9 +102,9 @@ class SGD(ITrainer):
event_handler(v2_event.BeginPass(pass_id)) event_handler(v2_event.BeginPass(pass_id))
pass_evaluator.start() pass_evaluator.start()
updater.startPass() updater.startPass()
for batch_id, data_batch in enumerate( for batch_id, data_batch in enumerate(reader()):
__data_reader_to_batch__(train_data_reader, batch_size, pass_type = updater.startBatch(len(data_batch))
topology)): gm.forwardBackward(feeder(data_batch), out_args, pass_type)
batch_evaluator.start() batch_evaluator.start()
event_handler( event_handler(
v2_event.BeginIteration( v2_event.BeginIteration(
...@@ -144,56 +134,19 @@ class SGD(ITrainer): ...@@ -144,56 +134,19 @@ class SGD(ITrainer):
gm.finish() gm.finish()
def __data_reader_to_batch__(reader, batch_size, topology): def __check_train_args__(reader, topology, parameters, event_handler, **kwargs):
"""
This function is not important, and will be removed when data refactored.
"""
def input_reorder(func):
for item in func():
retv = []
for __layer_name__ in topology.proto().input_layer_names:
retv.append(item[__layer_name__])
yield retv
return __generator_to_batch__(input_reorder(reader), batch_size=batch_size)
def __generator_to_batch__(generator, batch_size):
"""
This function is not important, and will be removed when data refactored.
"""
ret_val = list()
for each_item in generator:
ret_val.append(each_item)
if len(ret_val) == batch_size:
yield ret_val
ret_val = list()
if len(ret_val) != 0:
yield ret_val
def __check_train_args__(train_data_reader, topology, parameters,
test_data_reader, event_handler, **kwargs):
""" """
Check train function's argument types Check train function's argument types
""" """
if not callable(train_data_reader) or not isinstance(train_data_reader(), if not callable(reader) or not isinstance(reader(), collections.Iterator):
collections.Iterator): raise TypeError('train_data_reader should be a function, '
raise ValueError('train_data_reader should be a function, '
'which can return a iterator') 'which can return a iterator')
if test_data_reader is not None:
if not callable(test_data_reader) or not isinstance(
test_data_reader(), collections.Iterator):
raise ValueError('test_data_reader should be a function, which can '
'return a iterator')
if not isinstance(topology, Topology): if not isinstance(topology, Topology):
raise ValueError('topology should be a model config') raise TypeError('topology should be a model config')
if not isinstance(parameters, v2_parameters.Parameters): if not isinstance(parameters, v2_parameters.Parameters):
raise ValueError('parameters should be a parameter pool') raise TypeError('parameters should be a parameter pool')
if not callable(event_handler): if not callable(event_handler):
raise ValueError('event handler should be a function') raise TypeError('event handler should be a function')
...@@ -5,7 +5,9 @@ packages=['paddle', ...@@ -5,7 +5,9 @@ packages=['paddle',
'paddle.trainer', 'paddle.trainer',
'paddle.trainer_config_helpers', 'paddle.trainer_config_helpers',
'paddle.utils', 'paddle.utils',
'paddle.v2'] 'paddle.v2',
'paddle.v2.dataset',
'paddle.v2.reader']
setup(name='paddle', setup(name='paddle',
version='${PADDLE_VERSION}', version='${PADDLE_VERSION}',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册