diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py index 7c6b83541002071d6e9d00c17be97b6ce4bf8528..edc2e0292378fea0cd904d7f017762c1dade6caf 100644 --- a/paddle/py_paddle/dataprovider_converter.py +++ b/paddle/py_paddle/dataprovider_converter.py @@ -17,6 +17,7 @@ import collections import swig_paddle import numpy import itertools +from functools import reduce __all__ = ['DataProviderConverter'] @@ -65,6 +66,8 @@ class IScanner(object): :param argument: Output arguments object. :type argument: swig_paddle.Arguments + :param dat: Output arguments object. + :type dat: The Python object, numpy.array or List. :return: """ pass @@ -95,17 +98,35 @@ class DenseScanner(IScanner): def __init__(self, input_type, pos): IScanner.__init__(self, input_type, pos) self.__mat__ = None + self.__shape__ = None self.__height__ = 0 + self.__dim__ = 0 def pre_scan(self, dat): self.__height__ += 1 + if self.__shape__ is None: + self.__shape__ = numpy.array(dat).shape + if len(self.__shape__) > 3: + raise ValueError( + "The dimension of input cannot be greater than 3.") + self.__dim__ = reduce(lambda x, y: x * y, self.__shape__) + if len(self.__shape__) == 1 and self.__dim__ != self.input_type.dim: + raise ValueError( + "The data size must be equal to it in data layer.") + else: + if self.__shape__ != numpy.array(dat).shape: + raise ValueError( + "The data shape must be same in one mini-batch.") def finish_pre_scan(self, argument): self.__mat__ = numpy.ndarray( - shape=(self.__height__, self.input_type.dim), dtype=numpy.float32) + shape=(self.__height__, self.__dim__), dtype=numpy.float32) self.__height__ = 0 def scan(self, dat): + # It's better to use NumPy array for speed. + dat = numpy.array(dat) + dat = dat.flatten() self.__mat__[self.__height__] = dat self.__height__ += 1 @@ -116,6 +137,14 @@ class DenseScanner(IScanner): m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, self.data_in_gpu) argument.setSlotValue(self.pos, m) + if len(self.__shape__) > 1: + # The last-two dimenstions are the frame height and width. + # For example, the layout is CHW for 3-D feature of image. + # The H and W are the fram height and width. + h, w = self.__shape__[-2:] + argument.setSlotFrameHeight(self.pos, h) + argument.setSlotFrameWidth(self.pos, w) + self.__shape__ = None class SparseBinaryScanner(IScanner): diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py index a36f0ebfdcb9f90f54ba2d688f9f4bcee2939ef3..7e305e2cd9fbe306368a44d08f7f66b4185ae2d2 100644 --- a/python/paddle/trainer/PyDataProvider2.py +++ b/python/paddle/trainer/PyDataProvider2.py @@ -72,9 +72,16 @@ class InputType(object): def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE): """ - Dense Vector. It means the input feature is dense float vector. For example, - if the input is an image with 28*28 pixels, the input of Paddle neural - network should be a dense vector with dimension 784. + Dense Array. It means the input feature is dense array with float type. + For example, if the input is an image with 28*28 pixels, the input of + Paddle neural network could be a dense vector with dimension 784 or a + numpy array with shape (28, 28). + + For the 2-D convolution operation, each sample in one mini-batch must have + the similarly size in PaddlePaddle now. But, it supports variable-dimension + feature across mini-batch. For the variable-dimension, the param dim is not + used. While the data reader must yield numpy array and the data feeder will + set the data shape correctly. :param dim: dimension of this vector. :type dim: int @@ -135,6 +142,10 @@ sparse_binary_vector = sparse_non_value_slot sparse_vector = sparse_value_slot integer_value = index_slot +# dense_array can be used for variable-length input feature. +# Each feature is not a vector, but a multi-dimensional array. +dense_array = dense_slot + def dense_vector_sequence(dim): """ diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py index d582f76ddf01ed3430a1d075624bbb8e0bf3f2a9..226997465f2ec97c6224b248427739592e9694df 100644 --- a/python/paddle/v2/data_type.py +++ b/python/paddle/v2/data_type.py @@ -16,7 +16,8 @@ import paddle.trainer.PyDataProvider2 as pydp2 import_list = [ nm for nm in dir(pydp2) - if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm) + if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm or + 'array' in nm) ] import_list.extend(['InputType']) diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py index 71eb3bf31425c22b47accc11c9550042e077ef12..83da678da387ed1c86868847f140c6c09fbec3b5 100644 --- a/python/paddle/v2/tests/test_data_feeder.py +++ b/python/paddle/v2/tests/test_data_feeder.py @@ -233,6 +233,30 @@ class DataFeederTest(unittest.TestCase): self.assertEqual(out_sparse.getSparseRowCols(i), data[i][1]) self.assertEqual(out_index[i], data[i][0]) + def test_dense_set_shape(self): + # test 2-D data + def gen_data(batch_size, shape): + data = [] + for i in xrange(batch_size): + each_sample = [] + each_sample.append(np.random.random(shape)) + data.append(each_sample) + return data + + feeder = DataFeeder([('image', data_type.dense_array(2352))], + {'image': 0}) + arg = feeder(gen_data(32, (3, 28, 28))) + h = arg.getSlotFrameHeight(0) + w = arg.getSlotFrameWidth(0) + self.assertEqual(h, 28) + self.assertEqual(w, 28) + + arg = feeder(gen_data(32, (3, 30, 32))) + h = arg.getSlotFrameHeight(0) + w = arg.getSlotFrameWidth(0) + self.assertEqual(h, 30) + self.assertEqual(w, 32) + if __name__ == '__main__': api.initPaddle("--use_gpu=0")