diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index 6b0cfa0b05c966cb86c133093bb5b3555b76d42a..767dffdad82f59d73b1505586260a61f5008f1f8 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -44,13 +44,13 @@ def main(): topology=cost, parameters=parameters, event_handler=event_handler, - num_passes=100, - batch_size=200, # batch size should be refactor in Data reader - data_types={ # data_types will be removed, It should be in + batch_size=32, # batch size should be refactor in Data reader + data_types=[ # data_types will be removed, It should be in # network topology - 'pixel': images.type, - 'label': label.type - }) + ('pixel', images.type), + ('label', label.type)], + reader_dict={'pixel':0, 'label':1} + ) if __name__ == '__main__': diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py index 21d1cb75f4d40e6ed011b33c6366c9d31c0fcc7c..2690cafe1d8d32bf52cd9e5fa4dc69fbacb2d66c 100644 --- a/paddle/py_paddle/dataprovider_converter.py +++ b/paddle/py_paddle/dataprovider_converter.py @@ -23,7 +23,8 @@ __all__ = ['DataProviderConverter'] class IScanner(object): def __init__(self, input_type, pos): self.input_type = input_type - assert isinstance(self.input_type, dp2.InputType) + if not isinstance(self.input_type, dp2.InputType): + raise ValueError("input type should be dataprovider2.InputType") self.pos = pos def scan(self, dat): @@ -50,7 +51,6 @@ class DenseScanner(IScanner): def finish_scan(self, argument): assert isinstance(argument, swig_paddle.Arguments) - assert isinstance(self.input_type, dp2.InputType) if self.__mat__.dtype != numpy.float32: self.__mat__ = self.__mat__.astype(numpy.float32) m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False) @@ -63,7 +63,6 @@ class SparseBinaryScanner(IScanner): self.__rows__ = [0] self.__cols__ = [] self.__height__ = 0 - self.__nnz__ = 0 self.__value__ = [] def scan(self, dat): @@ -76,7 +75,6 @@ class SparseBinaryScanner(IScanner): def finish_scan(self, argument): assert isinstance(argument, swig_paddle.Arguments) - assert isinstance(self.input_type, dp2.InputType) m = swig_paddle.Matrix.createSparse(self.__height__, self.input_type.dim, len(self.__cols__), diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 0cf7b8e9039f7393ae9fcf73faeeeb8fbf11df31..49d1983a2a422b7e105c66dd92419426f0853212 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -18,12 +18,13 @@ import parameters import trainer import event import data_type +import data_feeder import attr import py_paddle.swig_paddle as api __all__ = [ 'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer', - 'event', 'data_type', 'attr' + 'event', 'data_type', 'attr', 'data_feeder' ] diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py new file mode 100644 index 0000000000000000000000000000000000000000..2a16d46dda47f822dd2d6c168528dd6cec53ab4e --- /dev/null +++ b/python/paddle/v2/data_feeder.py @@ -0,0 +1,100 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from py_paddle import swig_paddle +from py_paddle import DataProviderConverter +import data_type + +__all__ = ['DataFeeder'] + + +class DataFeeder(DataProviderConverter): + """ + DataFeeder converts the data returned by paddle.reader into a data structure + of Arguments which is defined in the API. The paddle.reader usually returns + a list of mini-batch data entries. Each data entry in the list is one sampe. + Each sample is a list or a tuple with one feature or multiple features. + DataFeeder converts this mini-batch data entries into Arguments in order + to feed it to C++ interface. + + The example usage: + + data_types = [('image', paddle.data_type.dense_vector(784)), + ('label', paddle.data_type.integer_value(10))] + reader_dict = {'image':0, 'label':1} + feeder = DataFeeder(data_types=data_types, reader_dict=reader_dict) + minibatch_data = [ + ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ), # first sample + ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ) # second sample + ] + # or minibatch_data = [ + # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample + # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample + # ] + arg = feeder(minibatch_data) + """ + + def __init__(self, data_types, reader_dict): + """ + :param data_types: A list to specify data name and type. Each item is + a tuple of (data_name, data_type). For example: + [('image', paddle.data_type.dense_vector(784)), + ('label', paddle.data_type.integer_value(10))] + + :type data_types: A list of tuple + :param reader_dict: A dictionary to specify the position of each data + in the input data. + :type reader_dict: dict() + """ + self.input_names = [] + input_types = [] + self.reader_dict = reader_dict + for each in data_types: + self.input_names.append(each[0]) + assert isinstance(each[1], data_type.InputType) + input_types.append(each[1]) + DataProviderConverter.__init__(self, input_types) + + def convert(self, dat, argument=None): + """ + :param dat: A list of mini-batch data. Each sample is a list or tuple + one feature or multiple features. + for example: + [ + ([0.2, 0.2], ), # first sample + ([0.8, 0.3], ), # second sample + ] + or, + [ + [[0.2, 0.2], ], # first sample + [[0.8, 0.3], ], # second sample + ] + + :type dat: List + :param argument: An Arguments object contains this mini-batch data with + one or multiple features. The Arguments definition is + in the API. + :type argument: swig_paddle.Arguments + """ + + def reorder_data(data): + retv = [] + for each in data: + reorder = [] + for name in self.input_names: + reorder.append(each[self.reader_dict[name]]) + retv.append(reorder) + return retv + + return DataProviderConverter.convert(self, reorder_data(dat), argument) diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py index 5b01ba4cd4866cf7b355fc0a6a667409cf9c4419..dd3ebfcb4267e1bb59011c81cb5a2716b8e45a6d 100644 --- a/python/paddle/v2/data_type.py +++ b/python/paddle/v2/data_type.py @@ -14,9 +14,9 @@ from paddle.trainer.PyDataProvider2 import \ InputType, dense_vector, sparse_binary_vector,\ - sparse_vector, integer_value + sparse_vector, integer_value, integer_value_sequence __all__ = [ 'InputType', 'dense_vector', 'sparse_binary_vector', 'sparse_vector', - 'integer_value' + 'integer_value', 'integer_value_sequence' ] diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/python/paddle/v2/dataset/config.py b/python/paddle/v2/dataset/config.py new file mode 100644 index 0000000000000000000000000000000000000000..69e96d65ef1ef868aff5d46ddf3af250ca11e641 --- /dev/null +++ b/python/paddle/v2/dataset/config.py @@ -0,0 +1,8 @@ +import os + +__all__ = ['DATA_HOME'] + +DATA_HOME = os.path.expanduser('~/.cache/paddle_data_set') + +if not os.path.exists(DATA_HOME): + os.makedirs(DATA_HOME) diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..db84f37aa4fc3477b17599a48a4de9b45cfb6c1f --- /dev/null +++ b/python/paddle/v2/dataset/mnist.py @@ -0,0 +1,39 @@ +import sklearn.datasets.mldata +import sklearn.model_selection +import numpy +from config import DATA_HOME + +__all__ = ['train_creator', 'test_creator'] + + +def __mnist_reader_creator__(data, target): + def reader(): + n_samples = data.shape[0] + for i in xrange(n_samples): + yield (data[i] / 255.0).astype(numpy.float32), int(target[i]) + + return reader + + +TEST_SIZE = 10000 + +data = sklearn.datasets.mldata.fetch_mldata( + "MNIST original", data_home=DATA_HOME) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + data.data, data.target, test_size=TEST_SIZE, random_state=0) + + +def train_creator(): + return __mnist_reader_creator__(X_train, y_train) + + +def test_creator(): + return __mnist_reader_creator__(X_test, y_test) + + +def unittest(): + assert len(list(test_creator()())) == TEST_SIZE + + +if __name__ == '__main__': + unittest() diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt index 402ad2e664c773dc7a34c46f57ffe70d6039a09a..2f08ceed534c58c3353be7861f45d024b7c60328 100644 --- a/python/paddle/v2/tests/CMakeLists.txt +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -2,3 +2,5 @@ add_test(NAME test_v2_layer COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/ ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_layer.py WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle) +add_test(NAME test_v2_api + COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE}) diff --git a/python/paddle/v2/tests/run_tests.sh b/python/paddle/v2/tests/run_tests.sh new file mode 100755 index 0000000000000000000000000000000000000000..b96f54fe9cc78a436bc67e6c542b6e842aba997b --- /dev/null +++ b/python/paddle/v2/tests/run_tests.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +pushd `dirname $0` > /dev/null +SCRIPTPATH=$PWD +popd > /dev/null + +cd $SCRIPTPATH + +$1 -m pip install ../../../../paddle/dist/*.whl + +test_list="test_data_feeder.py" + +export PYTHONPATH=$PWD/../../../../python/ + +for fn in $test_list +do + echo "test $fn" + $1 $fn + if [ $? -ne 0 ]; then + exit 1 + fi +done diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py new file mode 100644 index 0000000000000000000000000000000000000000..5f67da6a5b32d74228d727d94ec79b9f7a06dab7 --- /dev/null +++ b/python/paddle/v2/tests/test_data_feeder.py @@ -0,0 +1,238 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import py_paddle.swig_paddle as api +import numpy as np + +from paddle.v2 import data_type +from paddle.v2.data_feeder import DataFeeder + + +class DataFeederTest(unittest.TestCase): + def dense_reader(self, size): + data = np.random.random(size) + return data + + def sparse_binary_reader(self, high, size_limit, non_empty=False): + num = np.random.randint(size_limit) # num could be 0 + while non_empty and num == 0: + num = np.random.randint(size_limit) + return np.random.randint(high, size=num).tolist() + + def test_dense(self): + def compare(input): + feeder = DataFeeder([('image', data_type.dense_vector(784))], + {'image': 0}) + arg = feeder(input) + output = arg.getSlotValue(0).copyToNumpyMat() + input = np.array(input, dtype='float32') + self.assertAlmostEqual(input.all(), output.all()) + + # test numpy array + batch_size = 32 + dim = 784 + data = [] + for i in xrange(batch_size): + each_sample = [] + each_sample.append(self.dense_reader(dim)) + data.append(each_sample) + compare(data) + + # each feature is a list + data = [] + for i in xrange(batch_size): + each_sample = [] + each_sample.append(self.dense_reader(dim).tolist()) + data.append(each_sample) + compare(data) + + # test tuple + data = [] + for i in xrange(batch_size): + each_sample = (self.dense_reader(dim).tolist(), ) + data.append(each_sample) + compare(data) + + def test_sparse_binary(self): + dim = 10000 + batch_size = 32 + data = [] + for i in xrange(batch_size): + each_sample = [] + each_sample.append(self.sparse_binary_reader(dim, 50)) + data.append(each_sample) + feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))], + {'input': 0}) + arg = feeder(data) + output = arg.getSlotValue(0) + assert isinstance(output, api.Matrix) + for i in xrange(batch_size): + self.assertEqual(output.getSparseRowCols(i), data[i][0]) + + def test_sparse(self): + dim = 10000 + batch_size = 32 + v = [] + w = [] + data = [] + for dat in xrange(batch_size): + each_sample = [] + a = self.sparse_binary_reader(dim, 40, non_empty=True) + b = self.dense_reader(len(a)).tolist() + v.append(a) + w.append(np.array(b, dtype="float32")) + each_sample.append(zip(a, b)) + data.append(each_sample) + + feeder = DataFeeder([('input', data_type.sparse_vector(dim))], + {'input': 0}) + arg = feeder(data) + output = arg.getSlotValue(0) + assert isinstance(output, api.Matrix) + for i in xrange(batch_size): + self.assertEqual(output.getSparseRowCols(i), v[i]) + cols_value = output.getSparseRowColsVal(i) + value = [val[1] for val in cols_value] + value = np.array(value, dtype="float32") + self.assertAlmostEqual(value.all(), w[i].all()) + + def test_integer(self): + dim = 100 + batch_size = 32 + index = [] + for i in xrange(batch_size): + each_sample = [] + each_sample.append(np.random.randint(dim)) + index.append(each_sample) + feeder = DataFeeder([('input', data_type.integer_value(dim))], + {'input': 0}) + arg = feeder(index) + output = arg.getSlotIds(0).copyToNumpyArray() + index = np.array(index, dtype='int') + self.assertEqual(output.all(), index.flatten().all()) + + def test_integer_sequence(self): + dim = 10000 + batch_size = 32 + start = [0] + data = [] + for i in xrange(batch_size): + each_sample = [] + each_sample.append( + self.sparse_binary_reader( + dim, 30, non_empty=True)) + data.append(each_sample) + start.append(len(each_sample[0]) + start[-1]) + feeder = DataFeeder([('input', data_type.integer_value_sequence(dim))], + {'input': 0}) + arg = feeder(data) + output_data = arg.getSlotIds(0).copyToNumpyArray() + output_start = arg.getSlotSequenceStartPositions(0).copyToNumpyArray() + + index = [] + for dat in data: + index.extend(x for x in dat[0]) # only one feature, so dat[0] + index = np.array(index, dtype='int') + start = np.array(start, dtype='int') + self.assertEqual(output_data.all(), index.all()) + self.assertEqual(output_start.all(), start.all()) + + def test_multiple_features(self): + batch_size = 2 + data = [] + for i in xrange(batch_size): + each_sample = [] + each_sample.append(np.random.randint(10)) + each_sample.append( + self.sparse_binary_reader( + 20000, 40, non_empty=True)) + each_sample.append(self.dense_reader(100)) + data.append(each_sample) + + # test multiple features + data_types = [('fea0', data_type.dense_vector(100)), + ('fea1', data_type.sparse_binary_vector(20000)), + ('fea2', data_type.integer_value(10))] + feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0}) + arg = feeder(data) + output_dense = arg.getSlotValue(0).copyToNumpyMat() + output_sparse = arg.getSlotValue(1) + output_index = arg.getSlotIds(2).copyToNumpyArray() + for i in xrange(batch_size): + self.assertEqual(output_dense[i].all(), data[i][2].all()) + self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1]) + self.assertEqual(output_index[i], data[i][0]) + + # reader returns 3 features, but only use 2 features + data_types = [('fea0', data_type.dense_vector(100)), + ('fea2', data_type.integer_value(10))] + feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0}) + arg = feeder(data) + output_dense = arg.getSlotValue(0).copyToNumpyMat() + output_index = arg.getSlotIds(1).copyToNumpyArray() + for i in xrange(batch_size): + self.assertEqual(output_dense[i].all(), data[i][2].all()) + self.assertEqual(output_index[i], data[i][0]) + + # reader returns 3 featreus, one is duplicate data + data_types = [('fea0', data_type.dense_vector(100)), + ('fea1', data_type.sparse_binary_vector(20000)), + ('fea2', data_type.integer_value(10)), + ('fea3', data_type.dense_vector(100))] + feeder = DataFeeder(data_types, + {'fea0': 2, + 'fea1': 1, + 'fea2': 0, + 'fea3': 2}) + arg = feeder(data) + fea0 = arg.getSlotValue(0).copyToNumpyMat() + fea1 = arg.getSlotValue(1) + fea2 = arg.getSlotIds(2).copyToNumpyArray() + fea3 = arg.getSlotValue(3).copyToNumpyMat() + for i in xrange(batch_size): + self.assertEqual(fea0[i].all(), data[i][2].all()) + self.assertEqual(fea1.getSparseRowCols(i), data[i][1]) + self.assertEqual(fea2[i], data[i][0]) + self.assertEqual(fea3[i].all(), data[i][2].all()) + + def test_multiple_features_tuple(self): + batch_size = 2 + data = [] + for i in xrange(batch_size): + a = np.random.randint(10) + b = self.sparse_binary_reader(20000, 40, non_empty=True) + c = self.dense_reader(100) + each_sample = (a, b, c) + data.append(each_sample) + + # test multiple features + data_types = [('fea0', data_type.dense_vector(100)), + ('fea1', data_type.sparse_binary_vector(20000)), + ('fea2', data_type.integer_value(10))] + feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0}) + arg = feeder(data) + out_dense = arg.getSlotValue(0).copyToNumpyMat() + out_sparse = arg.getSlotValue(1) + out_index = arg.getSlotIds(2).copyToNumpyArray() + for i in xrange(batch_size): + self.assertEqual(out_dense[i].all(), data[i][2].all()) + self.assertEqual(out_sparse.getSparseRowCols(i), data[i][1]) + self.assertEqual(out_index[i], data[i][0]) + + +if __name__ == '__main__': + api.initPaddle("--use_gpu=0") + unittest.main() diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 0acfcee2cee9ee3edc623f67283c52060e9f893e..097814d2f4619797470668cbd0ea95f112a1fde6 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -2,7 +2,7 @@ import collections import py_paddle.swig_paddle as api from paddle.proto.ModelConfig_pb2 import ModelConfig -from py_paddle import DataProviderConverter +from data_feeder import DataFeeder from . import event as v2_event from . import layer as v2_layer @@ -69,7 +69,8 @@ class SGD(ITrainer): test_data_reader=None, event_handler=None, batch_size=32, - data_types=None): + data_types=None, + reader_dict=None): """ Training method. Will train num_passes of input data. @@ -107,13 +108,7 @@ class SGD(ITrainer): assert isinstance(pass_evaluator, api.Evaluator) out_args = api.Arguments.createArguments(0) - data_types_lists = [] - for each in topology.input_layer_names: - if each not in data_types: - raise ValueError() - data_types_lists.append(data_types[each]) - - converter = DataProviderConverter(input_types=data_types_lists) + feeder = DataFeeder(data_types, reader_dict) for pass_id in xrange(num_passes): event_handler(v2_event.BeginPass(pass_id)) @@ -127,7 +122,7 @@ class SGD(ITrainer): v2_event.BeginIteration( pass_id=pass_id, batch_id=batch_id)) pass_type = updater.startBatch(len(data_batch)) - gm.forwardBackward(converter(data_batch), out_args, pass_type) + gm.forwardBackward(feeder(data_batch), out_args, pass_type) gm.eval(pass_evaluator) gm.eval(batch_evaluator) for each_param in gm.getParameters():