From 84b423a89a2a6ece21910c83277f6282b80f6be7 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Thu, 23 Feb 2017 17:35:06 +0800 Subject: [PATCH] refine data feeder and add unit test --- python/CMakeLists.txt | 1 + python/paddle/v2/data_feeder.py | 110 ++++++++++----- python/paddle/v2/tests/CMakeLists.txt | 2 + python/paddle/v2/tests/run_tests.sh | 36 +++++ python/paddle/v2/tests/test_data_feeder.py | 150 +++++++++++++++++++++ 5 files changed, 264 insertions(+), 35 deletions(-) create mode 100644 python/paddle/v2/tests/CMakeLists.txt create mode 100755 python/paddle/v2/tests/run_tests.sh create mode 100644 python/paddle/v2/tests/test_data_feeder.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 357637e2034..71af50a9a4e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -25,6 +25,7 @@ add_custom_target(paddle_python ALL DEPENDS add_subdirectory(paddle/trainer_config_helpers/tests) add_subdirectory(paddle/reader/tests) +add_subdirectory(paddle/v2/tests) install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/ DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index 83a4efef9e7..b594643dda6 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -12,49 +12,89 @@ # See the License for the specific language governing permissions and # limitations under the License. +from py_paddle import swig_paddle from py_paddle import DataProviderConverter +import data_type __all__ = ['DataFeeder'] -""" -DataFeeder converts the data returned by paddle.reader into a data structure -of Arguments which is defined in the API. The paddle.reader usually returns -a list of mini-batch data. Each item in the list is a tuple or list, which is -one sample with multiple features. DataFeeder converts this mini-batch data -into Arguments in order to feed it to C++ interface. - -The example usage: - - data_types = [paddle.data_type.dense_vector(784), - paddle.data_type.integer_value(10)] - feeder = DataFeeder(input_types=data_types) - minibatch_data = [ - ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ), # first sample - ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ) # second sample - ] - - # or - # minibatch_data = [ - # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample - # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample - # ] - arg = feeder(minibatch_data) - - -Args: - input_types: A list of input data types. It's length is equal to the length - of data returned by paddle.reader. Each item specifies the type - of each feature. - mintbatch_data: A list of mini-batch data. Each item is a list or tuple, + + +class DataFeeder(DataProviderConverter): + """ + DataFeeder converts the data returned by paddle.reader into a data structure + of Arguments which is defined in the API. The paddle.reader usually returns + a list of mini-batch data. Each item in the list is a list or a tuple, + which is one sample with one or multiple features. DataFeeder converts this + mini-batch data into Arguments in order to feed it to C++ interface. + + The example usage: + + data_types = [('image', paddle.data_type.dense_vector(784)), + ('label', paddle.data_type.integer_value(10))] + reader_dict = {'image':0, 'label':1} + feeder = DataFeeder(data_types=data_types, reader_dict=reader_dict) + minibatch_data = [ + ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ), # first sample + ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ) # second sample + ] + arg = feeder(minibatch_data) + """ + + def __init__(self, data_types, reader_dict): + """ + :param data_types: A list to specify data name and type. Each item is + a tuple of (data_name, data_type). For example: + [('image', paddle.data_type.dense_vector(784)), + ('label', paddle.data_type.integer_value(10))] + + :type data_types: A list of tuple + :param reader_dict: A dictionary to specify the position of each data + in the input data. + :type reader_dict: dict() + """ + self.input_names = [] + self.input_types = [] + self.reader_dict = reader_dict + for each in data_types: + self.input_names.append(each[0]) + self.input_types.append(each[1]) + assert isinstance(each[1], data_type.InputType) + DataProviderConverter.__init__(self, self.input_types) + + def convert(self, dat, argument=None): + """ + :param dat: A list of mini-batch data. Each item is a list or tuple, for example: [ (feature_0, feature_1, feature_2, ...), # first sample (feature_0, feature_1, feature_2, ...), # second sample ... ] + :type dat: List + :param argument: An Arguments object contains this mini-batch data with + one or multiple features. The Arguments definition is + in the API. + :type argument: swig_paddle.Arguments + """ + + if argument is None: + argument = swig_paddle.Arguments.createArguments(0) + assert isinstance(argument, swig_paddle.Arguments) + argument.resize(len(self.input_types)) + + scanners = [ + DataProviderConverter.create_scanner(i, each_type) + for i, each_type in enumerate(self.input_types) + ] + + for each_sample in dat: + for name, scanner in zip(self.input_names, scanners): + scanner.scan(each_sample[self.reader_dict[name]]) + + for scanner in scanners: + scanner.finish_scan(argument) -Returns: - An Arguments object contains this mini-batch data with multiple features. - The Arguments definition is in the API. -""" + return argument -DataFeeder = DataProviderConverter + def __call__(self, dat, argument=None): + return self.convert(dat, argument) diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt new file mode 100644 index 00000000000..5842a716caa --- /dev/null +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -0,0 +1,2 @@ +add_test(NAME test_v2_api + COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE}) diff --git a/python/paddle/v2/tests/run_tests.sh b/python/paddle/v2/tests/run_tests.sh new file mode 100755 index 00000000000..b96f54fe9cc --- /dev/null +++ b/python/paddle/v2/tests/run_tests.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +pushd `dirname $0` > /dev/null +SCRIPTPATH=$PWD +popd > /dev/null + +cd $SCRIPTPATH + +$1 -m pip install ../../../../paddle/dist/*.whl + +test_list="test_data_feeder.py" + +export PYTHONPATH=$PWD/../../../../python/ + +for fn in $test_list +do + echo "test $fn" + $1 $fn + if [ $? -ne 0 ]; then + exit 1 + fi +done diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py new file mode 100644 index 00000000000..dcf433d7d8f --- /dev/null +++ b/python/paddle/v2/tests/test_data_feeder.py @@ -0,0 +1,150 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import py_paddle.swig_paddle as api +import numpy as np + +from paddle.v2 import data_type +from paddle.v2.data_feeder import DataFeeder + + +class DataFeederTest(unittest.TestCase): + def dense_reader(self, size): + data = np.random.random(size) + return data + + def sparse_binary_reader(self, high, size_limit, non_empty=False): + num = np.random.randint(size_limit) # num could be 0 + while non_empty and num == 0: + num = np.random.randint(size_limit) + return np.random.randint(high, size=num).tolist() + + def test_dense_vector(self): + def compare(input): + feeder = DataFeeder([('image', data_type.dense_vector(784))], + {'image': 0}) + arg = feeder([input]) + output = arg.getSlotValue(0).copyToNumpyMat() + input = np.array(input, dtype='float32') + self.assertAlmostEqual(input.all(), output.all()) + + # test numpy array + batch_size = 32 + dim = 784 + data = [] + for i in xrange(batch_size): + data.append(self.dense_reader(784)) + compare(data) + + # test list + data = [] + for i in xrange(batch_size): + data.append(self.dense_reader(784).tolist()) + compare(data) + + def test_sparse_binary(self): + dim = 10000 + batch_size = 32 + data = [] + for i in xrange(batch_size): + data.append([self.sparse_binary_reader(dim, 50)]) + feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))], + {'input': 0}) + arg = feeder(data) + output = arg.getSlotValue(0) + assert isinstance(output, api.Matrix) + for i in xrange(batch_size): + self.assertEqual(output.getSparseRowCols(i), data[i][0]) + + def test_sparse(self): + dim = 10000 + batch_size = 32 + v = [] + w = [] + data = [] + for dat in xrange(batch_size): + a = self.sparse_binary_reader(dim, 40, non_empty=True) + b = self.dense_reader(len(a)).tolist() + v.append(a) + w.append(b[0]) + data.append([zip(a, b)]) + + feeder = DataFeeder([('input', data_type.sparse_vector(dim))], + {'input': 0}) + arg = feeder(data) + output = arg.getSlotValue(0) + assert isinstance(output, api.Matrix) + for i in xrange(batch_size): + self.assertEqual(output.getSparseRowCols(i), v[i]) + + def test_integer(self): + dim = 100 + batch_size = 32 + index = [] + for i in xrange(batch_size): + index.append([np.random.randint(dim)]) + feeder = DataFeeder([('input', data_type.integer_value(dim))], + {'input': 0}) + arg = feeder(index) + output = arg.getSlotIds(0).copyToNumpyArray() + index = np.array(index, dtype='int') + self.assertEqual(output.all(), index.flatten().all()) + + def test_multiple_slots(self): + batch_size = 2 + data = [] + for i in xrange(batch_size): + each_sample = [] + each_sample.append(np.random.randint(10)) # size of feature 2: 10 + each_sample.append( + self.sparse_binary_reader( + 20000, 40, non_empty=True)) # size of feature 1: 20000 + each_sample.append(self.dense_reader(100)) # size of feature 0: 100 + data.append(each_sample) + + # test multiple features + data_types = [('fea0', data_type.dense_vector(100)), + ('fea1', data_type.sparse_binary_vector(20000)), + ('fea2', data_type.integer_value(10))] + feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0}) + arg = feeder(data) + output_dense = arg.getSlotValue(0).copyToNumpyMat() + output_sparse = arg.getSlotValue(1) + output_index = arg.getSlotIds(2).copyToNumpyArray() + for i in xrange(batch_size): + self.assertEqual(output_dense[i].all(), data[i][2].all()) + self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1]) + self.assertEqual(output_index[i], data[i][0]) + + # reader returns 3 featreus, but only use 2 features + data_types = [('fea0', data_type.dense_vector(100)), + ('fea2', data_type.integer_value(10))] + feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0}) + arg = feeder(data) + output_dense = arg.getSlotValue(0).copyToNumpyMat() + output_index = arg.getSlotIds(1).copyToNumpyArray() + for i in xrange(batch_size): + self.assertEqual(output_dense[i].all(), data[i][2].all()) + self.assertEqual(output_index[i], data[i][0]) + + +if __name__ == '__main__': + api.initPaddle("--use_gpu=0") + unittest.main() + +if __name__ == '__main__': + api.initPaddle("--use_gpu=0") + unittest.main() -- GitLab