From 67b8150ff4d04552a5a52cb099bf7e935765e69f Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 21 Feb 2017 13:27:21 +0800 Subject: [PATCH] data converter test --- paddle/data_converter_test.py | 92 +++++++++++++++++++++++++ python/paddle/v2/data_converter.py | 19 ++--- python/paddle/v2/data_converter_test.py | 92 +++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 8 deletions(-) create mode 100644 paddle/data_converter_test.py create mode 100644 python/paddle/v2/data_converter_test.py diff --git a/paddle/data_converter_test.py b/paddle/data_converter_test.py new file mode 100644 index 00000000000..d84ee517278 --- /dev/null +++ b/paddle/data_converter_test.py @@ -0,0 +1,92 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import py_paddle.swig_paddle as api +import numpy as np +import paddle.trainer.PyDataProvider2 as dp2 + +from paddle.v2.data_converter import DataConverter + + +class DataConverterTest(unittest.TestCase): + def dense_reader(self, shape): + data = np.random.random(shape) + return data + + def sparse_binary_reader(self, + high, + size_limit, + batch_size, + non_empty=False): + data = [] + for i in xrange(batch_size): + num = np.random.randint(size_limit) # num could be 0 + while non_empty and num == 0: + num = np.random.randint(size_limit) + data.append(np.random.randint(high, size=num).tolist()) + + return data + + def test_dense_vector(self): + def compare(input): + converter = DataConverter([('image', dp2.dense_vector(784))]) + arg = converter([input], {'image': 0}) + output = arg.getSlotValue(0).copyToNumpyMat() + input = np.array(input, dtype='float32') + self.assertAlmostEqual(input.all(), output.all()) + + # test numpy array + data = self.dense_reader(shape=[32, 784]) + compare(data) + + # test list + compare(data.tolist()) + + #def test_sparse_binary(self): + # dim = 100000 + # data = self.sparse_binary_reader(dim, 5, 2) + # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) + # arg = converter([data], {'input':0}) + # output = arg.getSlotValue(0) + + #def test_sparse(self): + # dim = 100000 + # v = self.sparse_binary_reader(dim, 5, 2) + # w = [] + # for dat in data: + # x = self.dense_reader(shape=[1, len(dat)]) + # w.append(x.tolist()) + # data = [] + # for each in zip(v, w): + # data.append(zip(each[0], each[1])) + # + # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) + # arg = converter([data], {'input':0}) + # output = arg.getSlotValue(0) + + def test_integer(self): + dim = 100 + index = np.random.randint(dim, size=32) + print index + converter = DataConverter([('input', dp2.integer_value(dim))]) + arg = converter([index], {'input': 0}) + print arg.getSlotValue(0) + output = arg.getSlotValue(0).copyToNumpyArray() + print 'output=', output + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/data_converter.py b/python/paddle/v2/data_converter.py index afb98a77c5b..fcba43e4ba9 100644 --- a/python/paddle/v2/data_converter.py +++ b/python/paddle/v2/data_converter.py @@ -53,9 +53,9 @@ class DenseConvert(IDataConverter): :type argument: Paddle's Arguments """ assert isinstance(argument, api.Arguments) - if data.dtype != np.float32: - data = data.astype(np.float32) - m = api.Matrix.createDenseFromNumpy(data, True, False) + # TODO: handle data type (float, double, ...) + data = np.array(data, np.float32) + m = api.Matrix.createDenseFromNumpy(data) argument.setSlotValue(self.pos, m) @@ -72,12 +72,12 @@ class SparseBinaryConvert(IDataConverter): self.__height__ = len(data) for x in data: self.__rows__.append(self.__rows__[-1] + len(x)) - self.__cols__ = data.flatten() + self.__cols__.extend(x) def convert(self, data, argument): assert isinstance(argument, api.Arguments) - fill_csr(data) + self.fill_csr(data) m = api.Matrix.createSparse(self.__height__, self.input_type.dim, len(self.__cols__), len(self.__value__) == 0) @@ -94,8 +94,8 @@ class SparseFloatConvert(SparseBinaryConvert): self.__height__ = len(data) for x in data: self.__rows__.append(self.__rows__[-1] + len(x)) - self.__cols__.extend((x[0] for x in data)) - self.__value__.extend((x[1] for x in data)) + self.__cols__.extend(x[0]) + self.__value__.extend(x[1]) class IndexConvert(IDataConverter): @@ -105,7 +105,10 @@ class IndexConvert(IDataConverter): def convert(self, data, argument): assert isinstance(argument, api.Arguments) - self.__ids__ = data.flatten() + #for x in data: + # self.__ids__.append(x) + self.__ids__.extend(x) + ids = api.IVector.create(self.__ids__) argument.setSlotIds(self.pos, ids) diff --git a/python/paddle/v2/data_converter_test.py b/python/paddle/v2/data_converter_test.py new file mode 100644 index 00000000000..d84ee517278 --- /dev/null +++ b/python/paddle/v2/data_converter_test.py @@ -0,0 +1,92 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import py_paddle.swig_paddle as api +import numpy as np +import paddle.trainer.PyDataProvider2 as dp2 + +from paddle.v2.data_converter import DataConverter + + +class DataConverterTest(unittest.TestCase): + def dense_reader(self, shape): + data = np.random.random(shape) + return data + + def sparse_binary_reader(self, + high, + size_limit, + batch_size, + non_empty=False): + data = [] + for i in xrange(batch_size): + num = np.random.randint(size_limit) # num could be 0 + while non_empty and num == 0: + num = np.random.randint(size_limit) + data.append(np.random.randint(high, size=num).tolist()) + + return data + + def test_dense_vector(self): + def compare(input): + converter = DataConverter([('image', dp2.dense_vector(784))]) + arg = converter([input], {'image': 0}) + output = arg.getSlotValue(0).copyToNumpyMat() + input = np.array(input, dtype='float32') + self.assertAlmostEqual(input.all(), output.all()) + + # test numpy array + data = self.dense_reader(shape=[32, 784]) + compare(data) + + # test list + compare(data.tolist()) + + #def test_sparse_binary(self): + # dim = 100000 + # data = self.sparse_binary_reader(dim, 5, 2) + # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) + # arg = converter([data], {'input':0}) + # output = arg.getSlotValue(0) + + #def test_sparse(self): + # dim = 100000 + # v = self.sparse_binary_reader(dim, 5, 2) + # w = [] + # for dat in data: + # x = self.dense_reader(shape=[1, len(dat)]) + # w.append(x.tolist()) + # data = [] + # for each in zip(v, w): + # data.append(zip(each[0], each[1])) + # + # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) + # arg = converter([data], {'input':0}) + # output = arg.getSlotValue(0) + + def test_integer(self): + dim = 100 + index = np.random.randint(dim, size=32) + print index + converter = DataConverter([('input', dp2.integer_value(dim))]) + arg = converter([index], {'input': 0}) + print arg.getSlotValue(0) + output = arg.getSlotValue(0).copyToNumpyArray() + print 'output=', output + + +if __name__ == '__main__': + unittest.main() -- GitLab