From 67b8150ff4d04552a5a52cb099bf7e935765e69f Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 21 Feb 2017 13:27:21 +0800
Subject: [PATCH] data converter test

---
 paddle/data_converter_test.py           | 92 +++++++++++++++++++++++++
 python/paddle/v2/data_converter.py      | 19 ++---
 python/paddle/v2/data_converter_test.py | 92 +++++++++++++++++++++++++
 3 files changed, 195 insertions(+), 8 deletions(-)
 create mode 100644 paddle/data_converter_test.py
 create mode 100644 python/paddle/v2/data_converter_test.py

diff --git a/paddle/data_converter_test.py b/paddle/data_converter_test.py
new file mode 100644
index 0000000000..d84ee51727
--- /dev/null
+++ b/paddle/data_converter_test.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import py_paddle.swig_paddle as api
+import numpy as np
+import paddle.trainer.PyDataProvider2 as dp2
+
+from paddle.v2.data_converter import DataConverter
+
+
+class DataConverterTest(unittest.TestCase):
+    def dense_reader(self, shape):
+        data = np.random.random(shape)
+        return data
+
+    def sparse_binary_reader(self,
+                             high,
+                             size_limit,
+                             batch_size,
+                             non_empty=False):
+        data = []
+        for i in xrange(batch_size):
+            num = np.random.randint(size_limit)  # num could be 0
+            while non_empty and num == 0:
+                num = np.random.randint(size_limit)
+            data.append(np.random.randint(high, size=num).tolist())
+
+        return data
+
+    def test_dense_vector(self):
+        def compare(input):
+            converter = DataConverter([('image', dp2.dense_vector(784))])
+            arg = converter([input], {'image': 0})
+            output = arg.getSlotValue(0).copyToNumpyMat()
+            input = np.array(input, dtype='float32')
+            self.assertAlmostEqual(input.all(), output.all())
+
+        # test numpy array
+        data = self.dense_reader(shape=[32, 784])
+        compare(data)
+
+        # test list
+        compare(data.tolist())
+
+    #def test_sparse_binary(self):
+    #    dim = 100000
+    #    data = self.sparse_binary_reader(dim, 5, 2)
+    #    converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
+    #    arg = converter([data], {'input':0})
+    #    output = arg.getSlotValue(0)
+
+    #def test_sparse(self):
+    #    dim = 100000
+    #    v = self.sparse_binary_reader(dim, 5, 2)
+    #    w = []
+    #    for dat in data:
+    #        x = self.dense_reader(shape=[1, len(dat)])
+    #        w.append(x.tolist())
+    #    data = []
+    #    for each in zip(v, w):
+    #        data.append(zip(each[0], each[1]))
+    #    
+    #    converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
+    #    arg = converter([data], {'input':0})
+    #    output = arg.getSlotValue(0)
+
+    def test_integer(self):
+        dim = 100
+        index = np.random.randint(dim, size=32)
+        print index
+        converter = DataConverter([('input', dp2.integer_value(dim))])
+        arg = converter([index], {'input': 0})
+        print arg.getSlotValue(0)
+        output = arg.getSlotValue(0).copyToNumpyArray()
+        print 'output=', output
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/data_converter.py b/python/paddle/v2/data_converter.py
index afb98a77c5..fcba43e4ba 100644
--- a/python/paddle/v2/data_converter.py
+++ b/python/paddle/v2/data_converter.py
@@ -53,9 +53,9 @@ class DenseConvert(IDataConverter):
         :type argument: Paddle's Arguments
         """
         assert isinstance(argument, api.Arguments)
-        if data.dtype != np.float32:
-            data = data.astype(np.float32)
-        m = api.Matrix.createDenseFromNumpy(data, True, False)
+        # TODO: handle data type (float, double, ...)
+        data = np.array(data, np.float32)
+        m = api.Matrix.createDenseFromNumpy(data)
         argument.setSlotValue(self.pos, m)
 
 
@@ -72,12 +72,12 @@ class SparseBinaryConvert(IDataConverter):
         self.__height__ = len(data)
         for x in data:
             self.__rows__.append(self.__rows__[-1] + len(x))
-        self.__cols__ = data.flatten()
+            self.__cols__.extend(x)
 
     def convert(self, data, argument):
         assert isinstance(argument, api.Arguments)
 
-        fill_csr(data)
+        self.fill_csr(data)
         m = api.Matrix.createSparse(self.__height__, self.input_type.dim,
                                     len(self.__cols__),
                                     len(self.__value__) == 0)
@@ -94,8 +94,8 @@ class SparseFloatConvert(SparseBinaryConvert):
         self.__height__ = len(data)
         for x in data:
             self.__rows__.append(self.__rows__[-1] + len(x))
-        self.__cols__.extend((x[0] for x in data))
-        self.__value__.extend((x[1] for x in data))
+            self.__cols__.extend(x[0])
+            self.__value__.extend(x[1])
 
 
 class IndexConvert(IDataConverter):
@@ -105,7 +105,10 @@ class IndexConvert(IDataConverter):
 
     def convert(self, data, argument):
         assert isinstance(argument, api.Arguments)
-        self.__ids__ = data.flatten()
+        #for x in data:
+        #    self.__ids__.append(x)
+        self.__ids__.extend(x)
+
         ids = api.IVector.create(self.__ids__)
         argument.setSlotIds(self.pos, ids)
 
diff --git a/python/paddle/v2/data_converter_test.py b/python/paddle/v2/data_converter_test.py
new file mode 100644
index 0000000000..d84ee51727
--- /dev/null
+++ b/python/paddle/v2/data_converter_test.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import py_paddle.swig_paddle as api
+import numpy as np
+import paddle.trainer.PyDataProvider2 as dp2
+
+from paddle.v2.data_converter import DataConverter
+
+
+class DataConverterTest(unittest.TestCase):
+    def dense_reader(self, shape):
+        data = np.random.random(shape)
+        return data
+
+    def sparse_binary_reader(self,
+                             high,
+                             size_limit,
+                             batch_size,
+                             non_empty=False):
+        data = []
+        for i in xrange(batch_size):
+            num = np.random.randint(size_limit)  # num could be 0
+            while non_empty and num == 0:
+                num = np.random.randint(size_limit)
+            data.append(np.random.randint(high, size=num).tolist())
+
+        return data
+
+    def test_dense_vector(self):
+        def compare(input):
+            converter = DataConverter([('image', dp2.dense_vector(784))])
+            arg = converter([input], {'image': 0})
+            output = arg.getSlotValue(0).copyToNumpyMat()
+            input = np.array(input, dtype='float32')
+            self.assertAlmostEqual(input.all(), output.all())
+
+        # test numpy array
+        data = self.dense_reader(shape=[32, 784])
+        compare(data)
+
+        # test list
+        compare(data.tolist())
+
+    #def test_sparse_binary(self):
+    #    dim = 100000
+    #    data = self.sparse_binary_reader(dim, 5, 2)
+    #    converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
+    #    arg = converter([data], {'input':0})
+    #    output = arg.getSlotValue(0)
+
+    #def test_sparse(self):
+    #    dim = 100000
+    #    v = self.sparse_binary_reader(dim, 5, 2)
+    #    w = []
+    #    for dat in data:
+    #        x = self.dense_reader(shape=[1, len(dat)])
+    #        w.append(x.tolist())
+    #    data = []
+    #    for each in zip(v, w):
+    #        data.append(zip(each[0], each[1]))
+    #    
+    #    converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
+    #    arg = converter([data], {'input':0})
+    #    output = arg.getSlotValue(0)
+
+    def test_integer(self):
+        dim = 100
+        index = np.random.randint(dim, size=32)
+        print index
+        converter = DataConverter([('input', dp2.integer_value(dim))])
+        arg = converter([index], {'input': 0})
+        print arg.getSlotValue(0)
+        output = arg.getSlotValue(0).copyToNumpyArray()
+        print 'output=', output
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab