refine data feeder

297c6a01 · dangqingqing · 67b8150f · 14ee4b80 · 297c6a01 · 297c6a01
28 changed file
--- a/demo/mnist/api_train.py
+++ b/demo/mnist/api_train.py
@@ -9,7 +9,6 @@ The user api could be simpler and carefully designed.
 import random

 import numpy as np
-import paddle.trainer.PyDataProvider2 as dp
 import paddle.v2 as paddle_v2
 import py_paddle.swig_paddle as api
 from paddle.trainer_config_helpers import *
@@ -71,8 +70,10 @@ def main():
    assert isinstance(updater, api.ParameterUpdater)

    # define network
-    images = paddle_v2.layer.data(name='pixel', size=784)
-    label = paddle_v2.layer.data(name='label', size=10)
+    images = paddle_v2.layer.data(
+        name='pixel', type=paddle_v2.data_type.dense_vector(784))
+    label = paddle_v2.layer.data(
+        name='label', type=paddle_v2.data_type.integer_value(10))
    hidden1 = paddle_v2.layer.fc(input=images, size=200)
    hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
    inference = paddle_v2.layer.fc(input=hidden2,
@@ -98,8 +99,7 @@ def main():

    # DataProvider Converter is a utility convert Python Object to Paddle C++
    # Input. The input format is as same as Paddle's DataProvider.
-    converter = DataProviderConverter(
-        input_types=[dp.dense_vector(784), dp.integer_value(10)])
+    converter = DataProviderConverter(input_types=[images.type, label.type])

    train_file = './data/raw_data/train'
    test_file = './data/raw_data/t10k'

--- a/demo/mnist/api_train_v2.py
+++ b/demo/mnist/api_train_v2.py
 import numpy
 import paddle.v2 as paddle
-from paddle.trainer.PyDataProvider2 import dense_vector, integer_value

 import mnist_util

@@ -16,8 +15,10 @@ def main():
    paddle.init(use_gpu=False, trainer_count=1)

    # define network topology
-    images = paddle.layer.data(name='pixel', size=784)
-    label = paddle.layer.data(name='label', size=10)
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
    hidden1 = paddle.layer.fc(input=images, size=200)
    hidden2 = paddle.layer.fc(input=hidden1, size=200)
    inference = paddle.layer.fc(input=hidden2,
@@ -51,8 +52,8 @@ def main():
                  batch_size=32,  # batch size should be refactor in Data reader
                  data_types={  # data_types will be removed, It should be in
                      # network topology
-                      'pixel': dense_vector(784),
-                      'label': integer_value(10)
+                      'pixel': images.type,
+                      'label': label.type
                  })



--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
@@ -32,4 +32,6 @@ def process(settings, file_name):
            word_slot = [
                settings.word_dict[w] for w in words if w in settings.word_dict
            ]
+            if not word_slot:
+                continue
            yield word_slot, label
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -138,7 +138,11 @@ def main():

    batch = []
    for line in sys.stdin:
-        batch.append([predict.get_index(line)])
+        words = predict.get_index(line)
+        if words:
+            batch.append([words])
+        else:
+            print('All the words in [%s] are not in the dictionary.' % line)
        if len(batch) == batch_size:
            predict.batch_predict(batch)
            batch = []

--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/trainer_config_helpers/layers.rst
@@ -279,6 +279,12 @@ concat_layer
    :members: concat_layer
    :noindex:

+seq_concat_layer
+----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: seq_concat_layer
+    :noindex:
+
 Reshaping Layers
 ================


--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
@@ -107,7 +107,7 @@ We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["imag
 ### How to create custom data reader

 ```python
-def image_reader(image_path, label_path):
+def image_reader(image_path, label_path, n):
 	f = open(image_path)
 	l = open(label_path)
 	images = numpy.fromfile(
@@ -117,9 +117,10 @@ def image_reader(image_path, label_path):
 	for i in xrange(n):
 		yield images[i, :], labels[i] # a single entry of data is created each time
 	f.close()
+	l.close()

 # use python lambda to change image_reader into a function with no parameters.
-reader = lambda : image_reader("/path/to/image_file", "/path/to/label_file")
+reader = lambda : image_reader("/path/to/image_file", "/path/to/label_file", 1024)
 paddle.train(reader, {"image":0, "label":1}, ...)
 ```


--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -887,32 +887,10 @@ static InitFunction __reg_type_auc_sum__([]() {
 */
 class ValuePrinter : public Evaluator {
 public:
-  ValuePrinter() {}
-
  virtual void eval(const NeuralNetwork& nn) {
    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        std::ostringstream os;
-        argu.value->print(os);
-        LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-      }
-      if (argu.ids) {
-        std::ostringstream os;
-        argu.ids->print(os, argu.ids->getSize());
-        LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-      }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
-      if (auto subStartPos = argu.subSequenceStartPositions) {
-        std::ostringstream os;
-        subStartPos->getVector(false)->print(os, subStartPos->getSize());
-        LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                  << os.str();
-      }
+      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
+                                                      "layer=" + name + " ");
    }
  }

@@ -928,8 +906,6 @@ REGISTER_EVALUATOR(value_printer, ValuePrinter);
 */
 class GradientPrinter : public Evaluator {
 public:
-  GradientPrinter() {}
-
  virtual void eval(const NeuralNetwork& nn) {
    for (const std::string& name : config_.input_layers()) {
      const Argument& argu = nn.getLayer(name)->getOutput();
@@ -938,11 +914,6 @@ public:
        argu.grad->print(os);
        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
      }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
    }
  }


--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -19,38 +19,17 @@ namespace paddle {
 class PrintLayer : public Layer {
 public:
  explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override {}
-};

-void PrintLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const auto& argu = getInput(i);
-    const std::string& name = inputLayers_[i]->getName();
-    if (argu.value) {
-      std::ostringstream os;
-      argu.value->print(os);
-      LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-    }
-    if (argu.ids) {
-      std::ostringstream os;
-      argu.ids->print(os, argu.ids->getSize());
-      LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-    }
-    if (auto startPos = argu.sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-    }
-    if (auto subStartPos = argu.subSequenceStartPositions) {
-      std::ostringstream os;
-      subStartPos->getVector(false)->print(os, subStartPos->getSize());
-      LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                << os.str();
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      getInput(i).printValueString(LOG(INFO),
+                                   "layer=" + inputLayers_[i]->getName() + " ");
    }
  }
-}
+
+  void backward(const UpdateCallback& callback) override {}
+};

 REGISTER_LAYER(print, PrintLayer);


--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -21,9 +21,11 @@ namespace paddle {

 /**
 * A layer for concatenating the first sequence with the second sequence
- * following the first
- * Input: two sequences each containing some instances
+ * Input: two sequences each containing the same number of instances
+ *        seq1 = [a1, a2, ..., an]
+ *        seq2 = [b1, b2, ..., bn]
 * Output: a concatenated sequence of the two input sequences
+ *        out = [a1, b1, a2, b2, ..., an, bn]
 */

 class SequenceConcatLayer : public Layer {

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -602,6 +602,44 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
  tgtBuf[numSequences] = numSubSequences;
 }

+void Argument::getValueString(
+    std::unordered_map<std::string, std::string>* out) const {
+  if (value) {
+    std::ostringstream os;
+    value->print(os);
+    out->insert({"value", os.str()});
+  }
+  if (ids) {
+    std::ostringstream os;
+    ids->print(os, ids->getSize());
+    out->insert({"ids", os.str()});
+  }
+  if (sequenceStartPositions) {
+    std::ostringstream os;
+    sequenceStartPositions->getVector(false)->print(
+        os, sequenceStartPositions->getSize());
+    out->insert({"sequence pos", os.str()});
+  }
+  if (subSequenceStartPositions) {
+    std::ostringstream os;
+    subSequenceStartPositions->getVector(false)->print(
+        os, subSequenceStartPositions->getSize());
+    out->insert({"sub-sequence pos", os.str()});
+  }
+}
+
+void Argument::printValueString(std::ostream& stream,
+                                const std::string& prefix) const {
+  std::unordered_map<std::string, std::string> out;
+  getValueString(&out);
+  for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
+    auto it = out.find(field);
+    if (it != out.end()) {
+      stream << prefix << field << ":\n" << it->second;
+    }
+  }
+}
+
 void Argument::subArgFrom(const Argument& input,
                          size_t offset,
                          size_t height,

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -297,6 +297,23 @@ struct Argument {
   sequence has sub-sequence degrades to a sequence.
   */
  void degradeSequence(const Argument& input, bool useGpu);
+
+  /**
+   * @brief getValueString will return the argument's output in string. There
+   * are several kinds of output. The keys of output dictionary are 'value',
+   * 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param out [out]: the return values.
+   */
+  void getValueString(std::unordered_map<std::string, std::string>* out) const;
+
+  /**
+   * @brief printValueString will print the argument's output in order of
+   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param stream: Output stream
+   * @param prefix: line prefix for printing.
+   */
+  void printValueString(std::ostream& stream,
+                        const std::string& prefix = "") const;
 };

 }  // namespace paddle
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -23,7 +23,8 @@ __all__ = ['DataProviderConverter']
 class IScanner(object):
    def __init__(self, input_type, pos):
        self.input_type = input_type
-        assert isinstance(self.input_type, dp2.InputType)
+        if not isinstance(self.input_type, dp2.InputType):
+            raise ValueError("input type should be dataprovider2.InputType")
        self.pos = pos

    def scan(self, dat):
@@ -50,7 +51,6 @@ class DenseScanner(IScanner):

    def finish_scan(self, argument):
        assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
        if self.__mat__.dtype != numpy.float32:
            self.__mat__ = self.__mat__.astype(numpy.float32)
        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
@@ -63,7 +63,6 @@ class SparseBinaryScanner(IScanner):
        self.__rows__ = [0]
        self.__cols__ = []
        self.__height__ = 0
-        self.__nnz__ = 0
        self.__value__ = []

    def scan(self, dat):
@@ -76,7 +75,6 @@ class SparseBinaryScanner(IScanner):

    def finish_scan(self, argument):
        assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
        m = swig_paddle.Matrix.createSparse(self.__height__,
                                            self.input_type.dim,
                                            len(self.__cols__),

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -24,6 +24,7 @@ add_custom_target(paddle_python ALL DEPENDS
    ${OUTPUT_DIR}/.timestamp)

 add_subdirectory(paddle/trainer_config_helpers/tests)
+add_subdirectory(paddle/reader/tests)

 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
    DESTINATION opt/paddle/share/wheels

--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# It would be too lengthy to require our users to prefix decorators with `decorator`.
+# For example, we want the following line
+#
+#     r = paddle.reader.decorator.bufferd(paddle.reader.creator.text("hello.txt"))
+#
+# to be a shorter version:
+#
+#     r = paddle.reader.buffered(paddle.reader.creator.text("hello.txt"))
+from decorator import *
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['buffered']
+
+from Queue import Queue
+from threading import Thread
+
+
+def buffered(reader, size):
+    """Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a buffer.
+    Reading from the buffered data reader will proceed as long as the buffer
+    is not empty.
+    
+    Args:
+        reader: the data reader to read from.
+        size: max buffer size.
+    
+    Returns:
+        The buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def create_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return create_reader
--- a/python/paddle/reader/tests/CMakeLists.txt
+++ b/python/paddle/reader/tests/CMakeLists.txt
+add_test(NAME reader_decorator_test
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/reader/tests/decorator_test.py
+    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import paddle.reader
+import time
+
+
+def reader_10(dur):
+    for i in range(10):
+        time.sleep(dur)
+        yield i
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.reader.buffered(lambda: reader_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.reader.buffered(lambda: reader_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.01)
+            last_time = time.time()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -59,6 +59,7 @@ __all__ = [
    'img_cmrnorm_layer',
    'addto_layer',
    'concat_layer',
+    'seq_concat_layer',
    'lstm_step_layer',
    'recurrent_group',
    'memory',
@@ -144,6 +145,7 @@ class LayerType(object):

    CONCAT_LAYER = 'concat'
    CONCAT_PROJ_LAYER = 'concat2'
+    SEQUENCE_CONCAT_LAYER = 'seqconcat'

    LSTM_STEP_LAYER = 'lstm_step'
    GRU_STEP_LAYER = 'gru_step'
@@ -2570,6 +2572,59 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
        size=sz)


+@wrap_name_default("seqconcat")
+@wrap_act_default(act=IdentityActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support()
+def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
+                     bias_attr=None):
+    """
+    Concat sequence a with sequence b.
+
+    Inputs: 
+      - a = [a1, a2, ..., an]
+      - b = [b1, b2, ..., bn]
+      - Note that the length of a and b should be the same.
+        
+    Output: [a1, b1, a2, b2, ..., an, bn]
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        concat = seq_concat_layer(a=layer1, b=layer2)
+
+    :param name: Layer name.
+    :type name: basestring
+    :param a: input sequence layer
+    :type a: LayerOutput
+    :param b: input sequence layer
+    :type b: LayerOutput
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
+    assert a.size == b.size
+    Layer(
+        name=name,
+        type=LayerType.SEQUENCE_CONCAT_LAYER,
+        inputs=[a.name, b.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    return LayerOutput(
+        name,
+        layer_type=LayerType.SEQUENCE_CONCAT_LAYER,
+        parents=[a, b],
+        activation=act,
+        size=a.size)
+
+
 def memory(name,
           size,
           is_seq=False,

--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -4,6 +4,7 @@ test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
+test_seq_concat)

 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat.protostr
+type: "nn"
+layers {
+  name: "data1"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "data2"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__seqconcat_0__"
+  type: "seqconcat"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "data1"
+  }
+  inputs {
+    input_layer_name: "data2"
+  }
+}
+input_layer_names: "data1"
+input_layer_names: "data2"
+output_layer_names: "__seqconcat_0__"
+sub_models {
+  name: "root"
+  layer_names: "data1"
+  layer_names: "data2"
+  layer_names: "__seqconcat_0__"
+  input_layer_names: "data1"
+  input_layer_names: "data2"
+  output_layer_names: "__seqconcat_0__"
+  is_recurrent_layer_group: false
+}
+
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat.py
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din1 = data_layer(name='data1', size=30)
+
+din2 = data_layer(name='data2', size=30)
+
+outputs(seq_concat_layer(a=din1, b=din2))
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -17,11 +17,19 @@ import activation
 import parameters
 import trainer
 import event
+import data_type
 import py_paddle.swig_paddle as api

 __all__ = [
-    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
-    'event', 'data_converter'
+    'optimizer',
+    'layer',
+    'activation',
+    'parameters',
+    'init',
+    'trainer',
+    'event',
+    'data_type',
+    'data_feeder',
 ]



--- a/python/paddle/v2/data_converter.py
+++ b/python/paddle/v2/data_converter.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import py_paddle.swig_paddle as api
-import numpy as np
-import paddle.trainer.PyDataProvider2 as dp2
-
-__all__ = ['DataConverter']
-
-
-class IDataConverter(object):
-    def __init__(self, input_type, pos):
-        """
-        :param input_type: data type
-        :type input_type: dp2.InputType
-        :param pos: which input, start from 0
-        :type pos: int
-        """
-        self.input_type = input_type
-        assert isinstance(self.input_type, dp2.InputType)
-        self.pos = pos
-
-    def convert(self, data, argument):
-        """
-        Conv data to paddle format.
-        :param data: input data
-        :param argument: paddle format
-        """
-        pass
-
-
-class DenseConvert(IDataConverter):
-    def __init__(self, input_type, pos):
-        IDataConverter.__init__(self, input_type, pos)
-
-    def convert(self, data, argument):
-        """
-        :param data: input data
-        :type data: list | numpy array
-        :param argument: the type which paddle is acceptable
-        :type argument: Paddle's Arguments
-        """
-        assert isinstance(argument, api.Arguments)
-        # TODO: handle data type (float, double, ...)
-        data = np.array(data, np.float32)
-        m = api.Matrix.createDenseFromNumpy(data)
-        argument.setSlotValue(self.pos, m)
-
-
-class SparseBinaryConvert(IDataConverter):
-    def __init__(self, input_type, pos):
-        IDataConverter.__init__(self, input_type, pos)
-        self.__rows__ = [0]
-        self.__cols__ = []
-        self.__height__ = 0
-        self.__nnz__ = 0
-        self.__value__ = []
-
-    def fill_csr(self, data):
-        self.__height__ = len(data)
-        for x in data:
-            self.__rows__.append(self.__rows__[-1] + len(x))
-            self.__cols__.extend(x)
-
-    def convert(self, data, argument):
-        assert isinstance(argument, api.Arguments)
-
-        self.fill_csr(data)
-        m = api.Matrix.createSparse(self.__height__, self.input_type.dim,
-                                    len(self.__cols__),
-                                    len(self.__value__) == 0)
-        assert isinstance(m, api.Matrix)
-        m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
-        argument.setSlotValue(self.pos, m)
-
-
-class SparseFloatConvert(SparseBinaryConvert):
-    def __init__(self, input_type, pos):
-        SparseBinaryConvert.__init__(self, input_type, pos)
-
-    def fill_csr(self, data):
-        self.__height__ = len(data)
-        for x in data:
-            self.__rows__.append(self.__rows__[-1] + len(x))
-            self.__cols__.extend(x[0])
-            self.__value__.extend(x[1])
-
-
-class IndexConvert(IDataConverter):
-    def __init__(self, input_type, pos):
-        IDataConverter.__init__(self, input_type, pos)
-        self.__ids__ = []
-
-    def convert(self, data, argument):
-        assert isinstance(argument, api.Arguments)
-        #for x in data:
-        #    self.__ids__.append(x)
-        self.__ids__.extend(x)
-
-        ids = api.IVector.create(self.__ids__)
-        argument.setSlotIds(self.pos, ids)
-
-
-class SequenceConvert(IDataConverter):
-    def __init__(self, input_type, pos, inner_convert, setter):
-        """
-        :param input_type: the type of input data
-        :type input_type: dp2.InputType
-        :param pos: the position of this input
-        :type pos: int
-        :param inner_convert: DataConvert type
-        :type inner_convert: DenseConvert|SparseBinaryConvert|
-                             SparseFloatConvert|IndexConvert
-        :param setter:
-        :type setter:
-        """
-        IDataConverter.__init__(self, input_type, pos)
-        self.__seq__ = [0]
-        self.__inner_convert__ = inner_convert
-        self.__setter__ = setter
-
-    def fill_seq(self, data):
-        for each in data:
-            self.__seq__.append(self.__seq__[-1] + self.get_size(each))
-
-    def convert(self, data, argument):
-        fill_seq(data)
-        seq = api.IVector.create(self.__seq__, False)
-        self.__setter__(argument, self.pos, seq)
-
-        dat = []
-        for each in data:
-            dat.append(each)
-        self.__inner_scanner__.convert(dat, argument)
-
-    def get_size(self, data):
-        if isinstance(self.__inner_scanner__, SequenceConvert):
-            return sum(self.__inner_scanner__.get_size(item) for item in dat)
-        else:
-            return len(data)
-
-
-class DataConverter(object):
-    def __init__(self, input):
-        """
-        Usege:
-
-        .. code-block:: python
-            inputs = [('image', dense_vector), ('label', integer_value)]
-            cvt = DataConverter(inputs)
-            arg = cvt(minibatch_data, {'image':0, 'label':1})
-
-        :param input_mapper: list of (input_name, input_type)
-        :type input_mapper: list
-        """
-        self.input_names = []
-        self.input_types = []
-        for each in input:
-            self.input_names.append(each[0])
-            self.input_types.append(each[1])
-            assert isinstance(each[1], dp2.InputType)
-
-    def convert(self, data, input_dict=None, argument=None):
-        """
-        Convert minibatch data to Paddle's argument. The data is numpy array
-        or list.
-
-        :param data: input samples, for example, [column0, column1, ...] or
-                     (column0, column1, ...) each column is one minibatch
-                     feature. Note, if only one column featrue, data also
-                     shuld be a list or tupe, [column0] or (column0).
-        :type data: list|tuple
-        :param input_dict: a dictionary to specify the correspondence
-                           of data_layer and input data. If None,
-                           the feature order in argument and data is the same.
-        :type input_dict: dict, like {string:integer, string, integer, ...}|None
-        :param argument: converted data will be saved in this argument. If None,
-                         it will create a Paddle's Arguments firstly.
-        :param type: swig_paddle.Arguments|None
-        """
-        if argument is None:
-            argument = api.Arguments.createArguments(0)
-        assert isinstance(argument, api.Arguments)
-        argument.resize(len(self.input_types))
-
-        converts = [
-            DataConverter.create_converter(i, each_type)
-            for i, each_type in enumerate(self.input_types)
-        ]
-
-        for i, cvt in enumerate(converts):
-            if input_dict is not None:
-                dat = data[input_dict[self.input_names[i]]]
-            else:
-                dat = data[i]
-            cvt.convert(dat, argument)
-
-        return argument
-
-    def __call__(self, dat, argument=None):
-        return self.convert(dat, argument)
-
-    @staticmethod
-    def create_converter(pos, each):
-        assert isinstance(each, dp2.InputType)
-        retv = None
-        if each.type == dp2.DataType.Dense:
-            retv = DenseConvert(each, pos)
-        elif each.type == dp2.DataType.Index:
-            retv = IndexConvert(each, pos)
-        elif each.type == dp2.DataType.SparseNonValue:
-            retv = SparseBinaryConvert(each, pos)
-        elif each.type == dp2.DataType.SparseValue:
-            retv = SparseFloatConvert(each, pos)
-        assert retv is not None
-
-        if each.seq_type == dp2.SequenceType.SUB_SEQUENCE:
-            retv = SequenceConvert(
-                each, pos, retv,
-                lambda arg, pos, seq: arg.setSlotSubSequenceStartPositions(pos, seq)
-            )
-
-        if each.seq_type in [
-                dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE
-        ]:
-            retv = SequenceConvert(
-                each, pos, retv,
-                lambda arg, pos, seq: arg.setSlotSequenceStartPositions(pos, seq)
-            )
-        return retv
--- a/python/paddle/v2/data_converter_test.py
+++ b/python/paddle/v2/data_converter_test.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import py_paddle.swig_paddle as api
-import numpy as np
-import paddle.trainer.PyDataProvider2 as dp2
-
-from paddle.v2.data_converter import DataConverter
-
-
-class DataConverterTest(unittest.TestCase):
-    def dense_reader(self, shape):
-        data = np.random.random(shape)
-        return data
-
-    def sparse_binary_reader(self,
-                             high,
-                             size_limit,
-                             batch_size,
-                             non_empty=False):
-        data = []
-        for i in xrange(batch_size):
-            num = np.random.randint(size_limit)  # num could be 0
-            while non_empty and num == 0:
-                num = np.random.randint(size_limit)
-            data.append(np.random.randint(high, size=num).tolist())
-
-        return data
-
-    def test_dense_vector(self):
-        def compare(input):
-            converter = DataConverter([('image', dp2.dense_vector(784))])
-            arg = converter([input], {'image': 0})
-            output = arg.getSlotValue(0).copyToNumpyMat()
-            input = np.array(input, dtype='float32')
-            self.assertAlmostEqual(input.all(), output.all())
-
-        # test numpy array
-        data = self.dense_reader(shape=[32, 784])
-        compare(data)
-
-        # test list
-        compare(data.tolist())
-
-    #def test_sparse_binary(self):
-    #    dim = 100000
-    #    data = self.sparse_binary_reader(dim, 5, 2)
-    #    converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
-    #    arg = converter([data], {'input':0})
-    #    output = arg.getSlotValue(0)
-
-    #def test_sparse(self):
-    #    dim = 100000
-    #    v = self.sparse_binary_reader(dim, 5, 2)
-    #    w = []
-    #    for dat in data:
-    #        x = self.dense_reader(shape=[1, len(dat)])
-    #        w.append(x.tolist())
-    #    data = []
-    #    for each in zip(v, w):
-    #        data.append(zip(each[0], each[1]))
-    #    
-    #    converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
-    #    arg = converter([data], {'input':0})
-    #    output = arg.getSlotValue(0)
-
-    def test_integer(self):
-        dim = 100
-        index = np.random.randint(dim, size=32)
-        print index
-        converter = DataConverter([('input', dp2.integer_value(dim))])
-        arg = converter([index], {'input': 0})
-        print arg.getSlotValue(0)
-        output = arg.getSlotValue(0).copyToNumpyArray()
-        print 'output=', output
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import DataProviderConverter
+
+__all__ = ['DataFeeder']
+
+DataFeeder = DataProviderConverter
--- a/python/paddle/v2/data_type.py
+++ b/python/paddle/v2/data_type.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import \
+    InputType, dense_vector, sparse_binary_vector,\
+    sparse_vector, integer_value
+
+__all__ = [
+    'InputType', 'dense_vector', 'sparse_binary_vector', 'sparse_vector',
+    'integer_value'
+]
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -67,6 +67,7 @@ paddle.v2.parameters.create, no longer exposed to users.
 """

 import paddle.trainer_config_helpers as conf_helps
+from . import data_type as v2_data
 from paddle.trainer_config_helpers.config_parser_utils import \
    parse_network_config as __parse__
 from paddle.trainer_config_helpers.default_decorators import wrap_name_default
@@ -157,7 +158,33 @@ def __convert_to_v2__(method_name, name_prefix, parent_names):
    return V2LayerImpl


-data = __convert_to_v2__('data_layer', None, [])
+"""
+Some layer may need some special config, and can not use __convert_to_v2__ to convert.
+So we also need to implement some special LayerV2.
+"""
+
+
+class DataLayerV2(Layer):
+    def __init__(self, name, type, **kwargs):
+        assert isinstance(type, v2_data.InputType)
+
+        self.type = type
+        self.__method_name__ = 'data_layer'
+        self.__kwargs__ = kwargs
+
+        super(DataLayerV2, self).__init__(name=name, parent_layers=dict())
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        args['size'] = self.type.dim
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__kwargs__:
+            args[each] = self.__kwargs__[each]
+        return getattr(conf_helps, self.__method_name__)(name=self.name, **args)
+
+
+data = DataLayerV2
 fc = __convert_to_v2__('fc_layer', name_prefix='fc', parent_names=['input'])
 max_id = __convert_to_v2__(
    'maxid_layer', name_prefix='maxid_layer', parent_names=['input'])
@@ -171,8 +198,8 @@ cross_entropy_cost = __convert_to_v2__(
    parent_names=['input', 'label'])

 if __name__ == '__main__':
-    pixel = data(name='pixel', size=784)
-    label = data(name='label', size=10)
+    pixel = data(name='pixel', type=v2_data.dense_vector(784))
+    label = data(name='label', type=v2_data.integer_value(10))
    hidden = fc(input=pixel, size=100, act=conf_helps.SigmoidActivation())
    inference = fc(input=hidden, size=10, act=conf_helps.SoftmaxActivation())
    maxid = max_id(input=inference)

--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -2,7 +2,7 @@ import collections

 import py_paddle.swig_paddle as api
 from paddle.proto.ModelConfig_pb2 import ModelConfig
-from py_paddle import DataProviderConverter
+from data_feeder import DataFeeder

 from . import event as v2_event
 from . import layer as v2_layer
@@ -89,6 +89,7 @@ class SGD(ITrainer):
            event_handler = default_event_handler

        topology = v2_layer.parse_network(topology)
+        print topology

        __check_train_args__(**locals())

@@ -109,7 +110,7 @@ class SGD(ITrainer):
                raise ValueError()
            data_types_lists.append(data_types[each])

-        converter = DataProviderConverter(input_types=data_types_lists)
+        feeder = DataFeeder(input_types=data_types_lists)

        for pass_id in xrange(num_passes):
            updater.startPass()
@@ -117,7 +118,7 @@ class SGD(ITrainer):
                    __data_reader_to_batch__(train_data_reader, batch_size,
                                             topology)):
                pass_type = updater.startBatch(len(data_batch))
-                gm.forwardBackward(converter(data_batch), out_args, pass_type)
+                gm.forwardBackward(feeder(data_batch), out_args, pass_type)
                for each_param in gm.getParameters():
                    updater.update(each_param)
                # Get cost. We use numpy to calculate total cost for this batch.