diff --git a/demo/mnist/api_train.py b/demo/mnist/api_train.py index fe39f0bd23f78e1a9d61f708dc880d9853b7a5f9..ea1caa7dd9653a2cc2860ace736fe3d25a3767e0 100644 --- a/demo/mnist/api_train.py +++ b/demo/mnist/api_train.py @@ -9,7 +9,6 @@ The user api could be simpler and carefully designed. import random import numpy as np -import paddle.trainer.PyDataProvider2 as dp import paddle.v2 as paddle_v2 import py_paddle.swig_paddle as api from paddle.trainer_config_helpers import * @@ -71,8 +70,10 @@ def main(): assert isinstance(updater, api.ParameterUpdater) # define network - images = paddle_v2.layer.data(name='pixel', size=784) - label = paddle_v2.layer.data(name='label', size=10) + images = paddle_v2.layer.data( + name='pixel', type=paddle_v2.data_type.dense_vector(784)) + label = paddle_v2.layer.data( + name='label', type=paddle_v2.data_type.integer_value(10)) hidden1 = paddle_v2.layer.fc(input=images, size=200) hidden2 = paddle_v2.layer.fc(input=hidden1, size=200) inference = paddle_v2.layer.fc(input=hidden2, @@ -98,8 +99,7 @@ def main(): # DataProvider Converter is a utility convert Python Object to Paddle C++ # Input. The input format is as same as Paddle's DataProvider. - converter = DataProviderConverter( - input_types=[dp.dense_vector(784), dp.integer_value(10)]) + converter = DataProviderConverter(input_types=[images.type, label.type]) train_file = './data/raw_data/train' test_file = './data/raw_data/t10k' diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index b5cc74ce67dfc8e1afa65bd52f5ec600260032ce..6fc01ce58be57c77144c6558d039430b22d3a746 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -1,6 +1,5 @@ import numpy import paddle.v2 as paddle -from paddle.trainer.PyDataProvider2 import dense_vector, integer_value import mnist_util @@ -16,8 +15,10 @@ def main(): paddle.init(use_gpu=False, trainer_count=1) # define network topology - images = paddle.layer.data(name='pixel', size=784) - label = paddle.layer.data(name='label', size=10) + images = paddle.layer.data( + name='pixel', type=paddle.data_type.dense_vector(784)) + label = paddle.layer.data( + name='label', type=paddle.data_type.integer_value(10)) hidden1 = paddle.layer.fc(input=images, size=200) hidden2 = paddle.layer.fc(input=hidden1, size=200) inference = paddle.layer.fc(input=hidden2, @@ -51,8 +52,8 @@ def main(): batch_size=32, # batch size should be refactor in Data reader data_types={ # data_types will be removed, It should be in # network topology - 'pixel': dense_vector(784), - 'label': integer_value(10) + 'pixel': images.type, + 'label': label.type }) diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py index 00f72cecacb454a0dd1184fa2098be4543007de7..4b7f5d0e504aef3884a04cbed8c16503a4079772 100755 --- a/demo/sentiment/dataprovider.py +++ b/demo/sentiment/dataprovider.py @@ -32,4 +32,6 @@ def process(settings, file_name): word_slot = [ settings.word_dict[w] for w in words if w in settings.word_dict ] + if not word_slot: + continue yield word_slot, label diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py index 8ec490f64691924013200a3d0038d39aa834b038..64c78e0d6b9297e7a321a4f070517593b0bfe332 100755 --- a/demo/sentiment/predict.py +++ b/demo/sentiment/predict.py @@ -138,7 +138,11 @@ def main(): batch = [] for line in sys.stdin: - batch.append([predict.get_index(line)]) + words = predict.get_index(line) + if words: + batch.append([words]) + else: + print('All the words in [%s] are not in the dictionary.' % line) if len(batch) == batch_size: predict.batch_predict(batch) batch = [] diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst index 8b0e553eacc932bc59062103ac6e6ac4245d03cb..3685868ffcfdf1d17557e5e8bcba6cbc25804147 100644 --- a/doc/api/trainer_config_helpers/layers.rst +++ b/doc/api/trainer_config_helpers/layers.rst @@ -279,6 +279,12 @@ concat_layer :members: concat_layer :noindex: +seq_concat_layer +---------------- +.. automodule:: paddle.trainer_config_helpers.layers + :members: seq_concat_layer + :noindex: + Reshaping Layers ================ diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index 8f7abf12f733542734efe91111f365a34aa4b15b..ff3ee758beeb2f5def0eefbd63132d48473f892b 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -107,7 +107,7 @@ We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["imag ### How to create custom data reader ```python -def image_reader(image_path, label_path): +def image_reader(image_path, label_path, n): f = open(image_path) l = open(label_path) images = numpy.fromfile( @@ -117,9 +117,10 @@ def image_reader(image_path, label_path): for i in xrange(n): yield images[i, :], labels[i] # a single entry of data is created each time f.close() + l.close() # use python lambda to change image_reader into a function with no parameters. -reader = lambda : image_reader("/path/to/image_file", "/path/to/label_file") +reader = lambda : image_reader("/path/to/image_file", "/path/to/label_file", 1024) paddle.train(reader, {"image":0, "label":1}, ...) ``` diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp index a2a5028e8418fd2884a436394a05903e1fdd795c..2bf6ead0dc382cd74cf64508835b24b8483dc553 100644 --- a/paddle/gserver/evaluators/Evaluator.cpp +++ b/paddle/gserver/evaluators/Evaluator.cpp @@ -887,32 +887,10 @@ static InitFunction __reg_type_auc_sum__([]() { */ class ValuePrinter : public Evaluator { public: - ValuePrinter() {} - virtual void eval(const NeuralNetwork& nn) { for (const std::string& name : config_.input_layers()) { - const Argument& argu = nn.getLayer(name)->getOutput(); - if (argu.value) { - std::ostringstream os; - argu.value->print(os); - LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str(); - } - if (argu.ids) { - std::ostringstream os; - argu.ids->print(os, argu.ids->getSize()); - LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str(); - } - if (auto startPos = argu.sequenceStartPositions) { - std::ostringstream os; - startPos->getVector(false)->print(os, startPos->getSize()); - LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str(); - } - if (auto subStartPos = argu.subSequenceStartPositions) { - std::ostringstream os; - subStartPos->getVector(false)->print(os, subStartPos->getSize()); - LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n" - << os.str(); - } + nn.getLayer(name)->getOutput().printValueString(LOG(INFO), + "layer=" + name + " "); } } @@ -928,8 +906,6 @@ REGISTER_EVALUATOR(value_printer, ValuePrinter); */ class GradientPrinter : public Evaluator { public: - GradientPrinter() {} - virtual void eval(const NeuralNetwork& nn) { for (const std::string& name : config_.input_layers()) { const Argument& argu = nn.getLayer(name)->getOutput(); @@ -938,11 +914,6 @@ public: argu.grad->print(os); LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str(); } - if (auto startPos = argu.sequenceStartPositions) { - std::ostringstream os; - startPos->getVector(false)->print(os, startPos->getSize()); - LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str(); - } } } diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp index 85f52ad5debd035c403c73afc7390904428e28a7..de198af111be4200dd1b240f6de9464e3f43b06d 100644 --- a/paddle/gserver/layers/PrintLayer.cpp +++ b/paddle/gserver/layers/PrintLayer.cpp @@ -19,38 +19,17 @@ namespace paddle { class PrintLayer : public Layer { public: explicit PrintLayer(const LayerConfig& config) : Layer(config) {} - void forward(PassType passType) override; - void backward(const UpdateCallback& callback) override {} -}; -void PrintLayer::forward(PassType passType) { - Layer::forward(passType); - for (size_t i = 0; i != inputLayers_.size(); ++i) { - const auto& argu = getInput(i); - const std::string& name = inputLayers_[i]->getName(); - if (argu.value) { - std::ostringstream os; - argu.value->print(os); - LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str(); - } - if (argu.ids) { - std::ostringstream os; - argu.ids->print(os, argu.ids->getSize()); - LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str(); - } - if (auto startPos = argu.sequenceStartPositions) { - std::ostringstream os; - startPos->getVector(false)->print(os, startPos->getSize()); - LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str(); - } - if (auto subStartPos = argu.subSequenceStartPositions) { - std::ostringstream os; - subStartPos->getVector(false)->print(os, subStartPos->getSize()); - LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n" - << os.str(); + void forward(PassType passType) override { + Layer::forward(passType); + for (size_t i = 0; i != inputLayers_.size(); ++i) { + getInput(i).printValueString(LOG(INFO), + "layer=" + inputLayers_[i]->getName() + " "); } } -} + + void backward(const UpdateCallback& callback) override {} +}; REGISTER_LAYER(print, PrintLayer); diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp index 599706eb419ede72dbd6f4c8c74e57f5f9965388..4b24d8f0c852e1bdc887d4ee1465b9ad05d210bb 100644 --- a/paddle/gserver/layers/SequenceConcatLayer.cpp +++ b/paddle/gserver/layers/SequenceConcatLayer.cpp @@ -21,9 +21,11 @@ namespace paddle { /** * A layer for concatenating the first sequence with the second sequence - * following the first - * Input: two sequences each containing some instances + * Input: two sequences each containing the same number of instances + * seq1 = [a1, a2, ..., an] + * seq2 = [b1, b2, ..., bn] * Output: a concatenated sequence of the two input sequences + * out = [a1, b1, a2, b2, ..., an, bn] */ class SequenceConcatLayer : public Layer { diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 65d01a15718ae2bebd4869eff0e5407524bc0e7c..7a343cca33f5b420be6192231ac73ca1c2da5fb9 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -602,6 +602,44 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) { tgtBuf[numSequences] = numSubSequences; } +void Argument::getValueString( + std::unordered_map* out) const { + if (value) { + std::ostringstream os; + value->print(os); + out->insert({"value", os.str()}); + } + if (ids) { + std::ostringstream os; + ids->print(os, ids->getSize()); + out->insert({"ids", os.str()}); + } + if (sequenceStartPositions) { + std::ostringstream os; + sequenceStartPositions->getVector(false)->print( + os, sequenceStartPositions->getSize()); + out->insert({"sequence pos", os.str()}); + } + if (subSequenceStartPositions) { + std::ostringstream os; + subSequenceStartPositions->getVector(false)->print( + os, subSequenceStartPositions->getSize()); + out->insert({"sub-sequence pos", os.str()}); + } +} + +void Argument::printValueString(std::ostream& stream, + const std::string& prefix) const { + std::unordered_map out; + getValueString(&out); + for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) { + auto it = out.find(field); + if (it != out.end()) { + stream << prefix << field << ":\n" << it->second; + } + } +} + void Argument::subArgFrom(const Argument& input, size_t offset, size_t height, diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index afd2de0202bf0f14ec3d4c5b856455a3488e41f6..178c068b93ac5fc1e06200984f14da86069cf7e4 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -297,6 +297,23 @@ struct Argument { sequence has sub-sequence degrades to a sequence. */ void degradeSequence(const Argument& input, bool useGpu); + + /** + * @brief getValueString will return the argument's output in string. There + * are several kinds of output. The keys of output dictionary are 'value', + * 'id', 'sequence pos', 'sub-sequence pos'. + * @param out [out]: the return values. + */ + void getValueString(std::unordered_map* out) const; + + /** + * @brief printValueString will print the argument's output in order of + * 'value', 'id', 'sequence pos', 'sub-sequence pos'. + * @param stream: Output stream + * @param prefix: line prefix for printing. + */ + void printValueString(std::ostream& stream, + const std::string& prefix = "") const; }; } // namespace paddle diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py index 21d1cb75f4d40e6ed011b33c6366c9d31c0fcc7c..2690cafe1d8d32bf52cd9e5fa4dc69fbacb2d66c 100644 --- a/paddle/py_paddle/dataprovider_converter.py +++ b/paddle/py_paddle/dataprovider_converter.py @@ -23,7 +23,8 @@ __all__ = ['DataProviderConverter'] class IScanner(object): def __init__(self, input_type, pos): self.input_type = input_type - assert isinstance(self.input_type, dp2.InputType) + if not isinstance(self.input_type, dp2.InputType): + raise ValueError("input type should be dataprovider2.InputType") self.pos = pos def scan(self, dat): @@ -50,7 +51,6 @@ class DenseScanner(IScanner): def finish_scan(self, argument): assert isinstance(argument, swig_paddle.Arguments) - assert isinstance(self.input_type, dp2.InputType) if self.__mat__.dtype != numpy.float32: self.__mat__ = self.__mat__.astype(numpy.float32) m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False) @@ -63,7 +63,6 @@ class SparseBinaryScanner(IScanner): self.__rows__ = [0] self.__cols__ = [] self.__height__ = 0 - self.__nnz__ = 0 self.__value__ = [] def scan(self, dat): @@ -76,7 +75,6 @@ class SparseBinaryScanner(IScanner): def finish_scan(self, argument): assert isinstance(argument, swig_paddle.Arguments) - assert isinstance(self.input_type, dp2.InputType) m = swig_paddle.Matrix.createSparse(self.__height__, self.input_type.dim, len(self.__cols__), diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index ee7a5bff84ca96ef1010fa7430356722f807fb0f..357637e20346f8e1179d3a28ff580722cdfcccff 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -24,6 +24,7 @@ add_custom_target(paddle_python ALL DEPENDS ${OUTPUT_DIR}/.timestamp) add_subdirectory(paddle/trainer_config_helpers/tests) +add_subdirectory(paddle/reader/tests) install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/ DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..493b410e8299ebe167be43ead1401a6ab245a631 --- /dev/null +++ b/python/paddle/reader/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# It would be too lengthy to require our users to prefix decorators with `decorator`. +# For example, we want the following line +# +# r = paddle.reader.decorator.bufferd(paddle.reader.creator.text("hello.txt")) +# +# to be a shorter version: +# +# r = paddle.reader.buffered(paddle.reader.creator.text("hello.txt")) +from decorator import * diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ddb0ff812b15ede21e6965c7c8857f12716fa0 --- /dev/null +++ b/python/paddle/reader/decorator.py @@ -0,0 +1,60 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['buffered'] + +from Queue import Queue +from threading import Thread + + +def buffered(reader, size): + """Creates a buffered data reader. + + The buffered data reader will read and save data entries into a buffer. + Reading from the buffered data reader will proceed as long as the buffer + is not empty. + + Args: + reader: the data reader to read from. + size: max buffer size. + + Returns: + The buffered data reader. + """ + + class EndSignal(): + pass + + end = EndSignal() + + def read_worker(r, q): + for d in r: + q.put(d) + q.put(end) + + def create_reader(): + r = reader() + q = Queue(maxsize=size) + t = Thread( + target=read_worker, args=( + r, + q, )) + t.daemon = True + t.start() + e = q.get() + while e != end: + yield e + e = q.get() + + return create_reader diff --git a/python/paddle/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..502c897d8946a838847c1c23b1236358c58c088e --- /dev/null +++ b/python/paddle/reader/tests/CMakeLists.txt @@ -0,0 +1,4 @@ +add_test(NAME reader_decorator_test + COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/ + ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/reader/tests/decorator_test.py + WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle) diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..879d1d9c1d0e0650d347b5c44e36771a0c15390e --- /dev/null +++ b/python/paddle/reader/tests/decorator_test.py @@ -0,0 +1,50 @@ +# Copyright PaddlePaddle contributors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import paddle.reader +import time + + +def reader_10(dur): + for i in range(10): + time.sleep(dur) + yield i + + +class TestBuffered(unittest.TestCase): + def test_read(self): + for size in range(20): + b = paddle.reader.buffered(lambda: reader_10(0), size) + c = 0 + for i in b(): + self.assertEqual(i, c) + c += 1 + self.assertEqual(c, 10) + + def test_buffering(self): + # read have 30ms delay. + b = paddle.reader.buffered(lambda: reader_10(0.03), 10) + last_time = time.time() + for idx, i in enumerate(b()): + elapsed_time = time.time() - last_time + if i == 0: + time.sleep(0.3) + else: + # read time should be short, meaning already buffered. + self.assertLess(elapsed_time, 0.01) + last_time = time.time() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 1fdc4c462363712e8b5b4dee10d0aaa26f4deffa..0ff29772356af7dcb36b71fa2b1263dc4477d0e4 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -59,6 +59,7 @@ __all__ = [ 'img_cmrnorm_layer', 'addto_layer', 'concat_layer', + 'seq_concat_layer', 'lstm_step_layer', 'recurrent_group', 'memory', @@ -144,6 +145,7 @@ class LayerType(object): CONCAT_LAYER = 'concat' CONCAT_PROJ_LAYER = 'concat2' + SEQUENCE_CONCAT_LAYER = 'seqconcat' LSTM_STEP_LAYER = 'lstm_step' GRU_STEP_LAYER = 'gru_step' @@ -2570,6 +2572,59 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): size=sz) +@wrap_name_default("seqconcat") +@wrap_act_default(act=IdentityActivation()) +@wrap_bias_attr_default(has_bias=False) +@layer_support() +def seq_concat_layer(a, b, act=None, name=None, layer_attr=None, + bias_attr=None): + """ + Concat sequence a with sequence b. + + Inputs: + - a = [a1, a2, ..., an] + - b = [b1, b2, ..., bn] + - Note that the length of a and b should be the same. + + Output: [a1, b1, a2, b2, ..., an, bn] + + The example usage is: + + .. code-block:: python + + concat = seq_concat_layer(a=layer1, b=layer2) + + :param name: Layer name. + :type name: basestring + :param a: input sequence layer + :type a: LayerOutput + :param b: input sequence layer + :type b: LayerOutput + :param act: Activation type. + :type act: BaseActivation + :param layer_attr: Extra Layer Attribute. + :type layer_attr: ExtraLayerAttribute + :return: LayerOutput object. + :rtype: LayerOutput + """ + assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput) + assert a.size == b.size + Layer( + name=name, + type=LayerType.SEQUENCE_CONCAT_LAYER, + inputs=[a.name, b.name], + active_type=act.name, + bias=ParamAttr.to_bias(bias_attr), + **ExtraLayerAttribute.to_kwargs(layer_attr)) + + return LayerOutput( + name, + layer_type=LayerType.SEQUENCE_CONCAT_LAYER, + parents=[a, b], + activation=act, + size=a.size) + + def memory(name, size, is_seq=False, diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index ea46b557a26ce638742facda3eb6aa2feb4b2563..55cef6be0c630c88c11401682449d8aabdafb124 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -4,6 +4,7 @@ test_sequence_pooling test_lstmemory_layer test_grumemory_layer last_first_seq test_expand_layer test_ntm_layers test_hsigmoid img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight -test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops) +test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops +test_seq_concat) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat.protostr new file mode 100644 index 0000000000000000000000000000000000000000..2fa55e87b5d13c2fc6f8a404733c17cf0a21c875 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat.protostr @@ -0,0 +1,39 @@ +type: "nn" +layers { + name: "data1" + type: "data" + size: 30 + active_type: "" +} +layers { + name: "data2" + type: "data" + size: 30 + active_type: "" +} +layers { + name: "__seqconcat_0__" + type: "seqconcat" + size: 30 + active_type: "" + inputs { + input_layer_name: "data1" + } + inputs { + input_layer_name: "data2" + } +} +input_layer_names: "data1" +input_layer_names: "data2" +output_layer_names: "__seqconcat_0__" +sub_models { + name: "root" + layer_names: "data1" + layer_names: "data2" + layer_names: "__seqconcat_0__" + input_layer_names: "data1" + input_layer_names: "data2" + output_layer_names: "__seqconcat_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat.py new file mode 100644 index 0000000000000000000000000000000000000000..193d9d0df9ed80a4bc555095116b42192c7757f0 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat.py @@ -0,0 +1,9 @@ +from paddle.trainer_config_helpers import * + +settings(batch_size=1000, learning_rate=1e-5) + +din1 = data_layer(name='data1', size=30) + +din2 = data_layer(name='data2', size=30) + +outputs(seq_concat_layer(a=din1, b=din2)) diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 4ecd0dafd6bc5a9c2a784fd9aecd6aa3a93fac95..cfdcebceb5fcf5be0a54b62fb76c58ee5be57729 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -17,11 +17,19 @@ import activation import parameters import trainer import event +import data_type import py_paddle.swig_paddle as api __all__ = [ - 'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer', - 'event', 'data_converter' + 'optimizer', + 'layer', + 'activation', + 'parameters', + 'init', + 'trainer', + 'event', + 'data_type', + 'data_feeder', ] diff --git a/python/paddle/v2/data_converter.py b/python/paddle/v2/data_converter.py deleted file mode 100644 index fcba43e4ba99f98d4853e991fb54483a58d3b8d1..0000000000000000000000000000000000000000 --- a/python/paddle/v2/data_converter.py +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -import py_paddle.swig_paddle as api -import numpy as np -import paddle.trainer.PyDataProvider2 as dp2 - -__all__ = ['DataConverter'] - - -class IDataConverter(object): - def __init__(self, input_type, pos): - """ - :param input_type: data type - :type input_type: dp2.InputType - :param pos: which input, start from 0 - :type pos: int - """ - self.input_type = input_type - assert isinstance(self.input_type, dp2.InputType) - self.pos = pos - - def convert(self, data, argument): - """ - Conv data to paddle format. - :param data: input data - :param argument: paddle format - """ - pass - - -class DenseConvert(IDataConverter): - def __init__(self, input_type, pos): - IDataConverter.__init__(self, input_type, pos) - - def convert(self, data, argument): - """ - :param data: input data - :type data: list | numpy array - :param argument: the type which paddle is acceptable - :type argument: Paddle's Arguments - """ - assert isinstance(argument, api.Arguments) - # TODO: handle data type (float, double, ...) - data = np.array(data, np.float32) - m = api.Matrix.createDenseFromNumpy(data) - argument.setSlotValue(self.pos, m) - - -class SparseBinaryConvert(IDataConverter): - def __init__(self, input_type, pos): - IDataConverter.__init__(self, input_type, pos) - self.__rows__ = [0] - self.__cols__ = [] - self.__height__ = 0 - self.__nnz__ = 0 - self.__value__ = [] - - def fill_csr(self, data): - self.__height__ = len(data) - for x in data: - self.__rows__.append(self.__rows__[-1] + len(x)) - self.__cols__.extend(x) - - def convert(self, data, argument): - assert isinstance(argument, api.Arguments) - - self.fill_csr(data) - m = api.Matrix.createSparse(self.__height__, self.input_type.dim, - len(self.__cols__), - len(self.__value__) == 0) - assert isinstance(m, api.Matrix) - m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__) - argument.setSlotValue(self.pos, m) - - -class SparseFloatConvert(SparseBinaryConvert): - def __init__(self, input_type, pos): - SparseBinaryConvert.__init__(self, input_type, pos) - - def fill_csr(self, data): - self.__height__ = len(data) - for x in data: - self.__rows__.append(self.__rows__[-1] + len(x)) - self.__cols__.extend(x[0]) - self.__value__.extend(x[1]) - - -class IndexConvert(IDataConverter): - def __init__(self, input_type, pos): - IDataConverter.__init__(self, input_type, pos) - self.__ids__ = [] - - def convert(self, data, argument): - assert isinstance(argument, api.Arguments) - #for x in data: - # self.__ids__.append(x) - self.__ids__.extend(x) - - ids = api.IVector.create(self.__ids__) - argument.setSlotIds(self.pos, ids) - - -class SequenceConvert(IDataConverter): - def __init__(self, input_type, pos, inner_convert, setter): - """ - :param input_type: the type of input data - :type input_type: dp2.InputType - :param pos: the position of this input - :type pos: int - :param inner_convert: DataConvert type - :type inner_convert: DenseConvert|SparseBinaryConvert| - SparseFloatConvert|IndexConvert - :param setter: - :type setter: - """ - IDataConverter.__init__(self, input_type, pos) - self.__seq__ = [0] - self.__inner_convert__ = inner_convert - self.__setter__ = setter - - def fill_seq(self, data): - for each in data: - self.__seq__.append(self.__seq__[-1] + self.get_size(each)) - - def convert(self, data, argument): - fill_seq(data) - seq = api.IVector.create(self.__seq__, False) - self.__setter__(argument, self.pos, seq) - - dat = [] - for each in data: - dat.append(each) - self.__inner_scanner__.convert(dat, argument) - - def get_size(self, data): - if isinstance(self.__inner_scanner__, SequenceConvert): - return sum(self.__inner_scanner__.get_size(item) for item in dat) - else: - return len(data) - - -class DataConverter(object): - def __init__(self, input): - """ - Usege: - - .. code-block:: python - inputs = [('image', dense_vector), ('label', integer_value)] - cvt = DataConverter(inputs) - arg = cvt(minibatch_data, {'image':0, 'label':1}) - - :param input_mapper: list of (input_name, input_type) - :type input_mapper: list - """ - self.input_names = [] - self.input_types = [] - for each in input: - self.input_names.append(each[0]) - self.input_types.append(each[1]) - assert isinstance(each[1], dp2.InputType) - - def convert(self, data, input_dict=None, argument=None): - """ - Convert minibatch data to Paddle's argument. The data is numpy array - or list. - - :param data: input samples, for example, [column0, column1, ...] or - (column0, column1, ...) each column is one minibatch - feature. Note, if only one column featrue, data also - shuld be a list or tupe, [column0] or (column0). - :type data: list|tuple - :param input_dict: a dictionary to specify the correspondence - of data_layer and input data. If None, - the feature order in argument and data is the same. - :type input_dict: dict, like {string:integer, string, integer, ...}|None - :param argument: converted data will be saved in this argument. If None, - it will create a Paddle's Arguments firstly. - :param type: swig_paddle.Arguments|None - """ - if argument is None: - argument = api.Arguments.createArguments(0) - assert isinstance(argument, api.Arguments) - argument.resize(len(self.input_types)) - - converts = [ - DataConverter.create_converter(i, each_type) - for i, each_type in enumerate(self.input_types) - ] - - for i, cvt in enumerate(converts): - if input_dict is not None: - dat = data[input_dict[self.input_names[i]]] - else: - dat = data[i] - cvt.convert(dat, argument) - - return argument - - def __call__(self, dat, argument=None): - return self.convert(dat, argument) - - @staticmethod - def create_converter(pos, each): - assert isinstance(each, dp2.InputType) - retv = None - if each.type == dp2.DataType.Dense: - retv = DenseConvert(each, pos) - elif each.type == dp2.DataType.Index: - retv = IndexConvert(each, pos) - elif each.type == dp2.DataType.SparseNonValue: - retv = SparseBinaryConvert(each, pos) - elif each.type == dp2.DataType.SparseValue: - retv = SparseFloatConvert(each, pos) - assert retv is not None - - if each.seq_type == dp2.SequenceType.SUB_SEQUENCE: - retv = SequenceConvert( - each, pos, retv, - lambda arg, pos, seq: arg.setSlotSubSequenceStartPositions(pos, seq) - ) - - if each.seq_type in [ - dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE - ]: - retv = SequenceConvert( - each, pos, retv, - lambda arg, pos, seq: arg.setSlotSequenceStartPositions(pos, seq) - ) - return retv diff --git a/python/paddle/v2/data_converter_test.py b/python/paddle/v2/data_converter_test.py deleted file mode 100644 index d84ee5172782d946e8303506f5e9921164dfdc4b..0000000000000000000000000000000000000000 --- a/python/paddle/v2/data_converter_test.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import py_paddle.swig_paddle as api -import numpy as np -import paddle.trainer.PyDataProvider2 as dp2 - -from paddle.v2.data_converter import DataConverter - - -class DataConverterTest(unittest.TestCase): - def dense_reader(self, shape): - data = np.random.random(shape) - return data - - def sparse_binary_reader(self, - high, - size_limit, - batch_size, - non_empty=False): - data = [] - for i in xrange(batch_size): - num = np.random.randint(size_limit) # num could be 0 - while non_empty and num == 0: - num = np.random.randint(size_limit) - data.append(np.random.randint(high, size=num).tolist()) - - return data - - def test_dense_vector(self): - def compare(input): - converter = DataConverter([('image', dp2.dense_vector(784))]) - arg = converter([input], {'image': 0}) - output = arg.getSlotValue(0).copyToNumpyMat() - input = np.array(input, dtype='float32') - self.assertAlmostEqual(input.all(), output.all()) - - # test numpy array - data = self.dense_reader(shape=[32, 784]) - compare(data) - - # test list - compare(data.tolist()) - - #def test_sparse_binary(self): - # dim = 100000 - # data = self.sparse_binary_reader(dim, 5, 2) - # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) - # arg = converter([data], {'input':0}) - # output = arg.getSlotValue(0) - - #def test_sparse(self): - # dim = 100000 - # v = self.sparse_binary_reader(dim, 5, 2) - # w = [] - # for dat in data: - # x = self.dense_reader(shape=[1, len(dat)]) - # w.append(x.tolist()) - # data = [] - # for each in zip(v, w): - # data.append(zip(each[0], each[1])) - # - # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) - # arg = converter([data], {'input':0}) - # output = arg.getSlotValue(0) - - def test_integer(self): - dim = 100 - index = np.random.randint(dim, size=32) - print index - converter = DataConverter([('input', dp2.integer_value(dim))]) - arg = converter([index], {'input': 0}) - print arg.getSlotValue(0) - output = arg.getSlotValue(0).copyToNumpyArray() - print 'output=', output - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py new file mode 100644 index 0000000000000000000000000000000000000000..2a0b6bbeb563f7b077706f5fd49306eae292c19a --- /dev/null +++ b/python/paddle/v2/data_feeder.py @@ -0,0 +1,19 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from py_paddle import DataProviderConverter + +__all__ = ['DataFeeder'] + +DataFeeder = DataProviderConverter diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py new file mode 100644 index 0000000000000000000000000000000000000000..5b01ba4cd4866cf7b355fc0a6a667409cf9c4419 --- /dev/null +++ b/python/paddle/v2/data_type.py @@ -0,0 +1,22 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer.PyDataProvider2 import \ + InputType, dense_vector, sparse_binary_vector,\ + sparse_vector, integer_value + +__all__ = [ + 'InputType', 'dense_vector', 'sparse_binary_vector', 'sparse_vector', + 'integer_value' +] diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 0ce4ecd569aa1dd9ad27c65775d235b969a52905..4d052c983c2c07730fe5111ccb961de68e73fb8f 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -67,6 +67,7 @@ paddle.v2.parameters.create, no longer exposed to users. """ import paddle.trainer_config_helpers as conf_helps +from . import data_type as v2_data from paddle.trainer_config_helpers.config_parser_utils import \ parse_network_config as __parse__ from paddle.trainer_config_helpers.default_decorators import wrap_name_default @@ -157,7 +158,33 @@ def __convert_to_v2__(method_name, name_prefix, parent_names): return V2LayerImpl -data = __convert_to_v2__('data_layer', None, []) +""" +Some layer may need some special config, and can not use __convert_to_v2__ to convert. +So we also need to implement some special LayerV2. +""" + + +class DataLayerV2(Layer): + def __init__(self, name, type, **kwargs): + assert isinstance(type, v2_data.InputType) + + self.type = type + self.__method_name__ = 'data_layer' + self.__kwargs__ = kwargs + + super(DataLayerV2, self).__init__(name=name, parent_layers=dict()) + + def to_proto_impl(self, **kwargs): + args = dict() + args['size'] = self.type.dim + for each in kwargs: + args[each] = kwargs[each] + for each in self.__kwargs__: + args[each] = self.__kwargs__[each] + return getattr(conf_helps, self.__method_name__)(name=self.name, **args) + + +data = DataLayerV2 fc = __convert_to_v2__('fc_layer', name_prefix='fc', parent_names=['input']) max_id = __convert_to_v2__( 'maxid_layer', name_prefix='maxid_layer', parent_names=['input']) @@ -171,8 +198,8 @@ cross_entropy_cost = __convert_to_v2__( parent_names=['input', 'label']) if __name__ == '__main__': - pixel = data(name='pixel', size=784) - label = data(name='label', size=10) + pixel = data(name='pixel', type=v2_data.dense_vector(784)) + label = data(name='label', type=v2_data.integer_value(10)) hidden = fc(input=pixel, size=100, act=conf_helps.SigmoidActivation()) inference = fc(input=hidden, size=10, act=conf_helps.SoftmaxActivation()) maxid = max_id(input=inference) diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 4365bd41e7073bce4112e5813dbf1517856c06f5..7480a3fb84bbd2abe9d7ff4cbed743bb0470e0e8 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -2,7 +2,7 @@ import collections import py_paddle.swig_paddle as api from paddle.proto.ModelConfig_pb2 import ModelConfig -from py_paddle import DataProviderConverter +from data_feeder import DataFeeder from . import event as v2_event from . import layer as v2_layer @@ -89,6 +89,7 @@ class SGD(ITrainer): event_handler = default_event_handler topology = v2_layer.parse_network(topology) + print topology __check_train_args__(**locals()) @@ -109,7 +110,7 @@ class SGD(ITrainer): raise ValueError() data_types_lists.append(data_types[each]) - converter = DataProviderConverter(input_types=data_types_lists) + feeder = DataFeeder(input_types=data_types_lists) for pass_id in xrange(num_passes): updater.startPass() @@ -117,7 +118,7 @@ class SGD(ITrainer): __data_reader_to_batch__(train_data_reader, batch_size, topology)): pass_type = updater.startBatch(len(data_batch)) - gm.forwardBackward(converter(data_batch), out_args, pass_type) + gm.forwardBackward(feeder(data_batch), out_args, pass_type) for each_param in gm.getParameters(): updater.update(each_param) # Get cost. We use numpy to calculate total cost for this batch.