diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py index 00f72cecacb454a0dd1184fa2098be4543007de7..4b7f5d0e504aef3884a04cbed8c16503a4079772 100755 --- a/demo/sentiment/dataprovider.py +++ b/demo/sentiment/dataprovider.py @@ -32,4 +32,6 @@ def process(settings, file_name): word_slot = [ settings.word_dict[w] for w in words if w in settings.word_dict ] + if not word_slot: + continue yield word_slot, label diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py index 8ec490f64691924013200a3d0038d39aa834b038..64c78e0d6b9297e7a321a4f070517593b0bfe332 100755 --- a/demo/sentiment/predict.py +++ b/demo/sentiment/predict.py @@ -138,7 +138,11 @@ def main(): batch = [] for line in sys.stdin: - batch.append([predict.get_index(line)]) + words = predict.get_index(line) + if words: + batch.append([words]) + else: + print('All the words in [%s] are not in the dictionary.' % line) if len(batch) == batch_size: predict.batch_predict(batch) batch = [] diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst index 8b0e553eacc932bc59062103ac6e6ac4245d03cb..3685868ffcfdf1d17557e5e8bcba6cbc25804147 100644 --- a/doc/api/trainer_config_helpers/layers.rst +++ b/doc/api/trainer_config_helpers/layers.rst @@ -279,6 +279,12 @@ concat_layer :members: concat_layer :noindex: +seq_concat_layer +---------------- +.. automodule:: paddle.trainer_config_helpers.layers + :members: seq_concat_layer + :noindex: + Reshaping Layers ================ diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index 8f7abf12f733542734efe91111f365a34aa4b15b..ff3ee758beeb2f5def0eefbd63132d48473f892b 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -107,7 +107,7 @@ We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["imag ### How to create custom data reader ```python -def image_reader(image_path, label_path): +def image_reader(image_path, label_path, n): f = open(image_path) l = open(label_path) images = numpy.fromfile( @@ -117,9 +117,10 @@ def image_reader(image_path, label_path): for i in xrange(n): yield images[i, :], labels[i] # a single entry of data is created each time f.close() + l.close() # use python lambda to change image_reader into a function with no parameters. -reader = lambda : image_reader("/path/to/image_file", "/path/to/label_file") +reader = lambda : image_reader("/path/to/image_file", "/path/to/label_file", 1024) paddle.train(reader, {"image":0, "label":1}, ...) ``` diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp index ae7508e2bb117a60492e0c28230f2fbb4b14915e..2bf6ead0dc382cd74cf64508835b24b8483dc553 100644 --- a/paddle/gserver/evaluators/Evaluator.cpp +++ b/paddle/gserver/evaluators/Evaluator.cpp @@ -866,21 +866,20 @@ void PnpairEvaluator::calc(std::vector& predictArray) { ClassRegistrar Evaluator::registrar_; Evaluator* Evaluator::create(const EvaluatorConfig& config) { - Evaluator* evaluator = nullptr; - if (config.type() == "classification_error") { - evaluator = new ClassificationErrorEvaluator(); - } else if (config.type() == "sum") { - evaluator = new SumEvaluator(); - } else if (config.type() == "last-column-sum") { - evaluator = new ColumnSumEvaluator(-1); - } else if (config.type() == "last-column-auc") { - evaluator = new AucEvaluator(-1); - } else { - evaluator = registrar_.createByType(config.type()); - } + Evaluator* evaluator = registrar_.createByType(config.type()); evaluator->init(config); return evaluator; } + +REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator); +REGISTER_EVALUATOR(sum, SumEvaluator); +static InitFunction __reg_type_auc_sum__([]() { + Evaluator::registrar_.registerClass( + "last-column-sum", [] { return new ColumnSumEvaluator(-1); }); + Evaluator::registrar_.registerClass("last-column-auc", + [] { return new AucEvaluator(-1); }); +}); + /** * @brief print value of each layer. * @@ -888,32 +887,10 @@ Evaluator* Evaluator::create(const EvaluatorConfig& config) { */ class ValuePrinter : public Evaluator { public: - ValuePrinter() {} - virtual void eval(const NeuralNetwork& nn) { for (const std::string& name : config_.input_layers()) { - const Argument& argu = nn.getLayer(name)->getOutput(); - if (argu.value) { - std::ostringstream os; - argu.value->print(os); - LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str(); - } - if (argu.ids) { - std::ostringstream os; - argu.ids->print(os, argu.ids->getSize()); - LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str(); - } - if (auto startPos = argu.sequenceStartPositions) { - std::ostringstream os; - startPos->getVector(false)->print(os, startPos->getSize()); - LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str(); - } - if (auto subStartPos = argu.subSequenceStartPositions) { - std::ostringstream os; - subStartPos->getVector(false)->print(os, subStartPos->getSize()); - LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n" - << os.str(); - } + nn.getLayer(name)->getOutput().printValueString(LOG(INFO), + "layer=" + name + " "); } } @@ -929,8 +906,6 @@ REGISTER_EVALUATOR(value_printer, ValuePrinter); */ class GradientPrinter : public Evaluator { public: - GradientPrinter() {} - virtual void eval(const NeuralNetwork& nn) { for (const std::string& name : config_.input_layers()) { const Argument& argu = nn.getLayer(name)->getOutput(); @@ -939,11 +914,6 @@ public: argu.grad->print(os); LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str(); } - if (auto startPos = argu.sequenceStartPositions) { - std::ostringstream os; - startPos->getVector(false)->print(os, startPos->getSize()); - LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str(); - } } } diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp index 85f52ad5debd035c403c73afc7390904428e28a7..de198af111be4200dd1b240f6de9464e3f43b06d 100644 --- a/paddle/gserver/layers/PrintLayer.cpp +++ b/paddle/gserver/layers/PrintLayer.cpp @@ -19,38 +19,17 @@ namespace paddle { class PrintLayer : public Layer { public: explicit PrintLayer(const LayerConfig& config) : Layer(config) {} - void forward(PassType passType) override; - void backward(const UpdateCallback& callback) override {} -}; -void PrintLayer::forward(PassType passType) { - Layer::forward(passType); - for (size_t i = 0; i != inputLayers_.size(); ++i) { - const auto& argu = getInput(i); - const std::string& name = inputLayers_[i]->getName(); - if (argu.value) { - std::ostringstream os; - argu.value->print(os); - LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str(); - } - if (argu.ids) { - std::ostringstream os; - argu.ids->print(os, argu.ids->getSize()); - LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str(); - } - if (auto startPos = argu.sequenceStartPositions) { - std::ostringstream os; - startPos->getVector(false)->print(os, startPos->getSize()); - LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str(); - } - if (auto subStartPos = argu.subSequenceStartPositions) { - std::ostringstream os; - subStartPos->getVector(false)->print(os, subStartPos->getSize()); - LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n" - << os.str(); + void forward(PassType passType) override { + Layer::forward(passType); + for (size_t i = 0; i != inputLayers_.size(); ++i) { + getInput(i).printValueString(LOG(INFO), + "layer=" + inputLayers_[i]->getName() + " "); } } -} + + void backward(const UpdateCallback& callback) override {} +}; REGISTER_LAYER(print, PrintLayer); diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp index 599706eb419ede72dbd6f4c8c74e57f5f9965388..4b24d8f0c852e1bdc887d4ee1465b9ad05d210bb 100644 --- a/paddle/gserver/layers/SequenceConcatLayer.cpp +++ b/paddle/gserver/layers/SequenceConcatLayer.cpp @@ -21,9 +21,11 @@ namespace paddle { /** * A layer for concatenating the first sequence with the second sequence - * following the first - * Input: two sequences each containing some instances + * Input: two sequences each containing the same number of instances + * seq1 = [a1, a2, ..., an] + * seq2 = [b1, b2, ..., bn] * Output: a concatenated sequence of the two input sequences + * out = [a1, b1, a2, b2, ..., an, bn] */ class SequenceConcatLayer : public Layer { diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 65d01a15718ae2bebd4869eff0e5407524bc0e7c..7a343cca33f5b420be6192231ac73ca1c2da5fb9 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -602,6 +602,44 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) { tgtBuf[numSequences] = numSubSequences; } +void Argument::getValueString( + std::unordered_map* out) const { + if (value) { + std::ostringstream os; + value->print(os); + out->insert({"value", os.str()}); + } + if (ids) { + std::ostringstream os; + ids->print(os, ids->getSize()); + out->insert({"ids", os.str()}); + } + if (sequenceStartPositions) { + std::ostringstream os; + sequenceStartPositions->getVector(false)->print( + os, sequenceStartPositions->getSize()); + out->insert({"sequence pos", os.str()}); + } + if (subSequenceStartPositions) { + std::ostringstream os; + subSequenceStartPositions->getVector(false)->print( + os, subSequenceStartPositions->getSize()); + out->insert({"sub-sequence pos", os.str()}); + } +} + +void Argument::printValueString(std::ostream& stream, + const std::string& prefix) const { + std::unordered_map out; + getValueString(&out); + for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) { + auto it = out.find(field); + if (it != out.end()) { + stream << prefix << field << ":\n" << it->second; + } + } +} + void Argument::subArgFrom(const Argument& input, size_t offset, size_t height, diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index afd2de0202bf0f14ec3d4c5b856455a3488e41f6..178c068b93ac5fc1e06200984f14da86069cf7e4 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -297,6 +297,23 @@ struct Argument { sequence has sub-sequence degrades to a sequence. */ void degradeSequence(const Argument& input, bool useGpu); + + /** + * @brief getValueString will return the argument's output in string. There + * are several kinds of output. The keys of output dictionary are 'value', + * 'id', 'sequence pos', 'sub-sequence pos'. + * @param out [out]: the return values. + */ + void getValueString(std::unordered_map* out) const; + + /** + * @brief printValueString will print the argument's output in order of + * 'value', 'id', 'sequence pos', 'sub-sequence pos'. + * @param stream: Output stream + * @param prefix: line prefix for printing. + */ + void printValueString(std::ostream& stream, + const std::string& prefix = "") const; }; } // namespace paddle diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index ee7a5bff84ca96ef1010fa7430356722f807fb0f..357637e20346f8e1179d3a28ff580722cdfcccff 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -24,6 +24,7 @@ add_custom_target(paddle_python ALL DEPENDS ${OUTPUT_DIR}/.timestamp) add_subdirectory(paddle/trainer_config_helpers/tests) +add_subdirectory(paddle/reader/tests) install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/ DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..493b410e8299ebe167be43ead1401a6ab245a631 --- /dev/null +++ b/python/paddle/reader/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# It would be too lengthy to require our users to prefix decorators with `decorator`. +# For example, we want the following line +# +# r = paddle.reader.decorator.bufferd(paddle.reader.creator.text("hello.txt")) +# +# to be a shorter version: +# +# r = paddle.reader.buffered(paddle.reader.creator.text("hello.txt")) +from decorator import * diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ddb0ff812b15ede21e6965c7c8857f12716fa0 --- /dev/null +++ b/python/paddle/reader/decorator.py @@ -0,0 +1,60 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['buffered'] + +from Queue import Queue +from threading import Thread + + +def buffered(reader, size): + """Creates a buffered data reader. + + The buffered data reader will read and save data entries into a buffer. + Reading from the buffered data reader will proceed as long as the buffer + is not empty. + + Args: + reader: the data reader to read from. + size: max buffer size. + + Returns: + The buffered data reader. + """ + + class EndSignal(): + pass + + end = EndSignal() + + def read_worker(r, q): + for d in r: + q.put(d) + q.put(end) + + def create_reader(): + r = reader() + q = Queue(maxsize=size) + t = Thread( + target=read_worker, args=( + r, + q, )) + t.daemon = True + t.start() + e = q.get() + while e != end: + yield e + e = q.get() + + return create_reader diff --git a/python/paddle/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..502c897d8946a838847c1c23b1236358c58c088e --- /dev/null +++ b/python/paddle/reader/tests/CMakeLists.txt @@ -0,0 +1,4 @@ +add_test(NAME reader_decorator_test + COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/ + ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/reader/tests/decorator_test.py + WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle) diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..879d1d9c1d0e0650d347b5c44e36771a0c15390e --- /dev/null +++ b/python/paddle/reader/tests/decorator_test.py @@ -0,0 +1,50 @@ +# Copyright PaddlePaddle contributors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import paddle.reader +import time + + +def reader_10(dur): + for i in range(10): + time.sleep(dur) + yield i + + +class TestBuffered(unittest.TestCase): + def test_read(self): + for size in range(20): + b = paddle.reader.buffered(lambda: reader_10(0), size) + c = 0 + for i in b(): + self.assertEqual(i, c) + c += 1 + self.assertEqual(c, 10) + + def test_buffering(self): + # read have 30ms delay. + b = paddle.reader.buffered(lambda: reader_10(0.03), 10) + last_time = time.time() + for idx, i in enumerate(b()): + elapsed_time = time.time() - last_time + if i == 0: + time.sleep(0.3) + else: + # read time should be short, meaning already buffered. + self.assertLess(elapsed_time, 0.01) + last_time = time.time() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 1fdc4c462363712e8b5b4dee10d0aaa26f4deffa..0ff29772356af7dcb36b71fa2b1263dc4477d0e4 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -59,6 +59,7 @@ __all__ = [ 'img_cmrnorm_layer', 'addto_layer', 'concat_layer', + 'seq_concat_layer', 'lstm_step_layer', 'recurrent_group', 'memory', @@ -144,6 +145,7 @@ class LayerType(object): CONCAT_LAYER = 'concat' CONCAT_PROJ_LAYER = 'concat2' + SEQUENCE_CONCAT_LAYER = 'seqconcat' LSTM_STEP_LAYER = 'lstm_step' GRU_STEP_LAYER = 'gru_step' @@ -2570,6 +2572,59 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): size=sz) +@wrap_name_default("seqconcat") +@wrap_act_default(act=IdentityActivation()) +@wrap_bias_attr_default(has_bias=False) +@layer_support() +def seq_concat_layer(a, b, act=None, name=None, layer_attr=None, + bias_attr=None): + """ + Concat sequence a with sequence b. + + Inputs: + - a = [a1, a2, ..., an] + - b = [b1, b2, ..., bn] + - Note that the length of a and b should be the same. + + Output: [a1, b1, a2, b2, ..., an, bn] + + The example usage is: + + .. code-block:: python + + concat = seq_concat_layer(a=layer1, b=layer2) + + :param name: Layer name. + :type name: basestring + :param a: input sequence layer + :type a: LayerOutput + :param b: input sequence layer + :type b: LayerOutput + :param act: Activation type. + :type act: BaseActivation + :param layer_attr: Extra Layer Attribute. + :type layer_attr: ExtraLayerAttribute + :return: LayerOutput object. + :rtype: LayerOutput + """ + assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput) + assert a.size == b.size + Layer( + name=name, + type=LayerType.SEQUENCE_CONCAT_LAYER, + inputs=[a.name, b.name], + active_type=act.name, + bias=ParamAttr.to_bias(bias_attr), + **ExtraLayerAttribute.to_kwargs(layer_attr)) + + return LayerOutput( + name, + layer_type=LayerType.SEQUENCE_CONCAT_LAYER, + parents=[a, b], + activation=act, + size=a.size) + + def memory(name, size, is_seq=False, diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index ea46b557a26ce638742facda3eb6aa2feb4b2563..55cef6be0c630c88c11401682449d8aabdafb124 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -4,6 +4,7 @@ test_sequence_pooling test_lstmemory_layer test_grumemory_layer last_first_seq test_expand_layer test_ntm_layers test_hsigmoid img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight -test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops) +test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops +test_seq_concat) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat.protostr new file mode 100644 index 0000000000000000000000000000000000000000..2fa55e87b5d13c2fc6f8a404733c17cf0a21c875 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat.protostr @@ -0,0 +1,39 @@ +type: "nn" +layers { + name: "data1" + type: "data" + size: 30 + active_type: "" +} +layers { + name: "data2" + type: "data" + size: 30 + active_type: "" +} +layers { + name: "__seqconcat_0__" + type: "seqconcat" + size: 30 + active_type: "" + inputs { + input_layer_name: "data1" + } + inputs { + input_layer_name: "data2" + } +} +input_layer_names: "data1" +input_layer_names: "data2" +output_layer_names: "__seqconcat_0__" +sub_models { + name: "root" + layer_names: "data1" + layer_names: "data2" + layer_names: "__seqconcat_0__" + input_layer_names: "data1" + input_layer_names: "data2" + output_layer_names: "__seqconcat_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat.py new file mode 100644 index 0000000000000000000000000000000000000000..193d9d0df9ed80a4bc555095116b42192c7757f0 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat.py @@ -0,0 +1,9 @@ +from paddle.trainer_config_helpers import * + +settings(batch_size=1000, learning_rate=1e-5) + +din1 = data_layer(name='data1', size=30) + +din2 = data_layer(name='data2', size=30) + +outputs(seq_concat_layer(a=din1, b=din2))