提交 297c6a01 编写于 作者: D dangqingqing

refine data feeder

......@@ -9,7 +9,6 @@ The user api could be simpler and carefully designed.
import random
import numpy as np
import paddle.trainer.PyDataProvider2 as dp
import paddle.v2 as paddle_v2
import py_paddle.swig_paddle as api
from paddle.trainer_config_helpers import *
......@@ -71,8 +70,10 @@ def main():
assert isinstance(updater, api.ParameterUpdater)
# define network
images = paddle_v2.layer.data(name='pixel', size=784)
label = paddle_v2.layer.data(name='label', size=10)
images = paddle_v2.layer.data(
name='pixel', type=paddle_v2.data_type.dense_vector(784))
label = paddle_v2.layer.data(
name='label', type=paddle_v2.data_type.integer_value(10))
hidden1 = paddle_v2.layer.fc(input=images, size=200)
hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
inference = paddle_v2.layer.fc(input=hidden2,
......@@ -98,8 +99,7 @@ def main():
# DataProvider Converter is a utility convert Python Object to Paddle C++
# Input. The input format is as same as Paddle's DataProvider.
converter = DataProviderConverter(
input_types=[dp.dense_vector(784), dp.integer_value(10)])
converter = DataProviderConverter(input_types=[images.type, label.type])
train_file = './data/raw_data/train'
test_file = './data/raw_data/t10k'
......
import numpy
import paddle.v2 as paddle
from paddle.trainer.PyDataProvider2 import dense_vector, integer_value
import mnist_util
......@@ -16,8 +15,10 @@ def main():
paddle.init(use_gpu=False, trainer_count=1)
# define network topology
images = paddle.layer.data(name='pixel', size=784)
label = paddle.layer.data(name='label', size=10)
images = paddle.layer.data(
name='pixel', type=paddle.data_type.dense_vector(784))
label = paddle.layer.data(
name='label', type=paddle.data_type.integer_value(10))
hidden1 = paddle.layer.fc(input=images, size=200)
hidden2 = paddle.layer.fc(input=hidden1, size=200)
inference = paddle.layer.fc(input=hidden2,
......@@ -51,8 +52,8 @@ def main():
batch_size=32, # batch size should be refactor in Data reader
data_types={ # data_types will be removed, It should be in
# network topology
'pixel': dense_vector(784),
'label': integer_value(10)
'pixel': images.type,
'label': label.type
})
......
......@@ -32,4 +32,6 @@ def process(settings, file_name):
word_slot = [
settings.word_dict[w] for w in words if w in settings.word_dict
]
if not word_slot:
continue
yield word_slot, label
......@@ -138,7 +138,11 @@ def main():
batch = []
for line in sys.stdin:
batch.append([predict.get_index(line)])
words = predict.get_index(line)
if words:
batch.append([words])
else:
print('All the words in [%s] are not in the dictionary.' % line)
if len(batch) == batch_size:
predict.batch_predict(batch)
batch = []
......
......@@ -279,6 +279,12 @@ concat_layer
:members: concat_layer
:noindex:
seq_concat_layer
----------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: seq_concat_layer
:noindex:
Reshaping Layers
================
......
......@@ -107,7 +107,7 @@ We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["imag
### How to create custom data reader
```python
def image_reader(image_path, label_path):
def image_reader(image_path, label_path, n):
f = open(image_path)
l = open(label_path)
images = numpy.fromfile(
......@@ -117,9 +117,10 @@ def image_reader(image_path, label_path):
for i in xrange(n):
yield images[i, :], labels[i] # a single entry of data is created each time
f.close()
l.close()
# use python lambda to change image_reader into a function with no parameters.
reader = lambda : image_reader("/path/to/image_file", "/path/to/label_file")
reader = lambda : image_reader("/path/to/image_file", "/path/to/label_file", 1024)
paddle.train(reader, {"image":0, "label":1}, ...)
```
......
......@@ -887,32 +887,10 @@ static InitFunction __reg_type_auc_sum__([]() {
*/
class ValuePrinter : public Evaluator {
public:
ValuePrinter() {}
virtual void eval(const NeuralNetwork& nn) {
for (const std::string& name : config_.input_layers()) {
const Argument& argu = nn.getLayer(name)->getOutput();
if (argu.value) {
std::ostringstream os;
argu.value->print(os);
LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
}
if (argu.ids) {
std::ostringstream os;
argu.ids->print(os, argu.ids->getSize());
LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
}
if (auto startPos = argu.sequenceStartPositions) {
std::ostringstream os;
startPos->getVector(false)->print(os, startPos->getSize());
LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
}
if (auto subStartPos = argu.subSequenceStartPositions) {
std::ostringstream os;
subStartPos->getVector(false)->print(os, subStartPos->getSize());
LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
<< os.str();
}
nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
"layer=" + name + " ");
}
}
......@@ -928,8 +906,6 @@ REGISTER_EVALUATOR(value_printer, ValuePrinter);
*/
class GradientPrinter : public Evaluator {
public:
GradientPrinter() {}
virtual void eval(const NeuralNetwork& nn) {
for (const std::string& name : config_.input_layers()) {
const Argument& argu = nn.getLayer(name)->getOutput();
......@@ -938,11 +914,6 @@ public:
argu.grad->print(os);
LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
}
if (auto startPos = argu.sequenceStartPositions) {
std::ostringstream os;
startPos->getVector(false)->print(os, startPos->getSize());
LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
}
}
}
......
......@@ -19,38 +19,17 @@ namespace paddle {
class PrintLayer : public Layer {
public:
explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
void forward(PassType passType) override;
void backward(const UpdateCallback& callback) override {}
};
void PrintLayer::forward(PassType passType) {
Layer::forward(passType);
for (size_t i = 0; i != inputLayers_.size(); ++i) {
const auto& argu = getInput(i);
const std::string& name = inputLayers_[i]->getName();
if (argu.value) {
std::ostringstream os;
argu.value->print(os);
LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
}
if (argu.ids) {
std::ostringstream os;
argu.ids->print(os, argu.ids->getSize());
LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
}
if (auto startPos = argu.sequenceStartPositions) {
std::ostringstream os;
startPos->getVector(false)->print(os, startPos->getSize());
LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
}
if (auto subStartPos = argu.subSequenceStartPositions) {
std::ostringstream os;
subStartPos->getVector(false)->print(os, subStartPos->getSize());
LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
<< os.str();
void forward(PassType passType) override {
Layer::forward(passType);
for (size_t i = 0; i != inputLayers_.size(); ++i) {
getInput(i).printValueString(LOG(INFO),
"layer=" + inputLayers_[i]->getName() + " ");
}
}
}
void backward(const UpdateCallback& callback) override {}
};
REGISTER_LAYER(print, PrintLayer);
......
......@@ -21,9 +21,11 @@ namespace paddle {
/**
* A layer for concatenating the first sequence with the second sequence
* following the first
* Input: two sequences each containing some instances
* Input: two sequences each containing the same number of instances
* seq1 = [a1, a2, ..., an]
* seq2 = [b1, b2, ..., bn]
* Output: a concatenated sequence of the two input sequences
* out = [a1, b1, a2, b2, ..., an, bn]
*/
class SequenceConcatLayer : public Layer {
......
......@@ -602,6 +602,44 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
tgtBuf[numSequences] = numSubSequences;
}
void Argument::getValueString(
std::unordered_map<std::string, std::string>* out) const {
if (value) {
std::ostringstream os;
value->print(os);
out->insert({"value", os.str()});
}
if (ids) {
std::ostringstream os;
ids->print(os, ids->getSize());
out->insert({"ids", os.str()});
}
if (sequenceStartPositions) {
std::ostringstream os;
sequenceStartPositions->getVector(false)->print(
os, sequenceStartPositions->getSize());
out->insert({"sequence pos", os.str()});
}
if (subSequenceStartPositions) {
std::ostringstream os;
subSequenceStartPositions->getVector(false)->print(
os, subSequenceStartPositions->getSize());
out->insert({"sub-sequence pos", os.str()});
}
}
void Argument::printValueString(std::ostream& stream,
const std::string& prefix) const {
std::unordered_map<std::string, std::string> out;
getValueString(&out);
for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
auto it = out.find(field);
if (it != out.end()) {
stream << prefix << field << ":\n" << it->second;
}
}
}
void Argument::subArgFrom(const Argument& input,
size_t offset,
size_t height,
......
......@@ -297,6 +297,23 @@ struct Argument {
sequence has sub-sequence degrades to a sequence.
*/
void degradeSequence(const Argument& input, bool useGpu);
/**
* @brief getValueString will return the argument's output in string. There
* are several kinds of output. The keys of output dictionary are 'value',
* 'id', 'sequence pos', 'sub-sequence pos'.
* @param out [out]: the return values.
*/
void getValueString(std::unordered_map<std::string, std::string>* out) const;
/**
* @brief printValueString will print the argument's output in order of
* 'value', 'id', 'sequence pos', 'sub-sequence pos'.
* @param stream: Output stream
* @param prefix: line prefix for printing.
*/
void printValueString(std::ostream& stream,
const std::string& prefix = "") const;
};
} // namespace paddle
......@@ -23,7 +23,8 @@ __all__ = ['DataProviderConverter']
class IScanner(object):
def __init__(self, input_type, pos):
self.input_type = input_type
assert isinstance(self.input_type, dp2.InputType)
if not isinstance(self.input_type, dp2.InputType):
raise ValueError("input type should be dataprovider2.InputType")
self.pos = pos
def scan(self, dat):
......@@ -50,7 +51,6 @@ class DenseScanner(IScanner):
def finish_scan(self, argument):
assert isinstance(argument, swig_paddle.Arguments)
assert isinstance(self.input_type, dp2.InputType)
if self.__mat__.dtype != numpy.float32:
self.__mat__ = self.__mat__.astype(numpy.float32)
m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
......@@ -63,7 +63,6 @@ class SparseBinaryScanner(IScanner):
self.__rows__ = [0]
self.__cols__ = []
self.__height__ = 0
self.__nnz__ = 0
self.__value__ = []
def scan(self, dat):
......@@ -76,7 +75,6 @@ class SparseBinaryScanner(IScanner):
def finish_scan(self, argument):
assert isinstance(argument, swig_paddle.Arguments)
assert isinstance(self.input_type, dp2.InputType)
m = swig_paddle.Matrix.createSparse(self.__height__,
self.input_type.dim,
len(self.__cols__),
......
......@@ -24,6 +24,7 @@ add_custom_target(paddle_python ALL DEPENDS
${OUTPUT_DIR}/.timestamp)
add_subdirectory(paddle/trainer_config_helpers/tests)
add_subdirectory(paddle/reader/tests)
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
DESTINATION opt/paddle/share/wheels
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# It would be too lengthy to require our users to prefix decorators with `decorator`.
# For example, we want the following line
#
# r = paddle.reader.decorator.bufferd(paddle.reader.creator.text("hello.txt"))
#
# to be a shorter version:
#
# r = paddle.reader.buffered(paddle.reader.creator.text("hello.txt"))
from decorator import *
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['buffered']
from Queue import Queue
from threading import Thread
def buffered(reader, size):
"""Creates a buffered data reader.
The buffered data reader will read and save data entries into a buffer.
Reading from the buffered data reader will proceed as long as the buffer
is not empty.
Args:
reader: the data reader to read from.
size: max buffer size.
Returns:
The buffered data reader.
"""
class EndSignal():
pass
end = EndSignal()
def read_worker(r, q):
for d in r:
q.put(d)
q.put(end)
def create_reader():
r = reader()
q = Queue(maxsize=size)
t = Thread(
target=read_worker, args=(
r,
q, ))
t.daemon = True
t.start()
e = q.get()
while e != end:
yield e
e = q.get()
return create_reader
add_test(NAME reader_decorator_test
COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/reader/tests/decorator_test.py
WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
# Copyright PaddlePaddle contributors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.reader
import time
def reader_10(dur):
for i in range(10):
time.sleep(dur)
yield i
class TestBuffered(unittest.TestCase):
def test_read(self):
for size in range(20):
b = paddle.reader.buffered(lambda: reader_10(0), size)
c = 0
for i in b():
self.assertEqual(i, c)
c += 1
self.assertEqual(c, 10)
def test_buffering(self):
# read have 30ms delay.
b = paddle.reader.buffered(lambda: reader_10(0.03), 10)
last_time = time.time()
for idx, i in enumerate(b()):
elapsed_time = time.time() - last_time
if i == 0:
time.sleep(0.3)
else:
# read time should be short, meaning already buffered.
self.assertLess(elapsed_time, 0.01)
last_time = time.time()
if __name__ == '__main__':
unittest.main()
......@@ -59,6 +59,7 @@ __all__ = [
'img_cmrnorm_layer',
'addto_layer',
'concat_layer',
'seq_concat_layer',
'lstm_step_layer',
'recurrent_group',
'memory',
......@@ -144,6 +145,7 @@ class LayerType(object):
CONCAT_LAYER = 'concat'
CONCAT_PROJ_LAYER = 'concat2'
SEQUENCE_CONCAT_LAYER = 'seqconcat'
LSTM_STEP_LAYER = 'lstm_step'
GRU_STEP_LAYER = 'gru_step'
......@@ -2570,6 +2572,59 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
size=sz)
@wrap_name_default("seqconcat")
@wrap_act_default(act=IdentityActivation())
@wrap_bias_attr_default(has_bias=False)
@layer_support()
def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
bias_attr=None):
"""
Concat sequence a with sequence b.
Inputs:
- a = [a1, a2, ..., an]
- b = [b1, b2, ..., bn]
- Note that the length of a and b should be the same.
Output: [a1, b1, a2, b2, ..., an, bn]
The example usage is:
.. code-block:: python
concat = seq_concat_layer(a=layer1, b=layer2)
:param name: Layer name.
:type name: basestring
:param a: input sequence layer
:type a: LayerOutput
:param b: input sequence layer
:type b: LayerOutput
:param act: Activation type.
:type act: BaseActivation
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
assert a.size == b.size
Layer(
name=name,
type=LayerType.SEQUENCE_CONCAT_LAYER,
inputs=[a.name, b.name],
active_type=act.name,
bias=ParamAttr.to_bias(bias_attr),
**ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(
name,
layer_type=LayerType.SEQUENCE_CONCAT_LAYER,
parents=[a, b],
activation=act,
size=a.size)
def memory(name,
size,
is_seq=False,
......
......@@ -4,6 +4,7 @@ test_sequence_pooling test_lstmemory_layer test_grumemory_layer
last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_seq_concat)
export whole_configs=(test_split_datasource)
type: "nn"
layers {
name: "data1"
type: "data"
size: 30
active_type: ""
}
layers {
name: "data2"
type: "data"
size: 30
active_type: ""
}
layers {
name: "__seqconcat_0__"
type: "seqconcat"
size: 30
active_type: ""
inputs {
input_layer_name: "data1"
}
inputs {
input_layer_name: "data2"
}
}
input_layer_names: "data1"
input_layer_names: "data2"
output_layer_names: "__seqconcat_0__"
sub_models {
name: "root"
layer_names: "data1"
layer_names: "data2"
layer_names: "__seqconcat_0__"
input_layer_names: "data1"
input_layer_names: "data2"
output_layer_names: "__seqconcat_0__"
is_recurrent_layer_group: false
}
from paddle.trainer_config_helpers import *
settings(batch_size=1000, learning_rate=1e-5)
din1 = data_layer(name='data1', size=30)
din2 = data_layer(name='data2', size=30)
outputs(seq_concat_layer(a=din1, b=din2))
......@@ -17,11 +17,19 @@ import activation
import parameters
import trainer
import event
import data_type
import py_paddle.swig_paddle as api
__all__ = [
'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
'event', 'data_converter'
'optimizer',
'layer',
'activation',
'parameters',
'init',
'trainer',
'event',
'data_type',
'data_feeder',
]
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import py_paddle.swig_paddle as api
import numpy as np
import paddle.trainer.PyDataProvider2 as dp2
__all__ = ['DataConverter']
class IDataConverter(object):
def __init__(self, input_type, pos):
"""
:param input_type: data type
:type input_type: dp2.InputType
:param pos: which input, start from 0
:type pos: int
"""
self.input_type = input_type
assert isinstance(self.input_type, dp2.InputType)
self.pos = pos
def convert(self, data, argument):
"""
Conv data to paddle format.
:param data: input data
:param argument: paddle format
"""
pass
class DenseConvert(IDataConverter):
def __init__(self, input_type, pos):
IDataConverter.__init__(self, input_type, pos)
def convert(self, data, argument):
"""
:param data: input data
:type data: list | numpy array
:param argument: the type which paddle is acceptable
:type argument: Paddle's Arguments
"""
assert isinstance(argument, api.Arguments)
# TODO: handle data type (float, double, ...)
data = np.array(data, np.float32)
m = api.Matrix.createDenseFromNumpy(data)
argument.setSlotValue(self.pos, m)
class SparseBinaryConvert(IDataConverter):
def __init__(self, input_type, pos):
IDataConverter.__init__(self, input_type, pos)
self.__rows__ = [0]
self.__cols__ = []
self.__height__ = 0
self.__nnz__ = 0
self.__value__ = []
def fill_csr(self, data):
self.__height__ = len(data)
for x in data:
self.__rows__.append(self.__rows__[-1] + len(x))
self.__cols__.extend(x)
def convert(self, data, argument):
assert isinstance(argument, api.Arguments)
self.fill_csr(data)
m = api.Matrix.createSparse(self.__height__, self.input_type.dim,
len(self.__cols__),
len(self.__value__) == 0)
assert isinstance(m, api.Matrix)
m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
argument.setSlotValue(self.pos, m)
class SparseFloatConvert(SparseBinaryConvert):
def __init__(self, input_type, pos):
SparseBinaryConvert.__init__(self, input_type, pos)
def fill_csr(self, data):
self.__height__ = len(data)
for x in data:
self.__rows__.append(self.__rows__[-1] + len(x))
self.__cols__.extend(x[0])
self.__value__.extend(x[1])
class IndexConvert(IDataConverter):
def __init__(self, input_type, pos):
IDataConverter.__init__(self, input_type, pos)
self.__ids__ = []
def convert(self, data, argument):
assert isinstance(argument, api.Arguments)
#for x in data:
# self.__ids__.append(x)
self.__ids__.extend(x)
ids = api.IVector.create(self.__ids__)
argument.setSlotIds(self.pos, ids)
class SequenceConvert(IDataConverter):
def __init__(self, input_type, pos, inner_convert, setter):
"""
:param input_type: the type of input data
:type input_type: dp2.InputType
:param pos: the position of this input
:type pos: int
:param inner_convert: DataConvert type
:type inner_convert: DenseConvert|SparseBinaryConvert|
SparseFloatConvert|IndexConvert
:param setter:
:type setter:
"""
IDataConverter.__init__(self, input_type, pos)
self.__seq__ = [0]
self.__inner_convert__ = inner_convert
self.__setter__ = setter
def fill_seq(self, data):
for each in data:
self.__seq__.append(self.__seq__[-1] + self.get_size(each))
def convert(self, data, argument):
fill_seq(data)
seq = api.IVector.create(self.__seq__, False)
self.__setter__(argument, self.pos, seq)
dat = []
for each in data:
dat.append(each)
self.__inner_scanner__.convert(dat, argument)
def get_size(self, data):
if isinstance(self.__inner_scanner__, SequenceConvert):
return sum(self.__inner_scanner__.get_size(item) for item in dat)
else:
return len(data)
class DataConverter(object):
def __init__(self, input):
"""
Usege:
.. code-block:: python
inputs = [('image', dense_vector), ('label', integer_value)]
cvt = DataConverter(inputs)
arg = cvt(minibatch_data, {'image':0, 'label':1})
:param input_mapper: list of (input_name, input_type)
:type input_mapper: list
"""
self.input_names = []
self.input_types = []
for each in input:
self.input_names.append(each[0])
self.input_types.append(each[1])
assert isinstance(each[1], dp2.InputType)
def convert(self, data, input_dict=None, argument=None):
"""
Convert minibatch data to Paddle's argument. The data is numpy array
or list.
:param data: input samples, for example, [column0, column1, ...] or
(column0, column1, ...) each column is one minibatch
feature. Note, if only one column featrue, data also
shuld be a list or tupe, [column0] or (column0).
:type data: list|tuple
:param input_dict: a dictionary to specify the correspondence
of data_layer and input data. If None,
the feature order in argument and data is the same.
:type input_dict: dict, like {string:integer, string, integer, ...}|None
:param argument: converted data will be saved in this argument. If None,
it will create a Paddle's Arguments firstly.
:param type: swig_paddle.Arguments|None
"""
if argument is None:
argument = api.Arguments.createArguments(0)
assert isinstance(argument, api.Arguments)
argument.resize(len(self.input_types))
converts = [
DataConverter.create_converter(i, each_type)
for i, each_type in enumerate(self.input_types)
]
for i, cvt in enumerate(converts):
if input_dict is not None:
dat = data[input_dict[self.input_names[i]]]
else:
dat = data[i]
cvt.convert(dat, argument)
return argument
def __call__(self, dat, argument=None):
return self.convert(dat, argument)
@staticmethod
def create_converter(pos, each):
assert isinstance(each, dp2.InputType)
retv = None
if each.type == dp2.DataType.Dense:
retv = DenseConvert(each, pos)
elif each.type == dp2.DataType.Index:
retv = IndexConvert(each, pos)
elif each.type == dp2.DataType.SparseNonValue:
retv = SparseBinaryConvert(each, pos)
elif each.type == dp2.DataType.SparseValue:
retv = SparseFloatConvert(each, pos)
assert retv is not None
if each.seq_type == dp2.SequenceType.SUB_SEQUENCE:
retv = SequenceConvert(
each, pos, retv,
lambda arg, pos, seq: arg.setSlotSubSequenceStartPositions(pos, seq)
)
if each.seq_type in [
dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE
]:
retv = SequenceConvert(
each, pos, retv,
lambda arg, pos, seq: arg.setSlotSequenceStartPositions(pos, seq)
)
return retv
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import py_paddle.swig_paddle as api
import numpy as np
import paddle.trainer.PyDataProvider2 as dp2
from paddle.v2.data_converter import DataConverter
class DataConverterTest(unittest.TestCase):
def dense_reader(self, shape):
data = np.random.random(shape)
return data
def sparse_binary_reader(self,
high,
size_limit,
batch_size,
non_empty=False):
data = []
for i in xrange(batch_size):
num = np.random.randint(size_limit) # num could be 0
while non_empty and num == 0:
num = np.random.randint(size_limit)
data.append(np.random.randint(high, size=num).tolist())
return data
def test_dense_vector(self):
def compare(input):
converter = DataConverter([('image', dp2.dense_vector(784))])
arg = converter([input], {'image': 0})
output = arg.getSlotValue(0).copyToNumpyMat()
input = np.array(input, dtype='float32')
self.assertAlmostEqual(input.all(), output.all())
# test numpy array
data = self.dense_reader(shape=[32, 784])
compare(data)
# test list
compare(data.tolist())
#def test_sparse_binary(self):
# dim = 100000
# data = self.sparse_binary_reader(dim, 5, 2)
# converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
# arg = converter([data], {'input':0})
# output = arg.getSlotValue(0)
#def test_sparse(self):
# dim = 100000
# v = self.sparse_binary_reader(dim, 5, 2)
# w = []
# for dat in data:
# x = self.dense_reader(shape=[1, len(dat)])
# w.append(x.tolist())
# data = []
# for each in zip(v, w):
# data.append(zip(each[0], each[1]))
#
# converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
# arg = converter([data], {'input':0})
# output = arg.getSlotValue(0)
def test_integer(self):
dim = 100
index = np.random.randint(dim, size=32)
print index
converter = DataConverter([('input', dp2.integer_value(dim))])
arg = converter([index], {'input': 0})
print arg.getSlotValue(0)
output = arg.getSlotValue(0).copyToNumpyArray()
print 'output=', output
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from py_paddle import DataProviderConverter
__all__ = ['DataFeeder']
DataFeeder = DataProviderConverter
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import \
InputType, dense_vector, sparse_binary_vector,\
sparse_vector, integer_value
__all__ = [
'InputType', 'dense_vector', 'sparse_binary_vector', 'sparse_vector',
'integer_value'
]
......@@ -67,6 +67,7 @@ paddle.v2.parameters.create, no longer exposed to users.
"""
import paddle.trainer_config_helpers as conf_helps
from . import data_type as v2_data
from paddle.trainer_config_helpers.config_parser_utils import \
parse_network_config as __parse__
from paddle.trainer_config_helpers.default_decorators import wrap_name_default
......@@ -157,7 +158,33 @@ def __convert_to_v2__(method_name, name_prefix, parent_names):
return V2LayerImpl
data = __convert_to_v2__('data_layer', None, [])
"""
Some layer may need some special config, and can not use __convert_to_v2__ to convert.
So we also need to implement some special LayerV2.
"""
class DataLayerV2(Layer):
def __init__(self, name, type, **kwargs):
assert isinstance(type, v2_data.InputType)
self.type = type
self.__method_name__ = 'data_layer'
self.__kwargs__ = kwargs
super(DataLayerV2, self).__init__(name=name, parent_layers=dict())
def to_proto_impl(self, **kwargs):
args = dict()
args['size'] = self.type.dim
for each in kwargs:
args[each] = kwargs[each]
for each in self.__kwargs__:
args[each] = self.__kwargs__[each]
return getattr(conf_helps, self.__method_name__)(name=self.name, **args)
data = DataLayerV2
fc = __convert_to_v2__('fc_layer', name_prefix='fc', parent_names=['input'])
max_id = __convert_to_v2__(
'maxid_layer', name_prefix='maxid_layer', parent_names=['input'])
......@@ -171,8 +198,8 @@ cross_entropy_cost = __convert_to_v2__(
parent_names=['input', 'label'])
if __name__ == '__main__':
pixel = data(name='pixel', size=784)
label = data(name='label', size=10)
pixel = data(name='pixel', type=v2_data.dense_vector(784))
label = data(name='label', type=v2_data.integer_value(10))
hidden = fc(input=pixel, size=100, act=conf_helps.SigmoidActivation())
inference = fc(input=hidden, size=10, act=conf_helps.SoftmaxActivation())
maxid = max_id(input=inference)
......
......@@ -2,7 +2,7 @@ import collections
import py_paddle.swig_paddle as api
from paddle.proto.ModelConfig_pb2 import ModelConfig
from py_paddle import DataProviderConverter
from data_feeder import DataFeeder
from . import event as v2_event
from . import layer as v2_layer
......@@ -89,6 +89,7 @@ class SGD(ITrainer):
event_handler = default_event_handler
topology = v2_layer.parse_network(topology)
print topology
__check_train_args__(**locals())
......@@ -109,7 +110,7 @@ class SGD(ITrainer):
raise ValueError()
data_types_lists.append(data_types[each])
converter = DataProviderConverter(input_types=data_types_lists)
feeder = DataFeeder(input_types=data_types_lists)
for pass_id in xrange(num_passes):
updater.startPass()
......@@ -117,7 +118,7 @@ class SGD(ITrainer):
__data_reader_to_batch__(train_data_reader, batch_size,
topology)):
pass_type = updater.startBatch(len(data_batch))
gm.forwardBackward(converter(data_batch), out_args, pass_type)
gm.forwardBackward(feeder(data_batch), out_args, pass_type)
for each_param in gm.getParameters():
updater.update(each_param)
# Get cost. We use numpy to calculate total cost for this batch.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册