From 2a3a1e9a93258a67c8491361d9d83e3181723a3a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 4 Dec 2017 10:56:53 +0800 Subject: [PATCH] Add DataFeeder (#6102) * Add DataFeeder A v2 API like data feeder for book demos. We can feed data directly from reader. * Fix CI * Remove batch_size_dim for feeder Also add __all__ to data_feeder.py * Follow comment --- python/paddle/v2/fluid/__init__.py | 5 +- python/paddle/v2/fluid/data_feeder.py | 98 +++++++++++++++++++ .../v2/fluid/tests/book/test_fit_a_line.py | 7 +- .../book/test_image_classification_train.py | 13 +-- .../tests/book/test_label_semantic_roles.py | 60 +++++------- .../tests/book/test_recognize_digits_conv.py | 10 +- .../tests/book/test_recognize_digits_mlp.py | 28 +----- .../book/test_understand_sentiment_conv.py | 28 ++---- .../test_understand_sentiment_dynamic_lstm.py | 28 +++--- .../v2/fluid/tests/book/test_word2vec.py | 15 +-- .../paddle/v2/fluid/tests/test_data_feeder.py | 13 +++ 11 files changed, 177 insertions(+), 128 deletions(-) create mode 100644 python/paddle/v2/fluid/data_feeder.py create mode 100644 python/paddle/v2/fluid/tests/test_data_feeder.py diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index dd25bc19ec..59986c9f0c 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -14,20 +14,21 @@ import optimizer import backward import regularizer from param_attr import ParamAttr - +from data_feeder import DataFeeder from core import LoDTensor, CPUPlace, GPUPlace Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + [ 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' + 'DataFeeder' ] def __read_gflags_from_env__(): """ Enable reading gflags from environment variables. - + Returns: None """ diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py new file mode 100644 index 0000000000..3dee0b5b73 --- /dev/null +++ b/python/paddle/v2/fluid/data_feeder.py @@ -0,0 +1,98 @@ +from __future__ import print_function + +import core +import numpy +import six.moves as six + +from framework import Variable + +__all__ = ['DataFeeder'] + + +class DataToLoDTensorConverter(object): + def __init__(self, place, lod_level, shape, dtype): + self.place = place + self.lod_level = lod_level + self.shape = shape + if dtype == core.DataType.FP32: + self.dtype = 'float32' + elif dtype == core.DataType.INT64: + self.dtype = 'int64' + elif dtype == core.DataType.FP64: + self.dtype = 'float64' + elif dtype == core.DataType.INT32: + self.dtype = 'int32' + else: + raise ValueError("dtype must be any of [int32, float32, int64, " + "float64]") + + self.data = [] + self.lod = [] + + for i in six.range(lod_level): + self.lod.append([0]) + + def feed(self, data): + self._feed_impl_(data, self.lod, self.lod_level) + + def _feed_impl_(self, data, lod, lod_level): + if lod_level == 0: + self.data.append(data) + else: + cur_lod_len = len(data) + lod[-1].append(lod[-1][-1] + cur_lod_len) + for each_data in data: + self._feed_impl_(each_data, lod[:-1], lod_level - 1) + + def done(self): + arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape) + t = core.LoDTensor() + t.set(arr, self.place) + if self.lod_level > 0: + t.set_lod(self.lod) + return t + + +class DataFeeder(object): + def __init__(self, feed_list, place): + self.feed_dtypes = [] + self.feed_names = [] + self.feed_shapes = [] + self.feed_lod_level = [] + for each_var in feed_list: + if not isinstance(each_var, Variable): + raise TypeError("Feed list should contain a list of variable") + self.feed_dtypes.append(each_var.dtype) + self.feed_names.append(each_var.name) + shape = each_var.shape + batch_size_dim = -1 + for i, s in enumerate(shape): + if s < 0: + batch_size_dim = i + break + if batch_size_dim == -1: + raise ValueError("Variable {0} must has a batch size dimension", + each_var.name) + self.feed_lod_level.append(each_var.lod_level) + self.feed_shapes.append(shape) + + self.place = place + + def feed(self, iterable): + converter = [] + for lod_level, shape, dtype in six.zip( + self.feed_lod_level, self.feed_shapes, self.feed_dtypes): + converter.append( + DataToLoDTensorConverter( + place=self.place, + lod_level=lod_level, + shape=shape, + dtype=dtype)) + + for each_sample in iterable: + for each_converter, each_slot in six.zip(converter, each_sample): + each_converter.feed(each_slot) + ret_dict = {} + for each_name, each_converter in six.zip(self.feed_names, converter): + ret_dict[each_name] = each_converter.done() + return ret_dict diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py index 9f98493adb..fbf46ac6cb 100644 --- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py @@ -22,6 +22,7 @@ train_reader = paddle.batch( batch_size=BATCH_SIZE) place = fluid.CPUPlace() +feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -31,12 +32,8 @@ for pass_id in range(PASS_NUM): fluid.io.save_persistables(exe, "./fit_a_line.model/") fluid.io.load_persistables(exe, "./fit_a_line.model/") for data in train_reader(): - x_data = np.array(map(lambda _: _[0], data)).astype("float32") - y_data = np.array(map(lambda _: _[1], data)).astype("float32") - avg_loss_value, = exe.run(fluid.default_main_program(), - feed={'x': x_data, - 'y': y_data}, + feed=feeder.feed(data), fetch_list=[avg_cost]) if avg_loss_value[0] < 10.0: diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py index 0f0cc5b540..4e71b6f345 100644 --- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py +++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py @@ -113,23 +113,14 @@ train_reader = paddle.batch( place = fluid.CPUPlace() exe = fluid.Executor(place) - +feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) exe.run(fluid.default_startup_program()) for pass_id in range(PASS_NUM): accuracy.reset(exe) for data in train_reader(): - img_data = np.array(map(lambda x: x[0].reshape(data_shape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - batch_size = 1 - for i in y_data.shape: - batch_size = batch_size * i - y_data = y_data.reshape([batch_size, 1]) - loss, acc = exe.run(fluid.default_main_program(), - feed={"pixel": img_data, - "label": y_data}, + feed=feeder.feed(data), fetch_list=[avg_cost] + accuracy.metrics) pass_acc = accuracy.eval(exe) print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py index bcd6f4d6bc..0494c7cdca 100644 --- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py @@ -28,17 +28,9 @@ def load_parameter(file_name, h, w): return np.fromfile(f, dtype=np.float32).reshape(h, w) -def db_lstm(): +def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, + **ignored): # 8 features - word = fluid.layers.data(name='word_data', shape=[1], dtype='int64') - predicate = fluid.layers.data(name='verb_data', shape=[1], dtype='int64') - ctx_n2 = fluid.layers.data(name='ctx_n2_data', shape=[1], dtype='int64') - ctx_n1 = fluid.layers.data(name='ctx_n1_data', shape=[1], dtype='int64') - ctx_0 = fluid.layers.data(name='ctx_0_data', shape=[1], dtype='int64') - ctx_p1 = fluid.layers.data(name='ctx_p1_data', shape=[1], dtype='int64') - ctx_p2 = fluid.layers.data(name='ctx_p2_data', shape=[1], dtype='int64') - mark = fluid.layers.data(name='mark_data', shape=[1], dtype='int64') - predicate_embedding = fluid.layers.embedding( input=predicate, size=[pred_len, word_dim], @@ -120,8 +112,25 @@ def to_lodtensor(data, place): def main(): # define network topology - feature_out = db_lstm() - target = fluid.layers.data(name='target', shape=[1], dtype='int64') + word = fluid.layers.data( + name='word_data', shape=[1], dtype='int64', lod_level=1) + predicate = fluid.layers.data( + name='verb_data', shape=[1], dtype='int64', lod_level=1) + ctx_n2 = fluid.layers.data( + name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) + ctx_n1 = fluid.layers.data( + name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) + ctx_0 = fluid.layers.data( + name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) + ctx_p1 = fluid.layers.data( + name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) + ctx_p2 = fluid.layers.data( + name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data( + name='mark_data', shape=[1], dtype='int64', lod_level=1) + feature_out = db_lstm(**locals()) + target = fluid.layers.data( + name='target', shape=[1], dtype='int64', lod_level=1) crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, @@ -139,6 +148,11 @@ def main(): paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) place = fluid.CPUPlace() + feeder = fluid.DataFeeder( + feed_list=[ + word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target + ], + place=place) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -150,28 +164,8 @@ def main(): batch_id = 0 for pass_id in xrange(PASS_NUM): for data in train_data(): - word_data = to_lodtensor(map(lambda x: x[0], data), place) - ctx_n2_data = to_lodtensor(map(lambda x: x[1], data), place) - ctx_n1_data = to_lodtensor(map(lambda x: x[2], data), place) - ctx_0_data = to_lodtensor(map(lambda x: x[3], data), place) - ctx_p1_data = to_lodtensor(map(lambda x: x[4], data), place) - ctx_p2_data = to_lodtensor(map(lambda x: x[5], data), place) - verb_data = to_lodtensor(map(lambda x: x[6], data), place) - mark_data = to_lodtensor(map(lambda x: x[7], data), place) - target = to_lodtensor(map(lambda x: x[8], data), place) - outs = exe.run(fluid.default_main_program(), - feed={ - 'word_data': word_data, - 'ctx_n2_data': ctx_n2_data, - 'ctx_n1_data': ctx_n1_data, - 'ctx_0_data': ctx_0_data, - 'ctx_p1_data': ctx_p1_data, - 'ctx_p2_data': ctx_p2_data, - 'verb_data': verb_data, - 'mark_data': mark_data, - 'target': target - }, + feed=feeder.feed(data), fetch_list=[avg_cost]) avg_cost_val = np.array(outs[0]) diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py index ba686b56f8..35bf8da924 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py @@ -37,20 +37,14 @@ train_reader = paddle.batch( place = fluid.CPUPlace() exe = fluid.Executor(place) - +feeder = fluid.DataFeeder(feed_list=[images, label], place=place) exe.run(fluid.default_startup_program()) for pass_id in range(PASS_NUM): accuracy.reset(exe) for data in train_reader(): - img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([BATCH_SIZE, 1]) - loss, acc = exe.run(fluid.default_main_program(), - feed={"pixel": img_data, - "label": y_data}, + feed=feeder.feed(data), fetch_list=[avg_cost] + accuracy.metrics) pass_acc = accuracy.eval(exe) print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" + diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py index fa18965aac..4dc2c50e1c 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py @@ -48,40 +48,22 @@ test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128) place = fluid.CPUPlace() exe = fluid.Executor(place) - +feeder = fluid.DataFeeder(feed_list=[image, label], place=place) exe.run(fluid.default_startup_program()) PASS_NUM = 100 for pass_id in range(PASS_NUM): accuracy.reset(exe) for data in train_reader(): - x_data = np.array(map(lambda x: x[0], data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = np.expand_dims(y_data, axis=1) - - tensor_x = fluid.LoDTensor() - tensor_x.set(x_data, place) - - tensor_y = fluid.LoDTensor() - tensor_y.set(y_data, place) - - outs = exe.run(fluid.default_main_program(), - feed={'x': tensor_x, - 'y': tensor_y}, - fetch_list=[avg_cost] + accuracy.metrics) - out = np.array(outs[0]) - acc = np.array(outs[1]) + out, acc = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost] + accuracy.metrics) pass_acc = accuracy.eval(exe) test_accuracy.reset(exe) for data in test_reader(): - x_data = np.array(map(lambda x: x[0], data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = np.expand_dims(y_data, axis=1) - out, acc = exe.run(inference_program, - feed={'x': x_data, - 'y': y_data}, + feed=feeder.feed(data), fetch_list=[avg_cost] + test_accuracy.metrics) test_pass_acc = test_accuracy.eval(exe) diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py index be875a952b..f103358edc 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py @@ -4,10 +4,8 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid -def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32): - data = fluid.layers.data(name="words", shape=[1], dtype="int64") - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - +def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, + hid_dim=32): emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) conv_3 = fluid.nets.sequence_conv_pool( input=emb, @@ -55,8 +53,11 @@ def main(): dict_dim = len(word_dict) class_dim = 2 + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost, accuracy, acc_out = convolution_net( - input_dim=dict_dim, class_dim=class_dim) + data, label, input_dim=dict_dim, class_dim=class_dim) train_data = paddle.batch( paddle.reader.shuffle( @@ -64,25 +65,16 @@ def main(): batch_size=BATCH_SIZE) place = fluid.CPUPlace() exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) exe.run(fluid.default_startup_program()) for pass_id in xrange(PASS_NUM): accuracy.reset(exe) for data in train_data(): - tensor_words = to_lodtensor(map(lambda x: x[0], data), place) - - label = np.array(map(lambda x: x[1], data)).astype("int64") - label = label.reshape([BATCH_SIZE, 1]) - - tensor_label = fluid.LoDTensor() - tensor_label.set(label, place) - - cost_val, acc_val = exe.run( - fluid.default_main_program(), - feed={"words": tensor_words, - "label": tensor_label}, - fetch_list=[cost, acc_out]) + cost_val, acc_val = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[cost, acc_out]) pass_acc = accuracy.eval(exe) print("cost=" + str(cost_val) + " acc=" + str(acc_val) + " pass_acc=" + str(pass_acc)) diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py index 094a3cdcda..cd28f04b85 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py @@ -3,14 +3,14 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid -def stacked_lstm_net(input_dim, +def stacked_lstm_net(data, + label, + input_dim, class_dim=2, emb_dim=128, hid_dim=512, stacked_num=3): assert stacked_num % 2 == 1 - data = fluid.layers.data(name="words", shape=[1], dtype="int64") - label = fluid.layers.data(name="label", shape=[1], dtype="int64") emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) # add bias attr @@ -65,8 +65,11 @@ def main(): dict_dim = len(word_dict) class_dim = 2 + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost, accuracy, acc_out = stacked_lstm_net( - input_dim=dict_dim, class_dim=class_dim) + data, label, input_dim=dict_dim, class_dim=class_dim) train_data = paddle.batch( paddle.reader.shuffle( @@ -74,25 +77,16 @@ def main(): batch_size=BATCH_SIZE) place = fluid.CPUPlace() exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) exe.run(fluid.default_startup_program()) for pass_id in xrange(PASS_NUM): accuracy.reset(exe) for data in train_data(): - tensor_words = to_lodtensor(map(lambda x: x[0], data), place) - - label = np.array(map(lambda x: x[1], data)).astype("int64") - label = label.reshape([BATCH_SIZE, 1]) - - tensor_label = fluid.LoDTensor() - tensor_label.set(label, place) - - cost_val, acc_val = exe.run( - fluid.default_main_program(), - feed={"words": tensor_words, - "label": tensor_label}, - fetch_list=[cost, acc_out]) + cost_val, acc_val = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[cost, acc_out]) pass_acc = accuracy.eval(exe) print("cost=" + str(cost_val) + " acc=" + str(acc_val) + " pass_acc=" + str(pass_acc)) diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py index 1b441e15c7..8b928ff9ee 100644 --- a/python/paddle/v2/fluid/tests/book/test_word2vec.py +++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py @@ -57,23 +57,16 @@ train_reader = paddle.batch( place = fluid.CPUPlace() exe = fluid.Executor(place) +feeder = fluid.DataFeeder( + feed_list=[first_word, second_word, third_word, forth_word, next_word], + place=place) exe.run(fluid.default_startup_program()) for pass_id in range(PASS_NUM): for data in train_reader(): - input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)] - input_data = map(lambda x: np.array(x).astype("int64"), input_data) - input_data = map(lambda x: np.expand_dims(x, axis=1), input_data) - avg_cost_np = exe.run(fluid.default_main_program(), - feed={ - 'firstw': input_data[0], - 'secondw': input_data[1], - 'thirdw': input_data[2], - 'forthw': input_data[3], - 'nextw': input_data[4] - }, + feed=feeder.feed(data), fetch_list=[avg_cost]) if avg_cost_np[0] < 5.0: exit(0) # if avg cost less than 10.0, we think our code is good. diff --git a/python/paddle/v2/fluid/tests/test_data_feeder.py b/python/paddle/v2/fluid/tests/test_data_feeder.py new file mode 100644 index 0000000000..4549693203 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_data_feeder.py @@ -0,0 +1,13 @@ +import paddle.v2.fluid as fluid + + +def test_converter(): + img = fluid.layers.data(name='image', shape=[1, 28, 28]) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + feeder = fluid.DataFeeder([img, label], fluid.CPUPlace()) + result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]]) + print(result) + + +if __name__ == '__main__': + test_converter() -- GitLab