未验证 提交 70ff9038 编写于 作者: L liuwei1031 提交者: GitHub

improve the doc of data feeder related APIs (#20515)

* improve data feeder related API
上级 057bce4d
...@@ -1100,11 +1100,11 @@ paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', '7b5bf ...@@ -1100,11 +1100,11 @@ paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', '7b5bf
paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', 'ea029ec9e0dea75f136211c433154f25')) paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', 'ea029ec9e0dea75f136211c433154f25'))
paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', 'd9e64be617bd5f49dbb08ac2bc8665e6')) paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', '9e83e9c52fe5b234df4e29d07f382995'))
paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'a0ed5ce816b5d603cb595aacb922335a')) paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '982feeee2611898d312fdf12580409d7'))
paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', 'ce65fe1d81dcd7067d5092a5667f35cc')) paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '69ee4aeeb5cd8c8e5922560457d318ba'))
paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '334c6af750941a4397a2dd2ea8a4d76f')) paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '19fe07f2e40f938003f66f39798ec7d6'))
paddle.fluid.clip.set_gradient_clip (ArgSpec(args=['clip', 'param_list', 'program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '7a0f76a77dd88a74f24485a103a22fc1')) paddle.fluid.clip.set_gradient_clip (ArgSpec(args=['clip', 'param_list', 'program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '7a0f76a77dd88a74f24485a103a22fc1'))
paddle.fluid.clip.ErrorClipByValue ('paddle.fluid.clip.ErrorClipByValue', ('document', '629b07558971a8ab5e954d9a77457656')) paddle.fluid.clip.ErrorClipByValue ('paddle.fluid.clip.ErrorClipByValue', ('document', '629b07558971a8ab5e954d9a77457656'))
paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
...@@ -152,53 +152,25 @@ class BatchedTensorProvider(object): ...@@ -152,53 +152,25 @@ class BatchedTensorProvider(object):
class DataFeeder(object): class DataFeeder(object):
""" """
DataFeeder converts the data that returned by a reader into a data DataFeeder converts the data that returned by a reader into a data
structure that can feed into Executor and ParallelExecutor. The reader structure that can feed into Executor. The reader is usually a
usually returns a list of mini-batch data entries. Each data entry in python generator that returns a list of mini-batch data entries.
the list is one sample. Each sample is a list or a tuple with one
feature or multiple features. Parameters:
feed_list (list): Variables or names of Variables that need
The simple usage shows below: to feed.
place (:ref:`api_fluid_CPUPlace` | :ref:`api_fluid_CUDAPlace` ):
.. code-block:: python place indicates the device (CPU | GPU) the data will be fed into, if
you want to feed data into GPU, please using :code:`fluid.CUDAPlace(i)`
import paddle.fluid as fluid (:code:`i` represents the GPU id), or if you want to feed data into CPU,
place = fluid.CPUPlace() please using :code:`fluid.CPUPlace()`.
img = fluid.layers.data(name='image', shape=[1, 28, 28]) program (:ref:`api_fluid_Program` , optional): The Program that will
label = fluid.layers.data(name='label', shape=[1], dtype='int64') feed data into, if program is None, it will use default_main_program().
feeder = fluid.DataFeeder([img, label], fluid.CPUPlace()) Default None.
result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
If you want to feed data into GPU side separately in advance when you
use multi-GPU to train a model, you can use `decorate_reader` function.
.. code-block:: python
import paddle
import paddle.fluid as fluid
place=fluid.CUDAPlace(0)
data = fluid.layers.data(name='data', shape=[3, 224, 224], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
reader = feeder.decorate_reader(
paddle.batch(paddle.dataset.flowers.train(), batch_size=16), multi_devices=True)
Args:
feed_list(list): The Variables or Variables'name that will
feed into model.
place(Place): place indicates feed data into CPU or GPU, if you want to
feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents
the GPU id), or if you want to feed data into CPU, please using
`fluid.CPUPlace()`.
program(Program): The Program that will feed data into, if program
is None, it will use default_main_program(). Default None.
Raises: Raises:
ValueError: If some Variable is not in this Program. :code:`ValueError` - If some Variables are not in this Program.
Examples: Example:
.. code-block:: python .. code-block:: python
...@@ -207,27 +179,34 @@ class DataFeeder(object): ...@@ -207,27 +179,34 @@ class DataFeeder(object):
import paddle.fluid as fluid import paddle.fluid as fluid
place = fluid.CPUPlace() place = fluid.CPUPlace()
def reader(): def reader():
yield [np.random.random([4]).astype('float32'), np.random.random([3]).astype('float32')], for _ in range(4):
yield np.random.random([4]).astype('float32'), np.random.random([3]).astype('float32'),
main_program = fluid.Program() main_program = fluid.Program()
startup_program = fluid.Program() startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program): with fluid.program_guard(main_program, startup_program):
data_1 = fluid.layers.data(name='data_1', shape=[1, 2, 2]) data_1 = fluid.layers.data(name='data_1', shape=[-1, 2, 2])
data_2 = fluid.layers.data(name='data_2', shape=[1, 1, 3]) data_2 = fluid.layers.data(name='data_2', shape=[-1, 1, 3])
out = fluid.layers.fc(input=[data_1, data_2], size=2) out = fluid.layers.fc(input=[data_1, data_2], size=2)
# ... # ...
feeder = fluid.DataFeeder([data_1, data_2], place) feeder = fluid.DataFeeder([data_1, data_2], place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_program) exe.run(startup_program)
for data in reader():
feed_data = feeder.feed(reader())
# print feed_data to view feed results
# print(feed_data['data_1'])
# print(feed_data['data_2'])
outs = exe.run(program=main_program, outs = exe.run(program=main_program,
feed=feeder.feed(data), feed=feed_data,
fetch_list=[out]) fetch_list=[out])
print(outs)
""" """
...@@ -252,31 +231,42 @@ class DataFeeder(object): ...@@ -252,31 +231,42 @@ class DataFeeder(object):
def feed(self, iterable): def feed(self, iterable):
""" """
According to feed_list and iterable, converters the input into According to :code:`feed_list` of :code:`DataFeeder` and :code:`iterable` , converts
a data structure that can feed into Executor and ParallelExecutor. the input into a data structure that can feed into Executor.
Args: Parameters:
iterable(list|tuple): the input data. iterable (generator): user defined python generator to read the raw input data
Returns: Returns:
dict: the result of conversion. :code:`dict`: a :code:`dict` that contains (variable name - converted tensor) pairs
Examples: Example:
.. code-block:: python .. code-block:: python
import numpy.random as random # In this example, reader - generator will return a list of ndarray of 3 elements
# feed API will convert each ndarray input into a tensor
# the return result is a dict with keys: data_1, data_2, data_3
# result['data_1'] a LoD-Tensor with shape of [5, 2, 1, 3]. 5 is batch size, and [2, 1, 3] is the real shape of data_1.
# result['data_2'], result['data_3'] are similar.
import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
def reader(limit=5): def reader(limit=5):
for i in range(limit): for i in range(1, limit + 1):
yield random.random([784]).astype('float32'), random.random([1]).astype('int64'), random.random([256]).astype('float32') yield np.ones([6]).astype('float32') * i , np.ones([1]).astype('int64') * i, np.random.random([9]).astype('float32')
data_1 = fluid.layers.data(name='data_1', shape=[1, 28, 28]) data_1 = fluid.layers.data(name='data_1', shape=[2, 1, 3])
data_2 = fluid.layers.data(name='data_2', shape=[1], dtype='int64') data_2 = fluid.layers.data(name='data_2', shape=[1], dtype='int64')
data_3 = fluid.layers.data(name='data_3', shape=[16, 16], dtype='float32') data_3 = fluid.layers.data(name='data_3', shape=[3, 3], dtype='float32')
feeder = fluid.DataFeeder(['data_1','data_2', 'data_3'], fluid.CPUPlace()) feeder = fluid.DataFeeder(['data_1','data_2', 'data_3'], fluid.CPUPlace())
result = feeder.feed(reader()) result = feeder.feed(reader())
print(result['data_1'])
print(result['data_2'])
print(result['data_3'])
""" """
converter = [] converter = []
for lod_level, shape, dtype in six.moves.zip( for lod_level, shape, dtype in six.moves.zip(
...@@ -303,33 +293,40 @@ class DataFeeder(object): ...@@ -303,33 +293,40 @@ class DataFeeder(object):
def feed_parallel(self, iterable, num_places=None): def feed_parallel(self, iterable, num_places=None):
""" """
Takes multiple mini-batches. Each mini-batch will be feed on each Similar with feed function, feed_parallel is used with multiple devices (CPU|GPU).
device in advance. Here :code:`iterable` is a list of python generators. The data return by each
generator in the list will be fed into a seperate device.
Args: Parameters:
iterable(list|tuple): the input data. iterable (list|tuple): list of user-defined python geneators. The element
num_places(int): the number of devices. Default None. number should match the :code:`num_places`.
num_places (int, optional): the number of devices. If not provided (None),
all available devices on the machine will be used. Default None.
Returns: Returns:
dict: the result of conversion. :code:`generator`: a :code:`generator` that generate dict which contains (variable name - converted tensor) pairs,
the total number of dicts will be generated matches with the :code:`num_places`
Notes: .. note::
The number of devices and number of mini-batches must be same. The number of devices - :code:`num_places` should equal to the generator (element of :code:`iterable` ) number
Examples: Example:
.. code-block:: python .. code-block:: python
import numpy.random as random
import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
def reader(limit=10): def generate_reader(batch_size, base=0, factor=1):
for i in range(limit): def _reader():
yield [random.random([784]).astype('float32'), random.random([1]).astype('float32')], for i in range(batch_size):
yield np.ones([4]) * factor + base, np.ones([4]) * factor + base + 5
return _reader()
x = fluid.layers.data(name='x', shape=[1, 28, 28]) x = fluid.layers.data(name='x', shape=[-1, 2, 2])
y = fluid.layers.data(name='y', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[-1, 2, 2], dtype='float32')
fluid.layers.elementwise_add(x, y) z = fluid.layers.elementwise_add(x, y)
feeder = fluid.DataFeeder(['x','y'], fluid.CPUPlace()) feeder = fluid.DataFeeder(['x','y'], fluid.CPUPlace())
place_num = 2 place_num = 2
...@@ -338,11 +335,17 @@ class DataFeeder(object): ...@@ -338,11 +335,17 @@ class DataFeeder(object):
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(places=places) program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(places=places)
for item in reader():
data.append(item) # print sample feed_parallel r resultt
if place_num == len(data): # for item in list(feeder.feed_parallel([generate_reader(5, 0, 1), generate_reader(3, 10, 2)], 2)):
exe.run(program=program, feed=list(feeder.feed_parallel(data, place_num)), fetch_list=[]) # print(item['x'])
data = [] # print(item['y'])
reader_list = [generate_reader(5, 0, 1), generate_reader(3, 10, 2)]
res = exe.run(program=program, feed=list(feeder.feed_parallel(reader_list, 2)), fetch_list=[z])
print(res)
""" """
if isinstance(self.place, core.CUDAPlace): if isinstance(self.place, core.CUDAPlace):
places = [ places = [
...@@ -383,52 +386,64 @@ class DataFeeder(object): ...@@ -383,52 +386,64 @@ class DataFeeder(object):
num_places=None, num_places=None,
drop_last=True): drop_last=True):
""" """
Converter the input data into a data that returned by reader into Decorate the reader (generator) to fit multiple devices. The reader generate
multiple mini-batches. Each mini-batch will be feed on each device. multiple mini-batches. Each mini-batch will be fed into a single device.
Args: Parameters:
reader(function): the reader is the function which can generate data. reader(generator): a user defined python generator used to get :code:`mini-batch` of data.
multi_devices(bool): whether to use multiple devices or not. A :code:`mini-batch` can be regarded as a python generator that returns batchs of input
num_places(int): if multi_devices is True, you can specify the number entities, just like the below :code:`_mini_batch` in the code example.
of GPU to use, if multi_devices is None, the function will use all the multi_devices(bool): indicate whether to use multiple devices or not.
GPU of the current machine. Default None. num_places(int, optional): if :code:`multi_devices` is True, you can specify the number
drop_last(bool): whether to drop the last batch if the of devices(CPU|GPU) to use, if multi_devices is None, the function will use all the
size of the last batch is less than batch_size. Default True. devices of the current machine. Default None.
drop_last(bool, optional): whether to drop the last round of data if it is not enough to
feed all devices. Default True.
Returns: Returns:
dict: the result of conversion. :code:`generator`: a new :code:`generator` which return converted dicts that can be fed into Executor
Raises: Raises:
ValueError: If drop_last is False and the data batch cannot fit for devices. :code:`ValueError`: If drop_last is False and the data cannot fit devices perfectly.
Examples: Example:
.. code-block:: python .. code-block:: python
import numpy.random as random import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.compiler as compiler import paddle.fluid.compiler as compiler
def reader(limit=10): def reader():
for i in range(limit): def _mini_batch(batch_size):
yield (random.random([784]).astype('float32'), random.random([1]).astype('int64')), for i in range(batch_size):
yield np.random.random([16]).astype('float32'), np.random.randint(10, size=[1])
for _ in range(10):
yield _mini_batch(np.random.randint(1, 10))
place=fluid.CUDAPlace(0) place_num = 3
data = fluid.layers.data(name='data', shape=[1, 28, 28], dtype='float32') places = [fluid.CPUPlace() for _ in range(place_num)]
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# a simple network sample
data = fluid.layers.data(name='data', shape=[-1, 4, 4], dtype='float32')
label = fluid.layers.data(name='label', shape=[-1, 1], dtype='int64')
hidden = fluid.layers.fc(input=data, size=10) hidden = fluid.layers.fc(input=data, size=10)
feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) feeder = fluid.DataFeeder(place=places[0], feed_list=[data, label])
reader = feeder.decorate_reader(reader, multi_devices=True) reader = feeder.decorate_reader(reader, multi_devices=True, num_places=3, drop_last=True)
exe = fluid.Executor(place) exe = fluid.Executor(places[0])
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
compiled_prog = compiler.CompiledProgram( compiled_prog = compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel() fluid.default_main_program()).with_data_parallel(places=places)
for i,data in enumerate(reader()): for i,data in enumerate(reader()):
print('iteration : ', i + 1) # print data if you like
# print(i, data)
ret = exe.run(compiled_prog, feed=data, fetch_list=[hidden]) ret = exe.run(compiled_prog, feed=data, fetch_list=[hidden])
print(ret)
""" """
def __reader_creator__(): def __reader_creator__():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册