提交 dc8cf36e 编写于 作者: D dongdaxiang

add more example on datagenerator

test=develop
上级 b8382076
...@@ -91,7 +91,7 @@ cc_library(timer SRCS timer.cc) ...@@ -91,7 +91,7 @@ cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer) cc_test(timer_test SRCS timer_test.cc DEPS timer)
cc_library(lodtensor_printer SRCS lodtensor_printer.cc) cc_library(lodtensor_printer SRCS lodtensor_printer.cc)
cc_test(lodtensor_printer SRCS lodtensor_printer.cc DEPS lodtensor_printer) cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
if(WITH_GPU) if(WITH_GPU)
......
...@@ -38,12 +38,49 @@ class DataGenerator(object): ...@@ -38,12 +38,49 @@ class DataGenerator(object):
self._line_limit = line_limit self._line_limit = line_limit
def set_batch(self, batch_size): def set_batch(self, batch_size):
'''
Set batch size of current DataGenerator
This is necessary only if a user wants to define generator_batch
Example:
.. code-block:: python
import paddle.fluid.incubate.data_generator as dg
class MyData(dg.DataGenerator):
def generate_sample(self, line):
def local_iter():
int_words = [int(x) for x in line.split()]
yield ("words", int_words)
return local_iter
def generate_batch(self, samples):
def local_iter():
for s in samples:
yield ("words", s[1].extend([s[1][0]]))
mydata = MyData()
mydata.set_batch(128)
'''
self.batch_size_ = batch_size self.batch_size_ = batch_size
def run_from_memory(self): def run_from_memory(self):
''' '''
This function generator data from memory, it is usually used for This function generator data from memory, it is usually used for
debug and benchmarking debug and benchmarking
Example:
.. code-block:: python
import paddle.fluid.incubate.data_generator as dg
class MyData(dg.DataGenerator):
def generate_sample(self, line):
def local_iter():
yield ("words", [1, 2, 3, 4])
return local_iter
mydata = MyData()
mydata.run_from_memory()
''' '''
batch_samples = [] batch_samples = []
line_iter = self.generate_sample(None) line_iter = self.generate_sample(None)
...@@ -69,6 +106,21 @@ class DataGenerator(object): ...@@ -69,6 +106,21 @@ class DataGenerator(object):
be wrote to stdout and the corresponding protofile will be be wrote to stdout and the corresponding protofile will be
generated. generated.
Example:
.. code-block:: python
import paddle.fluid.incubate.data_generator as dg
class MyData(dg.DataGenerator):
def generate_sample(self, line):
def local_iter():
int_words = [int(x) for x in line.split()]
yield ("words", [int_words])
return local_iter
mydata = MyData()
mydata.run_from_stdin()
''' '''
batch_samples = [] batch_samples = []
for line in sys.stdin: for line in sys.stdin:
...@@ -124,12 +176,58 @@ class DataGenerator(object): ...@@ -124,12 +176,58 @@ class DataGenerator(object):
The type of feasigns must be in int or float. Once the float The type of feasigns must be in int or float. Once the float
element appears in the feasign, the type of that slot will be element appears in the feasign, the type of that slot will be
processed into a float. processed into a float.
Example:
.. code-block:: python
import paddle.fluid.incubate.data_generator as dg
class MyData(dg.DataGenerator):
def generate_sample(self, line):
def local_iter():
int_words = [int(x) for x in line.split()]
yield ("words", [int_words])
return local_iter
''' '''
raise NotImplementedError( raise NotImplementedError(
"Please rewrite this function to return a list or tuple: " + "Please rewrite this function to return a list or tuple: " +
"[(name, [feasign, ...]), ...] or ((name, [feasign, ...]), ...)") "[(name, [feasign, ...]), ...] or ((name, [feasign, ...]), ...)")
def generate_batch(self, samples): def generate_batch(self, samples):
'''
This function needs to be overridden by the user to process the
generated samples from generate_sample(self, str) function
It is usually used as batch processing when a user wants to
do preprocessing on a batch of samples, e.g. padding according to
the max length of a sample in the batch
Args:
samples(list tuple): generated sample from generate_sample
Returns:
a python generator, the same format as return value of generate_sample
Example:
.. code-block:: python
import paddle.fluid.incubate.data_generator as dg
class MyData(dg.DataGenerator):
def generate_sample(self, line):
def local_iter():
int_words = [int(x) for x in line.split()]
yield ("words", int_words)
return local_iter
def generate_batch(self, samples):
def local_iter():
for s in samples:
yield ("words", s[1].extend([s[1][0]]))
mydata = MyData()
mydata.set_batch(128)
'''
def local_iter(): def local_iter():
for sample in samples: for sample in samples:
yield sample yield sample
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册