From dc8cf36e4b88bca5571f351852804ff57b8e2731 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sat, 23 Mar 2019 08:50:03 +0800 Subject: [PATCH] add more example on datagenerator test=develop --- paddle/fluid/platform/CMakeLists.txt | 2 +- .../fluid/incubate/data_generator/__init__.py | 98 +++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index ba1968e076..70b8c5266f 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -91,7 +91,7 @@ cc_library(timer SRCS timer.cc) cc_test(timer_test SRCS timer_test.cc DEPS timer) cc_library(lodtensor_printer SRCS lodtensor_printer.cc) -cc_test(lodtensor_printer SRCS lodtensor_printer.cc DEPS lodtensor_printer) +cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py index 75fda01c11..0407d67ea4 100644 --- a/python/paddle/fluid/incubate/data_generator/__init__.py +++ b/python/paddle/fluid/incubate/data_generator/__init__.py @@ -38,12 +38,49 @@ class DataGenerator(object): self._line_limit = line_limit def set_batch(self, batch_size): + ''' + Set batch size of current DataGenerator + This is necessary only if a user wants to define generator_batch + + Example: + + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + int_words = [int(x) for x in line.split()] + yield ("words", int_words) + return local_iter + + def generate_batch(self, samples): + def local_iter(): + for s in samples: + yield ("words", s[1].extend([s[1][0]])) + mydata = MyData() + mydata.set_batch(128) + + ''' self.batch_size_ = batch_size def run_from_memory(self): ''' This function generator data from memory, it is usually used for debug and benchmarking + + Example: + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + yield ("words", [1, 2, 3, 4]) + return local_iter + + mydata = MyData() + mydata.run_from_memory() ''' batch_samples = [] line_iter = self.generate_sample(None) @@ -69,6 +106,21 @@ class DataGenerator(object): be wrote to stdout and the corresponding protofile will be generated. + Example: + + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + int_words = [int(x) for x in line.split()] + yield ("words", [int_words]) + return local_iter + + mydata = MyData() + mydata.run_from_stdin() + ''' batch_samples = [] for line in sys.stdin: @@ -124,12 +176,58 @@ class DataGenerator(object): The type of feasigns must be in int or float. Once the float element appears in the feasign, the type of that slot will be processed into a float. + + Example: + + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + int_words = [int(x) for x in line.split()] + yield ("words", [int_words]) + return local_iter + ''' raise NotImplementedError( "Please rewrite this function to return a list or tuple: " + "[(name, [feasign, ...]), ...] or ((name, [feasign, ...]), ...)") def generate_batch(self, samples): + ''' + This function needs to be overridden by the user to process the + generated samples from generate_sample(self, str) function + It is usually used as batch processing when a user wants to + do preprocessing on a batch of samples, e.g. padding according to + the max length of a sample in the batch + + Args: + samples(list tuple): generated sample from generate_sample + + Returns: + a python generator, the same format as return value of generate_sample + + Example: + + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + int_words = [int(x) for x in line.split()] + yield ("words", int_words) + return local_iter + + def generate_batch(self, samples): + def local_iter(): + for s in samples: + yield ("words", s[1].extend([s[1][0]])) + mydata = MyData() + mydata.set_batch(128) + ''' + def local_iter(): for sample in samples: yield sample -- GitLab