add more example on datagenerator

test=develop

add more example on datagenerator
test=develop
dc8cf36e · dongdaxiang · b8382076 · dc8cf36e · dc8cf36e
Showing with 99 addition and 1 deletion

paddle/fluid/platform/CMakeLists.txt paddle/fluid/platform/CMakeLists.txt +1 -1

python/paddle/fluid/incubate/data_generator/__init__.py python/paddle/fluid/incubate/data_generator/__init__.py +98 -0

未找到文件。
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -91,7 +91,7 @@ cc_library(timer SRCS timer.cc)
 cc_test(timer_test SRCS timer_test.cc DEPS timer)

 cc_library(lodtensor_printer SRCS lodtensor_printer.cc)
-cc_test(lodtensor_printer SRCS lodtensor_printer.cc DEPS lodtensor_printer)
+cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer)

 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)

--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
@@ -38,12 +38,49 @@ class DataGenerator(object):
        self._line_limit = line_limit

    def set_batch(self, batch_size):
+        '''
+        Set batch size of current DataGenerator
+        This is necessary only if a user wants to define generator_batch
+        
+        Example:
+
+            .. code-block:: python
+                import paddle.fluid.incubate.data_generator as dg
+                class MyData(dg.DataGenerator):
+
+                    def generate_sample(self, line):
+                        def local_iter():
+                            int_words = [int(x) for x in line.split()]
+                            yield ("words", int_words)
+                        return local_iter
+
+                    def generate_batch(self, samples):
+                        def local_iter():
+                            for s in samples:
+                                yield ("words", s[1].extend([s[1][0]]))
+                mydata = MyData()
+                mydata.set_batch(128)
+                    
+        '''
        self.batch_size_ = batch_size

    def run_from_memory(self):
        '''
        This function generator data from memory, it is usually used for
        debug and benchmarking
+
+        Example:
+            .. code-block:: python
+                import paddle.fluid.incubate.data_generator as dg
+                class MyData(dg.DataGenerator):
+
+                    def generate_sample(self, line):
+                        def local_iter():
+                            yield ("words", [1, 2, 3, 4])
+                        return local_iter
+
+                mydata = MyData()
+                mydata.run_from_memory()
        '''
        batch_samples = []
        line_iter = self.generate_sample(None)
@@ -69,6 +106,21 @@ class DataGenerator(object):
        be wrote to stdout and the corresponding protofile will be
        generated.

+        Example:
+        
+            .. code-block:: python
+                import paddle.fluid.incubate.data_generator as dg
+                class MyData(dg.DataGenerator):
+
+                    def generate_sample(self, line):
+                        def local_iter():
+                            int_words = [int(x) for x in line.split()]
+                            yield ("words", [int_words])
+                        return local_iter
+
+                mydata = MyData()
+                mydata.run_from_stdin()
+
        '''
        batch_samples = []
        for line in sys.stdin:
@@ -124,12 +176,58 @@ class DataGenerator(object):
            The type of feasigns must be in int or float. Once the float
            element appears in the feasign, the type of that slot will be
            processed into a float.
+
+        Example:
+
+            .. code-block:: python
+                import paddle.fluid.incubate.data_generator as dg
+                class MyData(dg.DataGenerator):
+
+                    def generate_sample(self, line):
+                        def local_iter():
+                            int_words = [int(x) for x in line.split()]
+                            yield ("words", [int_words])
+                        return local_iter
+
        '''
        raise NotImplementedError(
            "Please rewrite this function to return a list or tuple: " +
            "[(name, [feasign, ...]), ...] or ((name, [feasign, ...]), ...)")

    def generate_batch(self, samples):
+        '''
+        This function needs to be overridden by the user to process the
+        generated samples from generate_sample(self, str) function
+        It is usually used as batch processing when a user wants to
+        do preprocessing on a batch of samples, e.g. padding according to
+        the max length of a sample in the batch
+
+        Args:
+            samples(list tuple): generated sample from generate_sample
+
+        Returns:
+            a python generator, the same format as return value of generate_sample
+
+        Example:
+
+            .. code-block:: python
+                import paddle.fluid.incubate.data_generator as dg
+                class MyData(dg.DataGenerator):
+
+                    def generate_sample(self, line):
+                        def local_iter():
+                            int_words = [int(x) for x in line.split()]
+                            yield ("words", int_words)
+                        return local_iter
+
+                    def generate_batch(self, samples):
+                        def local_iter():
+                            for s in samples:
+                                yield ("words", s[1].extend([s[1][0]]))
+                mydata = MyData()
+                mydata.set_batch(128)
+        '''
+
        def local_iter():
            for sample in samples:
                yield sample