diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index a9b446d8c0b1e5f186a00cbda475e7517ff594ba..fb5f64b2ec48f711a9051fb8875c622d969c1e12 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -15,12 +15,12 @@ paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=N paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912')) paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f06314a1cb30c96b5808dde2219c2dae')) paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03')) +paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a584496aa1343f36eebf3c46b323a74')) paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581')) -paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d')) +paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', '4cfcd9c15b766a51b584cc46d38f1ad8')) paddle.fluid.Executor.train_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', 'd521011d79e71080fe9b5bb179b43518')) -paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0')) -paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2')) +paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f65788d9ead293ada47551339df12203')) +paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'e480208ccc0c9abf084867206dab4d2c')) paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680')) paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8')) @@ -31,7 +31,7 @@ paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'pr paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f')) paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '2cb4bd74481861345c70228a0f57620c')) +paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '33ce6ec50f8eeb05d340e6b114b026fd')) paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff')) paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb')) paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -40,7 +40,7 @@ paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], v paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'eb894b464bbcd1b4bc8038398954f766')) paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '415c56600ce4e198c071cad01409a690')) paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356')) +paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '0e17773521634ef798fddd7d2ea3ef96')) paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8')) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 624c9934d5392b57526edea68254ddf45bd79f4c..f01a6dd9da2dd518227d0f45bab9a140191d38de 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -58,20 +58,34 @@ class CompiledProgram(object): optimizations, for example. * Pre-compute some logic once so that each run is faster. * Transform the program so that it can run in multiple devices. - * TODO: transform the program for optimized inference or distributed - training. + * Transform the program for optimized inference or distributed + training. **Note that: this part is not finished.** Example: .. code-block:: python - place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup) - compiled_prog = compiler.CompiledProgram(main).with_data_parallel( - loss_name=loss.name) - for i in range(5): - test_loss, = exe.run(compiled_prog, - feed=feed_dict, - fetch_list=[loss.name]) + + import paddle.fluid as fluid + import paddle.fluid.compiler as compiler + import numpy + import os + + place = fluid.CUDAPlace(0) # fluid.CPUPlace() + exe = fluid.Executor(place) + + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + + fluid.default_startup_program().random_seed=1 + exe.run(fluid.default_startup_program()) + compiled_prog = compiler.CompiledProgram( + fluid.default_main_program()) + + x = numpy.random.random(size=(10, 1)).astype('float32') + loss_data, = exe.run(compiled_prog, + feed={"X": x}, + fetch_list=[loss.name]) Args: program_or_graph (Graph|Program): If it's Program, it will be first @@ -108,6 +122,44 @@ class CompiledProgram(object): places=None): """Configs the program to run in data parallel way. + Example: + .. code-block:: python + + import paddle.fluid as fluid + import paddle.fluid.compiler as compiler + import numpy + import os + + use_cuda = True + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + # NOTE: If you use CPU to run the program, you need + # to specify the CPU_NUM, otherwise, fluid will use + # all the number of the logic core as the CPU_NUM, + # in that case, the batch size of the input should be + # greater than CPU_NUM, if not, the process will be + # failed by an exception. + if not use_cuda: + os.environ['CPU_NUM'] = str(2) + + exe = fluid.Executor(place) + + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + + fluid.default_startup_program().random_seed=1 + exe.run(fluid.default_startup_program()) + compiled_prog = compiler.CompiledProgram( + fluid.default_main_program()).with_data_parallel( + loss_name=loss.name) + + x = numpy.random.random(size=(10, 1)).astype('float32') + loss_data, = exe.run(compiled_prog, + feed={"X": x}, + fetch_list=[loss.name]) + Args: loss_name (str): The loss name must set in training. Default None. build_strategy(BuildStrategy): build_strategy is used to diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 0b9a23e6769389715535a4ea9dea77bfd3c2707b..53cf1e6743262afee5a93b708d35ffe296874377 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -38,6 +38,15 @@ def global_scope(): Get the global/default scope instance. There are a lot of APIs use :code:`global_scope` as its default value, e.g., :code:`Executor.run` + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace()) + numpy.array(fluid.global_scope().find_var("data").get_tensor()) + Returns: Scope: The global/default scope instance. """ @@ -58,10 +67,15 @@ def scope_guard(scope): variable in runtime will assigned to the new scope. Examples: - >>> import paddle.fluid as fluid - >>> new_scope = fluid.Scope() - >>> with fluid.scope_guard(new_scope): - >>> ... + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + new_scope = fluid.Scope() + with fluid.scope_guard(new_scope): + fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace()) + numpy.array(new_scope.find_var("data").get_tensor()) Args: scope: The new global/default scope. @@ -75,11 +89,18 @@ def as_numpy(tensor): """ Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information. For higher dimensional sequence data, please use LoDTensor directly. + Examples: - >>> import paddle.fluid as fluid - >>> outs = executor.run(...) - >>> np_outs = map(lambda x: as_numpy(x), outs) - >>> ... + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + new_scope = fluid.Scope() + with fluid.scope_guard(new_scope): + fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace()) + tensor = new_scope.find_var("data").get_tensor() + fluid.executor.as_numpy(tensor) # or numpy.array(new_scope.find_var("data").get_tensor()) Args: tensor(Variable): a instance of Tensor @@ -263,42 +284,70 @@ def _as_lodtensor(data, place): class Executor(object): """ - An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running. - Python executor takes a program, adds feed operators and fetch operators to this program according - to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides - the variables(or names) that user wants to get after program runs. Note: the executor will run all - operators in the program but not only the operators dependent by the fetch_list. - It stores the global variables into the global scope, and creates a local scope for the temporary - variables. The contents in local scope may be discarded after every minibatch forward/backward - finished. But the global scope variables will be persistent through different runs. - - - Example: + An Executor in Python, supports single/multiple-GPU running, + and single/multiple-CPU running. Python executor takes a program, + adds feed operators and fetch operators to this program according + to feed map and fetch_list. Feed map provides input data for the + program. fetch_list provides the variables(or names) that user wants + to get after program runs. Note: the executor will run all operators + in the program but not only the operators dependent by the fetch_list. + It stores the global variables into the global scope, and creates a + local scope for the temporary variables. The contents in local scope + may be discarded after every minibatch forward/backward finished. + But the global scope variables will be persistent through different runs. + Examples: .. code-block:: python - # First create the Executor. - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - - # Run the startup program once and only once. - # Not need to optimize/compile the startup program. - exe.run(fluid.default_startup_program()) - - # Run the main program directly without compile. - loss, = exe.run(fluid.default_main_program(), - feed=feed_dict, - fetch_list=[loss.name]) - # Or, compiled the program and run. See `CompiledProgram` for more detail. - compiled_prog = compiler.CompiledProgram( - fluid.default_main_program()).with_data_parallel( - loss_name=loss.name) - loss, = exe.run(compiled_prog, - feed=feed_dict, - fetch_list=[loss.name]) + import paddle.fluid as fluid + import paddle.fluid.compiler as compiler + import numpy + import os + + use_cuda = True + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + + # Run the startup program once and only once. + # Not need to optimize/compile the startup program. + startup_program.random_seed=1 + exe.run(startup_program) + + # Run the main program directly without compile. + x = numpy.random.random(size=(10, 1)).astype('float32') + loss_data, = exe.run(train_program, + feed={"X": x}, + fetch_list=[loss.name]) + + # Or, compiled the program and run. See `CompiledProgram` + # for more detail. + # NOTE: If you use CPU to run the program, you need + # to specify the CPU_NUM, otherwise, fluid will use + # all the number of the logic core as the CPU_NUM, + # in that case, the batch size of the input should be + # greater than CPU_NUM, if not, the process will be + # failed by an exception. + if not use_cuda: + os.environ['CPU_NUM'] = str(2) + + compiled_prog = compiler.CompiledProgram( + train_program).with_data_parallel( + loss_name=loss.name) + loss_data, = exe.run(compiled_prog, + feed={"X": x}, + fetch_list=[loss.name]) Args: - place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device + place(fluid.CPUPlace|fluid.CUDAPlace(n)): indicate the executor run on which device. + """ def __init__(self, place): @@ -392,14 +441,18 @@ class Executor(object): Close this executor. You can no longer use this executor after calling this method. - For the distributed training, this method would free the resource on PServers related to - the current Trainer. - - Example: - >>> cpu = core.CPUPlace() - >>> exe = Executor(cpu) - >>> ... - >>> exe.close() + For the distributed training, this method would free the resource + on PServers related to the current Trainer. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + + cpu = fluid.CPUPlace() + exe = fluid.Executor(cpu) + # execute training or testing + exe.close() """ if not self._closed: self._default_executor.close() @@ -490,13 +543,37 @@ class Executor(object): return_numpy=True, use_program_cache=False): """ - Run program by this Executor. Feed data by feed map, fetch result by fetch_list. - Python executor takes a program, add feed operators and fetch operators to this program according - to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides + Run program by this Executor. Feed data by feed map, fetch result by + fetch_list. Python executor takes a program, add feed operators and + fetch operators to this program according to feed map and fetch_list. + Feed map provides input data for the program. fetch_list provides the variables(or names) that user want to get after program run. - Note: the executor will run all - operators in the program but not only the operators dependent by the fetch_list + Note: the executor will run all operators in the program but not + only the operators dependent by the fetch_list. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + adam = fluid.optimizer.Adam() + adam.minimize(loss) + + # Run the startup program once and only once. + exe.run(fluid.default_startup_program()) + + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(feed={'X': x}, + fetch_list=[loss.name]) Args: program(Program|CompiledProgram): the program that need to run, @@ -520,26 +597,6 @@ class Executor(object): Returns: list(numpy.array): fetch result according to fetch_list. - - - Examples: - - >>> data = fluid.layers.data(name='X', shape=[1], dtype='float32') - >>> out = fluid.layers.create_tensor(dtype='float32') - >>> hidden = fluid.layers.fc(input=data, size=10) - >>> fluid.layers.assign(hidden,out) - >>> loss = fluid.layers.mean(out) - >>> adam = fluid.optimizer.Adam() - >>> adam.minimize(loss) - - >>> cpu = core.CPUPlace() - >>> exe = fluid.Executor(cpu) - >>> exe.run(fluid.default_startup_program()) - - >>> x = numpy.random.random(size=(10, 1)).astype('float32') - >>> outs = exe.run( - >>> feed={'X': x}, - >>> fetch_list=[loss.name]) """ if self._closed: diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index cf10f590ce2c90450047ff046ee3ed206b38322e..ad32f157d0e694477e009b5a9e3650701787d79e 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -37,6 +37,53 @@ class ParallelExecutor(object): is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number of CPUs in the system. + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy + import os + + use_cuda = True + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + # NOTE: If you use CPU to run the program, you need + # to specify the CPU_NUM, otherwise, fluid will use + # all the number of the logic core as the CPU_NUM, + # in that case, the batch size of the input should be + # greater than CPU_NUM, if not, the process will be + # failed by an exception. + if not use_cuda: + os.environ['CPU_NUM'] = str(2) + + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + test_program = fluid.default_main_program().clone(for_test=True) + fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + + startup_program.random_seed=1 + exe.run(startup_program) + + train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, + main_program=train_program, + loss_name=loss.name) + test_exe = fluid.ParallelExecutor(use_cuda=use_cuda, + main_program=test_program, + share_vars_from=train_exe) + + x = numpy.random.random(size=(10, 1)).astype('float32') + loss_data, = train_exe.run(feed={"X": x}, + fetch_list=[loss.name]) + + loss_data, = test_exe.run(feed={"X": x}, + fetch_list=[loss.name]) + Args: use_cuda (bool): Whether to use CUDA or not. loss_name (str): The loss name must set in training. Default None. @@ -66,16 +113,6 @@ class ParallelExecutor(object): Raises: TypeError: If share_vars_from is provided, but not ParallelExecutor object. - Examples: - .. code-block:: python - - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) - test_exe = fluid.ParallelExecutor(use_cuda=True, - main_program=test_program, - share_vars_from=train_exe) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - test_loss, = test_exe.run([loss.name], feed=feed_dict) """ def __init__(self, @@ -152,24 +189,58 @@ class ParallelExecutor(object): assume the data has been splitted into multiple devices, the each element in the list will be copied to each device directly. - For example, if the feed is a dict: - - >>> exe = ParallelExecutor() - >>> # the image will be splitted into devices. If there is two devices - >>> # each device will process an image with shape (24, 1, 28, 28) - >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) - - For example, if the feed is a list: + Examples: + .. code-block:: python - >>> exe = ParallelExecutor() - >>> # each device will process each element in the list. - >>> # the 1st device will process an image with shape (48, 1, 28, 28) - >>> # the 2nd device will process an image with shape (32, 1, 28, 28) - >>> # - >>> # you can use exe.device_count to get the device number. - >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, - >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, - >>> ]) + import paddle.fluid as fluid + import numpy + import os + + use_cuda = True + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + # NOTE: If you use CPU to run the program, you need + # to specify the CPU_NUM, otherwise, fluid will use + # all the number of the logic core as the CPU_NUM, + # in that case, the batch size of the input should be + # greater than CPU_NUM, if not, the process will be + # failed by an exception. + if not use_cuda: + os.environ['CPU_NUM'] = str(2) + + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + + startup_program.random_seed=1 + exe.run(startup_program) + + train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, + main_program=train_program, + loss_name=loss.name) + + # If the feed is a dict: + # the image will be splitted into devices. If there is two devices + # each device will process an image with shape (5, 1) + x = numpy.random.random(size=(10, 1)).astype('float32') + loss_data, = train_exe.run(feed={"X": x}, + fetch_list=[loss.name]) + + # If the feed is a list: + # each device will process each element in the list. + # the 1st device will process an image with shape (10, 1) + # the 2nd device will process an image with shape (9, 1) + # + # you can use exe.device_count to get the device number. + x2 = numpy.random.random(size=(9, 1)).astype('float32') + loss_data, = train_exe.run(feed=[{"X": x}, {"X": x2}], + fetch_list=[loss.name]) Args: fetch_list(list): The fetched variable names