提交 73f224d0 编写于 作者: C chengduoZH

add Doc parallel exe

上级 5ea039b3
...@@ -27,6 +27,40 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy ...@@ -27,6 +27,40 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy
class ParallelExecutor(object): class ParallelExecutor(object):
"""
ParallelExecutor can run program in parallel.
Args:
use_cuda (bool): Whether to use CUDA or not.
loss_name (str): The loss name must set in training. Default None.
main_program (Program): The program that need to run, if not provided,
then default_main_program will be used. Default None.
share_vars_from(ParallelExecutor): If provied, it will share variables
from the specified ParallelExecutor. Default None.
num_trainers(int): If greater than 1, NCCL will be initialized with
multiple rank of nodes, each node should have same number of GPUs.
Distributed training will be enabled then. Default 1.
trainer_id(int: Must use together with num_trainers. trainer_id is the
"rank" of current node starts from 0. Default 0.
Returns:
ParallelExecutor: The initialized ParallelExecutor object.
Raises:
TypeError: If share_vars_from is provided, but not ParallelExecutor object.
Examples:
.. code-block:: python
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
test_exe = fluid.ParallelExecutor(use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
test_loss, = test_exe.run([loss.name], feed=feed_dict)
"""
def __init__(self, def __init__(self,
use_cuda, use_cuda,
loss_name=None, loss_name=None,
...@@ -37,42 +71,7 @@ class ParallelExecutor(object): ...@@ -37,42 +71,7 @@ class ParallelExecutor(object):
num_trainers=1, num_trainers=1,
trainer_id=0, trainer_id=0,
**kwargs): **kwargs):
"""
ParallelExecutor can run program in parallel.
Args:
use_cuda(bool): Whether to use CUDA or not.
loss_name(str, default None): The loss name must set in training.
main_program(Program, default None): The program that need to run,
if not provided, then default_main_program will be used.
share_vars_from(ParallelExecutor, default None): If provied,
it will share variables from the specified ParallelExecutor.
num_trainers(int, default 1): If greater than 1, NCCL will be
initialized with multpile rank of nodes, each node should have
same number of GPUs. Distributed training will be enabled then.
trainer_id(int, default 0): Must use together with num_trainers.
trainer_id is the "rank" of current node starts from 0.
Returns:
A ParallelExecutor object.
Raises:
TypeError: If share_vars_from is provided, but not ParallelExecutor
object.
Examples:
.. code-block:: python
train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
test_loss, = test_exe.run([loss.name], feed=feed_dict)
"""
if len(kwargs) != 0: if len(kwargs) != 0:
err_msg = "" err_msg = ""
for key in kwargs: for key in kwargs:
...@@ -135,6 +134,7 @@ class ParallelExecutor(object): ...@@ -135,6 +134,7 @@ class ParallelExecutor(object):
if share_vars_from and not isinstance(share_vars_from, if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor): ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.") raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes( local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else [] ) if share_vars_from else []
...@@ -166,12 +166,14 @@ class ParallelExecutor(object): ...@@ -166,12 +166,14 @@ class ParallelExecutor(object):
element in the list will be copied to each device directly. element in the list will be copied to each device directly.
For example, if the feed is a dict: For example, if the feed is a dict:
>>> exe = ParallelExecutor() >>> exe = ParallelExecutor()
>>> # the image will be splitted into devices. If there is two devices >>> # the image will be splitted into devices. If there is two devices
>>> # each device will process an image with shape (24, 1, 28, 28) >>> # each device will process an image with shape (24, 1, 28, 28)
>>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
For example, if the feed is a list: For example, if the feed is a list:
>>> exe = ParallelExecutor() >>> exe = ParallelExecutor()
>>> # each device will process each element in the list. >>> # each device will process each element in the list.
>>> # the 1st device will process an image with shape (48, 1, 28, 28) >>> # the 1st device will process an image with shape (48, 1, 28, 28)
...@@ -182,18 +184,40 @@ class ParallelExecutor(object): ...@@ -182,18 +184,40 @@ class ParallelExecutor(object):
>>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, >>> {"image": numpy.random.random(size=(32, 1, 28, 28))},
>>> ]) >>> ])
Args: Args:
fetch_list(list): The fetched variable names fetch_list(list): The fetched variable names
feed(list|dict|None): The feed variables. If the feed is a dict, feed(list|dict|None): The feed variables. If the feed is a dict,
tensors in that dict will be splitted into each devices. If tensors in that dict will be splitted into each devices. If
the feed is a list, each element of the list will be copied the feed is a list, each element of the list will be copied
to each device. to each device. Default None.
feed_dict: Alias for feed parameter, for backward compatibility. feed_dict: Alias for feed parameter, for backward compatibility.
This parameter is deprecated. This parameter has been deprecated. Default None.
Returns:
List: The fetched result list.
Returns: fetched result list. Raises:
ValueError: If the feed is a list, but its length is not equal the
length of active places, or its element's is not dict.
NOTES:
1. If the feed's type is dict, the number of data that feeds to
ParallelExecutor must be bigger than active places. Otherwise,
it will throw exception from C++ side. Special attention should be
paid to check whether the last batch of the dataset is bigger
than active places.
2. If active places are more than one, the fetch results for each
variable is a list, and each element of this list is the variable of
respective active place.
Examples:
.. code-block:: python
pe = fluid.ParallelExecutor(use_cuda=use_cuda,
loss_name=avg_cost.name,
main_program=fluid.default_main_program())
loss = pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))
""" """
if feed is None and feed_dict is not None: if feed is None and feed_dict is not None:
feed = feed_dict feed = feed_dict
...@@ -241,6 +265,10 @@ class ParallelExecutor(object): ...@@ -241,6 +265,10 @@ class ParallelExecutor(object):
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
def bcast_params(self): def bcast_params(self):
"""
Broadcast the parameters to other devices. It is used during
distributed training.
"""
self.executor.bcast_params(set(self.persistable_vars)) self.executor.bcast_params(set(self.persistable_vars))
@property @property
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册