未验证 提交 3dfaf44a 编写于 作者: C Chen Weihang 提交者: GitHub

Rename dygraph parallel env & add doc (#22925)

* add dygraph parallel env doc, test=develop

* polish details, test=develop, test=document_fix

* fix examples error in other apis, test=develop

* fix more example error in other api, test=develop

* add white list for gpu examples, test=develop, test=document_fix
上级 08e80d17
...@@ -22,7 +22,7 @@ from .. import framework ...@@ -22,7 +22,7 @@ from .. import framework
from ..layers import collective from ..layers import collective
from . import to_variable, no_grad from . import to_variable, no_grad
__all__ = ["prepare_context"] __all__ = ["prepare_context", "ParallelEnv", "DataParallel"]
ParallelStrategy = core.ParallelStrategy ParallelStrategy = core.ParallelStrategy
...@@ -37,10 +37,10 @@ def prepare_context(strategy=None): ...@@ -37,10 +37,10 @@ def prepare_context(strategy=None):
if strategy.nranks < 2: if strategy.nranks < 2:
return return
assert framework.in_dygraph_mode() is True, \ assert framework.in_dygraph_mode() is True, \
"dygraph.parallel.prepare_context should be used with dygrahp mode." "dygraph.prepare_context should be used with dygrahp mode."
place = framework._current_expected_place() place = framework._current_expected_place()
assert place is not None, \ assert place is not None, \
"dygraph.parallel.prepare_context should be used in fluid.dygraph.guard(place) guard." "dygraph.prepare_context should be used in fluid.dygraph.guard(place) guard."
if isinstance(place, core.CUDAPlace): if isinstance(place, core.CUDAPlace):
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(
core.NCCLParallelContext(strategy, place)) core.NCCLParallelContext(strategy, place))
...@@ -51,7 +51,64 @@ def prepare_context(strategy=None): ...@@ -51,7 +51,64 @@ def prepare_context(strategy=None):
return strategy return strategy
class Env(object): class ParallelEnv(object):
"""
**Notes**:
**The old class name was Env and will be deprecated. Please use new class name ParallelEnv.**
This class is used to obtain the environment variables required for
the parallel execution of dynamic graph model.
The dynamic graph parallel mode needs to be started using paddle.distributed.launch.
By default, the related environment variable is automatically configured by this module.
This class is generally used in with `fluid.dygraph.DataParallel` to configure dynamic graph models
to run in parallel.
Examples:
.. code-block:: python
# This example needs to run with paddle.distributed.launch, The usage is:
# python -m paddle.distributed.launch --selected_gpus=0,1 example.py
# And the content of `example.py` is the code of following example.
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph.base import to_variable
place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
with fluid.dygraph.guard(place=place):
# prepare the data parallel context
strategy=dygraph.prepare_context()
linear = Linear(1, 10, act="softmax")
adam = fluid.optimizer.AdamOptimizer()
# make the module become the data parallelism module
linear = dygraph.DataParallel(linear, strategy)
x_data = np.random.random(size=[10, 1]).astype(np.float32)
data = to_variable(x_data)
hidden = linear(data)
avg_loss = fluid.layers.mean(hidden)
# scale the loss according to the number of trainers.
avg_loss = linear.scale_loss(avg_loss)
avg_loss.backward()
# collect the gradients of trainers.
linear.apply_collective_grads()
adam.minimize(avg_loss)
linear.clear_gradients()
"""
def __init__(self): def __init__(self):
self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0")) self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
...@@ -62,34 +119,124 @@ class Env(object): ...@@ -62,34 +119,124 @@ class Env(object):
@property @property
def nranks(self): def nranks(self):
"""
The number of trainers, generally refers to the number of GPU cards used in training.
Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1.
Examples:
.. code-block:: python
# execute this command in terminal: export PADDLE_TRAINERS_NUM=4
import paddle.fluid as fluid
env = fluid.dygraph.ParallelEnv()
print("The nranks is %d" % env.nranks)
# The nranks is 4
"""
return self._nranks return self._nranks
@property @property
def local_rank(self): def local_rank(self):
"""
The current trainer number.
Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0.
Examples:
.. code-block:: python
# execute this command in terminal: export PADDLE_TRAINER_ID=0
import paddle.fluid as fluid
env = fluid.dygraph.ParallelEnv()
print("The local rank is %d" % env.local_rank)
# The local rank is 0
"""
return self._local_rank return self._local_rank
@property @property
def dev_id(self): def dev_id(self):
"""
The ID of selected GPU card for parallel training.
Its value is equal to the value of the environment variable FLAGS_selected_gpus. The default value is 0.
Examples:
.. code-block:: python
# execute this command in terminal: export FLAGS_selected_gpus=1
import paddle.fluid as fluid
env = fluid.dygraph.ParallelEnv()
print("The device id are %d" % env.dev_id)
# The device id are 1
"""
return self._dev_id return self._dev_id
@property @property
def current_endpoint(self): def current_endpoint(self):
"""
The endpoint of current trainer, it is in the form of (node IP + port).
Its value is equal to the value of the environment variable PADDLE_CURRENT_ENDPOINT. The default value is "".
Examples:
.. code-block:: python
# execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
import paddle.fluid as fluid
env = fluid.dygraph.ParallelEnv()
print("The current endpoint are %s" % env.current_endpoint)
# The current endpoint are 127.0.0.1:6170
"""
return self._current_endpoint return self._current_endpoint
@property @property
def trainer_endpoints(self): def trainer_endpoints(self):
"""
The endpoints of all trainer nodes in the task,
which are used to broadcast the NCCL ID when NCCL2 is initialized.
Its value is equal to the value of the environment variable PADDLE_TRAINER_ENDPOINTS. The default value is "".
Examples:
.. code-block:: python
# execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
import paddle.fluid as fluid
env = fluid.dygraph.ParallelEnv()
print("The trainer endpoints are %s" % env.trainer_endpoints)
# The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
"""
return self._trainer_endpoints return self._trainer_endpoints
# NOTE: [ Compatible ] Originally this class name is `Env`. The semantics of the old class names
# are inaccurate and may confuse users, so replace it with `ParallelEnv`, but to be compatible
# with the old examples, here still need to keep this name.
Env = ParallelEnv
class DataParallel(layers.Layer): class DataParallel(layers.Layer):
""" """
Runs the module with data parallelism. Run the dygraph module with data parallelism.
Currently, DataParallel only supports to run the dynamic graph Currently, DataParallel class only supports to run the dynamic graph
with multi-process. The usage is: with multi-process. The usage is:
`python -m paddle.distributed.launch --gpus 2 dynamic_graph_test.py`. `python -m paddle.distributed.launch --selected_gpus=0,1 dynamic_graph_test.py`.
And the content of `dynamic_graph_test.py` is the code of examples. And the content of `dynamic_graph_test.py` is the code of examples.
Args:
layers(Layer): The module that should be executed by data parallel.
strategy(ParallelStrategy): The strategy of data parallelism, contains
environment configuration related to parallel execution.
Returns:
Layer: The data paralleled module.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -100,17 +247,17 @@ class DataParallel(layers.Layer): ...@@ -100,17 +247,17 @@ class DataParallel(layers.Layer):
from paddle.fluid.dygraph.nn import Linear from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
place = fluid.CUDAPlace(0) place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
with fluid.dygraph.guard(place=place): with fluid.dygraph.guard(place=place):
# prepare the data parallel context # prepare the data parallel context
strategy=dygraph.parallel.prepare_context() strategy=dygraph.prepare_context()
linear = Linear(1, 10, act="softmax") linear = Linear(1, 10, act="softmax")
adam = fluid.optimizer.AdamOptimizer() adam = fluid.optimizer.AdamOptimizer()
# make the module become the data parallelism module # make the module become the data parallelism module
linear = dygraph.parallel.DataParallel(linear, strategy) linear = dygraph.DataParallel(linear, strategy)
x_data = np.random.random(size=[10, 1]).astype(np.float32) x_data = np.random.random(size=[10, 1]).astype(np.float32)
data = to_variable(x_data) data = to_variable(x_data)
...@@ -128,13 +275,6 @@ class DataParallel(layers.Layer): ...@@ -128,13 +275,6 @@ class DataParallel(layers.Layer):
adam.minimize(avg_loss) adam.minimize(avg_loss)
linear.clear_gradients() linear.clear_gradients()
Args:
layers(Layer): The module that should be executed by data parallel.
strategy(ParallelStrategy): The strategy of data parallelism.
Returns:
Layer: The data paralleled module.
""" """
def __init__(self, layers, strategy): def __init__(self, layers, strategy):
...@@ -154,10 +294,41 @@ class DataParallel(layers.Layer): ...@@ -154,10 +294,41 @@ class DataParallel(layers.Layer):
directly. directly.
Args: Args:
loss(Layer): The loss of the current Model. loss(Variable): The loss of the current Model.
Returns: Returns:
Layer: the scaled loss. Variable: the scaled loss.
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph.base import to_variable
place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
with fluid.dygraph.guard(place=place):
strategy=dygraph.prepare_context()
linear = Linear(1, 10, act="softmax")
adam = fluid.optimizer.AdamOptimizer()
linear = dygraph.DataParallel(linear, strategy)
x_data = np.random.random(size=[10, 1]).astype(np.float32)
data = to_variable(x_data)
hidden = linear(data)
avg_loss = fluid.layers.mean(hidden)
# scale the loss according to the number of trainers.
avg_loss = linear.scale_loss(avg_loss)
avg_loss.backward()
linear.apply_collective_grads()
adam.minimize(avg_loss)
linear.clear_gradients()
""" """
if not self._is_data_parallel_mode(): if not self._is_data_parallel_mode():
return loss return loss
...@@ -211,6 +382,36 @@ class DataParallel(layers.Layer): ...@@ -211,6 +382,36 @@ class DataParallel(layers.Layer):
def apply_collective_grads(self): def apply_collective_grads(self):
""" """
AllReduce the Parameters' gradient. AllReduce the Parameters' gradient.
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph.base import to_variable
place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
with fluid.dygraph.guard(place=place):
strategy=dygraph.prepare_context()
linear = Linear(1, 10, act="softmax")
adam = fluid.optimizer.AdamOptimizer()
linear = dygraph.DataParallel(linear, strategy)
x_data = np.random.random(size=[10, 1]).astype(np.float32)
data = to_variable(x_data)
hidden = linear(data)
avg_loss = fluid.layers.mean(hidden)
avg_loss = linear.scale_loss(avg_loss)
avg_loss.backward()
# collect the gradients of trainers.
linear.apply_collective_grads()
adam.minimize(avg_loss)
linear.clear_gradients()
""" """
if not self._is_data_parallel_mode(): if not self._is_data_parallel_mode():
return return
...@@ -276,9 +477,9 @@ class DataParallel(layers.Layer): ...@@ -276,9 +477,9 @@ class DataParallel(layers.Layer):
import paddle.fluid as fluid import paddle.fluid as fluid
with fluid.dygraph.guard(): with fluid.dygraph.guard():
strategy=dygraph.parallel.prepare_context() strategy=fluid.dygraph.prepare_context()
emb = fluid.dygraph.Embedding([10, 10]) emb = fluid.dygraph.Embedding([10, 10])
emb = dygraph.parallel.DataParallel(emb, strategy) emb = fluid.dygraph.DataParallel(emb, strategy)
state_dict = emb.state_dict() state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy") fluid.save_dygraph( state_dict, "paddle_dy")
...@@ -310,9 +511,9 @@ class DataParallel(layers.Layer): ...@@ -310,9 +511,9 @@ class DataParallel(layers.Layer):
import paddle.fluid as fluid import paddle.fluid as fluid
with fluid.dygraph.guard(): with fluid.dygraph.guard():
strategy=dygraph.parallel.prepare_context() strategy=fluid.dygraph.prepare_context()
emb = fluid.dygraph.Embedding([10, 10]) emb = fluid.dygraph.Embedding([10, 10])
emb = dygraph.parallel.DataParallel(emb, strategy) emb = fluid.dygraph.DataParallel(emb, strategy)
state_dict = emb.state_dict() state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy") fluid.save_dygraph( state_dict, "paddle_dy")
...@@ -350,9 +551,9 @@ class DataParallel(layers.Layer): ...@@ -350,9 +551,9 @@ class DataParallel(layers.Layer):
import paddle.fluid as fluid import paddle.fluid as fluid
with fluid.dygraph.guard(): with fluid.dygraph.guard():
strategy=dygraph.parallel.prepare_context() strategy=fluid.dygraph.prepare_context()
emb = fluid.dygraph.Embedding([10, 10]) emb = fluid.dygraph.Embedding([10, 10])
emb = dygraph.parallel.DataParallel(emb, strategy) emb = fluid.dygraph.DataParallel(emb, strategy)
state_dict = emb.state_dict() state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy") fluid.save_dygraph( state_dict, "paddle_dy")
......
...@@ -310,6 +310,10 @@ ...@@ -310,6 +310,10 @@
"Recall.eval", "Recall.eval",
"FC.forward", "FC.forward",
"While.block", "While.block",
"DGCMomentumOptimizer" "DGCMomentumOptimizer",
"ParallelEnv",
"DataParallel",
"DataParallel.scale_loss",
"DataParallel.apply_collective_grads"
] ]
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册