未验证 提交 a7cd61fd 编写于 作者: C Chen Weihang 提交者: GitHub

fix DataParallel code samples, test=document_fix (#26423)

上级 bcf03273
...@@ -242,41 +242,38 @@ class DataParallel(layers.Layer): ...@@ -242,41 +242,38 @@ class DataParallel(layers.Layer):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph.base import to_variable
place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id) place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
with fluid.dygraph.guard(place=place): with fluid.dygraph.guard(place):
# prepare the data parallel context # prepare the data parallel context
strategy=dygraph.prepare_context() strategy = fluid.dygraph.prepare_context()
linear = Linear(1, 10, act="softmax") linear = fluid.dygraph.Linear(1, 10, act="softmax")
adam = fluid.optimizer.AdamOptimizer() adam = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, parameter_list=linear.parameters())
# make the module become the data parallelism module # make the module become the data parallelism module
linear = dygraph.DataParallel(linear, strategy) linear = fluid.dygraph.DataParallel(linear, strategy)
x_data = np.random.random(size=[10, 1]).astype(np.float32) x_data = np.random.random(size=[10, 1]).astype(np.float32)
data = to_variable(x_data) data = fluid.dygraph.to_variable(x_data)
hidden = linear(data) hidden = linear(data)
avg_loss = fluid.layers.mean(hidden) avg_loss = fluid.layers.mean(hidden)
# scale the loss according to the number of trainers. # scale the loss according to the number of trainers.
avg_loss = linear.scale_loss(avg_loss) avg_loss = linear.scale_loss(avg_loss)
avg_loss.backward() avg_loss.backward()
# collect the gradients of trainers. # collect the gradients of trainers.
linear.apply_collective_grads() linear.apply_collective_grads()
adam.minimize(avg_loss) adam.minimize(avg_loss)
linear.clear_gradients() linear.clear_gradients()
""" """
def __init__(self, layers, strategy): def __init__(self, layers, strategy):
...@@ -306,20 +303,23 @@ class DataParallel(layers.Layer): ...@@ -306,20 +303,23 @@ class DataParallel(layers.Layer):
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.optimizer import AdamOptimizer place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
from paddle.fluid.dygraph.nn import Linear with fluid.dygraph.guard(place):
from paddle.fluid.dygraph.base import to_variable
# prepare the data parallel context
place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id) strategy = fluid.dygraph.prepare_context()
with fluid.dygraph.guard(place=place):
strategy=dygraph.prepare_context() linear = fluid.dygraph.Linear(1, 10, act="softmax")
linear = Linear(1, 10, act="softmax") adam = fluid.optimizer.AdamOptimizer(
adam = fluid.optimizer.AdamOptimizer() learning_rate=0.001, parameter_list=linear.parameters())
linear = dygraph.DataParallel(linear, strategy)
# make the module become the data parallelism module
linear = fluid.dygraph.DataParallel(linear, strategy)
x_data = np.random.random(size=[10, 1]).astype(np.float32) x_data = np.random.random(size=[10, 1]).astype(np.float32)
data = to_variable(x_data) data = fluid.dygraph.to_variable(x_data)
hidden = linear(data) hidden = linear(data)
avg_loss = fluid.layers.mean(hidden) avg_loss = fluid.layers.mean(hidden)
...@@ -327,6 +327,8 @@ class DataParallel(layers.Layer): ...@@ -327,6 +327,8 @@ class DataParallel(layers.Layer):
avg_loss = linear.scale_loss(avg_loss) avg_loss = linear.scale_loss(avg_loss)
avg_loss.backward() avg_loss.backward()
# collect the gradients of trainers.
linear.apply_collective_grads() linear.apply_collective_grads()
adam.minimize(avg_loss) adam.minimize(avg_loss)
...@@ -390,23 +392,29 @@ class DataParallel(layers.Layer): ...@@ -390,23 +392,29 @@ class DataParallel(layers.Layer):
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.optimizer import AdamOptimizer place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
from paddle.fluid.dygraph.nn import Linear with fluid.dygraph.guard(place):
from paddle.fluid.dygraph.base import to_variable
# prepare the data parallel context
place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id) strategy = fluid.dygraph.prepare_context()
with fluid.dygraph.guard(place=place):
strategy=dygraph.prepare_context() linear = fluid.dygraph.Linear(1, 10, act="softmax")
linear = Linear(1, 10, act="softmax") adam = fluid.optimizer.AdamOptimizer(
adam = fluid.optimizer.AdamOptimizer() learning_rate=0.001, parameter_list=linear.parameters())
linear = dygraph.DataParallel(linear, strategy)
# make the module become the data parallelism module
linear = fluid.dygraph.DataParallel(linear, strategy)
x_data = np.random.random(size=[10, 1]).astype(np.float32) x_data = np.random.random(size=[10, 1]).astype(np.float32)
data = to_variable(x_data) data = fluid.dygraph.to_variable(x_data)
hidden = linear(data) hidden = linear(data)
avg_loss = fluid.layers.mean(hidden) avg_loss = fluid.layers.mean(hidden)
# scale the loss according to the number of trainers.
avg_loss = linear.scale_loss(avg_loss) avg_loss = linear.scale_loss(avg_loss)
avg_loss.backward() avg_loss.backward()
# collect the gradients of trainers. # collect the gradients of trainers.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册