提交 9dd223cc 编写于 作者: D danleifeng

fleet support dygraph in mnist/resnet/transformer

上级 b87761f8
...@@ -24,6 +24,8 @@ from paddle.fluid.optimizer import AdamOptimizer ...@@ -24,6 +24,8 @@ from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.distributed import fleet
from paddle.distributed.fleet.base import role_maker
def parse_args(): def parse_args():
parser = argparse.ArgumentParser("Training for Mnist.") parser = argparse.ArgumentParser("Training for Mnist.")
...@@ -174,8 +176,11 @@ def train_mnist(args): ...@@ -174,8 +176,11 @@ def train_mnist(args):
epoch_num = args.epoch epoch_num = args.epoch
BATCH_SIZE = 64 BATCH_SIZE = 64
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ if args.use_data_parallel:
if args.use_data_parallel else fluid.CUDAPlace(0) place_idx = int(os.environ['FLAGS_selected_gpus'])
place = fluid.CUDAPlace(place_idx)
else:
place = fluid.CUDAPlace(0)
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
if args.ce: if args.ce:
print("ce mode") print("ce mode")
...@@ -184,12 +189,15 @@ def train_mnist(args): ...@@ -184,12 +189,15 @@ def train_mnist(args):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
mnist = MNIST() mnist = MNIST()
adam = AdamOptimizer(learning_rate=0.001, parameter_list=mnist.parameters()) adam = AdamOptimizer(learning_rate=0.001, parameter_list=mnist.parameters())
if args.use_data_parallel: if args.use_data_parallel:
mnist = fluid.dygraph.parallel.DataParallel(mnist, strategy) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
dist_strategy = fleet.DistributedStrategy()
adam = fleet.distributed_optimizer(adam, dist_strategy)
# call after distributed_optimizer so as to apply dist_strategy
mnist = fleet.build_distributed_model(mnist)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=BATCH_SIZE, drop_last=True) paddle.dataset.mnist.train(), batch_size=BATCH_SIZE, drop_last=True)
...@@ -241,7 +249,7 @@ def train_mnist(args): ...@@ -241,7 +249,7 @@ def train_mnist(args):
save_parameters = (not args.use_data_parallel) or ( save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and args.use_data_parallel and
fluid.dygraph.parallel.Env().local_rank == 0) fleet.worker_index() == 0)
if save_parameters: if save_parameters:
fluid.save_dygraph(mnist.state_dict(), "save_temp") fluid.save_dygraph(mnist.state_dict(), "save_temp")
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import numpy as np import numpy as np
import argparse import argparse
import ast import ast
import os
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
...@@ -23,6 +24,8 @@ from paddle.fluid.dygraph.base import to_variable ...@@ -23,6 +24,8 @@ from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework from paddle.fluid import framework
from paddle.distributed import fleet
from paddle.distributed.fleet.base import role_maker
import math import math
import sys import sys
import time import time
...@@ -283,8 +286,11 @@ def eval(model, data): ...@@ -283,8 +286,11 @@ def eval(model, data):
def train_resnet(): def train_resnet():
epoch = args.epoch epoch = args.epoch
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ if args.use_data_parallel:
if args.use_data_parallel else fluid.CUDAPlace(0) place_idx = int(os.environ['FLAGS_selected_gpus'])
place = fluid.CUDAPlace(place_idx)
else:
place = fluid.CUDAPlace(0)
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
if args.ce: if args.ce:
print("ce mode") print("ce mode")
...@@ -293,14 +299,16 @@ def train_resnet(): ...@@ -293,14 +299,16 @@ def train_resnet():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
resnet = ResNet() resnet = ResNet()
optimizer = optimizer_setting(parameter_list=resnet.parameters()) optimizer = optimizer_setting(parameter_list=resnet.parameters())
if args.use_data_parallel: if args.use_data_parallel:
resnet = fluid.dygraph.parallel.DataParallel(resnet, strategy) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
dist_strategy = fleet.DistributedStrategy()
optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
# call after distributed_optimizer so as to apply dist_strategy
resnet = fleet.build_distributed_model(resnet)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
......
...@@ -21,6 +21,8 @@ import time ...@@ -21,6 +21,8 @@ import time
import numpy as np import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.distributed import fleet
from paddle.distributed.fleet.base import role_maker
from utils.configure import PDConfig from utils.configure import PDConfig
from utils.check import check_gpu, check_version from utils.check import check_gpu, check_version
...@@ -32,9 +34,9 @@ from model import Transformer, CrossEntropyCriterion, NoamDecay ...@@ -32,9 +34,9 @@ from model import Transformer, CrossEntropyCriterion, NoamDecay
def do_train(args): def do_train(args):
if args.use_cuda: if args.use_cuda:
trainer_count = fluid.dygraph.parallel.Env().nranks trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id place_idx = int(os.getenv('FLAGS_selected_gpus', 0))
) if trainer_count > 1 else fluid.CUDAPlace(0) place = fluid.CUDAPlace(place_idx)
else: else:
trainer_count = 1 trainer_count = 1
place = fluid.CPUPlace() place = fluid.CPUPlace()
...@@ -130,9 +132,12 @@ def do_train(args): ...@@ -130,9 +132,12 @@ def do_train(args):
transformer.load_dict(model_dict) transformer.load_dict(model_dict)
if trainer_count > 1: if trainer_count > 1:
strategy = fluid.dygraph.parallel.prepare_context() role = role_maker.PaddleCloudRoleMaker(is_collective=True)
transformer = fluid.dygraph.parallel.DataParallel( fleet.init(role)
transformer, strategy) dist_strategy = fleet.DistributedStrategy()
optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
# call after distributed_optimizer so as to apply dist_strategy
transformer = fleet.build_distributed_model(transformer)
# the best cross-entropy value with label smoothing # the best cross-entropy value with label smoothing
loss_normalizer = -( loss_normalizer = -(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册