提交 863897ce 编写于 作者: G guosheng

Merge branch 'master' of https://github.com/PaddlePaddle/hapi into fix-data-train

...@@ -16,7 +16,7 @@ import six ...@@ -16,7 +16,7 @@ import six
import copy import copy
from progressbar import ProgressBar from progressbar import ProgressBar
from distributed import get_local_rank from paddle.fluid.dygraph.parallel import ParallelEnv
def config_callbacks(callbacks=None, def config_callbacks(callbacks=None,
...@@ -195,7 +195,7 @@ class ProgBarLogger(Callback): ...@@ -195,7 +195,7 @@ class ProgBarLogger(Callback):
self.steps = self.params['steps'] self.steps = self.params['steps']
self.epoch = epoch self.epoch = epoch
self.train_step = 0 self.train_step = 0
if self.verbose and self.epochs and get_local_rank() == 0: if self.verbose and self.epochs and ParallelEnv().local_rank == 0:
print('Epoch %d/%d' % (epoch + 1, self.epochs)) print('Epoch %d/%d' % (epoch + 1, self.epochs))
self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose) self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
...@@ -213,8 +213,8 @@ class ProgBarLogger(Callback): ...@@ -213,8 +213,8 @@ class ProgBarLogger(Callback):
logs = logs or {} logs = logs or {}
self.train_step += 1 self.train_step += 1
if self.train_step % self.log_freq == 0 and self.verbose and get_local_rank( if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
) == 0: ).local_rank == 0:
# if steps is not None, last step will update in on_epoch_end # if steps is not None, last step will update in on_epoch_end
if self.steps and self.train_step < self.steps: if self.steps and self.train_step < self.steps:
self._updates(logs, 'train') self._updates(logs, 'train')
...@@ -223,7 +223,7 @@ class ProgBarLogger(Callback): ...@@ -223,7 +223,7 @@ class ProgBarLogger(Callback):
def on_epoch_end(self, epoch, logs=None): def on_epoch_end(self, epoch, logs=None):
logs = logs or {} logs = logs or {}
if self.verbose and get_local_rank() == 0: if self.verbose and ParallelEnv().local_rank == 0:
self._updates(logs, 'train') self._updates(logs, 'train')
def on_eval_begin(self, logs=None): def on_eval_begin(self, logs=None):
...@@ -233,7 +233,7 @@ class ProgBarLogger(Callback): ...@@ -233,7 +233,7 @@ class ProgBarLogger(Callback):
self.evaled_samples = 0 self.evaled_samples = 0
self.eval_progbar = ProgressBar( self.eval_progbar = ProgressBar(
num=self.eval_steps, verbose=self.verbose) num=self.eval_steps, verbose=self.verbose)
if get_local_rank() == 0: if ParallelEnv().local_rank == 0:
print('Eval begin...') print('Eval begin...')
def on_eval_batch_end(self, step, logs=None): def on_eval_batch_end(self, step, logs=None):
...@@ -242,9 +242,15 @@ class ProgBarLogger(Callback): ...@@ -242,9 +242,15 @@ class ProgBarLogger(Callback):
samples = logs.get('batch_size', 1) samples = logs.get('batch_size', 1)
self.evaled_samples += samples self.evaled_samples += samples
if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
).local_rank == 0:
# if steps is not None, last step will update in on_epoch_end
if self.eval_steps and self.eval_step < self.eval_steps:
self._updates(logs, 'eval')
def on_eval_end(self, logs=None): def on_eval_end(self, logs=None):
logs = logs or {} logs = logs or {}
if self.verbose and get_local_rank() == 0: if self.verbose and ParallelEnv().local_rank == 0:
self._updates(logs, 'eval') self._updates(logs, 'eval')
print('Eval samples: %d' % (self.evaled_samples)) print('Eval samples: %d' % (self.evaled_samples))
...@@ -258,7 +264,7 @@ class ModelCheckpoint(Callback): ...@@ -258,7 +264,7 @@ class ModelCheckpoint(Callback):
self.epoch = epoch self.epoch = epoch
def _is_save(self): def _is_save(self):
return self.model and self.save_dir and get_local_rank() == 0 return self.model and self.save_dir and ParallelEnv().local_rank == 0
def on_epoch_end(self, epoch, logs=None): def on_epoch_end(self, epoch, logs=None):
if self._is_save() and self.epoch % self.save_freq == 0: if self._is_save() and self.epoch % self.save_freq == 0:
......
...@@ -13,30 +13,20 @@ ...@@ -13,30 +13,20 @@
# limitations under the License. # limitations under the License.
import os import os
import sys import sys
import six
import time import time
import math import math
import socket import socket
import contextlib import contextlib
from contextlib import closing
from six import string_types
import numpy as np import numpy as np
from collections import OrderedDict
from paddle import fluid
import paddle.fluid.unique_name as nameGen
from paddle.fluid import core
from paddle.fluid import framework from paddle import fluid
from paddle.fluid.layers import collective from paddle.fluid.layers import collective
from paddle.fluid.dygraph import to_variable, no_grad, layers from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
from paddle.fluid.framework import Variable from paddle.fluid.io import BatchSampler
from paddle.fluid.executor import global_scope
from paddle.fluid.dygraph.parallel import Env, DataParallel, ParallelStrategy _parallel_context_initialized = False
from paddle.fluid.layers.collective import _c_allreduce, _c_allgather, _c_broadcast, \
_c_sync_comm_stream, _c_sync_calc_stream
from paddle.fluid.io import BatchSampler, DataLoader
__parallel_context_init = False
class DistributedBatchSampler(BatchSampler): class DistributedBatchSampler(BatchSampler):
"""Sampler that restricts data loading to a subset of the dataset. """Sampler that restricts data loading to a subset of the dataset.
...@@ -71,11 +61,13 @@ class DistributedBatchSampler(BatchSampler): ...@@ -71,11 +61,13 @@ class DistributedBatchSampler(BatchSampler):
self.shuffle = shuffle self.shuffle = shuffle
assert isinstance(drop_last, bool), \ assert isinstance(drop_last, bool), \
"drop_last should be a boolean number" "drop_last should be a boolean number"
self.drop_last = drop_last self.drop_last = drop_last
self.nranks = get_nranks() self.nranks = ParallelEnv().nranks
self.local_rank = get_local_rank() self.local_rank = ParallelEnv().local_rank
self.epoch = 0 self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks)) self.num_samples = int(
math.ceil(len(self.dataset) * 1.0 / self.nranks))
self.total_size = self.num_samples * self.nranks self.total_size = self.num_samples * self.nranks
def __iter__(self): def __iter__(self):
...@@ -86,9 +78,28 @@ class DistributedBatchSampler(BatchSampler): ...@@ -86,9 +78,28 @@ class DistributedBatchSampler(BatchSampler):
if self.shuffle: if self.shuffle:
np.random.RandomState(self.epoch).shuffle(indices) np.random.RandomState(self.epoch).shuffle(indices)
self.epoch += 1 self.epoch += 1
# subsample # subsample
indices = indices[self.local_rank * self.num_samples: def _get_indices_by_batch_size(indices):
(self.local_rank + 1) * self.num_samples] subsampled_indices = []
last_batch_size = self.total_size % (self.batch_size * self.nranks)
assert last_batch_size % self.nranks == 0
last_local_batch_size = last_batch_size // self.nranks
for i in range(self.local_rank * self.batch_size,
len(indices) - last_batch_size,
self.batch_size * self.nranks):
subsampled_indices.extend(indices[i:i + self.batch_size])
indices = indices[len(indices) - last_batch_size:]
subsampled_indices.extend(indices[
self.local_rank * last_local_batch_size:(
self.local_rank + 1) * last_local_batch_size])
return subsampled_indices
if self.nranks > 1:
indices = _get_indices_by_batch_size(indices)
assert len(indices) == self.num_samples assert len(indices) == self.num_samples
_sample_iter = iter(indices) _sample_iter = iter(indices)
...@@ -106,46 +117,37 @@ class DistributedBatchSampler(BatchSampler): ...@@ -106,46 +117,37 @@ class DistributedBatchSampler(BatchSampler):
num_samples += int(not self.drop_last) * (self.batch_size - 1) num_samples += int(not self.drop_last) * (self.batch_size - 1)
return num_samples // self.batch_size return num_samples // self.batch_size
def set_epoch(self, epoch):
def _all_gather(x, nranks, ring_id=0, use_calc_stream=True): self.epoch = epoch
return _c_allgather(x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
def get_local_rank():
return Env().local_rank
def get_nranks(): def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
return Env().nranks return collective._c_allgather(
x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
def wait_server_ready(endpoints): def wait_server_ready(endpoints):
assert not isinstance(endpoints, string_types) assert not isinstance(endpoints, six.string_types)
while True: while True:
all_ok = True all_ok = True
not_ready_endpoints = [] not_ready_endpoints = []
for ep in endpoints: for ep in endpoints:
ip_port = ep.split(":") ip_port = ep.split(":")
with closing( with contextlib.closing(
socket.socket(socket.AF_INET, socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
socket.SOCK_STREAM)) as sock:
sock.settimeout(2) sock.settimeout(2)
result = sock.connect_ex((ip_port[0], int(ip_port[1]))) result = sock.connect_ex((ip_port[0], int(ip_port[1])))
if result != 0: if result != 0:
all_ok = False all_ok = False
not_ready_endpoints.append(ep) not_ready_endpoints.append(ep)
if not all_ok: if not all_ok:
sys.stderr.write("server not ready, wait 3 sec to retry...\n")
sys.stderr.write("not ready endpoints:" + str(
not_ready_endpoints) + "\n")
sys.stderr.flush()
time.sleep(3) time.sleep(3)
else: else:
break break
def init_communicator(program, rank, nranks, wait_port, def init_communicator(program, rank, nranks, wait_port, current_endpoint,
current_endpoint, endpoints): endpoints):
if nranks < 2: if nranks < 2:
return return
other_endpoints = endpoints[:] other_endpoints = endpoints[:]
...@@ -154,9 +156,9 @@ def init_communicator(program, rank, nranks, wait_port, ...@@ -154,9 +156,9 @@ def init_communicator(program, rank, nranks, wait_port,
wait_server_ready(other_endpoints) wait_server_ready(other_endpoints)
block = program.global_block() block = program.global_block()
nccl_id_var = block.create_var( nccl_id_var = block.create_var(
name=nameGen.generate('nccl_id'), name=fluid.unique_name.generate('nccl_id'),
persistable=True, persistable=True,
type=core.VarDesc.VarType.RAW) type=fluid.core.VarDesc.VarType.RAW)
block.append_op( block.append_op(
type='c_gen_nccl_id', type='c_gen_nccl_id',
...@@ -181,25 +183,28 @@ def init_communicator(program, rank, nranks, wait_port, ...@@ -181,25 +183,28 @@ def init_communicator(program, rank, nranks, wait_port,
def prepare_distributed_context(place=None): def prepare_distributed_context(place=None):
if place is None: if place is None:
place = fluid.CUDAPlace(Env().dev_id) if Env().nranks > 1 \ place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
else fluid.CUDAPlace(0) else fluid.CUDAPlace(0)
strategy = ParallelStrategy() strategy = ParallelStrategy()
strategy.nranks = Env().nranks strategy.nranks = ParallelEnv().nranks
strategy.local_rank = Env().local_rank strategy.local_rank = ParallelEnv().local_rank
strategy.trainer_endpoints = Env().trainer_endpoints strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
strategy.current_endpoint = Env().current_endpoint strategy.current_endpoint = ParallelEnv().current_endpoint
if strategy.nranks < 2: if strategy.nranks < 2:
return return
global __parallel_context_init global _parallel_context_initialized
if not _parallel_context_initialized and isinstance(place,
fluid.CUDAPlace):
if not __parallel_context_init and isinstance(place, core.CUDAPlace):
def _init_context(): def _init_context():
communicator_prog = framework.Program() communicator_prog = fluid.Program()
init_communicator(communicator_prog, strategy.local_rank, strategy.nranks, init_communicator(communicator_prog, strategy.local_rank,
True, strategy.current_endpoint, strategy.trainer_endpoints) strategy.nranks, True, strategy.current_endpoint,
strategy.trainer_endpoints)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(communicator_prog) exe.run(communicator_prog)
...@@ -213,57 +218,5 @@ def prepare_distributed_context(place=None): ...@@ -213,57 +218,5 @@ def prepare_distributed_context(place=None):
else: else:
assert ("Only support CUDAPlace for now.") assert ("Only support CUDAPlace for now.")
__parallel_context_init = True _parallel_context_initialized = True
return strategy return strategy
class DistributedDataParallel(DataParallel):
def __init__(self, layers, strategy=None):
if strategy is None:
strategy = ParallelStrategy()
strategy.nranks = Env().nranks
strategy.local_rank = Env().local_rank
strategy.trainer_endpoints = Env().trainer_endpoints
strategy.current_endpoint = Env().current_endpoint
super(DistributedDataParallel, self).__init__(layers, strategy)
@no_grad
def apply_collective_grads(self):
"""
AllReduce the Parameters' gradient.
"""
if not self._is_data_parallel_mode():
return
grad_var_set = set()
grad_vars = []
for param in self._layers.parameters():
# NOTE(zcd): The grad_ivar maybe no generated.
if param.trainable and param._grad_ivar():
g_var = param._grad_ivar()
grad_vars.append(g_var)
assert g_var not in grad_var_set
grad_var_set.add(g_var)
mega_bytes = 128 * 1024 * 1024
group_idx = 0
memory_counter = 0
grad_var_groups = OrderedDict()
dtype = grad_vars[0].dtype
for g_var in grad_vars:
# Note: the dtype of the same group should be the same.
bytes = np.prod(g_var.shape) * core.size_of_dtype(g_var.dtype)
if memory_counter < mega_bytes and dtype == g_var.dtype:
memory_counter += bytes
else:
memory_counter = bytes
group_idx += 1
grad_var_groups.setdefault(group_idx, []).append(g_var)
coalesced_grads_and_vars = self._coalesce_tensors(grad_var_groups)
for coalesced_grad, _, _ in coalesced_grads_and_vars:
collective._c_allreduce(coalesced_grad, coalesced_grad, use_calc_stream=True)
self._split_tensors(coalesced_grads_and_vars)
...@@ -26,7 +26,7 @@ from paddle.fluid.optimizer import Momentum ...@@ -26,7 +26,7 @@ from paddle.fluid.optimizer import Momentum
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.io import MNIST as MnistDataset from paddle.fluid.io import MNIST as MnistDataset
from model import Model, CrossEntropy, Input, init_context from model import Model, CrossEntropy, Input, set_device
from metrics import Accuracy from metrics import Accuracy
...@@ -106,7 +106,8 @@ class MNIST(Model): ...@@ -106,7 +106,8 @@ class MNIST(Model):
def main(): def main():
init_context('dynamic' if FLAGS.dynamic else 'static') device = set_device(FLAGS.device)
fluid.enable_dygraph(device) if FLAGS.dynamic else None
train_dataset = MnistDataset(mode='train') train_dataset = MnistDataset(mode='train')
val_dataset = MnistDataset(mode='test') val_dataset = MnistDataset(mode='test')
...@@ -118,7 +119,13 @@ def main(): ...@@ -118,7 +119,13 @@ def main():
optim = Momentum( optim = Momentum(
learning_rate=FLAGS.lr, momentum=.9, parameter_list=model.parameters()) learning_rate=FLAGS.lr, momentum=.9, parameter_list=model.parameters())
model.prepare(optim, CrossEntropy(), Accuracy(topk=(1, 2)), inputs, labels) model.prepare(
optim,
CrossEntropy(),
Accuracy(topk=(1, 2)),
inputs,
labels,
device=FLAGS.device)
if FLAGS.resume is not None: if FLAGS.resume is not None:
model.load(FLAGS.resume) model.load(FLAGS.resume)
...@@ -131,6 +138,8 @@ def main(): ...@@ -131,6 +138,8 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser("CNN training on MNIST") parser = argparse.ArgumentParser("CNN training on MNIST")
parser.add_argument(
"--device", type=str, default='gpu', help="device to use, gpu or cpu")
parser.add_argument( parser.add_argument(
"-d", "--dynamic", action='store_true', help="enable dygraph mode") "-d", "--dynamic", action='store_true', help="enable dygraph mode")
parser.add_argument( parser.add_argument(
......
此差异已折叠。
...@@ -28,7 +28,7 @@ import contextlib ...@@ -28,7 +28,7 @@ import contextlib
import paddle import paddle
from paddle import fluid from paddle import fluid
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from model import Model, CrossEntropy, Input, Loss, init_context from model import Model, CrossEntropy, Input, Loss, set_device
from metrics import Accuracy from metrics import Accuracy
from callbacks import ProgBarLogger from callbacks import ProgBarLogger
from paddle.fluid.io import BatchSampler, DataLoader from paddle.fluid.io import BatchSampler, DataLoader
...@@ -139,9 +139,30 @@ class MyCrossEntropy(Loss): ...@@ -139,9 +139,30 @@ class MyCrossEntropy(Loss):
return [loss1, loss2] return [loss1, loss2]
class TestMnistDataset(MnistDataset):
def __init__(self):
super(TestMnistDataset, self).__init__(mode='test')
def __getitem__(self, idx):
return self.images[idx],
def __len__(self):
return len(self.images)
def get_predict_accuracy(pred, gt):
pred = np.argmax(pred, -1)
gt = np.array(gt)
correct = pred[:, np.newaxis] == gt
return np.sum(correct) / correct.shape[0]
class TestModel(unittest.TestCase): class TestModel(unittest.TestCase):
def fit(self, dynamic, is_mlp=False): def fit(self, dynamic, is_mlp=False):
init_context('dynamic' if dynamic else 'static') device = set_device('gpu')
fluid.enable_dygraph(device) if dynamic else None
im_shape = (-1, 784) im_shape = (-1, 784)
batch_size = 128 batch_size = 128
...@@ -151,19 +172,31 @@ class TestModel(unittest.TestCase): ...@@ -151,19 +172,31 @@ class TestModel(unittest.TestCase):
train_dataset = MnistDataset(mode='train') train_dataset = MnistDataset(mode='train')
val_dataset = MnistDataset(mode='test') val_dataset = MnistDataset(mode='test')
test_dataset = TestMnistDataset()
model = MNIST() if not is_mlp else MLP() model = MNIST() if not is_mlp else MLP()
optim = fluid.optimizer.Momentum( optim = fluid.optimizer.Momentum(
learning_rate=0.01, momentum=.9, parameter_list=model.parameters()) learning_rate=0.01, momentum=.9, parameter_list=model.parameters())
loss = CrossEntropy() if not is_mlp else MyCrossEntropy() loss = CrossEntropy() if not is_mlp else MyCrossEntropy()
model.prepare(optim, loss, Accuracy(), inputs, labels) model.prepare(optim, loss, Accuracy(), inputs, labels, device=device)
cbk = ProgBarLogger(50) cbk = ProgBarLogger(50)
model.fit(train_dataset, model.fit(train_dataset,
val_dataset, val_dataset,
epochs=2, epochs=2,
batch_size=batch_size, batch_size=batch_size,
callbacks=cbk) callbacks=cbk)
eval_result = model.evaluate(val_dataset, batch_size=batch_size)
output = model.predict(test_dataset, batch_size=batch_size)
np.testing.assert_equal(output[0].shape[0], len(test_dataset))
acc = get_predict_accuracy(output[0], val_dataset.labels)
np.testing.assert_allclose(acc, eval_result['acc'])
def test_fit_static(self): def test_fit_static(self):
self.fit(False) self.fit(False)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册