提交 666339f5 编写于 作者: G guosheng

Refine transformer batch length and move transformer to examples.

Refine len for data_loader in model.py.
上级 ecb6d64c
...@@ -201,7 +201,7 @@ python -u predict.py \ ...@@ -201,7 +201,7 @@ python -u predict.py \
--special_token '<s>' '<e>' '<unk>' \ --special_token '<s>' '<e>' '<unk>' \
--predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \
--batch_size 32 \ --batch_size 32 \
--init_from_params base_model_dygraph/step_100000/transformer \ --init_from_params big_model_dygraph/step_100000/transformer \
--beam_size 5 \ --beam_size 5 \
--max_out_len 255 \ --max_out_len 255 \
--output_file predict.txt \ --output_file predict.txt \
......
...@@ -14,9 +14,6 @@ ...@@ -14,9 +14,6 @@
import logging import logging
import os import os
import six
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from functools import partial from functools import partial
import numpy as np import numpy as np
...@@ -28,9 +25,9 @@ from paddle.fluid.layers.utils import flatten ...@@ -28,9 +25,9 @@ from paddle.fluid.layers.utils import flatten
from utils.configure import PDConfig from utils.configure import PDConfig
from utils.check import check_gpu, check_version from utils.check import check_gpu, check_version
from model import Input, set_device from hapi.model import Input, set_device
from reader import prepare_infer_input, Seq2SeqDataset, Seq2SeqBatchSampler from reader import prepare_infer_input, Seq2SeqDataset, Seq2SeqBatchSampler
from transformer import InferTransformer, position_encoding_init from transformer import InferTransformer
def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, def post_process_seq(seq, bos_idx, eos_idx, output_bos=False,
...@@ -132,7 +129,7 @@ def do_predict(args): ...@@ -132,7 +129,7 @@ def do_predict(args):
# TODO: use model.predict when support variant length # TODO: use model.predict when support variant length
f = open(args.output_file, "wb") f = open(args.output_file, "wb")
for data in data_loader(): for data in data_loader():
finished_seq = transformer.test(inputs=flatten(data))[0] finished_seq = transformer.test_batch(inputs=flatten(data))[0]
finished_seq = np.transpose(finished_seq, [0, 2, 1]) finished_seq = np.transpose(finished_seq, [0, 2, 1])
for ins in finished_seq: for ins in finished_seq:
for beam_idx, beam in enumerate(ins): for beam_idx, beam in enumerate(ins):
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import glob import glob
import six import sys
import os import os
import io import io
import itertools import itertools
...@@ -26,7 +26,7 @@ from paddle.io import BatchSampler, DataLoader, Dataset ...@@ -26,7 +26,7 @@ from paddle.io import BatchSampler, DataLoader, Dataset
def create_data_loader(args, device): def create_data_loader(args, device):
data_loaders = [None, None] data_loaders = [(None, None)] * 2
data_files = [args.training_file, args.validation_file data_files = [args.training_file, args.validation_file
] if args.validation_file else [args.training_file] ] if args.validation_file else [args.training_file]
for i, data_file in enumerate(data_files): for i, data_file in enumerate(data_files):
...@@ -65,7 +65,7 @@ def create_data_loader(args, device): ...@@ -65,7 +65,7 @@ def create_data_loader(args, device):
n_head=args.n_head), n_head=args.n_head),
num_workers=0, # TODO: use multi-process num_workers=0, # TODO: use multi-process
return_list=True) return_list=True)
data_loaders[i] = data_loader data_loaders[i] = (data_loader, batch_sampler.__len__)
return data_loaders return data_loaders
...@@ -476,6 +476,7 @@ class Seq2SeqBatchSampler(BatchSampler): ...@@ -476,6 +476,7 @@ class Seq2SeqBatchSampler(BatchSampler):
for i in range(self._nranks) for i in range(self._nranks)
] for batch in batches] ] for batch in batches]
batches = list(itertools.chain.from_iterable(batches)) batches = list(itertools.chain.from_iterable(batches))
self.batch_number = (len(batches) + self._nranks - 1) // self._nranks
# for multi-device # for multi-device
for batch_id, batch in enumerate(batches): for batch_id, batch in enumerate(batches):
...@@ -489,11 +490,13 @@ class Seq2SeqBatchSampler(BatchSampler): ...@@ -489,11 +490,13 @@ class Seq2SeqBatchSampler(BatchSampler):
yield batch_indices yield batch_indices
def __len__(self): def __len__(self):
if hasattr(self, "batch_number"): #
return self.batch_number
if not self._use_token_batch: if not self._use_token_batch:
batch_number = ( batch_number = (
len(self._dataset) + self._batch_size * self._nranks - 1) // ( len(self._dataset) + self._batch_size * self._nranks - 1) // (
self._batch_size * self._nranks) self._batch_size * self._nranks)
else: else:
# TODO(guosheng): fix the uncertain length # for uncertain batch number, the actual value is self.batch_number
batch_number = 1 batch_number = sys.maxsize
return batch_number return batch_number
...@@ -14,9 +14,6 @@ ...@@ -14,9 +14,6 @@
import logging import logging
import os import os
import six
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import numpy as np import numpy as np
import paddle import paddle
...@@ -26,14 +23,18 @@ from paddle.io import DataLoader ...@@ -26,14 +23,18 @@ from paddle.io import DataLoader
from utils.configure import PDConfig from utils.configure import PDConfig
from utils.check import check_gpu, check_version from utils.check import check_gpu, check_version
from model import Input, set_device from hapi.model import Input, set_device
from callbacks import ProgBarLogger from hapi.callbacks import ProgBarLogger
from reader import create_data_loader from reader import create_data_loader
from transformer import Transformer, CrossEntropyCriterion from transformer import Transformer, CrossEntropyCriterion
class TrainCallback(ProgBarLogger): class TrainCallback(ProgBarLogger):
def __init__(self, args, verbose=2): def __init__(self,
args,
verbose=2,
train_steps_fn=None,
eval_steps_fn=None):
# TODO(guosheng): save according to step # TODO(guosheng): save according to step
super(TrainCallback, self).__init__(args.print_step, verbose) super(TrainCallback, self).__init__(args.print_step, verbose)
# the best cross-entropy value with label smoothing # the best cross-entropy value with label smoothing
...@@ -42,11 +43,17 @@ class TrainCallback(ProgBarLogger): ...@@ -42,11 +43,17 @@ class TrainCallback(ProgBarLogger):
(1. - args.label_smooth_eps)) + args.label_smooth_eps * (1. - args.label_smooth_eps)) + args.label_smooth_eps *
np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
self.loss_normalizer = loss_normalizer self.loss_normalizer = loss_normalizer
self.train_steps_fn = train_steps_fn
self.eval_steps_fn = eval_steps_fn
def on_train_begin(self, logs=None): def on_train_begin(self, logs=None):
super(TrainCallback, self).on_train_begin(logs) super(TrainCallback, self).on_train_begin(logs)
self.train_metrics += ["normalized loss", "ppl"] self.train_metrics += ["normalized loss", "ppl"]
def on_train_batch_begin(self, step, logs=None):
if step == 0 and self.train_steps_fn:
self.train_progbar._num = self.train_steps_fn()
def on_train_batch_end(self, step, logs=None): def on_train_batch_end(self, step, logs=None):
logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
logs["ppl"] = np.exp(min(logs["loss"][0], 100)) logs["ppl"] = np.exp(min(logs["loss"][0], 100))
...@@ -57,6 +64,10 @@ class TrainCallback(ProgBarLogger): ...@@ -57,6 +64,10 @@ class TrainCallback(ProgBarLogger):
self.eval_metrics = list( self.eval_metrics = list(
self.eval_metrics) + ["normalized loss", "ppl"] self.eval_metrics) + ["normalized loss", "ppl"]
def on_eval_batch_begin(self, step, logs=None):
if step == 0 and self.eval_steps_fn:
self.eval_progbar._num = self.eval_steps_fn()
def on_eval_batch_end(self, step, logs=None): def on_eval_batch_end(self, step, logs=None):
logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
logs["ppl"] = np.exp(min(logs["loss"][0], 100)) logs["ppl"] = np.exp(min(logs["loss"][0], 100))
...@@ -104,7 +115,8 @@ def do_train(args): ...@@ -104,7 +115,8 @@ def do_train(args):
] ]
# def dataloader # def dataloader
train_loader, eval_loader = create_data_loader(args, device) (train_loader, train_steps_fn), (
eval_loader, eval_steps_fn) = create_data_loader(args, device)
# define model # define model
transformer = Transformer( transformer = Transformer(
...@@ -142,7 +154,12 @@ def do_train(args): ...@@ -142,7 +154,12 @@ def do_train(args):
eval_freq=1, eval_freq=1,
save_freq=1, save_freq=1,
save_dir=args.save_model, save_dir=args.save_model,
callbacks=[TrainCallback(args)]) callbacks=[
TrainCallback(
args,
train_steps_fn=train_steps_fn,
eval_steps_fn=eval_steps_fn)
])
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -20,8 +20,8 @@ import paddle.fluid as fluid ...@@ -20,8 +20,8 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
from model import Model, CrossEntropy, Loss from hapi.model import Model, CrossEntropy, Loss
from text import TransformerBeamSearchDecoder, DynamicDecode from hapi.text import TransformerBeamSearchDecoder, DynamicDecode
def position_encoding_init(n_position, d_pos_vec): def position_encoding_init(n_position, d_pos_vec):
......
...@@ -195,13 +195,19 @@ class PDConfig(object): ...@@ -195,13 +195,19 @@ class PDConfig(object):
"Whether to perform predicting.") "Whether to perform predicting.")
self.default_g.add_arg("do_eval", bool, False, self.default_g.add_arg("do_eval", bool, False,
"Whether to perform evaluating.") "Whether to perform evaluating.")
self.default_g.add_arg("do_save_inference_model", bool, False, self.default_g.add_arg(
"Whether to perform model saving for inference.") "do_save_inference_model", bool, False,
"Whether to perform model saving for inference.")
# NOTE: args for profiler # NOTE: args for profiler
self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)") self.default_g.add_arg(
self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)") "is_profiler", int, 0,
self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)") "the switch of profiler tools. (used for benchmark)")
self.default_g.add_arg(
"profiler_path", str, './',
"the profiler output file path. (used for benchmark)")
self.default_g.add_arg("max_iter", int, 0,
"the max train batch num.(used for benchmark)")
self.parser = parser self.parser = parser
......
...@@ -215,13 +215,13 @@ class ProgBarLogger(Callback): ...@@ -215,13 +215,13 @@ class ProgBarLogger(Callback):
if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv( if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
).local_rank == 0: ).local_rank == 0:
# if steps is not None, last step will update in on_epoch_end if self.steps is None or self.train_step < self.steps:
if self.steps and self.train_step < self.steps:
self._updates(logs, 'train') self._updates(logs, 'train')
def on_epoch_end(self, epoch, logs=None): def on_epoch_end(self, epoch, logs=None):
logs = logs or {} logs = logs or {}
if self.verbose and ParallelEnv().local_rank == 0: if self.train_step % self.log_freq != 0 and self.verbose and ParallelEnv(
).local_rank == 0:
self._updates(logs, 'train') self._updates(logs, 'train')
def on_eval_begin(self, logs=None): def on_eval_begin(self, logs=None):
...@@ -242,14 +242,14 @@ class ProgBarLogger(Callback): ...@@ -242,14 +242,14 @@ class ProgBarLogger(Callback):
if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv( if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
).local_rank == 0: ).local_rank == 0:
# if steps is not None, last step will update in on_epoch_end if self.eval_steps is None or self.eval_step < self.eval_steps:
if self.eval_steps and self.eval_step < self.eval_steps:
self._updates(logs, 'eval') self._updates(logs, 'eval')
def on_eval_end(self, logs=None): def on_eval_end(self, logs=None):
logs = logs or {} logs = logs or {}
if self.verbose and ParallelEnv().local_rank == 0: if self.verbose and ParallelEnv().local_rank == 0:
self._updates(logs, 'eval') if self.eval_step % self.log_freq != 0:
self._updates(logs, 'eval')
print('Eval samples: %d' % (self.evaled_samples)) print('Eval samples: %d' % (self.evaled_samples))
......
...@@ -576,14 +576,15 @@ class DynamicGraphAdapter(object): ...@@ -576,14 +576,15 @@ class DynamicGraphAdapter(object):
if labels is not None: if labels is not None:
labels = [to_variable(l) for l in to_list(labels)] labels = [to_variable(l) for l in to_list(labels)]
if self._nranks > 1: if self._nranks > 1:
outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs]) outputs = self.ddp_model.forward(
* [to_variable(x) for x in inputs])
losses = self.model._loss_function(outputs, labels) losses = self.model._loss_function(outputs, labels)
final_loss = fluid.layers.sum(losses) final_loss = fluid.layers.sum(losses)
final_loss = self.ddp_model.scale_loss(final_loss) final_loss = self.ddp_model.scale_loss(final_loss)
final_loss.backward() final_loss.backward()
self.ddp_model.apply_collective_grads() self.ddp_model.apply_collective_grads()
else: else:
outputs = self.model.forward(*[to_variable(x) for x in inputs]) outputs = self.model.forward(* [to_variable(x) for x in inputs])
losses = self.model._loss_function(outputs, labels) losses = self.model._loss_function(outputs, labels)
final_loss = fluid.layers.sum(losses) final_loss = fluid.layers.sum(losses)
final_loss.backward() final_loss.backward()
...@@ -592,9 +593,9 @@ class DynamicGraphAdapter(object): ...@@ -592,9 +593,9 @@ class DynamicGraphAdapter(object):
self.model.clear_gradients() self.model.clear_gradients()
metrics = [] metrics = []
for metric in self.model._metrics: for metric in self.model._metrics:
metric_outs = metric.add_metric_op( metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
*(to_list(outputs) + to_list(labels))) labels)))
m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m) metrics.append(m)
return ([to_numpy(l) for l in losses], metrics) \ return ([to_numpy(l) for l in losses], metrics) \
...@@ -606,7 +607,7 @@ class DynamicGraphAdapter(object): ...@@ -606,7 +607,7 @@ class DynamicGraphAdapter(object):
inputs = to_list(inputs) inputs = to_list(inputs)
if labels is not None: if labels is not None:
labels = [to_variable(l) for l in to_list(labels)] labels = [to_variable(l) for l in to_list(labels)]
outputs = self.model.forward(*[to_variable(x) for x in inputs]) outputs = self.model.forward(* [to_variable(x) for x in inputs])
if self.model._loss_function: if self.model._loss_function:
losses = self.model._loss_function(outputs, labels) losses = self.model._loss_function(outputs, labels)
else: else:
...@@ -632,9 +633,9 @@ class DynamicGraphAdapter(object): ...@@ -632,9 +633,9 @@ class DynamicGraphAdapter(object):
self._merge_count[self.mode + '_total'] += samples self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples self._merge_count[self.mode + '_batch'] = samples
metric_outs = metric.add_metric_op( metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
*(to_list(outputs) + to_list(labels))) labels)))
m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m) metrics.append(m)
# To be consistent with static graph # To be consistent with static graph
...@@ -1009,7 +1010,7 @@ class Model(fluid.dygraph.Layer): ...@@ -1009,7 +1010,7 @@ class Model(fluid.dygraph.Layer):
do_eval = eval_loader is not None do_eval = eval_loader is not None
self._test_dataloader = eval_loader self._test_dataloader = eval_loader
metrics_name = self._metrics_name() metrics_name = self._metrics_name()
steps = len(train_loader) if hasattr(train_loader, '__len__') else None steps = self._len_data_loader(train_loader)
cbks = config_callbacks( cbks = config_callbacks(
callbacks, callbacks,
model=self, model=self,
...@@ -1037,8 +1038,7 @@ class Model(fluid.dygraph.Layer): ...@@ -1037,8 +1038,7 @@ class Model(fluid.dygraph.Layer):
if not isinstance(eval_loader, Iterable): if not isinstance(eval_loader, Iterable):
loader = eval_loader() loader = eval_loader()
eval_steps = len(loader) if hasattr(loader, eval_steps = self._len_data_loader(loader)
'__len__') else None
cbks.on_begin('eval', { cbks.on_begin('eval', {
'steps': eval_steps, 'steps': eval_steps,
'metrics_name': metrics_name 'metrics_name': metrics_name
...@@ -1114,7 +1114,7 @@ class Model(fluid.dygraph.Layer): ...@@ -1114,7 +1114,7 @@ class Model(fluid.dygraph.Layer):
if not isinstance(eval_loader, Iterable): if not isinstance(eval_loader, Iterable):
loader = eval_loader() loader = eval_loader()
eval_steps = len(loader) if hasattr(loader, '__len__') else None eval_steps = self._len_data_loader(loader)
cbks.on_begin('eval', cbks.on_begin('eval',
{'steps': eval_steps, {'steps': eval_steps,
'metrics_name': metrics_name}) 'metrics_name': metrics_name})
...@@ -1205,7 +1205,7 @@ class Model(fluid.dygraph.Layer): ...@@ -1205,7 +1205,7 @@ class Model(fluid.dygraph.Layer):
mode, mode,
metrics_name, metrics_name,
epoch=None): epoch=None):
size = len(data_loader) if hasattr(data_loader, '__len__') else None size = self._len_data_loader(data_loader)
logs = { logs = {
'steps': size, 'steps': size,
'metrics_name': metrics_name, 'metrics_name': metrics_name,
...@@ -1280,3 +1280,10 @@ class Model(fluid.dygraph.Layer): ...@@ -1280,3 +1280,10 @@ class Model(fluid.dygraph.Layer):
for m in self._metrics: for m in self._metrics:
metrics_name.extend(to_list(m.name())) metrics_name.extend(to_list(m.name()))
return metrics_name return metrics_name
def _len_data_loader(self, data_loader):
try:
steps = len(data_loader)
except Exception:
steps = None
return steps
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册