提交 666339f5 编写于 作者: G guosheng

Refine transformer batch length and move transformer to examples.

Refine len for data_loader in model.py.
上级 ecb6d64c
......@@ -201,7 +201,7 @@ python -u predict.py \
--special_token '<s>' '<e>' '<unk>' \
--predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \
--batch_size 32 \
--init_from_params base_model_dygraph/step_100000/transformer \
--init_from_params big_model_dygraph/step_100000/transformer \
--beam_size 5 \
--max_out_len 255 \
--output_file predict.txt \
......
......@@ -14,9 +14,6 @@
import logging
import os
import six
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from functools import partial
import numpy as np
......@@ -28,9 +25,9 @@ from paddle.fluid.layers.utils import flatten
from utils.configure import PDConfig
from utils.check import check_gpu, check_version
from model import Input, set_device
from hapi.model import Input, set_device
from reader import prepare_infer_input, Seq2SeqDataset, Seq2SeqBatchSampler
from transformer import InferTransformer, position_encoding_init
from transformer import InferTransformer
def post_process_seq(seq, bos_idx, eos_idx, output_bos=False,
......@@ -132,7 +129,7 @@ def do_predict(args):
# TODO: use model.predict when support variant length
f = open(args.output_file, "wb")
for data in data_loader():
finished_seq = transformer.test(inputs=flatten(data))[0]
finished_seq = transformer.test_batch(inputs=flatten(data))[0]
finished_seq = np.transpose(finished_seq, [0, 2, 1])
for ins in finished_seq:
for beam_idx, beam in enumerate(ins):
......
......@@ -13,7 +13,7 @@
# limitations under the License.
import glob
import six
import sys
import os
import io
import itertools
......@@ -26,7 +26,7 @@ from paddle.io import BatchSampler, DataLoader, Dataset
def create_data_loader(args, device):
data_loaders = [None, None]
data_loaders = [(None, None)] * 2
data_files = [args.training_file, args.validation_file
] if args.validation_file else [args.training_file]
for i, data_file in enumerate(data_files):
......@@ -65,7 +65,7 @@ def create_data_loader(args, device):
n_head=args.n_head),
num_workers=0, # TODO: use multi-process
return_list=True)
data_loaders[i] = data_loader
data_loaders[i] = (data_loader, batch_sampler.__len__)
return data_loaders
......@@ -476,6 +476,7 @@ class Seq2SeqBatchSampler(BatchSampler):
for i in range(self._nranks)
] for batch in batches]
batches = list(itertools.chain.from_iterable(batches))
self.batch_number = (len(batches) + self._nranks - 1) // self._nranks
# for multi-device
for batch_id, batch in enumerate(batches):
......@@ -489,11 +490,13 @@ class Seq2SeqBatchSampler(BatchSampler):
yield batch_indices
def __len__(self):
if hasattr(self, "batch_number"): #
return self.batch_number
if not self._use_token_batch:
batch_number = (
len(self._dataset) + self._batch_size * self._nranks - 1) // (
self._batch_size * self._nranks)
else:
# TODO(guosheng): fix the uncertain length
batch_number = 1
# for uncertain batch number, the actual value is self.batch_number
batch_number = sys.maxsize
return batch_number
......@@ -14,9 +14,6 @@
import logging
import os
import six
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import numpy as np
import paddle
......@@ -26,14 +23,18 @@ from paddle.io import DataLoader
from utils.configure import PDConfig
from utils.check import check_gpu, check_version
from model import Input, set_device
from callbacks import ProgBarLogger
from hapi.model import Input, set_device
from hapi.callbacks import ProgBarLogger
from reader import create_data_loader
from transformer import Transformer, CrossEntropyCriterion
class TrainCallback(ProgBarLogger):
def __init__(self, args, verbose=2):
def __init__(self,
args,
verbose=2,
train_steps_fn=None,
eval_steps_fn=None):
# TODO(guosheng): save according to step
super(TrainCallback, self).__init__(args.print_step, verbose)
# the best cross-entropy value with label smoothing
......@@ -42,11 +43,17 @@ class TrainCallback(ProgBarLogger):
(1. - args.label_smooth_eps)) + args.label_smooth_eps *
np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
self.loss_normalizer = loss_normalizer
self.train_steps_fn = train_steps_fn
self.eval_steps_fn = eval_steps_fn
def on_train_begin(self, logs=None):
super(TrainCallback, self).on_train_begin(logs)
self.train_metrics += ["normalized loss", "ppl"]
def on_train_batch_begin(self, step, logs=None):
if step == 0 and self.train_steps_fn:
self.train_progbar._num = self.train_steps_fn()
def on_train_batch_end(self, step, logs=None):
logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
logs["ppl"] = np.exp(min(logs["loss"][0], 100))
......@@ -57,6 +64,10 @@ class TrainCallback(ProgBarLogger):
self.eval_metrics = list(
self.eval_metrics) + ["normalized loss", "ppl"]
def on_eval_batch_begin(self, step, logs=None):
if step == 0 and self.eval_steps_fn:
self.eval_progbar._num = self.eval_steps_fn()
def on_eval_batch_end(self, step, logs=None):
logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
logs["ppl"] = np.exp(min(logs["loss"][0], 100))
......@@ -104,7 +115,8 @@ def do_train(args):
]
# def dataloader
train_loader, eval_loader = create_data_loader(args, device)
(train_loader, train_steps_fn), (
eval_loader, eval_steps_fn) = create_data_loader(args, device)
# define model
transformer = Transformer(
......@@ -142,7 +154,12 @@ def do_train(args):
eval_freq=1,
save_freq=1,
save_dir=args.save_model,
callbacks=[TrainCallback(args)])
callbacks=[
TrainCallback(
args,
train_steps_fn=train_steps_fn,
eval_steps_fn=eval_steps_fn)
])
if __name__ == "__main__":
......
......@@ -20,8 +20,8 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
from model import Model, CrossEntropy, Loss
from text import TransformerBeamSearchDecoder, DynamicDecode
from hapi.model import Model, CrossEntropy, Loss
from hapi.text import TransformerBeamSearchDecoder, DynamicDecode
def position_encoding_init(n_position, d_pos_vec):
......
......@@ -195,13 +195,19 @@ class PDConfig(object):
"Whether to perform predicting.")
self.default_g.add_arg("do_eval", bool, False,
"Whether to perform evaluating.")
self.default_g.add_arg("do_save_inference_model", bool, False,
self.default_g.add_arg(
"do_save_inference_model", bool, False,
"Whether to perform model saving for inference.")
# NOTE: args for profiler
self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)")
self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)")
self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)")
self.default_g.add_arg(
"is_profiler", int, 0,
"the switch of profiler tools. (used for benchmark)")
self.default_g.add_arg(
"profiler_path", str, './',
"the profiler output file path. (used for benchmark)")
self.default_g.add_arg("max_iter", int, 0,
"the max train batch num.(used for benchmark)")
self.parser = parser
......
......@@ -215,13 +215,13 @@ class ProgBarLogger(Callback):
if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
).local_rank == 0:
# if steps is not None, last step will update in on_epoch_end
if self.steps and self.train_step < self.steps:
if self.steps is None or self.train_step < self.steps:
self._updates(logs, 'train')
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
if self.verbose and ParallelEnv().local_rank == 0:
if self.train_step % self.log_freq != 0 and self.verbose and ParallelEnv(
).local_rank == 0:
self._updates(logs, 'train')
def on_eval_begin(self, logs=None):
......@@ -242,13 +242,13 @@ class ProgBarLogger(Callback):
if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
).local_rank == 0:
# if steps is not None, last step will update in on_epoch_end
if self.eval_steps and self.eval_step < self.eval_steps:
if self.eval_steps is None or self.eval_step < self.eval_steps:
self._updates(logs, 'eval')
def on_eval_end(self, logs=None):
logs = logs or {}
if self.verbose and ParallelEnv().local_rank == 0:
if self.eval_step % self.log_freq != 0:
self._updates(logs, 'eval')
print('Eval samples: %d' % (self.evaled_samples))
......
......@@ -576,14 +576,15 @@ class DynamicGraphAdapter(object):
if labels is not None:
labels = [to_variable(l) for l in to_list(labels)]
if self._nranks > 1:
outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs])
outputs = self.ddp_model.forward(
* [to_variable(x) for x in inputs])
losses = self.model._loss_function(outputs, labels)
final_loss = fluid.layers.sum(losses)
final_loss = self.ddp_model.scale_loss(final_loss)
final_loss.backward()
self.ddp_model.apply_collective_grads()
else:
outputs = self.model.forward(*[to_variable(x) for x in inputs])
outputs = self.model.forward(* [to_variable(x) for x in inputs])
losses = self.model._loss_function(outputs, labels)
final_loss = fluid.layers.sum(losses)
final_loss.backward()
......@@ -592,9 +593,9 @@ class DynamicGraphAdapter(object):
self.model.clear_gradients()
metrics = []
for metric in self.model._metrics:
metric_outs = metric.add_metric_op(
*(to_list(outputs) + to_list(labels)))
m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
labels)))
m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m)
return ([to_numpy(l) for l in losses], metrics) \
......@@ -606,7 +607,7 @@ class DynamicGraphAdapter(object):
inputs = to_list(inputs)
if labels is not None:
labels = [to_variable(l) for l in to_list(labels)]
outputs = self.model.forward(*[to_variable(x) for x in inputs])
outputs = self.model.forward(* [to_variable(x) for x in inputs])
if self.model._loss_function:
losses = self.model._loss_function(outputs, labels)
else:
......@@ -632,9 +633,9 @@ class DynamicGraphAdapter(object):
self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples
metric_outs = metric.add_metric_op(
*(to_list(outputs) + to_list(labels)))
m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
labels)))
m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m)
# To be consistent with static graph
......@@ -1009,7 +1010,7 @@ class Model(fluid.dygraph.Layer):
do_eval = eval_loader is not None
self._test_dataloader = eval_loader
metrics_name = self._metrics_name()
steps = len(train_loader) if hasattr(train_loader, '__len__') else None
steps = self._len_data_loader(train_loader)
cbks = config_callbacks(
callbacks,
model=self,
......@@ -1037,8 +1038,7 @@ class Model(fluid.dygraph.Layer):
if not isinstance(eval_loader, Iterable):
loader = eval_loader()
eval_steps = len(loader) if hasattr(loader,
'__len__') else None
eval_steps = self._len_data_loader(loader)
cbks.on_begin('eval', {
'steps': eval_steps,
'metrics_name': metrics_name
......@@ -1114,7 +1114,7 @@ class Model(fluid.dygraph.Layer):
if not isinstance(eval_loader, Iterable):
loader = eval_loader()
eval_steps = len(loader) if hasattr(loader, '__len__') else None
eval_steps = self._len_data_loader(loader)
cbks.on_begin('eval',
{'steps': eval_steps,
'metrics_name': metrics_name})
......@@ -1205,7 +1205,7 @@ class Model(fluid.dygraph.Layer):
mode,
metrics_name,
epoch=None):
size = len(data_loader) if hasattr(data_loader, '__len__') else None
size = self._len_data_loader(data_loader)
logs = {
'steps': size,
'metrics_name': metrics_name,
......@@ -1280,3 +1280,10 @@ class Model(fluid.dygraph.Layer):
for m in self._metrics:
metrics_name.extend(to_list(m.name()))
return metrics_name
def _len_data_loader(self, data_loader):
try:
steps = len(data_loader)
except Exception:
steps = None
return steps
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册