diff --git a/transformer/README.md b/examples/transformer/README.md similarity index 99% rename from transformer/README.md rename to examples/transformer/README.md index 2c4c22b91788a091fc9c08e303e5bcae7d80a4de..0c785de8a262105a53386c2a6f417e1d499fba34 100644 --- a/transformer/README.md +++ b/examples/transformer/README.md @@ -201,7 +201,7 @@ python -u predict.py \ --special_token '' '' '' \ --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ --batch_size 32 \ - --init_from_params base_model_dygraph/step_100000/transformer \ + --init_from_params big_model_dygraph/step_100000/transformer \ --beam_size 5 \ --max_out_len 255 \ --output_file predict.txt \ diff --git a/transformer/gen_data.sh b/examples/transformer/gen_data.sh similarity index 100% rename from transformer/gen_data.sh rename to examples/transformer/gen_data.sh diff --git a/transformer/images/multi_head_attention.png b/examples/transformer/images/multi_head_attention.png similarity index 100% rename from transformer/images/multi_head_attention.png rename to examples/transformer/images/multi_head_attention.png diff --git a/transformer/images/transformer_network.png b/examples/transformer/images/transformer_network.png similarity index 100% rename from transformer/images/transformer_network.png rename to examples/transformer/images/transformer_network.png diff --git a/transformer/predict.py b/examples/transformer/predict.py similarity index 94% rename from transformer/predict.py rename to examples/transformer/predict.py index b83d5403486c1e661a939663bad154735b29b37e..a6e14314f523d78dee2f770e69a21ae808cd8ad1 100644 --- a/transformer/predict.py +++ b/examples/transformer/predict.py @@ -14,9 +14,6 @@ import logging import os -import six -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from functools import partial import numpy as np @@ -28,9 +25,9 @@ from paddle.fluid.layers.utils import flatten from utils.configure import PDConfig from utils.check import check_gpu, check_version -from model import Input, set_device +from hapi.model import Input, set_device from reader import prepare_infer_input, Seq2SeqDataset, Seq2SeqBatchSampler -from transformer import InferTransformer, position_encoding_init +from transformer import InferTransformer def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, @@ -132,7 +129,7 @@ def do_predict(args): # TODO: use model.predict when support variant length f = open(args.output_file, "wb") for data in data_loader(): - finished_seq = transformer.test(inputs=flatten(data))[0] + finished_seq = transformer.test_batch(inputs=flatten(data))[0] finished_seq = np.transpose(finished_seq, [0, 2, 1]) for ins in finished_seq: for beam_idx, beam in enumerate(ins): diff --git a/transformer/reader.py b/examples/transformer/reader.py similarity index 97% rename from transformer/reader.py rename to examples/transformer/reader.py index 2e3fc59e0d3a85e8a674e6b1f6bb4b611ee45057..f6891df960b66fb9b48bb65d36af46f4ec601fc9 100644 --- a/transformer/reader.py +++ b/examples/transformer/reader.py @@ -13,7 +13,7 @@ # limitations under the License. import glob -import six +import sys import os import io import itertools @@ -26,7 +26,7 @@ from paddle.io import BatchSampler, DataLoader, Dataset def create_data_loader(args, device): - data_loaders = [None, None] + data_loaders = [(None, None)] * 2 data_files = [args.training_file, args.validation_file ] if args.validation_file else [args.training_file] for i, data_file in enumerate(data_files): @@ -65,7 +65,7 @@ def create_data_loader(args, device): n_head=args.n_head), num_workers=0, # TODO: use multi-process return_list=True) - data_loaders[i] = data_loader + data_loaders[i] = (data_loader, batch_sampler.__len__) return data_loaders @@ -476,6 +476,7 @@ class Seq2SeqBatchSampler(BatchSampler): for i in range(self._nranks) ] for batch in batches] batches = list(itertools.chain.from_iterable(batches)) + self.batch_number = (len(batches) + self._nranks - 1) // self._nranks # for multi-device for batch_id, batch in enumerate(batches): @@ -489,11 +490,13 @@ class Seq2SeqBatchSampler(BatchSampler): yield batch_indices def __len__(self): + if hasattr(self, "batch_number"): # + return self.batch_number if not self._use_token_batch: batch_number = ( len(self._dataset) + self._batch_size * self._nranks - 1) // ( self._batch_size * self._nranks) else: - # TODO(guosheng): fix the uncertain length - batch_number = 1 + # for uncertain batch number, the actual value is self.batch_number + batch_number = sys.maxsize return batch_number diff --git a/transformer/train.py b/examples/transformer/train.py similarity index 83% rename from transformer/train.py rename to examples/transformer/train.py index 04a61f83a0191a944d9b2611b3bca61f0bcf2a0a..94b52b4423839a0d7e01f0243cbb3d0f5907a4b0 100644 --- a/transformer/train.py +++ b/examples/transformer/train.py @@ -14,9 +14,6 @@ import logging import os -import six -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import numpy as np import paddle @@ -26,14 +23,18 @@ from paddle.io import DataLoader from utils.configure import PDConfig from utils.check import check_gpu, check_version -from model import Input, set_device -from callbacks import ProgBarLogger +from hapi.model import Input, set_device +from hapi.callbacks import ProgBarLogger from reader import create_data_loader from transformer import Transformer, CrossEntropyCriterion class TrainCallback(ProgBarLogger): - def __init__(self, args, verbose=2): + def __init__(self, + args, + verbose=2, + train_steps_fn=None, + eval_steps_fn=None): # TODO(guosheng): save according to step super(TrainCallback, self).__init__(args.print_step, verbose) # the best cross-entropy value with label smoothing @@ -42,11 +43,17 @@ class TrainCallback(ProgBarLogger): (1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) self.loss_normalizer = loss_normalizer + self.train_steps_fn = train_steps_fn + self.eval_steps_fn = eval_steps_fn def on_train_begin(self, logs=None): super(TrainCallback, self).on_train_begin(logs) self.train_metrics += ["normalized loss", "ppl"] + def on_train_batch_begin(self, step, logs=None): + if step == 0 and self.train_steps_fn: + self.train_progbar._num = self.train_steps_fn() + def on_train_batch_end(self, step, logs=None): logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["ppl"] = np.exp(min(logs["loss"][0], 100)) @@ -57,6 +64,10 @@ class TrainCallback(ProgBarLogger): self.eval_metrics = list( self.eval_metrics) + ["normalized loss", "ppl"] + def on_eval_batch_begin(self, step, logs=None): + if step == 0 and self.eval_steps_fn: + self.eval_progbar._num = self.eval_steps_fn() + def on_eval_batch_end(self, step, logs=None): logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["ppl"] = np.exp(min(logs["loss"][0], 100)) @@ -104,7 +115,8 @@ def do_train(args): ] # def dataloader - train_loader, eval_loader = create_data_loader(args, device) + (train_loader, train_steps_fn), ( + eval_loader, eval_steps_fn) = create_data_loader(args, device) # define model transformer = Transformer( @@ -142,7 +154,12 @@ def do_train(args): eval_freq=1, save_freq=1, save_dir=args.save_model, - callbacks=[TrainCallback(args)]) + callbacks=[ + TrainCallback( + args, + train_steps_fn=train_steps_fn, + eval_steps_fn=eval_steps_fn) + ]) if __name__ == "__main__": diff --git a/transformer/transformer.py b/examples/transformer/transformer.py similarity index 99% rename from transformer/transformer.py rename to examples/transformer/transformer.py index 9caf4b04a1a34c5e856a789fbded8a53e917a3da..30bb931d28c3b52467f484f4cb14b5d5601c76d9 100644 --- a/transformer/transformer.py +++ b/examples/transformer/transformer.py @@ -20,8 +20,8 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay -from model import Model, CrossEntropy, Loss -from text import TransformerBeamSearchDecoder, DynamicDecode +from hapi.model import Model, CrossEntropy, Loss +from hapi.text import TransformerBeamSearchDecoder, DynamicDecode def position_encoding_init(n_position, d_pos_vec): diff --git a/transformer/transformer.yaml b/examples/transformer/transformer.yaml similarity index 100% rename from transformer/transformer.yaml rename to examples/transformer/transformer.yaml diff --git a/transformer/utils/__init__.py b/examples/transformer/utils/__init__.py similarity index 100% rename from transformer/utils/__init__.py rename to examples/transformer/utils/__init__.py diff --git a/transformer/utils/check.py b/examples/transformer/utils/check.py similarity index 100% rename from transformer/utils/check.py rename to examples/transformer/utils/check.py diff --git a/transformer/utils/configure.py b/examples/transformer/utils/configure.py similarity index 95% rename from transformer/utils/configure.py rename to examples/transformer/utils/configure.py index 67e601282fee572518435eaed38a4ed8e26fc5f9..17dfaa53d8b44a68a2847c4bc1a1934384bb5f82 100644 --- a/transformer/utils/configure.py +++ b/examples/transformer/utils/configure.py @@ -195,13 +195,19 @@ class PDConfig(object): "Whether to perform predicting.") self.default_g.add_arg("do_eval", bool, False, "Whether to perform evaluating.") - self.default_g.add_arg("do_save_inference_model", bool, False, - "Whether to perform model saving for inference.") + self.default_g.add_arg( + "do_save_inference_model", bool, False, + "Whether to perform model saving for inference.") # NOTE: args for profiler - self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)") - self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)") - self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)") + self.default_g.add_arg( + "is_profiler", int, 0, + "the switch of profiler tools. (used for benchmark)") + self.default_g.add_arg( + "profiler_path", str, './', + "the profiler output file path. (used for benchmark)") + self.default_g.add_arg("max_iter", int, 0, + "the max train batch num.(used for benchmark)") self.parser = parser diff --git a/hapi/callbacks.py b/hapi/callbacks.py index f02eec1ac7b20fe3d5ec771493378b4e74cc3796..62d6402941d0ab0e8af1b3efb3dd77d8ad05604d 100644 --- a/hapi/callbacks.py +++ b/hapi/callbacks.py @@ -215,13 +215,13 @@ class ProgBarLogger(Callback): if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: - # if steps is not None, last step will update in on_epoch_end - if self.steps and self.train_step < self.steps: + if self.steps is None or self.train_step < self.steps: self._updates(logs, 'train') def on_epoch_end(self, epoch, logs=None): logs = logs or {} - if self.verbose and ParallelEnv().local_rank == 0: + if self.train_step % self.log_freq != 0 and self.verbose and ParallelEnv( + ).local_rank == 0: self._updates(logs, 'train') def on_eval_begin(self, logs=None): @@ -242,14 +242,14 @@ class ProgBarLogger(Callback): if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: - # if steps is not None, last step will update in on_epoch_end - if self.eval_steps and self.eval_step < self.eval_steps: + if self.eval_steps is None or self.eval_step < self.eval_steps: self._updates(logs, 'eval') def on_eval_end(self, logs=None): logs = logs or {} if self.verbose and ParallelEnv().local_rank == 0: - self._updates(logs, 'eval') + if self.eval_step % self.log_freq != 0: + self._updates(logs, 'eval') print('Eval samples: %d' % (self.evaled_samples)) diff --git a/hapi/model.py b/hapi/model.py index 3593f00acaa9f2763e01cf139e1ccdb06d339d55..fa8d8f7f9fcedbd216c5f5e5801bad662a42c6e7 100644 --- a/hapi/model.py +++ b/hapi/model.py @@ -576,14 +576,15 @@ class DynamicGraphAdapter(object): if labels is not None: labels = [to_variable(l) for l in to_list(labels)] if self._nranks > 1: - outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs]) + outputs = self.ddp_model.forward( + * [to_variable(x) for x in inputs]) losses = self.model._loss_function(outputs, labels) final_loss = fluid.layers.sum(losses) final_loss = self.ddp_model.scale_loss(final_loss) final_loss.backward() self.ddp_model.apply_collective_grads() else: - outputs = self.model.forward(*[to_variable(x) for x in inputs]) + outputs = self.model.forward(* [to_variable(x) for x in inputs]) losses = self.model._loss_function(outputs, labels) final_loss = fluid.layers.sum(losses) final_loss.backward() @@ -592,9 +593,9 @@ class DynamicGraphAdapter(object): self.model.clear_gradients() metrics = [] for metric in self.model._metrics: - metric_outs = metric.add_metric_op( - *(to_list(outputs) + to_list(labels))) - m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) + metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list( + labels))) + m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)]) metrics.append(m) return ([to_numpy(l) for l in losses], metrics) \ @@ -606,7 +607,7 @@ class DynamicGraphAdapter(object): inputs = to_list(inputs) if labels is not None: labels = [to_variable(l) for l in to_list(labels)] - outputs = self.model.forward(*[to_variable(x) for x in inputs]) + outputs = self.model.forward(* [to_variable(x) for x in inputs]) if self.model._loss_function: losses = self.model._loss_function(outputs, labels) else: @@ -632,9 +633,9 @@ class DynamicGraphAdapter(object): self._merge_count[self.mode + '_total'] += samples self._merge_count[self.mode + '_batch'] = samples - metric_outs = metric.add_metric_op( - *(to_list(outputs) + to_list(labels))) - m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) + metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list( + labels))) + m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)]) metrics.append(m) # To be consistent with static graph @@ -1009,7 +1010,7 @@ class Model(fluid.dygraph.Layer): do_eval = eval_loader is not None self._test_dataloader = eval_loader metrics_name = self._metrics_name() - steps = len(train_loader) if hasattr(train_loader, '__len__') else None + steps = self._len_data_loader(train_loader) cbks = config_callbacks( callbacks, model=self, @@ -1037,8 +1038,7 @@ class Model(fluid.dygraph.Layer): if not isinstance(eval_loader, Iterable): loader = eval_loader() - eval_steps = len(loader) if hasattr(loader, - '__len__') else None + eval_steps = self._len_data_loader(loader) cbks.on_begin('eval', { 'steps': eval_steps, 'metrics_name': metrics_name @@ -1114,7 +1114,7 @@ class Model(fluid.dygraph.Layer): if not isinstance(eval_loader, Iterable): loader = eval_loader() - eval_steps = len(loader) if hasattr(loader, '__len__') else None + eval_steps = self._len_data_loader(loader) cbks.on_begin('eval', {'steps': eval_steps, 'metrics_name': metrics_name}) @@ -1205,7 +1205,7 @@ class Model(fluid.dygraph.Layer): mode, metrics_name, epoch=None): - size = len(data_loader) if hasattr(data_loader, '__len__') else None + size = self._len_data_loader(data_loader) logs = { 'steps': size, 'metrics_name': metrics_name, @@ -1280,3 +1280,10 @@ class Model(fluid.dygraph.Layer): for m in self._metrics: metrics_name.extend(to_list(m.name())) return metrics_name + + def _len_data_loader(self, data_loader): + try: + steps = len(data_loader) + except Exception: + steps = None + return steps