@@ -225,6 +225,10 @@ To save models/checkpoints and logs during training, just call `trainer.set_save
#### Evaluation/Inference
To do predict/evaluation after a training stage, just create another three reader, backbone and head instance with `phase='predict'` (repeat step 1~4 above). Then do predicting with `predict` method in trainer (no need to create another trainer). More implementation details see [this](https://github.com/PaddlePaddle/PALM/tree/master/examples/predict).
If you want to do evaluation during training process, use `trainer.train_one_step()` instead of `trainer.train()`. The `trainer.train_one_step(batch)` achieves to train only one step, thus you can insert evaluation code into any point of training process. The argument `batch` can be fetched from `trainer.get_one_batch`.
PaddlePALM also supports multi-head inference, please reference `examples/multi-task/joint_predict.py`.
#### Play with Multiple GPUs
If there exists multiple GPUs in your environment, you can control the number and index of these GPUs through the environment variable [CUDA_VISIBLE_DEVICES](https://devblogs.nvidia.com/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/). For example, if 4 GPUs in your enviroment, indexed with 0,1,2,3, you can run with GPU2 only with following commands
# Build computation graph for evaluation and prediction.
# Arguments:
# - pred_backbone: a Backbone object with phase == 'predict'. For evaluating model during training, the predict backbone should keep the same with train backbone.
# - pred_head: a Head object with phase == 'predict'. For evaluating model during training, the predict head should keep the same with train head.
#
# Return:
# - output_vars: dict type. Each value is a computational graph variable(node) argumented by pred_head outputs_attr.
# """
# for i in self._trainers:
# assert i._predict_vars is not None, "{} need to build_predict_forward before "
# merge dataset iterators and create net input vars
iterator=reader._iterator()
prefix=self.name
...
...
@@ -442,19 +383,8 @@ class Trainer(object):
Args:
model_path: the path of saved checkpoint/parameters.
"""
# load pretrain model (or ckpt)
# assert self._exe is not None, "You need to random_init_params before load checkpoints."
# if phase == 'train' and not self._train_init:
# self._init_exe_prog(for_train=True)
# self._exe.run(self._train_init_prog)
# if phase == 'predict' and not self._predict_init:
# self._init_exe_prog(for_train=False)
# self._exe.run(self._pred_init_prog)
assertself._train_init_progisnotNoneorself._pred_init_progisnotNone,"model graph not built. You should at least build_forward or build_predict_forward to load its checkpoint."
# if phase == 'train':
# assert self._train_init_prog is not None, "train graph not found! You should build_forward first before load checkpoint."
ifself._train_init_progisnotNone:
saver.init_pretraining_params(
self._exe,
...
...
@@ -462,9 +392,7 @@ class Trainer(object):
convert=False,
main_program=self._train_init_prog,
strict=True)
# elif phase == 'predict':
elifself._pred_init_progisnotNone:
# assert self._pred_init_prog is not None, "predict graph not found! You should build_predict_head first before load checkpoint."
saver.init_pretraining_params(
self._exe,
model_path,
...
...
@@ -489,7 +417,6 @@ class Trainer(object):
model_path,
convert=convert,
main_program=self._pred_prog)
# raise NotImplementedError()
defload_pretrain(self,model_path,convert=False):
"""
...
...
@@ -498,8 +425,6 @@ class Trainer(object):
Args:
model_path: the path of saved pretrained parameters.
"""
# load pretrain model (or ckpt)
# assert self._exe is not None, "You need to random_init_params before load pretrain models."
assertself._train_init_progisnotNone,"training graph not found. You should at least build_forward to load its pretrained parameters."