@@ -225,6 +225,10 @@ To save models/checkpoints and logs during training, just call `trainer.set_save
...
@@ -225,6 +225,10 @@ To save models/checkpoints and logs during training, just call `trainer.set_save
#### Evaluation/Inference
#### Evaluation/Inference
To do predict/evaluation after a training stage, just create another three reader, backbone and head instance with `phase='predict'` (repeat step 1~4 above). Then do predicting with `predict` method in trainer (no need to create another trainer). More implementation details see [this](https://github.com/PaddlePaddle/PALM/tree/master/examples/predict).
To do predict/evaluation after a training stage, just create another three reader, backbone and head instance with `phase='predict'` (repeat step 1~4 above). Then do predicting with `predict` method in trainer (no need to create another trainer). More implementation details see [this](https://github.com/PaddlePaddle/PALM/tree/master/examples/predict).
If you want to do evaluation during training process, use `trainer.train_one_step()` instead of `trainer.train()`. The `trainer.train_one_step(batch)` achieves to train only one step, thus you can insert evaluation code into any point of training process. The argument `batch` can be fetched from `trainer.get_one_batch`.
PaddlePALM also supports multi-head inference, please reference `examples/multi-task/joint_predict.py`.
#### Play with Multiple GPUs
#### Play with Multiple GPUs
If there exists multiple GPUs in your environment, you can control the number and index of these GPUs through the environment variable [CUDA_VISIBLE_DEVICES](https://devblogs.nvidia.com/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/). For example, if 4 GPUs in your enviroment, indexed with 0,1,2,3, you can run with GPU2 only with following commands
If there exists multiple GPUs in your environment, you can control the number and index of these GPUs through the environment variable [CUDA_VISIBLE_DEVICES](https://devblogs.nvidia.com/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/). For example, if 4 GPUs in your enviroment, indexed with 0,1,2,3, you can run with GPU2 only with following commands
# Build computation graph for evaluation and prediction.
# Arguments:
# - pred_backbone: a Backbone object with phase == 'predict'. For evaluating model during training, the predict backbone should keep the same with train backbone.
# - pred_head: a Head object with phase == 'predict'. For evaluating model during training, the predict head should keep the same with train head.
#
# Return:
# - output_vars: dict type. Each value is a computational graph variable(node) argumented by pred_head outputs_attr.
# """
# for i in self._trainers:
# assert i._predict_vars is not None, "{} need to build_predict_forward before "
# merge dataset iterators and create net input vars
# merge dataset iterators and create net input vars
iterator=reader._iterator()
iterator=reader._iterator()
prefix=self.name
prefix=self.name
...
@@ -442,19 +383,8 @@ class Trainer(object):
...
@@ -442,19 +383,8 @@ class Trainer(object):
Args:
Args:
model_path: the path of saved checkpoint/parameters.
model_path: the path of saved checkpoint/parameters.
"""
"""
# load pretrain model (or ckpt)
# assert self._exe is not None, "You need to random_init_params before load checkpoints."
# if phase == 'train' and not self._train_init:
# self._init_exe_prog(for_train=True)
# self._exe.run(self._train_init_prog)
# if phase == 'predict' and not self._predict_init:
# self._init_exe_prog(for_train=False)
# self._exe.run(self._pred_init_prog)
assertself._train_init_progisnotNoneorself._pred_init_progisnotNone,"model graph not built. You should at least build_forward or build_predict_forward to load its checkpoint."
assertself._train_init_progisnotNoneorself._pred_init_progisnotNone,"model graph not built. You should at least build_forward or build_predict_forward to load its checkpoint."
# if phase == 'train':
# assert self._train_init_prog is not None, "train graph not found! You should build_forward first before load checkpoint."
ifself._train_init_progisnotNone:
ifself._train_init_progisnotNone:
saver.init_pretraining_params(
saver.init_pretraining_params(
self._exe,
self._exe,
...
@@ -462,9 +392,7 @@ class Trainer(object):
...
@@ -462,9 +392,7 @@ class Trainer(object):
convert=False,
convert=False,
main_program=self._train_init_prog,
main_program=self._train_init_prog,
strict=True)
strict=True)
# elif phase == 'predict':
elifself._pred_init_progisnotNone:
elifself._pred_init_progisnotNone:
# assert self._pred_init_prog is not None, "predict graph not found! You should build_predict_head first before load checkpoint."
saver.init_pretraining_params(
saver.init_pretraining_params(
self._exe,
self._exe,
model_path,
model_path,
...
@@ -489,7 +417,6 @@ class Trainer(object):
...
@@ -489,7 +417,6 @@ class Trainer(object):
model_path,
model_path,
convert=convert,
convert=convert,
main_program=self._pred_prog)
main_program=self._pred_prog)
# raise NotImplementedError()
defload_pretrain(self,model_path,convert=False):
defload_pretrain(self,model_path,convert=False):
"""
"""
...
@@ -498,8 +425,6 @@ class Trainer(object):
...
@@ -498,8 +425,6 @@ class Trainer(object):
Args:
Args:
model_path: the path of saved pretrained parameters.
model_path: the path of saved pretrained parameters.
"""
"""
# load pretrain model (or ckpt)
# assert self._exe is not None, "You need to random_init_params before load pretrain models."
assertself._train_init_progisnotNone,"training graph not found. You should at least build_forward to load its pretrained parameters."
assertself._train_init_progisnotNone,"training graph not found. You should at least build_forward to load its pretrained parameters."