From 5c6f90d6f7f2d356546daafb0d558d6c467079fc Mon Sep 17 00:00:00 2001 From: zhoujun Date: Thu, 16 Dec 2021 21:51:24 -0600 Subject: [PATCH] add eval and ips (#4947) * del unused code * add eval * add resume * fix error --- ppstructure/vqa/README.md | 89 +++++++++++++- ppstructure/vqa/eval_re.py | 125 ++++++++++++++++++++ ppstructure/vqa/eval_ser.py | 154 ++++++++++++++++++++++++ ppstructure/vqa/infer_ser_e2e.py | 4 +- ppstructure/vqa/requirements.txt | 1 + ppstructure/vqa/train_re.py | 119 +++++++------------ ppstructure/vqa/train_ser.py | 196 ++++++++++--------------------- ppstructure/vqa/utils.py | 9 +- 8 files changed, 481 insertions(+), 216 deletions(-) create mode 100644 ppstructure/vqa/eval_re.py create mode 100644 ppstructure/vqa/eval_ser.py diff --git a/ppstructure/vqa/README.md b/ppstructure/vqa/README.md index 23fe28f8..08db718f 100644 --- a/ppstructure/vqa/README.md +++ b/ppstructure/vqa/README.md @@ -98,7 +98,7 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR # 需要使用PaddleNLP最新的代码版本进行安装 git clone https://github.com/PaddlePaddle/PaddleNLP -b develop cd PaddleNLP -pip install -e . +pip3 install -e . ``` @@ -141,7 +141,6 @@ python3.7 train_ser.py \ --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ --num_train_epochs 200 \ --eval_steps 10 \ - --save_steps 500 \ --output_dir "./output/ser/" \ --learning_rate 5e-5 \ --warmup_steps 50 \ @@ -151,6 +150,48 @@ python3.7 train_ser.py \ 最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/ser/`文件夹中。 +* 恢复训练 + +```shell +python3.7 train_ser.py \ + --model_name_or_path "model_path" \ + --train_data_dir "XFUND/zh_train/image" \ + --train_label_path "XFUND/zh_train/xfun_normalize_train.json" \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --num_train_epochs 200 \ + --eval_steps 10 \ + --output_dir "./output/ser/" \ + --learning_rate 5e-5 \ + --warmup_steps 50 \ + --evaluate_during_training \ + --seed 2048 \ + --resume +``` + +* 评估 +```shell +export CUDA_VISIBLE_DEVICES=0 +python3 eval_ser.py \ + --model_name_or_path "PP-Layout_v1.0_ser_pretrained/" \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --per_gpu_eval_batch_size 8 \ + --output_dir "output/ser/" \ + --seed 2048 +``` +最终会打印出`precision`, `recall`, `f1`等指标 + + +```shell +export CUDA_VISIBLE_DEVICES=0 +python3.7 infer_ser.py \ + --model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \ + --output_dir "output_res/" \ + --infer_imgs "XFUND/zh_val/image/" \ + --ocr_json_path "XFUND/zh_val/xfun_normalize_val.json" +``` + * 使用评估集合中提供的OCR识别结果进行预测 ```shell @@ -188,6 +229,7 @@ python3.7 helper/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_nor * 启动训练 ```shell +export CUDA_VISIBLE_DEVICES=0 python3 train_re.py \ --model_name_or_path "layoutxlm-base-uncased" \ --train_data_dir "XFUND/zh_train/image" \ @@ -197,7 +239,6 @@ python3 train_re.py \ --label_map_path 'labels/labels_ser.txt' \ --num_train_epochs 2 \ --eval_steps 10 \ - --save_steps 500 \ --output_dir "output/re/" \ --learning_rate 5e-5 \ --warmup_steps 50 \ @@ -208,8 +249,48 @@ python3 train_re.py \ ``` +* 恢复训练 + +```shell +export CUDA_VISIBLE_DEVICES=0 +python3 train_re.py \ + --model_name_or_path "model_path" \ + --train_data_dir "XFUND/zh_train/image" \ + --train_label_path "XFUND/zh_train/xfun_normalize_train.json" \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --label_map_path 'labels/labels_ser.txt' \ + --num_train_epochs 2 \ + --eval_steps 10 \ + --output_dir "output/re/" \ + --learning_rate 5e-5 \ + --warmup_steps 50 \ + --per_gpu_train_batch_size 8 \ + --per_gpu_eval_batch_size 8 \ + --evaluate_during_training \ + --seed 2048 \ + --resume + +``` + 最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/re/`文件夹中。 +* 评估 +```shell +export CUDA_VISIBLE_DEVICES=0 +python3 eval_re.py \ + --model_name_or_path "output/check/checkpoint-best" \ + --max_seq_length 512 \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --label_map_path 'labels/labels_ser.txt' \ + --output_dir "output/re_test/" \ + --per_gpu_eval_batch_size 8 \ + --seed 2048 +``` +最终会打印出`precision`, `recall`, `f1`等指标 + + * 使用评估集合中提供的OCR识别结果进行预测 ```shell @@ -231,7 +312,7 @@ python3 infer_re.py \ ```shell export CUDA_VISIBLE_DEVICES=0 -# python3.7 infer_ser_re_e2e.py \ +python3.7 infer_ser_re_e2e.py \ --model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \ --re_model_name_or_path "./PP-Layout_v1.0_re_pretrained/" \ --max_seq_length 512 \ diff --git a/ppstructure/vqa/eval_re.py b/ppstructure/vqa/eval_re.py new file mode 100644 index 00000000..45c23660 --- /dev/null +++ b/ppstructure/vqa/eval_re.py @@ -0,0 +1,125 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) + +import paddle + +from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction + +from xfun import XFUNDataset +from utils import parse_args, get_bio_label_maps, print_arguments +from data_collator import DataCollator +from metric import re_score + +from ppocr.utils.logging import get_logger + + +def cal_metric(re_preds, re_labels, entities): + gt_relations = [] + for b in range(len(re_labels)): + rel_sent = [] + for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]): + rel = {} + rel["head_id"] = head + rel["head"] = (entities[b]["start"][rel["head_id"]], + entities[b]["end"][rel["head_id"]]) + rel["head_type"] = entities[b]["label"][rel["head_id"]] + + rel["tail_id"] = tail + rel["tail"] = (entities[b]["start"][rel["tail_id"]], + entities[b]["end"][rel["tail_id"]]) + rel["tail_type"] = entities[b]["label"][rel["tail_id"]] + + rel["type"] = 1 + rel_sent.append(rel) + gt_relations.append(rel_sent) + re_metrics = re_score(re_preds, gt_relations, mode="boundaries") + return re_metrics + + +def evaluate(model, eval_dataloader, logger, prefix=""): + # Eval! + logger.info("***** Running evaluation {} *****".format(prefix)) + logger.info(" Num examples = {}".format(len(eval_dataloader.dataset))) + + re_preds = [] + re_labels = [] + entities = [] + eval_loss = 0.0 + model.eval() + for idx, batch in enumerate(eval_dataloader): + with paddle.no_grad(): + outputs = model(**batch) + loss = outputs['loss'].mean().item() + if paddle.distributed.get_rank() == 0: + logger.info("[Eval] process: {}/{}, loss: {:.5f}".format( + idx, len(eval_dataloader), loss)) + + eval_loss += loss + re_preds.extend(outputs['pred_relations']) + re_labels.extend(batch['relations']) + entities.extend(batch['entities']) + re_metrics = cal_metric(re_preds, re_labels, entities) + re_metrics = { + "precision": re_metrics["ALL"]["p"], + "recall": re_metrics["ALL"]["r"], + "f1": re_metrics["ALL"]["f1"], + } + model.train() + return re_metrics + + +def eval(args): + logger = get_logger() + label2id_map, id2label_map = get_bio_label_maps(args.label_map_path) + pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index + + tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) + + model = LayoutXLMForRelationExtraction.from_pretrained( + args.model_name_or_path) + + eval_dataset = XFUNDataset( + tokenizer, + data_dir=args.eval_data_dir, + label_path=args.eval_label_path, + label2id_map=label2id_map, + img_size=(224, 224), + max_seq_len=args.max_seq_length, + pad_token_label_id=pad_token_label_id, + contains_re=True, + add_special_ids=False, + return_attention_mask=True, + load_mode='all') + + eval_dataloader = paddle.io.DataLoader( + eval_dataset, + batch_size=args.per_gpu_eval_batch_size, + num_workers=8, + shuffle=False, + collate_fn=DataCollator()) + + results = evaluate(model, eval_dataloader, logger) + logger.info("eval results: {}".format(results)) + + +if __name__ == "__main__": + args = parse_args() + eval(args) diff --git a/ppstructure/vqa/eval_ser.py b/ppstructure/vqa/eval_ser.py new file mode 100644 index 00000000..e56aa27c --- /dev/null +++ b/ppstructure/vqa/eval_ser.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) + +import random +import time +import copy +import logging + +import argparse +import paddle +import numpy as np +from seqeval.metrics import classification_report, f1_score, precision_score, recall_score +from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification +from xfun import XFUNDataset +from utils import parse_args, get_bio_label_maps, print_arguments + +from ppocr.utils.logging import get_logger + + +def eval(args): + logger = get_logger() + print_arguments(args, logger) + + label2id_map, id2label_map = get_bio_label_maps(args.label_map_path) + pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index + + tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) + model = LayoutXLMForTokenClassification.from_pretrained( + args.model_name_or_path) + + eval_dataset = XFUNDataset( + tokenizer, + data_dir=args.eval_data_dir, + label_path=args.eval_label_path, + label2id_map=label2id_map, + img_size=(224, 224), + pad_token_label_id=pad_token_label_id, + contains_re=False, + add_special_ids=False, + return_attention_mask=True, + load_mode='all') + + eval_dataloader = paddle.io.DataLoader( + eval_dataset, + batch_size=args.per_gpu_eval_batch_size, + num_workers=0, + use_shared_memory=True, + collate_fn=None, ) + + results, _ = evaluate(args, model, tokenizer, eval_dataloader, label2id_map, + id2label_map, pad_token_label_id, logger) + + logger.info(results) + + +def evaluate(args, + model, + tokenizer, + eval_dataloader, + label2id_map, + id2label_map, + pad_token_label_id, + logger, + prefix=""): + + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None + model.eval() + for idx, batch in enumerate(eval_dataloader): + with paddle.no_grad(): + outputs = model(**batch) + tmp_eval_loss, logits = outputs[:2] + + tmp_eval_loss = tmp_eval_loss.mean() + + if paddle.distributed.get_rank() == 0: + logger.info("[Eval]process: {}/{}, loss: {:.5f}".format( + idx, len(eval_dataloader), tmp_eval_loss.numpy()[0])) + + eval_loss += tmp_eval_loss.item() + nb_eval_steps += 1 + if preds is None: + preds = logits.numpy() + out_label_ids = batch["labels"].numpy() + else: + preds = np.append(preds, logits.numpy(), axis=0) + out_label_ids = np.append( + out_label_ids, batch["labels"].numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + preds = np.argmax(preds, axis=2) + + # label_map = {i: label.upper() for i, label in enumerate(labels)} + + out_label_list = [[] for _ in range(out_label_ids.shape[0])] + preds_list = [[] for _ in range(out_label_ids.shape[0])] + + for i in range(out_label_ids.shape[0]): + for j in range(out_label_ids.shape[1]): + if out_label_ids[i, j] != pad_token_label_id: + out_label_list[i].append(id2label_map[out_label_ids[i][j]]) + preds_list[i].append(id2label_map[preds[i][j]]) + + results = { + "loss": eval_loss, + "precision": precision_score(out_label_list, preds_list), + "recall": recall_score(out_label_list, preds_list), + "f1": f1_score(out_label_list, preds_list), + } + + with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout: + for lbl in out_label_list: + for l in lbl: + fout.write(l + "\t") + fout.write("\n") + with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout: + for lbl in preds_list: + for l in lbl: + fout.write(l + "\t") + fout.write("\n") + + report = classification_report(out_label_list, preds_list) + logger.info("\n" + report) + + logger.info("***** Eval results %s *****", prefix) + for key in sorted(results.keys()): + logger.info(" %s = %s", key, str(results[key])) + model.train() + return results, preds_list + + +if __name__ == "__main__": + args = parse_args() + eval(args) diff --git a/ppstructure/vqa/infer_ser_e2e.py b/ppstructure/vqa/infer_ser_e2e.py index 3ebb350f..7cd9907d 100644 --- a/ppstructure/vqa/infer_ser_e2e.py +++ b/ppstructure/vqa/infer_ser_e2e.py @@ -24,9 +24,9 @@ import paddle from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification # relative reference -from .utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps +from utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps -from .utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info +from utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info def trans_poly_to_bbox(poly): diff --git a/ppstructure/vqa/requirements.txt b/ppstructure/vqa/requirements.txt index c259fadc..9c935ae6 100644 --- a/ppstructure/vqa/requirements.txt +++ b/ppstructure/vqa/requirements.txt @@ -1,2 +1,3 @@ sentencepiece yacs +seqeval \ No newline at end of file diff --git a/ppstructure/vqa/train_re.py b/ppstructure/vqa/train_re.py index ed19646c..748c5e11 100644 --- a/ppstructure/vqa/train_re.py +++ b/ppstructure/vqa/train_re.py @@ -20,80 +20,20 @@ sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) import random +import time import numpy as np import paddle from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction from xfun import XFUNDataset -from utils import parse_args, get_bio_label_maps, print_arguments +from utils import parse_args, get_bio_label_maps, print_arguments, set_seed from data_collator import DataCollator -from metric import re_score +from eval_re import evaluate from ppocr.utils.logging import get_logger -def set_seed(seed): - random.seed(seed) - np.random.seed(seed) - paddle.seed(seed) - - -def cal_metric(re_preds, re_labels, entities): - gt_relations = [] - for b in range(len(re_labels)): - rel_sent = [] - for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]): - rel = {} - rel["head_id"] = head - rel["head"] = (entities[b]["start"][rel["head_id"]], - entities[b]["end"][rel["head_id"]]) - rel["head_type"] = entities[b]["label"][rel["head_id"]] - - rel["tail_id"] = tail - rel["tail"] = (entities[b]["start"][rel["tail_id"]], - entities[b]["end"][rel["tail_id"]]) - rel["tail_type"] = entities[b]["label"][rel["tail_id"]] - - rel["type"] = 1 - rel_sent.append(rel) - gt_relations.append(rel_sent) - re_metrics = re_score(re_preds, gt_relations, mode="boundaries") - return re_metrics - - -def evaluate(model, eval_dataloader, logger, prefix=""): - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = {}".format(len(eval_dataloader.dataset))) - - re_preds = [] - re_labels = [] - entities = [] - eval_loss = 0.0 - model.eval() - for idx, batch in enumerate(eval_dataloader): - with paddle.no_grad(): - outputs = model(**batch) - loss = outputs['loss'].mean().item() - if paddle.distributed.get_rank() == 0: - logger.info("[Eval] process: {}/{}, loss: {:.5f}".format( - idx, len(eval_dataloader), loss)) - - eval_loss += loss - re_preds.extend(outputs['pred_relations']) - re_labels.extend(batch['relations']) - entities.extend(batch['entities']) - re_metrics = cal_metric(re_preds, re_labels, entities) - re_metrics = { - "precision": re_metrics["ALL"]["p"], - "recall": re_metrics["ALL"]["r"], - "f1": re_metrics["ALL"]["f1"], - } - model.train() - return re_metrics - - def train(args): logger = get_logger(log_file=os.path.join(args.output_dir, "train.log")) print_arguments(args, logger) @@ -109,9 +49,14 @@ def train(args): paddle.distributed.init_parallel_env() tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) - - model = LayoutXLMModel.from_pretrained(args.model_name_or_path) - model = LayoutXLMForRelationExtraction(model, dropout=None) + if not args.resume: + model = LayoutXLMModel.from_pretrained(args.model_name_or_path) + model = LayoutXLMForRelationExtraction(model, dropout=None) + logger.info('train from scratch') + else: + logger.info('resume from {}'.format(args.model_name_or_path)) + model = LayoutXLMForRelationExtraction.from_pretrained( + args.model_name_or_path) # dist mode if paddle.distributed.get_world_size() > 1: @@ -200,24 +145,45 @@ def train(args): best_metirc = {'f1': 0} model.train() + train_reader_cost = 0.0 + train_run_cost = 0.0 + total_samples = 0 + reader_start = time.time() + + print_step = 1 + for epoch in range(int(args.num_train_epochs)): for step, batch in enumerate(train_dataloader): + train_reader_cost += time.time() - reader_start + train_start = time.time() outputs = model(**batch) + train_run_cost += time.time() - train_start # model outputs are always tuple in ppnlp (see doc) loss = outputs['loss'] loss = loss.mean() - logger.info( - "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {}, lr: {}". - format(epoch, args.num_train_epochs, step, train_dataloader_len, - global_step, np.mean(loss.numpy()), optimizer.get_lr())) - loss.backward() optimizer.step() optimizer.clear_grad() # lr_scheduler.step() # Update learning rate schedule global_step += 1 + total_samples += batch['image'].shape[0] + + if step % print_step == 0: + logger.info( + "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". + format(epoch, args.num_train_epochs, step, + train_dataloader_len, global_step, + np.mean(loss.numpy()), + optimizer.get_lr(), train_reader_cost / print_step, ( + train_reader_cost + train_run_cost) / print_step, + total_samples / print_step, total_samples / ( + train_reader_cost + train_run_cost))) + + train_reader_cost = 0.0 + train_run_cost = 0.0 + total_samples = 0 if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0): @@ -225,10 +191,9 @@ def train(args): if (paddle.distributed.get_rank() == 0 and args. evaluate_during_training): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(model, eval_dataloader, logger) - if results['f1'] > best_metirc['f1']: + if results['f1'] >= best_metirc['f1']: best_metirc = results - output_dir = os.path.join(args.output_dir, - "checkpoint-best") + output_dir = os.path.join(args.output_dir, "best_model") os.makedirs(output_dir, exist_ok=True) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) @@ -240,10 +205,9 @@ def train(args): logger.info("eval results: {}".format(results)) logger.info("best_metirc: {}".format(best_metirc)) - if (paddle.distributed.get_rank() == 0 and args.save_steps > 0 and - global_step % args.save_steps == 0): + if paddle.distributed.get_rank() == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, "checkpoint-latest") + output_dir = os.path.join(args.output_dir, "latest_model") os.makedirs(output_dir, exist_ok=True) if paddle.distributed.get_rank() == 0: model.save_pretrained(output_dir) @@ -252,6 +216,7 @@ def train(args): os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to {}".format( output_dir)) + reader_start = time.time() logger.info("best_metirc: {}".format(best_metirc)) diff --git a/ppstructure/vqa/train_ser.py b/ppstructure/vqa/train_ser.py index d3144e71..a722f921 100644 --- a/ppstructure/vqa/train_ser.py +++ b/ppstructure/vqa/train_ser.py @@ -20,6 +20,7 @@ sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) import random +import time import copy import logging @@ -29,19 +30,11 @@ import numpy as np from seqeval.metrics import classification_report, f1_score, precision_score, recall_score from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification from xfun import XFUNDataset -from utils import parse_args -from utils import get_bio_label_maps -from utils import print_arguments - +from utils import parse_args, get_bio_label_maps, print_arguments, set_seed +from eval_ser import evaluate from ppocr.utils.logging import get_logger -def set_seed(args): - random.seed(args.seed) - np.random.seed(args.seed) - paddle.seed(args.seed) - - def train(args): os.makedirs(args.output_dir, exist_ok=True) logger = get_logger(log_file=os.path.join(args.output_dir, "train.log")) @@ -55,9 +48,15 @@ def train(args): paddle.distributed.init_parallel_env() tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) - base_model = LayoutXLMModel.from_pretrained(args.model_name_or_path) - model = LayoutXLMForTokenClassification( - base_model, num_classes=len(label2id_map), dropout=None) + if not args.resume: + model = LayoutXLMModel.from_pretrained(args.model_name_or_path) + model = LayoutXLMForTokenClassification( + model, num_classes=len(label2id_map), dropout=None) + logger.info('train from scratch') + else: + logger.info('resume from {}'.format(args.model_name_or_path)) + model = LayoutXLMForTokenClassification.from_pretrained( + args.model_name_or_path) # dist mode if paddle.distributed.get_world_size() > 1: @@ -74,6 +73,17 @@ def train(args): add_special_ids=False, return_attention_mask=True, load_mode='all') + eval_dataset = XFUNDataset( + tokenizer, + data_dir=args.eval_data_dir, + label_path=args.eval_label_path, + label2id_map=label2id_map, + img_size=(224, 224), + pad_token_label_id=pad_token_label_id, + contains_re=False, + add_special_ids=False, + return_attention_mask=True, + load_mode='all') train_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True) @@ -88,6 +98,13 @@ def train(args): use_shared_memory=True, collate_fn=None, ) + eval_dataloader = paddle.io.DataLoader( + eval_dataset, + batch_size=args.per_gpu_eval_batch_size, + num_workers=0, + use_shared_memory=True, + collate_fn=None, ) + t_total = len(train_dataloader) * args.num_train_epochs # build linear decay with warmup lr sch @@ -122,28 +139,49 @@ def train(args): global_step = 0 tr_loss = 0.0 - set_seed(args) + set_seed(ags.seed) best_metrics = None + train_reader_cost = 0.0 + train_run_cost = 0.0 + total_samples = 0 + reader_start = time.time() + + print_step = 1 + model.train() for epoch_id in range(args.num_train_epochs): for step, batch in enumerate(train_dataloader): - model.train() + train_reader_cost += time.time() - reader_start + + train_start = time.time() outputs = model(**batch) + train_run_cost += time.time() - train_start + # model outputs are always tuple in ppnlp (see doc) loss = outputs[0] loss = loss.mean() - logger.info( - "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {}, lr: {}". - format(epoch_id, args.num_train_epochs, step, - len(train_dataloader), global_step, - loss.numpy()[0], lr_scheduler.get_lr())) - loss.backward() tr_loss += loss.item() optimizer.step() lr_scheduler.step() # Update learning rate schedule optimizer.clear_grad() global_step += 1 + total_samples += batch['image'].shape[0] + + if step % print_step == 0: + logger.info( + "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". + format(epoch_id, args.num_train_epochs, step, + len(train_dataloader), global_step, + loss.numpy()[0], + lr_scheduler.get_lr(), train_reader_cost / + print_step, (train_reader_cost + train_run_cost) / + print_step, total_samples / print_step, total_samples + / (train_reader_cost + train_run_cost))) + + train_reader_cost = 0.0 + train_run_cost = 0.0 + total_samples = 0 if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0): @@ -151,9 +189,9 @@ def train(args): # Only evaluate when single GPU otherwise metrics may not average well if paddle.distributed.get_rank( ) == 0 and args.evaluate_during_training: - results, _ = evaluate(args, model, tokenizer, label2id_map, - id2label_map, pad_token_label_id, - logger) + results, _ = evaluate( + args, model, tokenizer, eval_dataloader, label2id_map, + id2label_map, pad_token_label_id, logger) if best_metrics is None or results["f1"] >= best_metrics[ "f1"]: @@ -175,11 +213,9 @@ def train(args): if best_metrics is not None: logger.info("best metrics: {}".format(best_metrics)) - if paddle.distributed.get_rank( - ) == 0 and args.save_steps > 0 and global_step % args.save_steps == 0: + if paddle.distributed.get_rank() == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, - "checkpoint-{}".format(global_step)) + output_dir = os.path.join(args.output_dir, "latest_model") os.makedirs(output_dir, exist_ok=True) if paddle.distributed.get_rank() == 0: model.save_pretrained(output_dir) @@ -187,112 +223,10 @@ def train(args): paddle.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) - + reader_start = time.time() return global_step, tr_loss / global_step -def evaluate(args, - model, - tokenizer, - label2id_map, - id2label_map, - pad_token_label_id, - logger, - prefix=""): - eval_dataset = XFUNDataset( - tokenizer, - data_dir=args.eval_data_dir, - label_path=args.eval_label_path, - label2id_map=label2id_map, - img_size=(224, 224), - pad_token_label_id=pad_token_label_id, - contains_re=False, - add_special_ids=False, - return_attention_mask=True, - load_mode='all') - - args.eval_batch_size = args.per_gpu_eval_batch_size * max( - 1, paddle.distributed.get_world_size()) - - eval_dataloader = paddle.io.DataLoader( - eval_dataset, - batch_size=args.eval_batch_size, - num_workers=0, - use_shared_memory=True, - collate_fn=None, ) - - # Eval! - logger.info("***** Running evaluation %s *****", prefix) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - model.eval() - for idx, batch in enumerate(eval_dataloader): - with paddle.no_grad(): - outputs = model(**batch) - tmp_eval_loss, logits = outputs[:2] - - tmp_eval_loss = tmp_eval_loss.mean() - - if paddle.distributed.get_rank() == 0: - logger.info("[Eval]process: {}/{}, loss: {:.5f}".format( - idx, len(eval_dataloader), tmp_eval_loss.numpy()[0])) - - eval_loss += tmp_eval_loss.item() - nb_eval_steps += 1 - if preds is None: - preds = logits.numpy() - out_label_ids = batch["labels"].numpy() - else: - preds = np.append(preds, logits.numpy(), axis=0) - out_label_ids = np.append( - out_label_ids, batch["labels"].numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - preds = np.argmax(preds, axis=2) - - # label_map = {i: label.upper() for i, label in enumerate(labels)} - - out_label_list = [[] for _ in range(out_label_ids.shape[0])] - preds_list = [[] for _ in range(out_label_ids.shape[0])] - - for i in range(out_label_ids.shape[0]): - for j in range(out_label_ids.shape[1]): - if out_label_ids[i, j] != pad_token_label_id: - out_label_list[i].append(id2label_map[out_label_ids[i][j]]) - preds_list[i].append(id2label_map[preds[i][j]]) - - results = { - "loss": eval_loss, - "precision": precision_score(out_label_list, preds_list), - "recall": recall_score(out_label_list, preds_list), - "f1": f1_score(out_label_list, preds_list), - } - - with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout: - for lbl in out_label_list: - for l in lbl: - fout.write(l + "\t") - fout.write("\n") - with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout: - for lbl in preds_list: - for l in lbl: - fout.write(l + "\t") - fout.write("\n") - - report = classification_report(out_label_list, preds_list) - logger.info("\n" + report) - - logger.info("***** Eval results %s *****", prefix) - for key in sorted(results.keys()): - logger.info(" %s = %s", key, str(results[key])) - - return results, preds_list - - if __name__ == "__main__": args = parse_args() train(args) diff --git a/ppstructure/vqa/utils.py b/ppstructure/vqa/utils.py index f4db20d5..7e862e97 100644 --- a/ppstructure/vqa/utils.py +++ b/ppstructure/vqa/utils.py @@ -25,6 +25,12 @@ import paddle from PIL import Image, ImageDraw, ImageFont +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + + def get_bio_label_maps(label_map_path): with open(label_map_path, "r") as fin: lines = fin.readlines() @@ -375,8 +381,6 @@ def parse_args(): help="Linear warmup over warmup_steps.",) parser.add_argument("--eval_steps", type=int, default=10, help="eval every X updates steps.",) - parser.add_argument("--save_steps", type=int, default=50, - help="Save checkpoint every X updates steps.",) parser.add_argument("--seed", type=int, default=2048, help="random seed for initialization",) @@ -385,6 +389,7 @@ def parse_args(): parser.add_argument( "--label_map_path", default="./labels/labels_ser.txt", type=str, required=False, ) parser.add_argument("--infer_imgs", default=None, type=str, required=False) + parser.add_argument("--resume", action='store_true') parser.add_argument("--ocr_json_path", default=None, type=str, required=False, help="ocr prediction results") # yapf: enable -- GitLab