未验证 提交 68deaab1 编写于 作者: M MissPenguin 提交者: GitHub

Merge pull request #4967 from WenmuZhou/fix_vqa

 ser and re support distributed train
......@@ -15,13 +15,12 @@
import os
import re
import sys
# import Polygon
import shapely
from shapely.geometry import Polygon
import numpy as np
from collections import defaultdict
import operator
import editdistance
import Levenshtein
import argparse
import json
import copy
......@@ -95,7 +94,7 @@ def ed(args, str1, str2):
if args.ignore_case:
str1 = str1.lower()
str2 = str2.lower()
return editdistance.eval(str1, str2)
return Levenshtein.distance(str1, str2)
def convert_bbox_to_polygon(bbox):
......@@ -115,8 +114,6 @@ def eval_e2e(args):
# pred
dt_results = parse_ser_results_fp(args.pred_json_path, "pred",
args.ignore_background)
assert set(gt_results.keys()) == set(dt_results.keys())
iou_thresh = args.iou_thres
num_gt_chars = 0
gt_count = 0
......@@ -124,7 +121,7 @@ def eval_e2e(args):
hit = 0
ed_sum = 0
for img_name in gt_results:
for img_name in dt_results:
gt_info = gt_results[img_name]
gt_count += len(gt_info)
......
......@@ -36,6 +36,9 @@ from ppocr.utils.logging import get_logger
def train(args):
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
rank = paddle.distributed.get_rank()
distributed = paddle.distributed.get_world_size() > 1
print_arguments(args, logger)
# Added here for reproducibility (even between python 2 and 3)
......@@ -45,7 +48,7 @@ def train(args):
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
# dist mode
if paddle.distributed.get_world_size() > 1:
if distributed:
paddle.distributed.init_parallel_env()
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
......@@ -59,8 +62,8 @@ def train(args):
args.model_name_or_path)
# dist mode
if paddle.distributed.get_world_size() > 1:
model = paddle.distributed.DataParallel(model)
if distributed:
model = paddle.DataParallel(model)
train_dataset = XFUNDataset(
tokenizer,
......@@ -90,8 +93,7 @@ def train(args):
train_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
args.train_batch_size = args.per_gpu_train_batch_size * \
max(1, paddle.distributed.get_world_size())
train_dataloader = paddle.io.DataLoader(
train_dataset,
batch_sampler=train_sampler,
......@@ -136,7 +138,8 @@ def train(args):
args.per_gpu_train_batch_size))
logger.info(
" Total train batch size (w. parallel, distributed & accumulation) = {}".
format(args.train_batch_size * paddle.distributed.get_world_size()))
format(args.per_gpu_train_batch_size *
paddle.distributed.get_world_size()))
logger.info(" Total optimization steps = {}".format(t_total))
global_step = 0
......@@ -170,7 +173,7 @@ def train(args):
global_step += 1
total_samples += batch['image'].shape[0]
if step % print_step == 0:
if rank == 0 and step % print_step == 0:
logger.info(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
format(epoch, args.num_train_epochs, step,
......@@ -185,38 +188,38 @@ def train(args):
train_run_cost = 0.0
total_samples = 0
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
global_step % args.eval_steps == 0):
if rank == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0 and args.evaluate_during_training:
# Log metrics
if (paddle.distributed.get_rank() == 0 and args.
evaluate_during_training): # Only evaluate when single GPU otherwise metrics may not average well
# Only evaluate when single GPU otherwise metrics may not average well
results = evaluate(model, eval_dataloader, logger)
if results['f1'] >= best_metirc['f1']:
best_metirc = results
output_dir = os.path.join(args.output_dir, "best_model")
os.makedirs(output_dir, exist_ok=True)
if distributed:
model._layers.save_pretrained(output_dir)
else:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(args,
os.path.join(output_dir,
"training_args.bin"))
os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to {}".format(
output_dir))
logger.info("eval results: {}".format(results))
logger.info("best_metirc: {}".format(best_metirc))
reader_start = time.time()
if paddle.distributed.get_rank() == 0:
if rank == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, "latest_model")
os.makedirs(output_dir, exist_ok=True)
if paddle.distributed.get_rank() == 0:
if distributed:
model._layers.save_pretrained(output_dir)
else:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(args,
os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to {}".format(
output_dir))
reader_start = time.time()
paddle.save(args, os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to {}".format(output_dir))
logger.info("best_metirc: {}".format(best_metirc))
......
......@@ -37,6 +37,9 @@ from ppocr.utils.logging import get_logger
def train(args):
os.makedirs(args.output_dir, exist_ok=True)
rank = paddle.distributed.get_rank()
distributed = paddle.distributed.get_world_size() > 1
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
print_arguments(args, logger)
......@@ -44,7 +47,7 @@ def train(args):
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
# dist mode
if paddle.distributed.get_world_size() > 1:
if distributed:
paddle.distributed.init_parallel_env()
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
......@@ -59,7 +62,7 @@ def train(args):
args.model_name_or_path)
# dist mode
if paddle.distributed.get_world_size() > 1:
if distributed:
model = paddle.DataParallel(model)
train_dataset = XFUNDataset(
......@@ -88,9 +91,6 @@ def train(args):
train_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
args.train_batch_size = args.per_gpu_train_batch_size * max(
1, paddle.distributed.get_world_size())
train_dataloader = paddle.io.DataLoader(
train_dataset,
batch_sampler=train_sampler,
......@@ -134,7 +134,7 @@ def train(args):
args.per_gpu_train_batch_size)
logger.info(
" Total train batch size (w. parallel, distributed) = %d",
args.train_batch_size * paddle.distributed.get_world_size(), )
args.per_gpu_train_batch_size * paddle.distributed.get_world_size(), )
logger.info(" Total optimization steps = %d", t_total)
global_step = 0
......@@ -168,7 +168,7 @@ def train(args):
global_step += 1
total_samples += batch['image'].shape[0]
if step % print_step == 0:
if rank == 0 and step % print_step == 0:
logger.info(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
format(epoch_id, args.num_train_epochs, step,
......@@ -183,47 +183,43 @@ def train(args):
train_run_cost = 0.0
total_samples = 0
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
global_step % args.eval_steps == 0):
if rank == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0 and args.evaluate_during_training:
# Log metrics
# Only evaluate when single GPU otherwise metrics may not average well
if paddle.distributed.get_rank(
) == 0 and args.evaluate_during_training:
results, _ = evaluate(
args, model, tokenizer, eval_dataloader, label2id_map,
id2label_map, pad_token_label_id, logger)
if best_metrics is None or results["f1"] >= best_metrics[
"f1"]:
results, _ = evaluate(args, model, tokenizer, eval_dataloader,
label2id_map, id2label_map,
pad_token_label_id, logger)
if best_metrics is None or results["f1"] >= best_metrics["f1"]:
best_metrics = copy.deepcopy(results)
output_dir = os.path.join(args.output_dir, "best_model")
os.makedirs(output_dir, exist_ok=True)
if paddle.distributed.get_rank() == 0:
if distributed:
model._layers.save_pretrained(output_dir)
else:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(
args,
paddle.save(args,
os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to %s",
output_dir)
logger.info("Saving model checkpoint to %s", output_dir)
logger.info("[epoch {}/{}][iter: {}/{}] results: {}".format(
epoch_id, args.num_train_epochs, step,
len(train_dataloader), results))
if best_metrics is not None:
logger.info("best metrics: {}".format(best_metrics))
if paddle.distributed.get_rank() == 0:
reader_start = time.time()
if rank == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, "latest_model")
os.makedirs(output_dir, exist_ok=True)
if paddle.distributed.get_rank() == 0:
if distributed:
model._layers.save_pretrained(output_dir)
else:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(args,
os.path.join(output_dir, "training_args.bin"))
paddle.save(args, os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to %s", output_dir)
reader_start = time.time()
return global_step, tr_loss / global_step
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册