infer_re.py 5.4 KB
Newer Older
文幕地方's avatar
add re  
文幕地方 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
import os
import sys

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))

import random

import cv2
import matplotlib.pyplot as plt
import numpy as np
import paddle

from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction

from xfun import XFUNDataset
from utils import parse_args, get_bio_label_maps, draw_re_results
from data_collator import DataCollator

from ppocr.utils.logging import get_logger


def infer(args):
    os.makedirs(args.output_dir, exist_ok=True)
    logger = get_logger()
    label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
    pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index

    tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)

    model = LayoutXLMForRelationExtraction.from_pretrained(
        args.model_name_or_path)

    eval_dataset = XFUNDataset(
        tokenizer,
        data_dir=args.eval_data_dir,
        label_path=args.eval_label_path,
        label2id_map=label2id_map,
        img_size=(224, 224),
        max_seq_len=args.max_seq_length,
        pad_token_label_id=pad_token_label_id,
        contains_re=True,
        add_special_ids=False,
        return_attention_mask=True,
        load_mode='all')

    eval_dataloader = paddle.io.DataLoader(
        eval_dataset,
        batch_size=args.per_gpu_eval_batch_size,
        num_workers=8,
        shuffle=False,
        collate_fn=DataCollator())

    # 读取gt的oct数据
    ocr_info_list = load_ocr(args.eval_data_dir, args.eval_label_path)

    for idx, batch in enumerate(eval_dataloader):
        logger.info("[Infer] process: {}/{}".format(idx, len(eval_dataloader)))
        with paddle.no_grad():
            outputs = model(**batch)
        pred_relations = outputs['pred_relations']

        ocr_info = ocr_info_list[idx]
        image_path = ocr_info['image_path']
        ocr_info = ocr_info['ocr_info']

        # 根据entity里的信息,做token解码后去过滤不要的ocr_info
        ocr_info = filter_bg_by_txt(ocr_info, batch, tokenizer)

        # 进行 relations 到 ocr信息的转换
        result = []
        used_tail_id = []
        for relations in pred_relations:
            for relation in relations:
                if relation['tail_id'] in used_tail_id:
                    continue
                if relation['head_id'] not in ocr_info or relation[
                        'tail_id'] not in ocr_info:
                    continue
                used_tail_id.append(relation['tail_id'])
                ocr_info_head = ocr_info[relation['head_id']]
                ocr_info_tail = ocr_info[relation['tail_id']]
                result.append((ocr_info_head, ocr_info_tail))

        img = cv2.imread(image_path)
        img_show = draw_re_results(img, result)
        save_path = os.path.join(args.output_dir, os.path.basename(image_path))
        cv2.imwrite(save_path, img_show)


def load_ocr(img_folder, json_path):
    import json
    d = []
    with open(json_path, "r") as fin:
        lines = fin.readlines()
        for line in lines:
            image_name, info_str = line.split("\t")
            info_dict = json.loads(info_str)
            info_dict['image_path'] = os.path.join(img_folder, image_name)
            d.append(info_dict)
    return d


def filter_bg_by_txt(ocr_info, batch, tokenizer):
    entities = batch['entities'][0]
    input_ids = batch['input_ids'][0]

    new_info_dict = {}
    for i in range(len(entities['start'])):
        entitie_head = entities['start'][i]
        entitie_tail = entities['end'][i]
        word_input_ids = input_ids[entitie_head:entitie_tail].numpy().tolist()
        txt = tokenizer.convert_ids_to_tokens(word_input_ids)
        txt = tokenizer.convert_tokens_to_string(txt)

        for i, info in enumerate(ocr_info):
            if info['text'] == txt:
                new_info_dict[i] = info
    return new_info_dict


def post_process(pred_relations, ocr_info, img):
    result = []
    for relations in pred_relations:
        for relation in relations:
            ocr_info_head = ocr_info[relation['head_id']]
            ocr_info_tail = ocr_info[relation['tail_id']]
            result.append((ocr_info_head, ocr_info_tail))
    return result


def draw_re(result, image_path, output_folder):
    img = cv2.imread(image_path)

    from matplotlib import pyplot as plt
    for ocr_info_head, ocr_info_tail in result:
        cv2.rectangle(
            img,
            tuple(ocr_info_head['bbox'][:2]),
            tuple(ocr_info_head['bbox'][2:]), (255, 0, 0),
            thickness=2)
        cv2.rectangle(
            img,
            tuple(ocr_info_tail['bbox'][:2]),
            tuple(ocr_info_tail['bbox'][2:]), (0, 0, 255),
            thickness=2)
        center_p1 = [(ocr_info_head['bbox'][0] + ocr_info_head['bbox'][2]) // 2,
                     (ocr_info_head['bbox'][1] + ocr_info_head['bbox'][3]) // 2]
        center_p2 = [(ocr_info_tail['bbox'][0] + ocr_info_tail['bbox'][2]) // 2,
                     (ocr_info_tail['bbox'][1] + ocr_info_tail['bbox'][3]) // 2]
        cv2.line(
            img, tuple(center_p1), tuple(center_p2), (0, 255, 0), thickness=2)
    plt.imshow(img)
    plt.savefig(
        os.path.join(output_folder, os.path.basename(image_path)), dpi=600)
    # plt.show()


if __name__ == "__main__":
    args = parse_args()
    infer(args)