convert_label2html.py 3.0 KB
Newer Older
文幕地方's avatar
文幕地方 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
conver table label to html
"""

import json
import argparse
from tqdm import tqdm


def save_pred_txt(key, val, tmp_file_path):
    with open(tmp_file_path, 'a+', encoding='utf-8') as f:
        f.write('{}\t{}\n'.format(key, val))


def skip_char(text, sp_char_list):
    """
    skip empty cell
    @param text: text in cell
    @param sp_char_list: style char and special code
    @return:
    """
    for sp_char in sp_char_list:
        text = text.replace(sp_char, '')
    return text


def gen_html(img):
    ''' 
    Formats HTML code from tokenized annotation of img
    '''
    html_code = img['html']['structure']['tokens'].copy()
    to_insert = [i for i, tag in enumerate(html_code) if tag in ('<td>', '>')]
    for i, cell in zip(to_insert[::-1], img['html']['cells'][::-1]):
        if cell['tokens']:
            text = ''.join(cell['tokens'])
            # skip empty text
            sp_char_list = ['<b>', '</b>', '\u2028', ' ', '<i>', '</i>']
            text_remove_style = skip_char(text, sp_char_list)
            if len(text_remove_style) == 0:
                continue
            html_code.insert(i + 1, text)
    html_code = ''.join(html_code)
    html_code = '<html><body><table>{}</table></body></html>'.format(html_code)
    return html_code


def load_gt_data(gt_path):
    """
    load gt
    @param gt_path:
    @return:
    """
    data_list = {}
    with open(gt_path, 'rb') as f:
        lines = f.readlines()
        for line in tqdm(lines):
            data_line = line.decode('utf-8').strip("\n")
            info = json.loads(data_line)
            data_list[info['filename']] = info
    return data_list


def convert(origin_gt_path, save_path):
    """
    gen html from label file
    @param origin_gt_path:
    @param save_path:
    @return:
    """
    data_dict = load_gt_data(origin_gt_path)
    for img_name, gt in tqdm(data_dict.items()):
        html = gen_html(gt)
        save_pred_txt(img_name, html, save_path)
    print('conver finish')


def parse_args():
    parser = argparse.ArgumentParser(description="args for paddleserving")
    parser.add_argument(
        "--ori_gt_path", type=str, required=True, help="label gt path")
    parser.add_argument(
        "--save_path", type=str, required=True, help="path to save file")
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    convert(args.ori_gt_path, args.save_path)