From 97f7f748085fbe516952d36808735902d305da40 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Wed, 10 Aug 2022 05:33:21 +0000 Subject: [PATCH] add copyright --- ppstructure/table/convert_label2html.py | 102 ++++++++++++++++++++++++ ppstructure/table/matcher.py | 21 ++++- ppstructure/table/predict_table.py | 9 ++- ppstructure/table/table_master_match.py | 18 +++++ 4 files changed, 145 insertions(+), 5 deletions(-) create mode 100644 ppstructure/table/convert_label2html.py diff --git a/ppstructure/table/convert_label2html.py b/ppstructure/table/convert_label2html.py new file mode 100644 index 00000000..be16212a --- /dev/null +++ b/ppstructure/table/convert_label2html.py @@ -0,0 +1,102 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +conver table label to html +""" + +import json +import argparse +from tqdm import tqdm + + +def save_pred_txt(key, val, tmp_file_path): + with open(tmp_file_path, 'a+', encoding='utf-8') as f: + f.write('{}\t{}\n'.format(key, val)) + + +def skip_char(text, sp_char_list): + """ + skip empty cell + @param text: text in cell + @param sp_char_list: style char and special code + @return: + """ + for sp_char in sp_char_list: + text = text.replace(sp_char, '') + return text + + +def gen_html(img): + ''' + Formats HTML code from tokenized annotation of img + ''' + html_code = img['html']['structure']['tokens'].copy() + to_insert = [i for i, tag in enumerate(html_code) if tag in ('', '>')] + for i, cell in zip(to_insert[::-1], img['html']['cells'][::-1]): + if cell['tokens']: + text = ''.join(cell['tokens']) + # skip empty text + sp_char_list = ['', '', '\u2028', ' ', '', ''] + text_remove_style = skip_char(text, sp_char_list) + if len(text_remove_style) == 0: + continue + html_code.insert(i + 1, text) + html_code = ''.join(html_code) + html_code = '{}
'.format(html_code) + return html_code + + +def load_gt_data(gt_path): + """ + load gt + @param gt_path: + @return: + """ + data_list = {} + with open(gt_path, 'rb') as f: + lines = f.readlines() + for line in tqdm(lines): + data_line = line.decode('utf-8').strip("\n") + info = json.loads(data_line) + data_list[info['filename']] = info + return data_list + + +def convert(origin_gt_path, save_path): + """ + gen html from label file + @param origin_gt_path: + @param save_path: + @return: + """ + data_dict = load_gt_data(origin_gt_path) + for img_name, gt in tqdm(data_dict.items()): + html = gen_html(gt) + save_pred_txt(img_name, html, save_path) + print('conver finish') + + +def parse_args(): + parser = argparse.ArgumentParser(description="args for paddleserving") + parser.add_argument( + "--ori_gt_path", type=str, required=True, help="label gt path") + parser.add_argument( + "--save_path", type=str, required=True, help="path to save file") + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + convert(args.ori_gt_path, args.save_path) diff --git a/ppstructure/table/matcher.py b/ppstructure/table/matcher.py index a25f869c..6884ea3c 100755 --- a/ppstructure/table/matcher.py +++ b/ppstructure/table/matcher.py @@ -1,4 +1,18 @@ -import json +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np from ppstructure.table.table_master_match import deal_eb_token, deal_bb @@ -64,6 +78,11 @@ class TableMatch: for i, gt_box in enumerate(dt_boxes): distances = [] for j, pred_box in enumerate(pred_bboxes): + if len(pred_box) == 8: + pred_box = [ + np.min(pred_box[0::2]), np.min(pred_box[1::2]), + np.max(pred_box[0::2]), np.max(pred_box[1::2]) + ] distances.append((distance(gt_box, pred_box), 1. - compute_iou(gt_box, pred_box) )) # compute iou and l1 distance diff --git a/ppstructure/table/predict_table.py b/ppstructure/table/predict_table.py index 6e705123..57f0fec0 100644 --- a/ppstructure/table/predict_table.py +++ b/ppstructure/table/predict_table.py @@ -133,6 +133,7 @@ class TableSystem(object): return structure_res, elapse def _ocr(self, img): + h, w = img.shape[:2] if self.benchmark: self.autolog.times.stamp() dt_boxes, det_elapse = self.text_detector(copy.deepcopy(img)) @@ -140,10 +141,10 @@ class TableSystem(object): r_boxes = [] for box in dt_boxes: - x_min = box[:, 0].min() - 1 - x_max = box[:, 0].max() + 1 - y_min = box[:, 1].min() - 1 - y_max = box[:, 1].max() + 1 + x_min = max(0, box[:, 0].min() - 1) + x_max = min(w, box[:, 0].max() + 1) + y_min = max(0, box[:, 1].min() - 1) + y_max = min(h, box[:, 1].max() + 1) box = [x_min, y_min, x_max, y_max] r_boxes.append(box) dt_boxes = np.array(r_boxes) diff --git a/ppstructure/table/table_master_match.py b/ppstructure/table/table_master_match.py index 069d576b..6a4c4e9d 100644 --- a/ppstructure/table/table_master_match.py +++ b/ppstructure/table/table_master_match.py @@ -1,3 +1,21 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/JiaquanYe/TableMASTER-mmocr/blob/master/table_recognition/match.py +""" + import os import re import cv2 -- GitLab