From 97f7f748085fbe516952d36808735902d305da40 Mon Sep 17 00:00:00 2001
From: WenmuZhou <572459439@qq.com>
Date: Wed, 10 Aug 2022 05:33:21 +0000
Subject: [PATCH] add copyright
---
ppstructure/table/convert_label2html.py | 102 ++++++++++++++++++++++++
ppstructure/table/matcher.py | 21 ++++-
ppstructure/table/predict_table.py | 9 ++-
ppstructure/table/table_master_match.py | 18 +++++
4 files changed, 145 insertions(+), 5 deletions(-)
create mode 100644 ppstructure/table/convert_label2html.py
diff --git a/ppstructure/table/convert_label2html.py b/ppstructure/table/convert_label2html.py
new file mode 100644
index 00000000..be16212a
--- /dev/null
+++ b/ppstructure/table/convert_label2html.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+conver table label to html
+"""
+
+import json
+import argparse
+from tqdm import tqdm
+
+
+def save_pred_txt(key, val, tmp_file_path):
+ with open(tmp_file_path, 'a+', encoding='utf-8') as f:
+ f.write('{}\t{}\n'.format(key, val))
+
+
+def skip_char(text, sp_char_list):
+ """
+ skip empty cell
+ @param text: text in cell
+ @param sp_char_list: style char and special code
+ @return:
+ """
+ for sp_char in sp_char_list:
+ text = text.replace(sp_char, '')
+ return text
+
+
+def gen_html(img):
+ '''
+ Formats HTML code from tokenized annotation of img
+ '''
+ html_code = img['html']['structure']['tokens'].copy()
+ to_insert = [i for i, tag in enumerate(html_code) if tag in ('
', '>')]
+ for i, cell in zip(to_insert[::-1], img['html']['cells'][::-1]):
+ if cell['tokens']:
+ text = ''.join(cell['tokens'])
+ # skip empty text
+ sp_char_list = ['', '', '\u2028', ' ', '', '']
+ text_remove_style = skip_char(text, sp_char_list)
+ if len(text_remove_style) == 0:
+ continue
+ html_code.insert(i + 1, text)
+ html_code = ''.join(html_code)
+ html_code = ''.format(html_code)
+ return html_code
+
+
+def load_gt_data(gt_path):
+ """
+ load gt
+ @param gt_path:
+ @return:
+ """
+ data_list = {}
+ with open(gt_path, 'rb') as f:
+ lines = f.readlines()
+ for line in tqdm(lines):
+ data_line = line.decode('utf-8').strip("\n")
+ info = json.loads(data_line)
+ data_list[info['filename']] = info
+ return data_list
+
+
+def convert(origin_gt_path, save_path):
+ """
+ gen html from label file
+ @param origin_gt_path:
+ @param save_path:
+ @return:
+ """
+ data_dict = load_gt_data(origin_gt_path)
+ for img_name, gt in tqdm(data_dict.items()):
+ html = gen_html(gt)
+ save_pred_txt(img_name, html, save_path)
+ print('conver finish')
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="args for paddleserving")
+ parser.add_argument(
+ "--ori_gt_path", type=str, required=True, help="label gt path")
+ parser.add_argument(
+ "--save_path", type=str, required=True, help="path to save file")
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ convert(args.ori_gt_path, args.save_path)
diff --git a/ppstructure/table/matcher.py b/ppstructure/table/matcher.py
index a25f869c..6884ea3c 100755
--- a/ppstructure/table/matcher.py
+++ b/ppstructure/table/matcher.py
@@ -1,4 +1,18 @@
-import json
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
from ppstructure.table.table_master_match import deal_eb_token, deal_bb
@@ -64,6 +78,11 @@ class TableMatch:
for i, gt_box in enumerate(dt_boxes):
distances = []
for j, pred_box in enumerate(pred_bboxes):
+ if len(pred_box) == 8:
+ pred_box = [
+ np.min(pred_box[0::2]), np.min(pred_box[1::2]),
+ np.max(pred_box[0::2]), np.max(pred_box[1::2])
+ ]
distances.append((distance(gt_box, pred_box),
1. - compute_iou(gt_box, pred_box)
)) # compute iou and l1 distance
diff --git a/ppstructure/table/predict_table.py b/ppstructure/table/predict_table.py
index 6e705123..57f0fec0 100644
--- a/ppstructure/table/predict_table.py
+++ b/ppstructure/table/predict_table.py
@@ -133,6 +133,7 @@ class TableSystem(object):
return structure_res, elapse
def _ocr(self, img):
+ h, w = img.shape[:2]
if self.benchmark:
self.autolog.times.stamp()
dt_boxes, det_elapse = self.text_detector(copy.deepcopy(img))
@@ -140,10 +141,10 @@ class TableSystem(object):
r_boxes = []
for box in dt_boxes:
- x_min = box[:, 0].min() - 1
- x_max = box[:, 0].max() + 1
- y_min = box[:, 1].min() - 1
- y_max = box[:, 1].max() + 1
+ x_min = max(0, box[:, 0].min() - 1)
+ x_max = min(w, box[:, 0].max() + 1)
+ y_min = max(0, box[:, 1].min() - 1)
+ y_max = min(h, box[:, 1].max() + 1)
box = [x_min, y_min, x_max, y_max]
r_boxes.append(box)
dt_boxes = np.array(r_boxes)
diff --git a/ppstructure/table/table_master_match.py b/ppstructure/table/table_master_match.py
index 069d576b..6a4c4e9d 100644
--- a/ppstructure/table/table_master_match.py
+++ b/ppstructure/table/table_master_match.py
@@ -1,3 +1,21 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/JiaquanYe/TableMASTER-mmocr/blob/master/table_recognition/match.py
+"""
+
import os
import re
import cv2
--
GitLab
|