diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index d0d2bb721be41fe2c4042fbea1b55e4e76bdd664..390c2b159575bf1c60387e42b5be3d917ba845f7 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -2449,13 +2449,6 @@ class MainWindow(QMainWindow): export PPLabel and CSV to JSON (PubTabNet) ''' import pandas as pd - from libs.dataPartitionDialog import DataPartitionDialog - - # data partition user input - partitionDialog = DataPartitionDialog(parent=self) - partitionDialog.exec() - if partitionDialog.getStatus() == False: - return # automatically save annotations self.saveFilestate() @@ -2478,28 +2471,19 @@ class MainWindow(QMainWindow): labeldict[file] = eval(label) else: labeldict[file] = [] + + # read table recognition output + TableRec_excel_dir = os.path.join( + self.lastOpenDir, 'tableRec_excel_output') - train_split, val_split, test_split = partitionDialog.getDataPartition() - # check validate - if train_split + val_split + test_split > 100: - msg = "The sum of training, validation and testing data should be less than 100%" - QMessageBox.information(self, "Information", msg) - return - print(train_split, val_split, test_split) - train_split, val_split, test_split = float(train_split) / 100., float(val_split) / 100., float(test_split) / 100. - train_id = int(len(labeldict) * train_split) - val_id = int(len(labeldict) * (train_split + val_split)) - print('Data partition: train:', train_id, - 'validation:', val_id - train_id, - 'test:', len(labeldict) - val_id) - - TableRec_excel_dir = os.path.join(self.lastOpenDir, 'tableRec_excel_output') - json_results = [] - imgid = 0 + # save txt + fid = open( + "{}/gt.txt".format(self.lastOpenDir), "w", encoding='utf-8') for image_path in labeldict.keys(): # load csv annotations filename, _ = os.path.splitext(os.path.basename(image_path)) - csv_path = os.path.join(TableRec_excel_dir, filename + '.xlsx') + csv_path = os.path.join( + TableRec_excel_dir, filename + '.xlsx') if not os.path.exists(csv_path): continue @@ -2518,28 +2502,31 @@ class MainWindow(QMainWindow): cells = [] for anno in labeldict[image_path]: tokens = list(anno['transcription']) - obb = anno['points'] - hbb = OBB2HBB(np.array(obb)).tolist() - cells.append({'tokens': tokens, 'bbox': hbb}) - - # data split - if imgid < train_id: - split = 'train' - elif imgid < val_id: - split = 'val' - else: - split = 'test' - - # save dict - html = {'structure': {'tokens': token_list}, 'cells': cells} - json_results.append({'filename': os.path.basename(image_path), 'split': split, 'imgid': imgid, 'html': html}) - imgid += 1 - - # save json - with open("{}/annotation.json".format(self.lastOpenDir), "w", encoding='utf-8') as fid: - fid.write(json.dumps(json_results, ensure_ascii=False)) - - msg = 'JSON sucessfully saved in {}/annotation.json'.format(self.lastOpenDir) + cells.append({ + 'tokens': tokens, + 'bbox': anno['points'] + }) + + # 构造标注信息 + html = { + 'structure': { + 'tokens': token_list + }, + 'cells': cells + } + d = { + 'filename': os.path.basename(image_path), + 'html': html + } + # 重构HTML + d['gt'] = rebuild_html_from_ppstructure_label(d) + fid.write('{}\n'.format( + json.dumps( + d, ensure_ascii=False))) + + # convert to PP-Structure label format + fid.close() + msg = 'JSON sucessfully saved in {}/gt.txt'.format(self.lastOpenDir) QMessageBox.information(self, "Information", msg) def autolcm(self): diff --git a/PPOCRLabel/libs/dataPartitionDialog.py b/PPOCRLabel/libs/dataPartitionDialog.py deleted file mode 100644 index 33bd491552fe773bd07020d82f7ea9bab76e7557..0000000000000000000000000000000000000000 --- a/PPOCRLabel/libs/dataPartitionDialog.py +++ /dev/null @@ -1,113 +0,0 @@ -try: - from PyQt5.QtGui import * - from PyQt5.QtCore import * - from PyQt5.QtWidgets import * -except ImportError: - from PyQt4.QtGui import * - from PyQt4.QtCore import * - -from libs.utils import newIcon - -import time -import datetime -import json -import cv2 -import numpy as np - - -BB = QDialogButtonBox - -class DataPartitionDialog(QDialog): - def __init__(self, parent=None): - super().__init__() - self.parnet = parent - self.title = 'DATA PARTITION' - - self.train_ratio = 70 - self.val_ratio = 15 - self.test_ratio = 15 - - self.initUI() - - def initUI(self): - self.setWindowTitle(self.title) - self.setWindowModality(Qt.ApplicationModal) - - self.flag_accept = True - - if self.parnet.lang == 'ch': - msg = "导出JSON前请保存所有图像的标注且关闭EXCEL!" - else: - msg = "Please save all the annotations and close the EXCEL before exporting JSON!" - - info_msg = QLabel(msg, self) - info_msg.setWordWrap(True) - info_msg.setStyleSheet("color: red") - info_msg.setFont(QFont('Arial', 12)) - - train_lbl = QLabel('Train split: ', self) - train_lbl.setFont(QFont('Arial', 15)) - val_lbl = QLabel('Valid split: ', self) - val_lbl.setFont(QFont('Arial', 15)) - test_lbl = QLabel('Test split: ', self) - test_lbl.setFont(QFont('Arial', 15)) - - self.train_input = QLineEdit(self) - self.train_input.setFont(QFont('Arial', 15)) - self.val_input = QLineEdit(self) - self.val_input.setFont(QFont('Arial', 15)) - self.test_input = QLineEdit(self) - self.test_input.setFont(QFont('Arial', 15)) - - self.train_input.setText(str(self.train_ratio)) - self.val_input.setText(str(self.val_ratio)) - self.test_input.setText(str(self.test_ratio)) - - validator = QIntValidator(0, 100) - self.train_input.setValidator(validator) - self.val_input.setValidator(validator) - self.test_input.setValidator(validator) - - gridlayout = QGridLayout() - gridlayout.addWidget(info_msg, 0, 0, 1, 2) - gridlayout.addWidget(train_lbl, 1, 0) - gridlayout.addWidget(val_lbl, 2, 0) - gridlayout.addWidget(test_lbl, 3, 0) - gridlayout.addWidget(self.train_input, 1, 1) - gridlayout.addWidget(self.val_input, 2, 1) - gridlayout.addWidget(self.test_input, 3, 1) - - bb = BB(BB.Ok | BB.Cancel, Qt.Horizontal, self) - bb.button(BB.Ok).setIcon(newIcon('done')) - bb.button(BB.Cancel).setIcon(newIcon('undo')) - bb.accepted.connect(self.validate) - bb.rejected.connect(self.cancel) - gridlayout.addWidget(bb, 4, 0, 1, 2) - - self.setLayout(gridlayout) - - self.show() - - def validate(self): - self.flag_accept = True - self.accept() - - def cancel(self): - self.flag_accept = False - self.reject() - - def getStatus(self): - return self.flag_accept - - def getDataPartition(self): - self.train_ratio = int(self.train_input.text()) - self.val_ratio = int(self.val_input.text()) - self.test_ratio = int(self.test_input.text()) - - return self.train_ratio, self.val_ratio, self.test_ratio - - def closeEvent(self, event): - self.flag_accept = False - self.reject() - - diff --git a/PPOCRLabel/libs/utils.py b/PPOCRLabel/libs/utils.py index e397f139e0cf34de4fd517f920dd3fef12cc2cd7..1bd46ab4dac65f4e63e4ac4b2af5a8d295d89671 100644 --- a/PPOCRLabel/libs/utils.py +++ b/PPOCRLabel/libs/utils.py @@ -176,18 +176,6 @@ def boxPad(box, imgShape, pad : int) -> np.array: return box -def OBB2HBB(obb) -> np.array: - """ - Convert Oriented Bounding Box to Horizontal Bounding Box. - """ - hbb = np.zeros(4, dtype=np.int32) - hbb[0] = min(obb[:, 0]) - hbb[1] = min(obb[:, 1]) - hbb[2] = max(obb[:, 0]) - hbb[3] = max(obb[:, 1]) - return hbb - - def expand_list(merged, html_list): ''' Fill blanks according to merged cells @@ -232,6 +220,26 @@ def convert_token(html_list): return token_list +def rebuild_html_from_ppstructure_label(label_info): + from html import escape + html_code = label_info['html']['structure']['tokens'].copy() + to_insert = [ + i for i, tag in enumerate(html_code) if tag in ('', '>') + ] + for i, cell in zip(to_insert[::-1], label_info['html']['cells'][::-1]): + if cell['tokens']: + cell = [ + escape(token) if len(token) == 1 else token + for token in cell['tokens'] + ] + cell = ''.join(cell) + html_code.insert(i + 1, cell) + html_code = ''.join(html_code) + html_code = '{}
'.format( + html_code) + return html_code + + def stepsInfo(lang='en'): if lang == 'ch': msg = "1. 安装与运行:使用上述命令安装与运行程序。\n" \