Merge pull request #1 from Evezerest/table2

add excel2html

Merge pull request #1 from Evezerest/table2
add excel2html
2838ac70 · whjdark · GitHub · 6413fb1f · 650bad88 · 2838ac70
隐藏空白更改
内联并排

Showing with 83 addition and 21 deletion

PPOCRLabel/PPOCRLabel.py PPOCRLabel/PPOCRLabel.py +39 -21

PPOCRLabel/libs/utils.py PPOCRLabel/libs/utils.py +44 -0

未找到文件。
--- a/PPOCRLabel/PPOCRLabel.py
+++ b/PPOCRLabel/PPOCRLabel.py
@@ -21,6 +21,7 @@ import os.path
 import platform
 import subprocess
 import sys
+import xlrd
 from functools import partial
 from PyQt5.QtCore import QSize, Qt, QPoint, QByteArray, QTimer, QFileInfo, QPointF, QProcess
@@ -611,7 +612,7 @@ class MainWindow(QMainWindow):
            zoomIn, zoomOut, zoomOrg, None,
            fitWindow, fitWidth))
-        addActions(self.menus.autolabel, (AutoRec, reRec, alcm, None, help))
+        addActions(self.menus.autolabel, (AutoRec, reRec, cellreRec, alcm, None, help))
        self.menus.file.aboutToShow.connect(self.updateFileMenu)
@@ -2131,7 +2132,8 @@ class MainWindow(QMainWindow):
        TableRec_excel_dir = self.lastOpenDir + '/tableRec_excel_output/'
        os.makedirs(TableRec_excel_dir, exist_ok=True)
-        filename = os.path.basename(self.filePath)
+        filename, _ = os.path.splitext(os.path.basename(self.filePath))
        excel_path = TableRec_excel_dir + '{}.xlsx'.format(filename)
        if res is None:
@@ -2203,19 +2205,26 @@ class MainWindow(QMainWindow):
            return
        # automatically open excel annotation file
-        try:
+        if platform.system() == 'Windows':
-            import win32com.client
+            try:
-        except:
+                import win32com.client
-            print("CANNOT OPEN .xlsx. It could be one of the following reasons: " \
+            except:
-                "Only support Windows | No python win32com")
+                print("CANNOT OPEN .xlsx. It could be one of the following reasons: " \
+                    "Only support Windows | No python win32com")
-        try:
+            try:
-            xl = win32com.client.Dispatch("Excel.Application")
+                xl = win32com.client.Dispatch("Excel.Application")
-            xl.Visible = True
+                xl.Visible = True
-            xl.Workbooks.Open(excel_path)
+                xl.Workbooks.Open(excel_path)
-        except:
+                # excelEx = "You need to show the excel executable at this point"
-            print("CANNOT OPEN .xlsx. It could be the following reasons: " \
+                # subprocess.Popen([excelEx, excel_path])
-                ".xlsx is not existed")
+                # os.startfile(excel_path)
+            except:
+                print("CANNOT OPEN .xlsx. It could be the following reasons: " \
+                    ".xlsx is not existed")
+        else:
+            os.system('open ' + os.path.normpath(excel_path))
        print('time cost: ', time.time() - start)
@@ -2313,8 +2322,6 @@ class MainWindow(QMainWindow):
        #           'Please check the label.txt and tableRec_excel_output\n'
        #     QMessageBox.information(self, "Information", msg)
        #     return
        train_split, val_split, test_split = partitionDialog.getDataPartition()
        # check validate
        if train_split + val_split + test_split > 100:
@@ -2334,7 +2341,7 @@ class MainWindow(QMainWindow):
        imgid = 0
        for image_path in labeldict.keys():
            # load csv annotations
-            filename = os.path.basename(image_path)
+            filename, _ = os.path.splitext(os.path.basename(image_path))
            csv_path = os.path.join(TableRec_excel_dir, filename + '.xlsx')
            if not os.path.exists(csv_path):
                msg = 'ERROR, Can not find ' + csv_path
@@ -2342,9 +2349,20 @@ class MainWindow(QMainWindow):
                return
            # read xlsx file, convert to HTML
-            xd = pd.ExcelFile(csv_path)
+            # xd = pd.ExcelFile(csv_path)
-            df = xd.parse()
+            # df = xd.parse()
-            structure = df.to_html()
+            # structure = df.to_html(index = False)
+            excel = xlrd.open_workbook(csv_path)
+            sheet0 = excel.sheet_by_index(0)  # only sheet 0
+            merged_cells = sheet0.merged_cells # (0,1,1,3) start row, end row, start col, end col
+            html_list = [['td'] * sheet0.ncols for i in range(sheet0.nrows)]
+            for merged in merged_cells:
+                html_list = expand_list(merged, html_list)
+            token_list = convert_token(html_list)
            # load box annotations
            cells = []
@@ -2363,7 +2381,7 @@ class MainWindow(QMainWindow):
                split = 'test'
            #  save dict
-            html = {'structure': {'tokens': structure}, 'cell': cells}
+            html = {'structure': {'tokens': token_list}, 'cell': cells}
            json_results.append({'filename': filename, 'split': split, 'imgid': imgid, 'html': html})
            imgid += 1

--- a/PPOCRLabel/libs/utils.py
+++ b/PPOCRLabel/libs/utils.py
@@ -188,6 +188,50 @@ def OBB2HBB(obb) -> np.array:
    return hbb
+def expand_list(merged, html_list):
+    '''
+    Fill blanks according to merged cells
+    '''
+    sr, er, sc, ec = merged
+    for i in range(sr, er):
+        for j in range(sc, ec):
+            html_list[i][j] = None
+    html_list[sr][sc] = ''
+    if ec - sc > 1:
+        html_list[sr][sc] += " colspan={}".format(ec - sc)
+    if er - sr > 1:
+        html_list[sr][sc] += " rowspan={}".format(er - sr)
+    return html_list
+def convert_token(html_list):
+    '''
+    Convert raw html to label format
+    '''
+    token_list = ["<tbody>"]
+    # final html list:
+    for row in html_list:
+        token_list.append("<tr>")
+        for col in row:
+            if col == None:
+                continue
+            elif col == 'td':
+                token_list.extend(["<td>", "</td>"])
+            else:
+                token_list.append("<td")
+                if 'colspan' in col:
+                    _, n = col.split('colspan=')
+                    token_list.append(" colspan=\"{}\"".format(n))
+                if 'rowspan' in col:
+                    _, n = col.split('rowspan=')
+                    token_list.append(" rowspan=\"{}\"".format(n))
+                token_list.append(">")
+        token_list.append("</tr>")
+    token_list.append("</tbody>")
+    return token_list
 def stepsInfo(lang='en'):
    if lang == 'ch':
        msg = "1. 安装与运行：使用上述命令安装与运行程序。\n" \