Fix bugs on PPOCRLabel during whl package installation

246da08a · qq_25193841 · c9128ba7 · 246da08a · 246da08a · 246da08a
8 changed file
--- a/PPOCRLabel/PPOCRLabel.py
+++ b/PPOCRLabel/PPOCRLabel.py
@@ -114,7 +114,7 @@ class MainWindow(QMainWindow, WindowMixin):
        getStr = lambda strId: self.stringBundle.getString(strId)
        self.defaultSaveDir = defaultSaveDir
-        self.ocr = PaddleOCR(use_pdserving=False, use_angle_cls=True, det=True, cls=True, use_gpu=gpu, lang=lang)
+        self.ocr = PaddleOCR(use_pdserving=False, use_angle_cls=True, det=True, cls=True, use_gpu=gpu, lang=lang, show_log=False)
        if os.path.exists('./data/paddle.png'):
            result = self.ocr.ocr('./data/paddle.png', cls=True, det=True)
@@ -1389,7 +1389,6 @@ class MainWindow(QMainWindow, WindowMixin):
        for box in self.PPlabel[imgidx]:
            shapes.append((box['transcription'], box['points'], None, None, box['difficult']))
-        print(shapes)
        self.loadLabels(shapes)
        self.canvas.verified = False

--- a/PPOCRLabel/README.md
+++ b/PPOCRLabel/README.md
@@ -8,7 +8,10 @@ PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, w
 ### Recent Update
- 2021.8.11：
+- 2021.11.17:
+  - Support install and start PPOCRLabel through the whl package (by [d2623587501](https://github.com/d2623587501))
+  - Dataset segmentation: Divide the annotation file into training, verification and testing parts (refer to section 3.5 below, by [MrCuiHao](https://github.com/MrCuiHao))
+- 2021.8.11:
  - New functions: Open the dataset folder, image rotation (Note: Please delete the label box before rotating the image) (by [Wei-JL](https://github.com/Wei-JL))
  - Added shortcut key description (Help-Shortcut Key), repaired the direction shortcut key movement function under batch processing (by [d2623587501](https://github.com/d2623587501))
 - 2021.2.5: New batch processing and undo functions (by [Evezerest](https://github.com/Evezerest)):
@@ -21,11 +24,11 @@ PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, w
  - Click to modify the recognition result.(If you can't change the result, please switch to the system default input method, or switch back to the original input method again)
 - 2020.12.18: Support re-recognition of a single label box (by [ninetailskim](https://github.com/ninetailskim) ), perfect shortcut keys.
-## 1. Installation
-### 1.1 Environment Preparation
-#### **Install PaddlePaddle 2.0**
+## 1. Installation and Run
+### 1.1 Install PaddlePaddle
 ```bash
 pip3 install --upgrade pip
@@ -39,58 +42,54 @@ python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
 For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation.
-#### **Install PaddleOCR**
+### 1.2 Install and Run PPOCRLabel
-```bash
-# Recommend
-git clone https://github.com/PaddlePaddle/PaddleOCR
-# If you cannot pull successfully due to network problems, you can also choose to use the code hosting on the cloud:
-git clone https://gitee.com/paddlepaddle/PaddleOCR
-# Note: The cloud-hosting code may not be able to synchronize the update with this GitHub project in real time. There might be a delay of 3-5 days. Please give priority to the recommended method.
+PPOCRLabel can be started in two ways: whl package and Python script. The whl package form is more convenient to start, and the python script to start is convenient for secondary development.
-```
-#### **Install Third-party Libraries**
+#### Windows
 ```bash
-cd PaddleOCR
+pip install PPOCRLabel  # install
-pip3 install -r requirements.txt
+PPOCRLabel  # run
 ```
-If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. Please try to download Shapely whl file using http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely.
+> If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. Please try to download Shapely whl file using http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely.
+>
-Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found)
+> Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found)
+>
-### 1.2 Install PPOCRLabel
+#### Ubuntu Linux
-#### Windows
+```bash
+pip3 install PPOCRLabel
+pip3 install trash-cli
+PPOCRLabel
+```
+#### MacOS
 ```bash
-pip install pyqt5
+pip3 install PPOCRLabel
-cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder
+pip3 install opencv-contrib-python-headless==4.2.0.32
-python PPOCRLabel.py
+PPOCRLabel # run
 ```
-#### Ubuntu Linux
+#### 1.2.2 Build and Install the Whl Package Locally
 ```bash
-pip3 install pyqt5
+cd PaddleOCR/PPOCRLabel
-pip3 install trash-cli
+python3 setup.py bdist_wheel 
-cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder
+pip3 install dist/PPOCRLabel-1.0.0-py2.py3-none-any.whl
-python3 PPOCRLabel.py
 ```
-#### MacOS
+#### 1.2.3 Run PPOCRLabel by Python Script
 ```bash
-pip3 install pyqt5
+cd ./PPOCRLabel  # Switch to the PPOCRLabel directory
-pip3 uninstall opencv-python # Uninstall opencv manually as it conflicts with pyqt
+python PPOCRLabel.py --lang ch
-pip3 install opencv-contrib-python-headless==4.2.0.32 # Install the headless version of opencv
-cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder
-python3 PPOCRLabel.py
 ```
 ## 2. Usage
 ### 2.1 Steps
@@ -119,7 +118,7 @@ python3 PPOCRLabel.py
 10. Labeling result: the user can export the label result manually through the menu "File - Export Label", while the program will also export automatically if "File - Auto export Label Mode" is selected. The manually checked label will be stored in *Label.txt* under the opened picture folder. Click "File"-"Export Recognition Results" in the menu bar, the recognition training data of such pictures will be saved in the *crop_img* folder, and the recognition label will be saved in *rec_gt.txt*<sup>[4]</sup>.
-### Note
+### 2.2 Note
 [1] PPOCRLabel uses the opened folder as the project. After opening the image folder, the picture will not be displayed in the dialog. Instead, the pictures under the folder will be directly imported into the program after clicking "Open Dir".
@@ -137,6 +136,8 @@ python3 PPOCRLabel.py
 |  rec_gt.txt   | The recognition label file, which can be directly used for PPOCR identification model training, is generated after the user clicks on the menu bar "File"-"Export recognition result". |
 |   crop_img    | The recognition data, generated at the same time with *rec_gt.txt* |
 ## 3. Explanation
 ### 3.1 Shortcut keys
@@ -189,7 +190,26 @@ For some data that are difficult to recognize, the recognition results will not
 > *Note: The status of the checkboxes in the recognition results still needs to be saved manually by clicking Save Button.*
-### 3.5 Error message
+### 3.5 Dataset division
+- Enter the following command in the terminal to execute the dataset division script:
+  ```
+  cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder
+  python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec
+  ```
+  Parameter Description:
+  - `trainValTestRatio` is the division ratio of the number of images in the training set, validation set, and test set, set according to your actual situation, the default is `6:2:2`
+  - `labelRootPath` is the storage path of the dataset labeled by PPOCRLabel, the default is `../train_data/label`
+  - `detRootPath` is the path where the text detection dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/det`
+  - `recRootPath` is the path where the character recognition dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/rec`
+### 3.6 Error message
 - If paddleocr is installed with whl, it has a higher priority than calling PaddleOCR class with paddleocr.py, which may cause an exception if whl package is not updated.
@@ -207,24 +227,8 @@ For some data that are difficult to recognize, the recognition results will not
    pip install opencv-contrib-python-headless==4.2.0.32
    ```
-### Dataset division
- Enter the following command in the terminal to execute the dataset division script:
-    ```
-    cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder
-    python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec
-    ```
- Parameter Description:
-    trainValTestRatio is the division ratio of the number of images in the training set, validation set, and test set, set according to your actual situation, the default is 6:2:2
-    labelRootPath is the storage path of the dataset labeled by PPOCRLabel, the default is ../train_data/label
-    detRootPath is the path where the text detection dataset is divided according to the dataset marked by PPOCRLabel. The default is ../train_data/det
-    recRootPath is the path where the character recognition dataset is divided according to the dataset marked by PPOCRLabel. The default is ../train_data/rec
-### Related
+### 4. Related
 1.[Tzutalin. LabelImg. Git code (2015)](https://github.com/tzutalin/labelImg)
\ No newline at end of file
--- a/PPOCRLabel/README_ch.md
+++ b/PPOCRLabel/README_ch.md
@@ -2,14 +2,17 @@
 # PPOCRLabel
-PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具，内置PPOCR模型对数据自动标注和重新识别。使用python3和pyqt5编写，支持矩形框标注和四点标注模式，导出格式可直接用于PPOCR检测和识别模型的训练。
+PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具，内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写，支持矩形框标注和四点标注模式，导出格式可直接用于PaddleOCR检测和识别模型的训练。
 <img src="./data/gif/steps.gif" width="100%"/>
 #### 近期更新
+- 2021.11.17：
+  - 新增支持通过whl包安装和启动PPOCRLabel（by [d2623587501](https://github.com/d2623587501)）
+  - 标注数据集切分：对标注数据进行训练、验证与测试集划分（参考下方3.5节，by [MrCuiHao](https://github.com/MrCuiHao)）
 - 2021.8.11：
-  - 新增功能：打开数据所在文件夹、图像旋转（注意：旋转前的图片上不能存在标记框）（by [Wei-JL](https://github.com/Wei-JL)）
+  - 新增功能：打开数据所在文件夹、右键图像旋转90度（注意：旋转前的图片上不能存在标记框，by [Wei-JL](https://github.com/Wei-JL)）
  - 新增快捷键说明（帮助-快捷键）、修复批处理下的方向快捷键移动功能（by [d2623587501](https://github.com/d2623587501)）
 - 2021.2.5：新增批处理与撤销功能（by [Evezerest](https://github.com/Evezerest))
  - **批处理功能**：按住Ctrl键选择标记框后可批量移动、复制、删除、重新识别。
@@ -24,73 +27,69 @@ PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具，内置P
 如果您对以上内容感兴趣或对完善工具有不一样的想法，欢迎加入我们的SIG队伍与我们共同开发。可以在[此处](https://github.com/PaddlePaddle/PaddleOCR/issues/1728)完成问卷和前置任务，经过我们确认相关内容后即可正式加入，享受SIG福利，共同为OCR开源事业贡献（特别说明：针对PPOCRLabel的改进也属于PaddleOCR前置任务）
-## 1. 安装
-### 1.1 环境搭建
+## 1. 安装与运行
-#### 安装PaddlePaddle
+### 1.1 安装PaddlePaddle
 ```bash
 pip3 install --upgrade pip
-如果您的机器安装的是CUDA9或CUDA10，请运行以下命令安装
+# 如果您的机器安装的是CUDA9或CUDA10，请运行以下命令安装
 python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple
-如果您的机器是CPU，请运行以下命令安装
+# 如果您的机器是CPU，请运行以下命令安装
 python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
 ```
 更多的版本需求，请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
-#### **安装PaddleOCR**
+### 1.2 安装与运行PPOCRLabel
-```bash
+PPOCRLabel可通过whl包与Python脚本两种方式启动，whl包形式启动更加方便，python脚本启动便于二次开发
-【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR
-如果因为网络问题无法pull成功，也可选择使用码云上的托管：
+#### 1.2.1 通过whl包安装与运行
-git clone https://gitee.com/paddlepaddle/PaddleOCR
+##### Windows
-注：码云托管代码可能无法实时同步本github项目更新，存在3~5天延时，请优先使用推荐方式。
+```bash
+pip install PPOCRLabel  # 安装
+PPOCRLabel --lang ch  # 运行
 ```
+> 注意：通过whl包安装PPOCRLabel会自动下载 `paddleocr` whl包，其中shapely依赖可能会出现 `[winRrror 126] 找不到指定模块的问题。` 的错误，建议从[这里](https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely)下载并安装
-#### 安装第三方库
+##### Ubuntu Linux
 ```bash
-cd PaddleOCR
+pip3 install PPOCRLabel
-pip3 install -r requirements.txt
+pip3 install trash-cli
+PPOCRLabel --lang ch
 ```
-注意，windows环境下，建议从[这里](https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely)下载shapely安装包完成安装， 直接通过pip安装的shapely库可能出现`[winRrror 126] 找不到指定模块的问题`。
+##### MacOS
-### 1.2 安装PPOCRLabel
-#### Windows
 ```bash
-pip install pyqt5
+pip3 install PPOCRLabel
-cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下
+pip3 install opencv-contrib-python-headless==4.2.0.32 # 如果下载过慢请添加"-i https://mirror.baidu.com/pypi/simple"
-python PPOCRLabel.py --lang ch
+PPOCRLabel --lang ch # 启动
 ```
-#### Ubuntu Linux
+#### 1.2.2 本地构建whl包并安装
 ```bash
-pip3 install pyqt5
+cd PaddleOCR/PPOCRLabel
-pip3 install trash-cli
+python3 setup.py bdist_wheel 
-cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下
+pip3 install dist/PPOCRLabel-1.0.0-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple
-python3 PPOCRLabel.py --lang ch
 ```
-#### MacOS
+#### 1.2.3 通过Python脚本运行PPOCRLabel
+如果您对PPOCRLabel文件有所更改，通过Python脚本运行会更加方面的看到更改的结果
 ```bash
-pip3 install pyqt5
+cd ./PPOCRLabel  # 切换到PPOCRLabel目录
-pip3 uninstall opencv-python # 由于mac版本的opencv与pyqt有冲突，需先手动卸载opencv
+python PPOCRLabel.py --lang ch
-pip3 install opencv-contrib-python-headless==4.2.0.32 # 安装headless版本的open-cv
-cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下
-python3 PPOCRLabel.py --lang ch
 ```
 ## 2. 使用
 ### 2.1 操作步骤
@@ -106,7 +105,7 @@ python3 PPOCRLabel.py --lang ch
 9. 删除：点击 “删除图像”，图片将会被删除至回收站。
 10. 导出结果：用户可以通过菜单中“文件-导出标记结果”手动导出，同时也可以点击“文件 - 自动导出标记结果”开启自动导出。手动确认过的标记将会被存放在所打开图片文件夹下的*Label.txt*中。在菜单栏点击 “文件” - "导出识别结果"后，会将此类图片的识别训练数据保存在*crop_img*文件夹下，识别标签保存在*rec_gt.txt*中<sup>[4]</sup>。
-### 注意
+### 2.2 注意
 [1] PPOCRLabel以文件夹为基本标记单位，打开待标记的图片文件夹后，不会在窗口栏中显示图片，而是在点击 "选择文件夹" 之后直接将文件夹下的图片导入到程序中。
@@ -124,6 +123,8 @@ python3 PPOCRLabel.py --lang ch
 |  rec_gt.txt   | 识别标签。可直接用于PPOCR识别模型训练。需用户手动点击菜单栏“文件” - "导出识别结果"后产生。 |
 |   crop_img    |   识别数据。按照检测框切割后的图片。与rec_gt.txt同时产生。   |
 ## 3. 说明
 ### 3.1 快捷键
@@ -152,9 +153,7 @@ python3 PPOCRLabel.py --lang ch
 - 模型语言切换：用户可通过菜单栏中 "PaddleOCR" - "选择模型" 切换内置模型语言，目前支持的语言包括法文、德文、韩文、日文。具体模型下载链接可参考[PaddleOCR模型列表](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md).
- - **自定义模型**：如果用户想将内置模型更换为自己的推理模型，可根据[自定义模型代码使用](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/whl.md#%E8%87%AA%E5%AE%9A%E4%B9%89%E6%A8%A1%E5%9E%8B)，通过修改PPOCRLabel.py中针对[PaddleOCR类的实例化](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/PPOCRLabel/PPOCRLabel.py#L116) :
+ - **自定义模型**：如果用户想将内置模型更换为自己的推理模型，可根据[自定义模型代码使用](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/whl.md#%E8%87%AA%E5%AE%9A%E4%B9%89%E6%A8%A1%E5%9E%8B)，通过修改PPOCRLabel.py中针对[PaddleOCR类的实例化](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/PPOCRLabel/PPOCRLabel.py#L116) 实现，例如指定检测模型：`self.ocr = PaddleOCR(det=True, cls=True, use_gpu=gpu, lang=lang) `，在 `det_model_dir` 中传入  自己的模型即可。 
-   `self.ocr = PaddleOCR(use_pdserving=False, use_angle_cls=True, det=True, cls=True, use_gpu=gpu, lang=lang) `，在 `det_model_dir` 中传入  自己的模型即可。 
 ### 3.3 导出标记结果
@@ -174,7 +173,27 @@ PPOCRLabel支持三种导出方式：
 > *注意：识别结果中的复选框状态仍需用户手动点击确认后才能保留*
-### 3.5 错误提示
+### 3.5 数据集划分
+在终端中输入以下命令执行数据集划分脚本：
+```
+cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下
+python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec
+```
+参数说明：
+- `trainValTestRatio` 是训练集、验证集、测试集的图像数量划分比例，根据实际情况设定，默认是`6:2:2`
+- `labelRootPath` 是PPOCRLabel标注的数据集存放路径，默认是`../train_data/label`
+- `detRootPath` 是根据PPOCRLabel标注的数据集划分后的文本检测数据集存放的路径，默认是`../train_data/det `
+- `recRootPath` 是根据PPOCRLabel标注的数据集划分后的字符识别数据集存放的路径，默认是`../train_data/rec`
+### 3.6 错误提示
 - 如果同时使用whl包安装了paddleocr，其优先级大于通过paddleocr.py调用PaddleOCR类，whl包未更新时会导致程序异常。
 - PPOCRLabel**不支持对中文文件名**的图片进行自动标注。
@@ -193,21 +212,7 @@ PPOCRLabel支持三种导出方式：
    ```
    pip install opencv-contrib-python-headless==4.2.0.32
    ```
-### 数据集划分
- 在终端中输入以下命令执行数据集划分脚本：
-    ```
-    cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下
-    python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec
-    ```
- 参数说明：
-    trainValTestRatio是训练集、验证集、测试集的图像数量划分比例，根据你的实际情况设定，默认是6:2:2
-    labelRootPath是PPOCRLabel标注的数据集存放路径，默认是../train_data/label
-    detRootPath是根据PPOCRLabel标注的数据集划分后的文本检测数据集存放的路径，默认是../train_data/det 
-    recRootPath是根据PPOCRLabel标注的数据集划分后的字符识别数据集存放的路径，默认是../train_data/rec
 ### 4. 参考资料

--- a/PPOCRLabel/libs/resources.py
+++ b/PPOCRLabel/libs/resources.py
--- a/PPOCRLabel/libs/stringBundle.py
+++ b/PPOCRLabel/libs/stringBundle.py
@@ -60,14 +60,14 @@ class StringBundle:
    def __createLookupFallbackList(self, localeStr):
        resultPaths = []
-        basePath = "\strings"
+        basePath = "\strings" if os.name == 'nt' else ":/strings"
        resultPaths.append(basePath)
        if localeStr is not None:
            # Don't follow standard BCP47. Simple fallback
            tags = re.split('[^a-zA-Z]', localeStr)
            for tag in tags:
                lastPath = resultPaths[-1]
-                resultPaths[-1] = lastPath + '-' + tag
+                resultPaths.append(lastPath + '-' + tag)
            resultPaths[-1] = __dirpath__ + resultPaths[-1] + ".properties"
        return resultPaths

--- a/PPOCRLabel/libs/utils.py
+++ b/PPOCRLabel/libs/utils.py
@@ -33,9 +33,9 @@ except ImportError:
 def newIcon(icon, iconSize=None):
    if iconSize is not None:
-        return QIcon(QIcon(__iconpath__ + "\\" + icon + ".png").pixmap(iconSize,iconSize))
+        return QIcon(QIcon(__iconpath__ + "/" + icon + ".png").pixmap(iconSize,iconSize))
    else:
-        return QIcon(__iconpath__ + "\\" + icon + ".png")
+        return QIcon(__iconpath__ + "/" + icon + ".png")
 def newButton(text, icon=None, slot=None):

--- a/PPOCRLabel/requirements.txt
+++ b/PPOCRLabel/requirements.txt
-shapely
+pyqt5
-scikit-image==0.17.2
+paddleocr
-imgaug==0.4.0
\ No newline at end of file
-pyclipper
-lmdb
-tqdm
-numpy
-visualdl
-python-Levenshtein
-opencv-contrib-python==4.2.0.32
-PaddleOCR
\ No newline at end of file
--- a/PPOCRLabel/resources.qrc
+++ b/PPOCRLabel/resources.qrc
@@ -35,7 +35,7 @@
 <file alias="prev">resources/icons/prev.png</file>
 <file alias="resetall">resources/icons/resetall.png</file>
 <file alias="verify">resources/icons/verify.png</file>
-<file alias="strings">resources/strings/strings.properties</file>
+<file alias="strings">resources/strings/strings-en.properties</file>
 <file alias="strings-zh-CN">resources/strings/strings-zh-CN.properties</file>
 </qresource>
 </RCC>