diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 390c2b159575bf1c60387e42b5be3d917ba845f7..0a3ae1cb3b8fc004aa7c48dc86b6546a80e17a0f 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -2715,6 +2715,9 @@ class MainWindow(QMainWindow): self._update_shape_color(shape) self.keyDialog.addLabelHistory(key_text) + + # save changed shape + self.setDirty() def undoShapeEdit(self): self.canvas.restoreShape() diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index 3ea684a3f09a6084403fa0b91e2511b7fd790f4b..767102fb1aaa696667b8e584ba8d2d1a17faa82e 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -2,7 +2,7 @@ # PPOCRLabelv2 -PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写,支持矩形框标注和四点标注模式,导出格式可直接用于PaddleOCR检测和识别模型的训练。 +PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写,支持矩形框标注、表格标注、不规则文本标注、关键信息标注模式,导出格式可直接用于PaddleOCR检测和识别模型的训练。 | 常规标注 | 表格标注 | | :-------------------------------------------------: | :--------------------------------------------: | diff --git a/PPOCRLabel/libs/canvas.py b/PPOCRLabel/libs/canvas.py index 81f37995126140b03650f5ddea37ea282d5ceb09..44d899cbc9f21793f89c498cf844c95e418b08a1 100644 --- a/PPOCRLabel/libs/canvas.py +++ b/PPOCRLabel/libs/canvas.py @@ -611,8 +611,8 @@ class Canvas(QWidget): if self.drawing() and not self.prevPoint.isNull() and not self.outOfPixmap(self.prevPoint): p.setPen(QColor(0, 0, 0)) - p.drawLine(self.prevPoint.x(), 0, self.prevPoint.x(), self.pixmap.height()) - p.drawLine(0, self.prevPoint.y(), self.pixmap.width(), self.prevPoint.y()) + p.drawLine(int(self.prevPoint.x()), 0, int(self.prevPoint.x()), self.pixmap.height()) + p.drawLine(0, int(self.prevPoint.y()), self.pixmap.width(), int(self.prevPoint.y())) self.setAutoFillBackground(True) if self.verified: @@ -909,4 +909,4 @@ class Canvas(QWidget): def updateShapeIndex(self): for i in range(len(self.shapes)): self.shapes[i].idx = i - self.update() \ No newline at end of file + self.update() diff --git a/PPOCRLabel/resources/strings/strings-en.properties b/PPOCRLabel/resources/strings/strings-en.properties index 1b628016c079ad1c5eb5514c7d6eb2cba842b7e3..3dfc34e001ad098ffcf1f4528218befd692281b9 100644 --- a/PPOCRLabel/resources/strings/strings-en.properties +++ b/PPOCRLabel/resources/strings/strings-en.properties @@ -113,4 +113,4 @@ keyDialogTip=Enter object label keyChange=Change Box Key TableRecognition=Table Recognition cellreRecognition=Cell Re-Recognition -exportJSON=Export Excel Label(PubTabNet) +exportJSON=Export Table Label diff --git a/PPOCRLabel/resources/strings/strings-zh-CN.properties b/PPOCRLabel/resources/strings/strings-zh-CN.properties index 0758729a8ca0cae862a4bf5bcf2e5b24f2d95822..00dfd31da2ce7a5b922fb5e4b15a84d657be0db6 100644 --- a/PPOCRLabel/resources/strings/strings-zh-CN.properties +++ b/PPOCRLabel/resources/strings/strings-zh-CN.properties @@ -113,4 +113,4 @@ keyDialogTip=请输入类型名称 keyChange=更改Box关键字类别 TableRecognition=表格识别 cellreRecognition=单元格重识别 -exportJSON=导出表格JSON标注 \ No newline at end of file +exportJSON=导出表格标注 \ No newline at end of file diff --git a/doc/doc_ch/distributed_training.md b/doc/doc_ch/distributed_training.md index 6afa4a5b9f77ce238cb18fcb4160e49f7b465369..dbbc4dc8b70953430147240f2bb0939d5af9f1e7 100644 --- a/doc/doc_ch/distributed_training.md +++ b/doc/doc_ch/distributed_training.md @@ -41,16 +41,30 @@ python3 -m paddle.distributed.launch \ ## 性能效果测试 -* 在2机8卡P40的机器上,基于26W公开识别数据集(LSVT, RCTW, MTWI)上进行训练,最终耗时如下。 +* 在2机8卡P40的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 -| 模型 | 配置 | 精度 | 单机8卡耗时 | 2机8卡耗时 | 加速比 | -|------|-----|--------|--------|--------|-----| -| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** | +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 2机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 26W中文数据集 | 2.50d/66.7% | 1.67d/67.0% | **1.5** | -* 在4机8卡V100的机器上,基于全量数据训练,最终耗时如下 +* 在3机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 3机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** | -| 模型 | 配置 | 精度 | 单机8卡耗时 | 4机8卡耗时 | 加速比 | -|------|-----|--------|--------|--------|-----| -| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** | + + > 注意:这里3机8卡训练时,单卡batch size相比于单机8卡不变,学习率乘以2 (默认乘以3的话,精度仅有73.42%) + + +* 在4机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 + + +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 4机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** | + + +* **注意** + * 在训练的GPU卡数过多时,精度会稍微有所损失(1%左右),此时可以尝试通过添加warmup或者适当增加迭代轮数来弥补精度损失。 diff --git a/doc/doc_ch/inference_ppocr.md b/doc/doc_ch/inference_ppocr.md index 514f905393984e2189b4c9c920ca4aeb91ac6da1..6723fcc12001426048dff26fcfb053ebe5f07ad7 100644 --- a/doc/doc_ch/inference_ppocr.md +++ b/doc/doc_ch/inference_ppocr.md @@ -144,7 +144,7 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982] **注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm`为`SVTR_LCNet`,注意和原始`SVTR`的区别。 -以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 +以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径,也支持PDF文件、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 ```shell # 使用方向分类器 @@ -153,8 +153,11 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false # 使用多进程 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 +# 使用PDF文件,可以通过使用`page_num`参数来控制推理前几页,默认为0,表示推理所有页 +python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2 ``` + 执行命令后,识别结果图像如下: ![](../imgs_results/system_res_00018069_v3.jpg) diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md index e425cdd8a87d320554e61c72e05001875d022e43..cac7664c2fb38b91efa4b3f2daa388b90e1ee1f8 100644 --- a/doc/doc_ch/quickstart.md +++ b/doc/doc_ch/quickstart.md @@ -75,6 +75,11 @@ cd /path/to/ppocr_img ...... ``` + 此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 + ```bash + paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 + ``` + - 单独使用检测:设置`--rec`为`false` ```bash @@ -165,12 +170,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = './imgs/11.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -196,6 +203,50 @@ im_show.save('result.jpg') +如果输入是PDF文件,那么可以参考下面代码进行可视化 + +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + ## 3. 小结 通过本节内容,相信您已经熟练掌握PaddleOCR whl包的使用方法并获得了初步效果。 diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index 315329464f15aa1127e34a38d3407a9c81dbc627..83f062801a343289f11681995549dded97982397 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -33,12 +33,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -71,12 +73,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -109,8 +113,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` 结果是一个list,每个item只包含识别结果和识别置信度 @@ -127,12 +133,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, rec=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = Image.fromarray(im_show) @@ -163,8 +171,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` 结果是一个list,每个item只包含识别结果和识别置信度 @@ -181,8 +191,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False, rec=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` 结果是一个list,每个item只包含分类结果和分类置信度 @@ -212,6 +224,11 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true ...... ``` +此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + * 检测+识别 ```bash @@ -290,12 +307,14 @@ ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_m use_angle_cls=True) img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -325,12 +344,14 @@ from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] download_with_progressbar(img_path, 'tmp.jpg') image = Image.open('tmp.jpg').convert('RGB') boxes = [line[0] for line in result] @@ -362,12 +383,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg' img = cv2.imread(img_path) # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消 result = ocr.ocr(img, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -376,14 +399,65 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` +## 5 PDF文件作为输入 +- 命令行模式 + +可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` +- 代码使用 + +```python +from paddleocr import PaddleOCR, draw_ocr -## 5 参数说明 +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +## 6 参数说明 | 字段 | 说明 | 默认值 | |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| | use_gpu | 是否使用GPU | TRUE | | gpu_mem | 初始化占用的GPU内存大小 | 8000M | -| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | | +| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | +| page_num | 当输入类型为pdf文件时有效,指定预测前面page_num页,默认预测所有页 | 0 | | det_algorithm | 使用的检测算法类型 | DB | | det_model_dir | 检测模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/det`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | | det_max_side_len | 检测算法前向时图片长边的最大尺寸,当长边超出这个值时会将长边resize到这个大小,短边等比例缩放 | 960 | diff --git a/doc/doc_en/distributed_training_en.md b/doc/doc_en/distributed_training_en.md index 5a219ed2b494d6239096ff634dfdc702c4be9419..a9db354ad46751dc1320b48d68fe8025edb651d3 100644 --- a/doc/doc_en/distributed_training_en.md +++ b/doc/doc_en/distributed_training_en.md @@ -40,17 +40,29 @@ python3 -m paddle.distributed.launch \ ## Performance comparison -* On two 8-card P40 graphics cards, the final time consumption and speedup ratio for public recognition dataset (LSVT, RCTW, MTWI) containing 260k images are as follows. +* We conducted model training on 2x8 P40 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | -| Model | Config file | Recognition acc | single 8-card training time | two 8-card training time | Speedup ratio | -|------|-----|--------|--------|--------|-----| -| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** | +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 260k Chinese dataset | 2.50d/66.7% | 1.67d/67.0% | **1.5** | -* On four 8-card V100 graphics cards, the final time consumption and speedup ratio for full data are as follows. +* We conducted model training on 3x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. -| Model | Config file | Recognition acc | single 8-card training time | four 8-card training time | Speedup ratio | -|------|-----|--------|--------|--------|-----| -| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** | +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** | + + + > Note: when training with 3x8 GPUs, the single card batch size is unchanged compared with the 1x8 GPUs' training process, and the learning rate is multiplied by 2 (if it is multiplied by 3 by default, the accuracy is only 73.42%). + + +* We conducted model training on 4x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. + + +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 4x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** | diff --git a/doc/doc_en/inference_ppocr_en.md b/doc/doc_en/inference_ppocr_en.md index 4c9db51e1d23e5ac05cfcb3ec43748df75c0b36c..4c3576983aba11b73b1e0300089014866bc0d2fc 100755 --- a/doc/doc_en/inference_ppocr_en.md +++ b/doc/doc_en/inference_ppocr_en.md @@ -144,16 +144,17 @@ After executing the command, the prediction results (classification angle and sc **Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`. -When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. +When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, pdf file is also supported, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. ```shell # use direction classifier python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true - # not use use direction classifier python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false # use multi-process python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 +# use PDF files, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2 ``` diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index 9e1de839ff0ed8291f1822186f43cb24c9f9ebce..ea38845f503192705a4d87f3faacdaf25bb27ba9 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -86,6 +86,12 @@ If you do not use the provided test image, you can replace the following `--imag ...... ``` + pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages + + ```bash + paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 + ``` + * Only detection: set `--rec` to `false` ```bash @@ -176,12 +182,15 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory img_path = './imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -206,6 +215,50 @@ Visualization of results +If the input is a PDF file, you can refer to the following code for visualization + +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# draw result +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index da2dff67c16b4a9a0a653934b1f1df64cb6e9707..77e80faa688392db5b2959f4fd1705275cb37d6b 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -25,12 +25,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) - +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -60,11 +62,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -94,8 +99,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to load model into memory img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' result = ocr.ocr(img_path, det=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` Output will be a list, each item contains recognition text and confidence @@ -109,12 +116,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path,rec=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = Image.fromarray(im_show) @@ -141,8 +150,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(lang='en') # need to run only once to load model into memory img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' result = ocr.ocr(img_path, det=False, cls=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` Output will be a list, each item contains recognition text and confidence @@ -156,8 +167,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True) # need to run only once to load model into memory img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' result = ocr.ocr(img_path, det=False, rec=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` Output will be a list, each item contains classification result and confidence @@ -185,6 +198,11 @@ Output will be a list, each item contains bounding box, text and recognition con ...... ``` +pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + * detection and recognition ```bash paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en @@ -253,11 +271,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True) img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -283,11 +304,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # show result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -312,12 +336,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg' img = cv2.imread(img_path) # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # show result from PIL import Image - +result = result[0] download_with_progressbar(img_path, 'tmp.jpg') image = Image.open('tmp.jpg').convert('RGB') boxes = [line[0] for line in result] @@ -327,15 +353,66 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` +## 5 PDF file +- Use by command line + +you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` +- Use by code +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) -## 5 Parameter Description +# draw result +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +## 6 Parameter Description | Parameter | Description | Default value | |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| | use_gpu | use GPU or not | TRUE | | gpu_mem | GPU memory size used for initialization | 8000M | | image_dir | The images path or folder path for predicting when used by the command line | | +| page_num | Valid when the input type is pdf file, specify to predict the previous page_num pages, all pages are predicted by default | 0 | | det_algorithm | Type of detection algorithm selected | DB | | det_model_dir | the text detection inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/det`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | | det_max_side_len | The maximum size of the long side of the image. When the long side exceeds this value, the long side will be resized to this size, and the short side will be scaled proportionally | 960 | diff --git a/paddleocr.py b/paddleocr.py index d34b8f78a56a8d8d5455c18e7e1cf1e75df8f3f9..95a19147fee6dff30af2264d26aceac85b114289 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -47,7 +47,7 @@ __all__ = [ ] SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.6.0.1' +VERSION = '2.6.0.2' SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet'] BASE_DIR = os.path.expanduser("~/.paddleocr/") @@ -428,8 +428,8 @@ def check_img(img): download_with_progressbar(img, 'tmp.jpg') img = 'tmp.jpg' image_file = img - img, flag, _ = check_and_read(image_file) - if not flag: + img, flag_gif, flag_pdf = check_and_read(image_file) + if not flag_gif and not flag_pdf: with open(image_file, 'rb') as f: img = img_decode(f.read()) if img is None: @@ -500,6 +500,7 @@ class PaddleOCR(predict_system.TextSystem): logger.debug(params) # init det_model and rec_model super().__init__(params) + self.page_num = params.page_num def ocr(self, img, det=True, rec=True, cls=True): """ @@ -520,24 +521,43 @@ class PaddleOCR(predict_system.TextSystem): ) img = check_img(img) - + # for infer pdf file + if isinstance(img, list): + if self.page_num > len(img) or self.page_num == 0: + self.page_num = len(img) + imgs = img[:self.page_num] + else: + imgs = [img] if det and rec: - dt_boxes, rec_res, _ = self.__call__(img, cls) - return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] + ocr_res = [] + for idx, img in enumerate(imgs): + dt_boxes, rec_res, _ = self.__call__(img, cls) + tmp_res = [[box.tolist(), res] + for box, res in zip(dt_boxes, rec_res)] + ocr_res.append(tmp_res) + return ocr_res elif det and not rec: - dt_boxes, elapse = self.text_detector(img) - if dt_boxes is None: - return None - return [box.tolist() for box in dt_boxes] + ocr_res = [] + for idx, img in enumerate(imgs): + dt_boxes, elapse = self.text_detector(img) + tmp_res = [box.tolist() for box in dt_boxes] + ocr_res.append(tmp_res) + return ocr_res else: - if not isinstance(img, list): - img = [img] - if self.use_angle_cls and cls: - img, cls_res, elapse = self.text_classifier(img) - if not rec: - return cls_res - rec_res, elapse = self.text_recognizer(img) - return rec_res + ocr_res = [] + cls_res = [] + for idx, img in enumerate(imgs): + if not isinstance(img, list): + img = [img] + if self.use_angle_cls and cls: + img, cls_res_tmp, elapse = self.text_classifier(img) + if not rec: + cls_res.append(cls_res_tmp) + rec_res, elapse = self.text_recognizer(img) + ocr_res.append(rec_res) + if not rec: + return cls_res + return ocr_res class PPStructure(StructureSystem): @@ -633,8 +653,10 @@ def main(): rec=args.rec, cls=args.use_angle_cls) if result is not None: - for line in result: - logger.info(line) + for idx in range(len(result)): + res = result[idx] + for line in res: + logger.info(line) elif args.type == 'structure': img, flag_gif, flag_pdf = check_and_read(img_path) if not flag_gif and not flag_pdf: @@ -682,7 +704,7 @@ def main(): "error in layout recovery image:{}, err msg: {}".format( img_name, ex)) continue - + for item in all_res: item.pop('img') item.pop('res') diff --git a/ppocr/modeling/backbones/rec_efficientb3_pren.py b/ppocr/modeling/backbones/rec_efficientb3_pren.py index 57eef178869fc7f5ff55b3548674c741fb4f3ead..701e436c1e0e29f42cc9c7ce6e66552d4005f6b0 100644 --- a/ppocr/modeling/backbones/rec_efficientb3_pren.py +++ b/ppocr/modeling/backbones/rec_efficientb3_pren.py @@ -21,124 +21,165 @@ from __future__ import division from __future__ import print_function import math -from collections import namedtuple +import re +import collections import paddle import paddle.nn as nn import paddle.nn.functional as F __all__ = ['EfficientNetb3'] +GlobalParams = collections.namedtuple('GlobalParams', [ + 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 'num_classes', + 'width_coefficient', 'depth_coefficient', 'depth_divisor', 'min_depth', + 'drop_connect_rate', 'image_size' +]) -class EffB3Params: +BlockArgs = collections.namedtuple('BlockArgs', [ + 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', + 'expand_ratio', 'id_skip', 'stride', 'se_ratio' +]) + + +class BlockDecoder: @staticmethod - def get_global_params(): - """ - The fllowing are efficientnetb3's arch superparams, but to fit for scene - text recognition task, the resolution(image_size) here is changed - from 300 to 64. - """ - GlobalParams = namedtuple('GlobalParams', [ - 'drop_connect_rate', 'width_coefficient', 'depth_coefficient', - 'depth_divisor', 'image_size' - ]) - global_params = GlobalParams( - drop_connect_rate=0.3, - width_coefficient=1.2, - depth_coefficient=1.4, - depth_divisor=8, - image_size=64) - return global_params + def _decode_block_string(block_string): + assert isinstance(block_string, str) + + ops = block_string.split('_') + options = {} + for op in ops: + splits = re.split(r'(\d.*)', op) + if len(splits) >= 2: + key, value = splits[:2] + options[key] = value + + assert (('s' in options and len(options['s']) == 1) or + (len(options['s']) == 2 and options['s'][0] == options['s'][1])) + + return BlockArgs( + kernel_size=int(options['k']), + num_repeat=int(options['r']), + input_filters=int(options['i']), + output_filters=int(options['o']), + expand_ratio=int(options['e']), + id_skip=('noskip' not in block_string), + se_ratio=float(options['se']) if 'se' in options else None, + stride=[int(options['s'][0])]) @staticmethod - def get_block_params(): - BlockParams = namedtuple('BlockParams', [ - 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', - 'expand_ratio', 'id_skip', 'se_ratio', 'stride' - ]) - block_params = [ - BlockParams(3, 1, 32, 16, 1, True, 0.25, 1), - BlockParams(3, 2, 16, 24, 6, True, 0.25, 2), - BlockParams(5, 2, 24, 40, 6, True, 0.25, 2), - BlockParams(3, 3, 40, 80, 6, True, 0.25, 2), - BlockParams(5, 3, 80, 112, 6, True, 0.25, 1), - BlockParams(5, 4, 112, 192, 6, True, 0.25, 2), - BlockParams(3, 1, 192, 320, 6, True, 0.25, 1) - ] - return block_params + def decode(string_list): + assert isinstance(string_list, list) + blocks_args = [] + for block_string in string_list: + blocks_args.append(BlockDecoder._decode_block_string(block_string)) + return blocks_args + + +def efficientnet(width_coefficient=None, + depth_coefficient=None, + dropout_rate=0.2, + drop_connect_rate=0.2, + image_size=None, + num_classes=1000): + blocks_args = [ + 'r1_k3_s11_e1_i32_o16_se0.25', + 'r2_k3_s22_e6_i16_o24_se0.25', + 'r2_k5_s22_e6_i24_o40_se0.25', + 'r3_k3_s22_e6_i40_o80_se0.25', + 'r3_k5_s11_e6_i80_o112_se0.25', + 'r4_k5_s22_e6_i112_o192_se0.25', + 'r1_k3_s11_e6_i192_o320_se0.25', + ] + blocks_args = BlockDecoder.decode(blocks_args) + + global_params = GlobalParams( + batch_norm_momentum=0.99, + batch_norm_epsilon=1e-3, + dropout_rate=dropout_rate, + drop_connect_rate=drop_connect_rate, + num_classes=num_classes, + width_coefficient=width_coefficient, + depth_coefficient=depth_coefficient, + depth_divisor=8, + min_depth=None, + image_size=image_size, ) + return blocks_args, global_params class EffUtils: @staticmethod def round_filters(filters, global_params): - """Calculate and round number of filters based on depth multiplier.""" + """ Calculate and round number of filters based on depth multiplier. """ multiplier = global_params.width_coefficient if not multiplier: return filters divisor = global_params.depth_divisor + min_depth = global_params.min_depth filters *= multiplier - new_filters = int(filters + divisor / 2) // divisor * divisor + min_depth = min_depth or divisor + new_filters = max(min_depth, + int(filters + divisor / 2) // divisor * divisor) if new_filters < 0.9 * filters: new_filters += divisor return int(new_filters) @staticmethod def round_repeats(repeats, global_params): - """Round number of filters based on depth multiplier.""" + """ Round number of filters based on depth multiplier. """ multiplier = global_params.depth_coefficient if not multiplier: return repeats return int(math.ceil(multiplier * repeats)) -class ConvBlock(nn.Layer): - def __init__(self, block_params): - super(ConvBlock, self).__init__() - self.block_args = block_params - self.has_se = (self.block_args.se_ratio is not None) and \ - (0 < self.block_args.se_ratio <= 1) - self.id_skip = block_params.id_skip +class MbConvBlock(nn.Layer): + def __init__(self, block_args): + super(MbConvBlock, self).__init__() + self._block_args = block_args + self.has_se = (self._block_args.se_ratio is not None) and \ + (0 < self._block_args.se_ratio <= 1) + self.id_skip = block_args.id_skip # expansion phase - self.input_filters = self.block_args.input_filters - output_filters = \ - self.block_args.input_filters * self.block_args.expand_ratio - if self.block_args.expand_ratio != 1: - self.expand_conv = nn.Conv2D( - self.input_filters, output_filters, 1, bias_attr=False) - self.bn0 = nn.BatchNorm(output_filters) + self.inp = self._block_args.input_filters + oup = self._block_args.input_filters * self._block_args.expand_ratio + if self._block_args.expand_ratio != 1: + self._expand_conv = nn.Conv2D(self.inp, oup, 1, bias_attr=False) + self._bn0 = nn.BatchNorm(oup) # depthwise conv phase - k = self.block_args.kernel_size - s = self.block_args.stride - self.depthwise_conv = nn.Conv2D( - output_filters, - output_filters, - groups=output_filters, + k = self._block_args.kernel_size + s = self._block_args.stride + if isinstance(s, list): + s = s[0] + self._depthwise_conv = nn.Conv2D( + oup, + oup, + groups=oup, kernel_size=k, stride=s, padding='same', bias_attr=False) - self.bn1 = nn.BatchNorm(output_filters) + self._bn1 = nn.BatchNorm(oup) # squeeze and excitation layer, if desired if self.has_se: num_squeezed_channels = max(1, - int(self.block_args.input_filters * - self.block_args.se_ratio)) - self.se_reduce = nn.Conv2D(output_filters, num_squeezed_channels, 1) - self.se_expand = nn.Conv2D(num_squeezed_channels, output_filters, 1) - - # output phase - self.final_oup = self.block_args.output_filters - self.project_conv = nn.Conv2D( - output_filters, self.final_oup, 1, bias_attr=False) - self.bn2 = nn.BatchNorm(self.final_oup) - self.swish = nn.Swish() - - def drop_connect(self, inputs, p, training): + int(self._block_args.input_filters * + self._block_args.se_ratio)) + self._se_reduce = nn.Conv2D(oup, num_squeezed_channels, 1) + self._se_expand = nn.Conv2D(num_squeezed_channels, oup, 1) + + # output phase and some util class + self.final_oup = self._block_args.output_filters + self._project_conv = nn.Conv2D(oup, self.final_oup, 1, bias_attr=False) + self._bn2 = nn.BatchNorm(self.final_oup) + self._swish = nn.Swish() + + def _drop_connect(self, inputs, p, training): if not training: return inputs - batch_size = inputs.shape[0] keep_prob = 1 - p random_tensor = keep_prob @@ -151,22 +192,23 @@ class ConvBlock(nn.Layer): def forward(self, inputs, drop_connect_rate=None): # expansion and depthwise conv x = inputs - if self.block_args.expand_ratio != 1: - x = self.swish(self.bn0(self.expand_conv(inputs))) - x = self.swish(self.bn1(self.depthwise_conv(x))) + if self._block_args.expand_ratio != 1: + x = self._swish(self._bn0(self._expand_conv(inputs))) + x = self._swish(self._bn1(self._depthwise_conv(x))) # squeeze and excitation if self.has_se: x_squeezed = F.adaptive_avg_pool2d(x, 1) - x_squeezed = self.se_expand(self.swish(self.se_reduce(x_squeezed))) + x_squeezed = self._se_expand( + self._swish(self._se_reduce(x_squeezed))) x = F.sigmoid(x_squeezed) * x - x = self.bn2(self.project_conv(x)) + x = self._bn2(self._project_conv(x)) # skip conntection and drop connect - if self.id_skip and self.block_args.stride == 1 and \ - self.input_filters == self.final_oup: + if self.id_skip and self._block_args.stride == 1 and \ + self.inp == self.final_oup: if drop_connect_rate: - x = self.drop_connect( + x = self._drop_connect( x, p=drop_connect_rate, training=self.training) x = x + inputs return x @@ -175,54 +217,63 @@ class ConvBlock(nn.Layer): class EfficientNetb3_PREN(nn.Layer): def __init__(self, in_channels): super(EfficientNetb3_PREN, self).__init__() - self.blocks_params = EffB3Params.get_block_params() - self.global_params = EffB3Params.get_global_params() + """ + the fllowing are efficientnetb3's superparams, + they means efficientnetb3 network's width, depth, resolution and + dropout respectively, to fit for text recognition task, the resolution + here is changed from 300 to 64. + """ + w, d, s, p = 1.2, 1.4, 64, 0.3 + self._blocks_args, self._global_params = efficientnet( + width_coefficient=w, + depth_coefficient=d, + dropout_rate=p, + image_size=s) self.out_channels = [] # stem - stem_channels = EffUtils.round_filters(32, self.global_params) - self.conv_stem = nn.Conv2D( - in_channels, stem_channels, 3, 2, padding='same', bias_attr=False) - self.bn0 = nn.BatchNorm(stem_channels) + out_channels = EffUtils.round_filters(32, self._global_params) + self._conv_stem = nn.Conv2D( + in_channels, out_channels, 3, 2, padding='same', bias_attr=False) + self._bn0 = nn.BatchNorm(out_channels) - self.blocks = [] + # build blocks + self._blocks = [] # to extract three feature maps for fpn based on efficientnetb3 backbone - self.concerned_block_idxes = [7, 17, 25] - concerned_idx = 0 - for i, block_params in enumerate(self.blocks_params): - block_params = block_params._replace( - input_filters=EffUtils.round_filters(block_params.input_filters, - self.global_params), - output_filters=EffUtils.round_filters( - block_params.output_filters, self.global_params), - num_repeat=EffUtils.round_repeats(block_params.num_repeat, - self.global_params)) - self.blocks.append( - self.add_sublayer("{}-0".format(i), ConvBlock(block_params))) - concerned_idx += 1 - if concerned_idx in self.concerned_block_idxes: - self.out_channels.append(block_params.output_filters) - if block_params.num_repeat > 1: - block_params = block_params._replace( - input_filters=block_params.output_filters, stride=1) - for j in range(block_params.num_repeat - 1): - self.blocks.append( - self.add_sublayer('{}-{}'.format(i, j + 1), - ConvBlock(block_params))) - concerned_idx += 1 - if concerned_idx in self.concerned_block_idxes: - self.out_channels.append(block_params.output_filters) - - self.swish = nn.Swish() + self._concerned_block_idxes = [7, 17, 25] + _concerned_idx = 0 + for i, block_args in enumerate(self._blocks_args): + block_args = block_args._replace( + input_filters=EffUtils.round_filters(block_args.input_filters, + self._global_params), + output_filters=EffUtils.round_filters(block_args.output_filters, + self._global_params), + num_repeat=EffUtils.round_repeats(block_args.num_repeat, + self._global_params)) + self._blocks.append( + self.add_sublayer(f"{i}-0", MbConvBlock(block_args))) + _concerned_idx += 1 + if _concerned_idx in self._concerned_block_idxes: + self.out_channels.append(block_args.output_filters) + if block_args.num_repeat > 1: + block_args = block_args._replace( + input_filters=block_args.output_filters, stride=1) + for j in range(block_args.num_repeat - 1): + self._blocks.append( + self.add_sublayer(f'{i}-{j+1}', MbConvBlock(block_args))) + _concerned_idx += 1 + if _concerned_idx in self._concerned_block_idxes: + self.out_channels.append(block_args.output_filters) + + self._swish = nn.Swish() def forward(self, inputs): outs = [] - - x = self.swish(self.bn0(self.conv_stem(inputs))) - for idx, block in enumerate(self.blocks): - drop_connect_rate = self.global_params.drop_connect_rate + x = self._swish(self._bn0(self._conv_stem(inputs))) + for idx, block in enumerate(self._blocks): + drop_connect_rate = self._global_params.drop_connect_rate if drop_connect_rate: - drop_connect_rate *= float(idx) / len(self.blocks) + drop_connect_rate *= float(idx) / len(self._blocks) x = block(x, drop_connect_rate=drop_connect_rate) - if idx in self.concerned_block_idxes: + if idx in self._concerned_block_idxes: outs.append(x) return outs diff --git a/ppocr/modeling/heads/table_att_head.py b/ppocr/modeling/heads/table_att_head.py index 50910c5b73aa2a41f329d7222fc8c632509b4c91..e3fc8436e78bf3959eec8cb89efc66500fa56bdc 100644 --- a/ppocr/modeling/heads/table_att_head.py +++ b/ppocr/modeling/heads/table_att_head.py @@ -82,7 +82,8 @@ class TableAttentionHead(nn.Layer): batch_size = fea.shape[0] hidden = paddle.zeros((batch_size, self.hidden_size)) - output_hiddens = paddle.zeros((batch_size, self.max_text_length + 1, self.hidden_size)) + output_hiddens = paddle.zeros( + (batch_size, self.max_text_length + 1, self.hidden_size)) if self.training and targets is not None: structure = targets[0] for i in range(self.max_text_length + 1): @@ -91,19 +92,13 @@ class TableAttentionHead(nn.Layer): (outputs, hidden), alpha = self.structure_attention_cell( hidden, fea, elem_onehots) output_hiddens[:, i, :] = outputs - # output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) - output = paddle.concat(output_hiddens, axis=1) - structure_probs = self.structure_generator(output) - if self.loc_type == 1: - loc_preds = self.loc_generator(output) - loc_preds = F.sigmoid(loc_preds) - else: - loc_fea = fea.transpose([0, 2, 1]) - loc_fea = self.loc_fea_trans(loc_fea) - loc_fea = loc_fea.transpose([0, 2, 1]) - loc_concat = paddle.concat([output, loc_fea], axis=2) - loc_preds = self.loc_generator(loc_concat) - loc_preds = F.sigmoid(loc_preds) + structure_probs = self.structure_generator(output_hiddens) + loc_fea = fea.transpose([0, 2, 1]) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.transpose([0, 2, 1]) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) else: temp_elem = paddle.zeros(shape=[batch_size], dtype="int32") structure_probs = None @@ -118,17 +113,15 @@ class TableAttentionHead(nn.Layer): (outputs, hidden), alpha = self.structure_attention_cell( hidden, fea, elem_onehots) output_hiddens[:, i, :] = outputs - # output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) structure_probs_step = self.structure_generator(outputs) temp_elem = structure_probs_step.argmax(axis=1, dtype="int32") - output = output_hiddens - structure_probs = self.structure_generator(output) + structure_probs = self.structure_generator(output_hiddens) structure_probs = F.softmax(structure_probs) loc_fea = fea.transpose([0, 2, 1]) loc_fea = self.loc_fea_trans(loc_fea) loc_fea = loc_fea.transpose([0, 2, 1]) - loc_concat = paddle.concat([output, loc_fea], axis=2) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) loc_preds = self.loc_generator(loc_concat) loc_preds = F.sigmoid(loc_preds) return {'structure_probs': structure_probs, 'loc_preds': loc_preds} @@ -203,8 +196,10 @@ class SLAHead(nn.Layer): fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) hidden = paddle.zeros((batch_size, self.hidden_size)) - structure_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.num_embeddings)) - loc_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.loc_reg_num)) + structure_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.num_embeddings)) + loc_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.loc_reg_num)) structure_preds.stop_gradient = True loc_preds.stop_gradient = True if self.training and targets is not None: diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 40ba5c2087879e4d277b67e78596c52434c06fd6..74f4e880bf87d12a4db31e489a4527f830e8a8d6 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -650,7 +650,8 @@ class PRENLabelDecode(BaseRecLabelDecode): return result_list def __call__(self, preds, label=None, *args, **kwargs): - preds = preds.numpy() + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() preds_idx = preds.argmax(axis=2) preds_prob = preds.max(axis=2) text = self.decode(preds_idx, preds_prob) diff --git a/tools/export_model.py b/tools/export_model.py index 9c23060ee0e09f41df82661bf290c660a1fbbb03..52f05bfcba0487d1c5abd0f7d7221c2feca40ae9 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -77,7 +77,7 @@ def export_single_model(model, elif arch_config["algorithm"] == "PREN": other_shape = [ paddle.static.InputSpec( - shape=[None, 3, 64, 512], dtype="float32"), + shape=[None, 3, 64, 256], dtype="float32"), ] model = to_static(model, input_spec=other_shape) elif arch_config["model_type"] == "sr": diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 697e9da437581b139034415143839f9940d6cb7f..bffeb25534068691fee21bbf946cc7cda7326d27 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -106,6 +106,8 @@ class TextRecognizer(object): "character_dict_path": None, "use_space_char": args.use_space_char } + elif self.rec_algorithm == "PREN": + postprocess_params = {'name': 'PRENLabelDecode'} self.postprocess_op = build_post_process(postprocess_params) self.predictor, self.input_tensor, self.output_tensors, self.config = \ utility.create_predictor(args, 'rec', logger) @@ -400,7 +402,7 @@ class TextRecognizer(object): self.rec_image_shape) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) - elif self.rec_algorithm == "VisionLAN": + elif self.rec_algorithm in ["VisionLAN", "PREN"]: norm_img = self.resize_norm_img_vl(img_list[indices[ino]], self.rec_image_shape) norm_img = norm_img[np.newaxis, :]