diff --git a/__init__.py b/__init__.py index 504aeca61b734937b97bab18dec8e49237d873d5..e22e466a8426c437407c491bbae47c3b66defa2e 100644 --- a/__init__.py +++ b/__init__.py @@ -15,4 +15,4 @@ import paddleocr from .paddleocr import * __version__ = paddleocr.VERSION -__all__ = ['PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', 'save_structure_res'] +__all__ = ['PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', 'save_structure_res','download_with_progressbar'] diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index edacb5b2d804d85fd593f4a6c439fc223a8f5e4b..167ed7b2b8a13706dfe1533265b6d96560265511 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -5,26 +5,32 @@ ### 1.1 安装whl包 pip安装 + ```bash pip install "paddleocr>=2.0.1" # 推荐使用2.0.1+版本 ``` 本地构建并安装 + ```bash python3 setup.py bdist_wheel pip3 install dist/paddleocr-x.x.x-py3-none-any.whl # x.x.x是paddleocr的版本号 ``` ## 2 使用 + ### 2.1 代码使用 + paddleocr whl包会自动下载ppocr轻量级模型作为默认模型,可以根据第3节**自定义模型**进行自定义更换。 * 检测+方向分类器+识别全流程 + ```python from paddleocr import PaddleOCR, draw_ocr + # Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 # 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 -ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=True) for line in result: @@ -32,6 +38,7 @@ for line in result: # 显示结果 from PIL import Image + image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -40,31 +47,36 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` + 结果是一个list,每个item包含了文本框,文字和识别置信度 + ```bash [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] [[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] ...... ``` + 结果可视化
- * 检测+识别 + ```python from paddleocr import PaddleOCR, draw_ocr -ocr = PaddleOCR() # need to run only once to download and load model into memory + +ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' -result = ocr.ocr(img_path,cls=False) +result = ocr.ocr(img_path, cls=False) for line in result: print(line) # 显示结果 from PIL import Image + image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -73,38 +85,46 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` + 结果是一个list,每个item包含了文本框,文字和识别置信度 + ```bash [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] [[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] ...... ``` + 结果可视化
- * 方向分类器+识别 + ```python from paddleocr import PaddleOCR -ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory + +ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False, cls=True) for line in result: print(line) ``` + 结果是一个list,每个item只包含识别结果和识别置信度 + ```bash ['韩国小馆', 0.9907421] ``` * 单独执行检测 + ```python from paddleocr import PaddleOCR, draw_ocr -ocr = PaddleOCR() # need to run only once to download and load model into memory + +ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, rec=False) for line in result: @@ -118,13 +138,16 @@ im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/Pa im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` + 结果是一个list,每个item只包含文本框 + ```bash [[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] [[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] [[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]] ...... ``` + 结果可视化 @@ -133,29 +156,37 @@ im_show.save('result.jpg') * 单独执行识别 + ```python from paddleocr import PaddleOCR -ocr = PaddleOCR() # need to run only once to download and load model into memory + +ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False) for line in result: print(line) ``` + 结果是一个list,每个item只包含识别结果和识别置信度 + ```bash ['韩国小馆', 0.9907421] ``` * 单独执行方向分类器 + ```python from paddleocr import PaddleOCR -ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory + +ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False, rec=False, cls=True) for line in result: print(line) ``` + 结果是一个list,每个item只包含分类结果和分类置信度 + ```bash ['0', 0.9999924] ``` @@ -163,15 +194,19 @@ for line in result: ### 2.2 通过命令行使用 查看帮助信息 + ```bash paddleocr -h ``` * 检测+方向分类器+识别全流程 + ```bash paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true ``` + 结果是一个list,每个item包含了文本框,文字和识别置信度 + ```bash [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] @@ -180,10 +215,13 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true ``` * 检测+识别 + ```bash paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg ``` + 结果是一个list,每个item包含了文本框,文字和识别置信度 + ```bash [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] @@ -192,20 +230,25 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg ``` * 方向分类器+识别 + ```bash paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false ``` 结果是一个list,每个item只包含识别结果和识别置信度 + ```bash ['韩国小馆', 0.9907421] ``` * 单独执行检测 + ```bash paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false ``` + 结果是一个list,每个item只包含文本框 + ```bash [[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] [[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] @@ -214,34 +257,42 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false ``` * 单独执行识别 + ```bash paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false ``` 结果是一个list,每个item只包含识别结果和识别置信度 + ```bash ['韩国小馆', 0.9907421] ``` * 单独执行方向分类器 + ```bash paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false --rec false ``` 结果是一个list,每个item只包含分类结果和分类置信度 + ```bash ['0', 0.9999924] ``` ## 3 自定义模型 -当内置模型无法满足需求时,需要使用到自己训练的模型。 -首先,参照[inference.md](./inference.md) 第一节转换将检测、分类和识别模型转换为inference模型,然后按照如下方式使用 + +当内置模型无法满足需求时,需要使用到自己训练的模型。 首先,参照[inference.md](./inference.md) 第一节转换将检测、分类和识别模型转换为inference模型,然后按照如下方式使用 ### 3.1 代码使用 + ```python from paddleocr import PaddleOCR, draw_ocr + # 模型路径下必须含有model和params文件 -ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True) +ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', + rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', + use_angle_cls=True) img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=True) for line in result: @@ -249,6 +300,7 @@ for line in result: # 显示结果 from PIL import Image + image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -269,11 +321,13 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_ ### 4.1 网络图片 - 代码使用 + ```python -from paddleocr import PaddleOCR, draw_ocr +from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar + # Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 # 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 -ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' result = ocr.ocr(img_path, cls=True) for line in result: @@ -281,7 +335,9 @@ for line in result: # 显示结果 from PIL import Image -image = Image.open(img_path).convert('RGB') + +download_with_progressbar(img_path, 'tmp.jpg') +image = Image.open('tmp.jpg').convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] scores = [line[1][1] for line in result] @@ -289,19 +345,24 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` + - 命令行模式 + ```bash paddleocr --image_dir http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg --use_angle_cls=true ``` ### 4.2 numpy数组 + 仅通过代码使用时支持numpy数组作为输入 + ```python import cv2 from paddleocr import PaddleOCR, draw_ocr + # Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 # 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 -ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' img = cv2.imread(img_path) # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消 @@ -311,6 +372,7 @@ for line in result: # 显示结果 from PIL import Image + image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index 79bc9b0bc5c5f7f33682d5829e5b613b278a4964..c8c8353accdf7f6ce179d3700547bfe9bd70c200 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -306,7 +306,7 @@ Support numpy array as input only when used by code ```python import cv2 -from paddleocr import PaddleOCR, draw_ocr +from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' img = cv2.imread(img_path) @@ -317,7 +317,9 @@ for line in result: # show result from PIL import Image -image = Image.open(img_path).convert('RGB') + +download_with_progressbar(img_path, 'tmp.jpg') +image = Image.open('tmp.jpg').convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] scores = [line[1][1] for line in result] diff --git a/paddleocr.py b/paddleocr.py index 5ae812fa8741a9007bacb4fb64a43f97648d8f77..c52737f55b61cd29c08367adb6d7e05c561e933e 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -33,7 +33,7 @@ from tools.infer.utility import draw_ocr, str2bool from ppstructure.utility import init_args, draw_structure_result from ppstructure.predict_system import OCRSystem, save_structure_res -__all__ = ['PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', 'save_structure_res'] +__all__ = ['PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', 'save_structure_res','download_with_progressbar'] model_urls = { 'det': { diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index 05f89360608a2d931d38afbff6452c5e9f5e85fa..a8d10b79e507ab59ef2481982a33902e4a95e73e 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -15,9 +15,18 @@ The table recognition flow chart is as follows 3. The recognition result of the cell is combined by the coordinates, recognition result of the single line and the coordinates of the cell. 4. The cell recognition result and the table structure together construct the html string of the table. -## 2. How to use +## 2. Performance +We evaluated the algorithm on the PubTabNet[1] eval dataset, and the performance is as follows: -### 2.1 quick start + +|Method|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)| +| --- | --- | +| EDD[2] | 88.3 | +| Ours | 93.32 | + +## 3. How to use + +### 3.1 quick start ```python cd PaddleOCR/ppstructure @@ -38,7 +47,7 @@ Note: The above model is trained on the PubLayNet dataset and only supports Engl After running, the excel sheet of each picture will be saved in the directory specified by the output field -### 2.2 Train +### 3.2 Train In this chapter, we only introduce the training of the table structure model, For model training of [text detection](../../doc/doc_en/detection_en.md) and [text recognition](../../doc/doc_en/recognition_en.md), please refer to the corresponding documents @@ -68,9 +77,9 @@ python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./yo **Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrain_weights`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrain_weights` will be loaded. -### 2.3 Eval +### 3.3 Eval -The table uses TEDS (Tree-Edit-Distance-based Similarity) as the evaluation metric of the model. Before the model evaluation, the three models in the pipeline need to be exported as inference models (we have provided them), and the gt for evaluation needs to be prepared. Examples of gt are as follows: +The table uses [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) as the evaluation metric of the model. Before the model evaluation, the three models in the pipeline need to be exported as inference models (we have provided them), and the gt for evaluation needs to be prepared. Examples of gt are as follows: ```json {"PMC4289340_004_00.png": [ ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], @@ -89,11 +98,19 @@ cd PaddleOCR/ppstructure python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json ``` +If the PubLatNet eval dataset is used, it will be output +```bash +teds: 93.32 +``` -### 2.4 Inference +### 3.4 Inference ```python cd PaddleOCR/ppstructure python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` After running, the excel sheet of each picture will be saved in the directory specified by the output field + +Reference +1. https://github.com/ibm-aur-nlp/PubTabNet +2. https://arxiv.org/pdf/1911.10683 \ No newline at end of file diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index 09b73577047cb95343d9e52f89796f14d243bfba..2ded403c371984a447f94268d23ca1c6240cf432 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -17,9 +17,18 @@ 3. 由单行文字的坐标、识别结果和单元格的坐标一起组合出单元格的识别结果。 4. 单元格的识别结果和表格结构一起构造表格的html字符串。 -## 2. 使用 +## 2. 性能 +我们在 PubTabNet[1] 评估数据集上对算法进行了评估,性能如下 -### 2.1 快速开始 + +|算法|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)| +| --- | --- | +| EDD[2] | 88.3 | +| Ours | 93.32 | + +## 3. 使用 + +### 3.1 快速开始 ```python cd PaddleOCR/ppstructure @@ -40,7 +49,7 @@ python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_ta note: 上述模型是在 PubLayNet 数据集上训练的表格识别模型,仅支持英文扫描场景,如需识别其他场景需要自己训练模型后替换 `det_model_dir`,`rec_model_dir`,`table_model_dir`三个字段即可。 -### 2.2 训练 +### 3.2 训练 在这一章节中,我们仅介绍表格结构模型的训练,[文字检测](../../doc/doc_ch/detection.md)和[文字识别](../../doc/doc_ch/recognition.md)的模型训练请参考对应的文档。 #### 数据准备 @@ -67,9 +76,9 @@ python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./yo **注意**:`Global.checkpoints`的优先级高于`Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrain_weights`指定的模型。 -### 2.3 评估 +### 3.3 评估 -表格使用 TEDS(Tree-Edit-Distance-based Similarity) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: +表格使用 [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: ```json {"PMC4289340_004_00.png": [ ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], @@ -87,11 +96,18 @@ json 中,key为图片名,value为对应的gt,gt是一个由三个item组 cd PaddleOCR/ppstructure python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json ``` +如使用PubLatNet评估数据集,将会输出 +```bash +teds: 93.32 +``` -### 2.4 预测 +### 3.4 预测 ```python cd PaddleOCR/ppstructure python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` +Reference +1. https://github.com/ibm-aur-nlp/PubTabNet +2. https://arxiv.org/pdf/1911.10683 \ No newline at end of file