提交 6a121aa7 编写于 作者: W WenmuZhou

add ppstructure doc

上级 4030fd1e
...@@ -356,3 +356,4 @@ im_show.save('result.jpg') ...@@ -356,3 +356,4 @@ im_show.save('result.jpg')
| rec | 前向时是否启动识别 | TRUE | | rec | 前向时是否启动识别 | TRUE |
| cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE | | cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE |
| show_log | 是否打印det和rec等信息 | FALSE | | show_log | 是否打印det和rec等信息 | FALSE |
| type | 执行ocr或者表格结构化, 值可选['ocr','structure'] | ocr |
...@@ -362,5 +362,5 @@ im_show.save('result.jpg') ...@@ -362,5 +362,5 @@ im_show.save('result.jpg')
| det | Enable detction when `ppocr.ocr` func exec | TRUE | | det | Enable detction when `ppocr.ocr` func exec | TRUE |
| rec | Enable recognition when `ppocr.ocr` func exec | TRUE | | rec | Enable recognition when `ppocr.ocr` func exec | TRUE |
| cls | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction) | FALSE | | cls | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction) | FALSE |
| show_log | Whether to print log in det and rec | show_log | Whether to print log in det and rec | FALSE |
| FALSE | | type | Perform ocr or table structuring, the value is selected in ['ocr','structure'] | ocr |
\ No newline at end of file \ No newline at end of file
...@@ -33,7 +33,7 @@ from tools.infer.utility import draw_ocr, str2bool ...@@ -33,7 +33,7 @@ from tools.infer.utility import draw_ocr, str2bool
from ppstructure.utility import init_args, draw_structure_result from ppstructure.utility import init_args, draw_structure_result
from ppstructure.predict_system import OCRSystem, save_structure_res from ppstructure.predict_system import OCRSystem, save_structure_res
__all__ = ['PaddleOCR','PPStructure','draw_ocr','draw_structure_result','save_structure_res'] __all__ = ['PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', 'save_structure_res']
model_urls = { model_urls = {
'det': { 'det': {
...@@ -153,6 +153,42 @@ def parse_args(mMain=True): ...@@ -153,6 +153,42 @@ def parse_args(mMain=True):
return argparse.Namespace(**inference_args_dict) return argparse.Namespace(**inference_args_dict)
def parse_lang(lang):
latin_lang = [
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga',
'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms',
'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk',
'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi'
]
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd',
'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
]
devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new',
'gom', 'sa', 'bgc'
]
if lang in latin_lang:
lang = "latin"
elif lang in arabic_lang:
lang = "arabic"
elif lang in cyrillic_lang:
lang = "cyrillic"
elif lang in devanagari_lang:
lang = "devanagari"
assert lang in model_urls[
'rec'], 'param lang must in {}, but got {}'.format(
model_urls['rec'].keys(), lang)
if lang == "ch":
det_lang = "ch"
elif lang == 'structure':
det_lang = 'structure'
else:
det_lang = "en"
return lang, det_lang
class PaddleOCR(predict_system.TextSystem): class PaddleOCR(predict_system.TextSystem):
def __init__(self, **kwargs): def __init__(self, **kwargs):
""" """
...@@ -165,42 +201,7 @@ class PaddleOCR(predict_system.TextSystem): ...@@ -165,42 +201,7 @@ class PaddleOCR(predict_system.TextSystem):
if not params.show_log: if not params.show_log:
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
self.use_angle_cls = params.use_angle_cls self.use_angle_cls = params.use_angle_cls
lang = params.lang lang, det_lang = parse_lang(params.lang)
latin_lang = [
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga',
'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms',
'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk',
'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi'
]
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd',
'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
]
devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new',
'gom', 'sa', 'bgc'
]
if lang in latin_lang:
lang = "latin"
elif lang in arabic_lang:
lang = "arabic"
elif lang in cyrillic_lang:
lang = "cyrillic"
elif lang in devanagari_lang:
lang = "devanagari"
assert lang in model_urls[
'rec'], 'param lang must in {}, but got {}'.format(
model_urls['rec'].keys(), lang)
if lang == "ch":
det_lang = "ch"
else:
det_lang = "en"
use_inner_dict = False
if params.rec_char_dict_path is None:
use_inner_dict = True
params.rec_char_dict_path = model_urls['rec'][lang][
'dict_path']
# init model dir # init model dir
params.det_model_dir, det_url = confirm_model_dir_url(params.det_model_dir, params.det_model_dir, det_url = confirm_model_dir_url(params.det_model_dir,
...@@ -223,9 +224,9 @@ class PaddleOCR(predict_system.TextSystem): ...@@ -223,9 +224,9 @@ class PaddleOCR(predict_system.TextSystem):
if params.rec_algorithm not in SUPPORT_REC_MODEL: if params.rec_algorithm not in SUPPORT_REC_MODEL:
logger.error('rec_algorithm must in {}'.format(SUPPORT_REC_MODEL)) logger.error('rec_algorithm must in {}'.format(SUPPORT_REC_MODEL))
sys.exit(0) sys.exit(0)
if use_inner_dict:
params.rec_char_dict_path = str( if params.rec_char_dict_path is None:
Path(__file__).parent / params.rec_char_dict_path) params.rec_char_dict_path = str(Path(__file__).parent / model_urls['rec'][lang]['dict_path'])
print(params) print(params)
# init det_model and rec_model # init det_model and rec_model
...@@ -289,16 +290,17 @@ class PPStructure(OCRSystem): ...@@ -289,16 +290,17 @@ class PPStructure(OCRSystem):
params.__dict__.update(**kwargs) params.__dict__.update(**kwargs)
if not params.show_log: if not params.show_log:
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
params.use_angle_cls = False lang, det_lang = parse_lang(params.lang)
# init model dir # init model dir
params.det_model_dir, det_url = confirm_model_dir_url(params.det_model_dir, params.det_model_dir, det_url = confirm_model_dir_url(params.det_model_dir,
os.path.join(BASE_DIR, VERSION, 'structure', 'det'), os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang),
model_urls['det']['structure']) model_urls['det'][det_lang])
params.rec_model_dir, rec_url = confirm_model_dir_url(params.rec_model_dir, params.rec_model_dir, rec_url = confirm_model_dir_url(params.rec_model_dir,
os.path.join(BASE_DIR, VERSION, 'structure', 'rec'), os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang),
model_urls['rec']['structure']['url']) model_urls['rec'][lang]['url'])
params.table_model_dir, table_url = confirm_model_dir_url(params.table_model_dir, params.table_model_dir, table_url = confirm_model_dir_url(params.table_model_dir,
os.path.join(BASE_DIR, VERSION, 'structure', 'table'), os.path.join(BASE_DIR, VERSION, 'ocr', 'table'),
model_urls['table']['url']) model_urls['table']['url'])
# download model # download model
maybe_download(params.det_model_dir, det_url) maybe_download(params.det_model_dir, det_url)
...@@ -306,16 +308,9 @@ class PPStructure(OCRSystem): ...@@ -306,16 +308,9 @@ class PPStructure(OCRSystem):
maybe_download(params.table_model_dir, table_url) maybe_download(params.table_model_dir, table_url)
if params.rec_char_dict_path is None: if params.rec_char_dict_path is None:
params.rec_char_type = 'EN' params.rec_char_dict_path = str(Path(__file__).parent / model_urls['rec'][lang]['dict_path'])
if os.path.exists(str(Path(__file__).parent / model_urls['rec']['structure']['dict_path'])):
params.rec_char_dict_path = str(Path(__file__).parent / model_urls['rec']['structure']['dict_path'])
else:
params.rec_char_dict_path = str(Path(__file__).parent.parent / model_urls['rec']['structure']['dict_path'])
if params.table_char_dict_path is None: if params.table_char_dict_path is None:
if os.path.exists(str(Path(__file__).parent / model_urls['table']['dict_path'])): params.table_char_dict_path = str(Path(__file__).parent / model_urls['table']['dict_path'])
params.table_char_dict_path = str(Path(__file__).parent / model_urls['table']['dict_path'])
else:
params.table_char_dict_path = str(Path(__file__).parent.parent / model_urls['table']['dict_path'])
print(params) print(params)
super().__init__(params) super().__init__(params)
...@@ -354,9 +349,9 @@ def main(): ...@@ -354,9 +349,9 @@ def main():
if len(image_file_list) == 0: if len(image_file_list) == 0:
logger.error('no images find in {}'.format(args.image_dir)) logger.error('no images find in {}'.format(args.image_dir))
return return
if args.type=='ocr': if args.type == 'ocr':
engine = PaddleOCR(**(args.__dict__)) engine = PaddleOCR(**(args.__dict__))
elif args.type=='structure': elif args.type == 'structure':
engine = PPStructure(**(args.__dict__)) engine = PPStructure(**(args.__dict__))
else: else:
raise NotImplementedError raise NotImplementedError
...@@ -366,9 +361,9 @@ def main(): ...@@ -366,9 +361,9 @@ def main():
logger.info('{}{}{}'.format('*' * 10, img_path, '*' * 10)) logger.info('{}{}{}'.format('*' * 10, img_path, '*' * 10))
if args.type == 'ocr': if args.type == 'ocr':
result = engine.ocr(img_path, result = engine.ocr(img_path,
det=args.det, det=args.det,
rec=args.rec, rec=args.rec,
cls=args.use_angle_cls) cls=args.use_angle_cls)
if result is not None: if result is not None:
for line in result: for line in result:
logger.info(line) logger.info(line)
...@@ -376,4 +371,4 @@ def main(): ...@@ -376,4 +371,4 @@ def main():
result = engine(img_path) result = engine(img_path)
for item in result: for item in result:
logger.info(item['res']) logger.info(item['res'])
save_structure_res(result, args.output, img_name) save_structure_res(result, args.output, img_name)
\ No newline at end of file
# PaddleStructure # PPStructure
PaddleStructure is an OCR toolkit for complex layout analysis. It can divide document data in the form of pictures into **text, table, title, picture and list** 5 types of areas, and extract the table area as excel PPStructure is an OCR toolkit for complex layout analysis. It can divide document data in the form of pictures into **text, table, title, picture and list** 5 types of areas, and extract the table area as excel
## 1. Quick start ## 1. Quick start
### install ### install
**install layoutparser** **install paddleocr**
```sh
pip3 install -U premailer paddleocr https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl
```
**install paddlestructure**
install by pypi ref to [paddleocr whl doc](../doc/doc_en/whl_en.md)
```bash **install layoutparser**
pip install paddlestructure ```sh
``` pip3 install -U premailer https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl
build own whl package and install
```bash
python3 setup.py bdist_wheel
pip3 install dist/paddlestructure-x.x.x-py3-none-any.whl # x.x.x is the version of paddlestructure
``` ```
### 1.2 Use ### 1.2 Use
...@@ -28,7 +19,7 @@ pip3 install dist/paddlestructure-x.x.x-py3-none-any.whl # x.x.x is the version ...@@ -28,7 +19,7 @@ pip3 install dist/paddlestructure-x.x.x-py3-none-any.whl # x.x.x is the version
#### 1.2.1 Use by command line #### 1.2.1 Use by command line
```bash ```bash
paddlestructure --image_dir=../doc/table/1.png paddleocr --image_dir=../doc/table/1.png --type=structure
``` ```
#### 1.2.2 Use by code #### 1.2.2 Use by code
...@@ -36,29 +27,29 @@ paddlestructure --image_dir=../doc/table/1.png ...@@ -36,29 +27,29 @@ paddlestructure --image_dir=../doc/table/1.png
```python ```python
import os import os
import cv2 import cv2
from paddlestructure import PaddleStructure,draw_result,save_res from paddleocr import PPStructure,draw_structure_result,save_structure_res
table_engine = PaddleStructure(show_log=True) table_engine = PPStructure(show_log=True)
save_folder = './output/table' save_folder = './output/table'
img_path = '../doc/table/1.png' img_path = '../doc/table/1.png'
img = cv2.imread(img_path) img = cv2.imread(img_path)
result = table_engine(img) result = table_engine(img)
save_res(result, save_folder,os.path.basename(img_path).split('.')[0]) save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0])
for line in result: for line in result:
print(line) print(line)
from PIL import Image from PIL import Image
font_path = '../doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 font_path = '../doc/fonts/simfang.ttf'
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
im_show = draw_result(image, result,font_path=font_path) im_show = draw_structure_result(image, result,font_path=font_path)
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
#### 1.2.3 返回结果说明 #### 1.2.3 返回结果说明
The return result of PaddleStructure is a list composed of a dict, an example is as follows The return result of PPStructure is a list composed of a dict, an example is as follows
```shell ```shell
[ [
...@@ -91,12 +82,12 @@ Most of the parameters are consistent with the paddleocr whl package, see [doc o ...@@ -91,12 +82,12 @@ Most of the parameters are consistent with the paddleocr whl package, see [doc o
After running, each image will have a directory with the same name under the directory specified in the output field. Each table in the picture will be stored as an excel, and the excel file name will be the coordinates of the table in the image. After running, each image will have a directory with the same name under the directory specified in the output field. Each table in the picture will be stored as an excel, and the excel file name will be the coordinates of the table in the image.
## 2. PaddleStructure Pipeline ## 2. PPStructure Pipeline
the process is as follows the process is as follows
![pipeline](../doc/table/pipeline_en.jpg) ![pipeline](../doc/table/pipeline_en.jpg)
In PaddleStructure, the image will be analyzed by layoutparser first. In the layout analysis, the area in the image will be classified, including **text, title, image, list and table** 5 categories. For the first 4 types of areas, directly use the PP-OCR to complete the text detection and recognition. The table area will be converted to an excel file of the same table style via Table OCR. In PPStructure, the image will be analyzed by layoutparser first. In the layout analysis, the area in the image will be classified, including **text, title, image, list and table** 5 categories. For the first 4 types of areas, directly use the PP-OCR to complete the text detection and recognition. The table area will be converted to an excel file of the same table style via Table OCR.
### 2.1 LayoutParser ### 2.1 LayoutParser
......
# PaddleStructure # PPStructure
PaddleStructure是一个用于复杂版面分析的OCR工具包,其能够对图片形式的文档数据划分**文字、表格、标题、图片以及列表**5类区域,并将表格区域提取为excel PaddleStructure是一个用于复杂版面分析的OCR工具包,其能够对图片形式的文档数据划分**文字、表格、标题、图片以及列表**5类区域,并将表格区域提取为excel
...@@ -6,29 +6,21 @@ PaddleStructure是一个用于复杂版面分析的OCR工具包,其能够对 ...@@ -6,29 +6,21 @@ PaddleStructure是一个用于复杂版面分析的OCR工具包,其能够对
### 1.1 安装 ### 1.1 安装
**安装 paddleocr**
参考 [paddleocr whl文档](../doc/doc_ch/whl.md)
**安装 layoutparser** **安装 layoutparser**
```sh ```sh
pip3 install -U premailer paddleocr https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl pip3 install -U premailer paddleocr https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl
``` ```
**安装 paddlestructure**
pip安装
```bash
pip install paddlestructure
```
本地构建并安装
```bash
python3 setup.py bdist_wheel
pip3 install dist/paddlestructure-x.x.x-py3-none-any.whl # x.x.x是 paddlestructure 的版本号
```
### 1.2 PaddleStructure whl包使用 ### 1.2 PPStructure whl包使用
#### 1.2.1 命令行使用 #### 1.2.1 命令行使用
```bash ```bash
paddlestructure --image_dir=../doc/table/1.png paddleocr --image_dir=../doc/table/1.png --type=structure
``` ```
#### 1.2.2 Python脚本使用 #### 1.2.2 Python脚本使用
...@@ -36,15 +28,15 @@ paddlestructure --image_dir=../doc/table/1.png ...@@ -36,15 +28,15 @@ paddlestructure --image_dir=../doc/table/1.png
```python ```python
import os import os
import cv2 import cv2
from paddlestructure import PaddleStructure,draw_result,save_res from paddleocr import PPStructure,draw_structure_result,save_structure_res
table_engine = PaddleStructure(show_log=True) table_engine = PPStructure(show_log=True)
save_folder = './output/table' save_folder = './output/table'
img_path = '../doc/table/1.png' img_path = '../doc/table/1.png'
img = cv2.imread(img_path) img = cv2.imread(img_path)
result = table_engine(img) result = table_engine(img)
save_res(result, save_folder,os.path.basename(img_path).split('.')[0]) save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0])
for line in result: for line in result:
print(line) print(line)
...@@ -53,7 +45,7 @@ from PIL import Image ...@@ -53,7 +45,7 @@ from PIL import Image
font_path = '../doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 font_path = '../doc/fonts/simfang.ttf' # PaddleOCR下提供字体包
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
im_show = draw_result(image, result,font_path=font_path) im_show = draw_structure_result(image, result,font_path=font_path)
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
...@@ -93,12 +85,12 @@ dict 里各个字段说明如下 ...@@ -93,12 +85,12 @@ dict 里各个字段说明如下
运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,excel文件名为表格在图片里的坐标。 运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,excel文件名为表格在图片里的坐标。
## 2. PaddleStructure Pipeline ## 2. PPStructure Pipeline
流程如下 流程如下
![pipeline](../doc/table/pipeline.jpg) ![pipeline](../doc/table/pipeline.jpg)
在PaddleStructure中,图片会先经由layoutparser进行版面分析,在版面分析中,会对图片里的区域进行分类,包括**文字、标题、图片、列表和表格**5类。对于前4类区域,直接使用PP-OCR完成对应区域文字检测与识别。对于表格类区域,经过Table OCR处理后,表格图片转换为相同表格样式的Excel文件。 在PPStructure中,图片会先经由layoutparser进行版面分析,在版面分析中,会对图片里的区域进行分类,包括**文字、标题、图片、列表和表格**5类。对于前4类区域,直接使用PP-OCR完成对应区域文字检测与识别。对于表格类区域,经过Table OCR处理后,表格图片转换为相同表格样式的Excel文件。
### 2.1 版面分析 ### 2.1 版面分析
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册