提交 3ee146fa 编写于 作者: z37757's avatar z37757

Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into drrg_branch

Global: Global:
use_gpu: true use_gpu: true
use_xpu: false use_xpu: false
use_mlu: false
epoch_num: 1200 epoch_num: 1200
log_smooth_window: 20 log_smooth_window: 20
print_batch_step: 10 print_batch_step: 10
......
Global:
use_gpu: True
epoch_num: 6
log_smooth_window: 20
print_batch_step: 50
save_model_dir: ./output/rec/rec_resnet_rfl_att/
save_epoch_step: 1
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 5000]
cal_metric_during_train: True
pretrained_model: ./pretrain_models/rec_resnet_rfl_visual/best_accuracy.pdparams
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/rec_resnet_rfl.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.999
weight_decay: 0.0
clip_norm_global: 5.0
lr:
name: Piecewise
decay_epochs : [3, 4, 5]
values : [0.001, 0.0003, 0.00009, 0.000027]
Architecture:
model_type: rec
algorithm: RFL
in_channels: 1
Transform:
name: TPS
num_fiducial: 20
loc_lr: 1.0
model_name: large
Backbone:
name: ResNetRFL
use_cnt: True
use_seq: True
Neck:
name: RFAdaptor
use_v2s: True
use_s2v: True
Head:
name: RFLHead
in_channels: 512
hidden_size: 256
batch_max_legnth: 25
out_channels: 38
use_cnt: True
use_seq: True
Loss:
name: RFLLoss
# ignore_index: 0
PostProcess:
name: RFLLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RFLLabelEncode: # Class handling label
- RFLRecResizeImg:
image_shape: [1, 32, 100]
padding: false
interpolation: 2
- KeepKeys:
keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 64
drop_last: True
num_workers: 8
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/validation/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RFLLabelEncode: # Class handling label
- RFLRecResizeImg:
image_shape: [1, 32, 100]
padding: false
interpolation: 2
- KeepKeys:
keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 8
Global:
use_gpu: True
epoch_num: 6
log_smooth_window: 20
print_batch_step: 50
save_model_dir: ./output/rec/rec_resnet_rfl_visual/
save_epoch_step: 1
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 5000]
cal_metric_during_train: False
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/rec_resnet_rfl_visual.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.999
weight_decay: 0.0
clip_norm_global: 5.0
lr:
name: Piecewise
decay_epochs : [3, 4, 5]
values : [0.001, 0.0003, 0.00009, 0.000027]
Architecture:
model_type: rec
algorithm: RFL
in_channels: 1
Transform:
name: TPS
num_fiducial: 20
loc_lr: 1.0
model_name: large
Backbone:
name: ResNetRFL
use_cnt: True
use_seq: False
Neck:
name: RFAdaptor
use_v2s: False
use_s2v: False
Head:
name: RFLHead
in_channels: 512
hidden_size: 256
batch_max_legnth: 25
out_channels: 38
use_cnt: True
use_seq: False
Loss:
name: RFLLoss
PostProcess:
name: RFLLabelDecode
Metric:
name: CNTMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RFLLabelEncode: # Class handling label
- RFLRecResizeImg:
image_shape: [1, 32, 100]
padding: false
interpolation: 2
- KeepKeys:
keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 64
drop_last: True
num_workers: 8
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RFLLabelEncode: # Class handling label
- RFLRecResizeImg:
image_shape: [1, 32, 100]
padding: false
interpolation: 2
- KeepKeys:
keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 8
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
{
"modules_info": {
"kie_ser": {
"init_args": {
"version": "1.0.0",
"use_gpu": true
},
"predict_args": {
}
}
},
"port": 8871,
"use_multiprocess": false,
"workers": 2
}
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
sys.path.insert(0, ".")
import copy
import time
import paddlehub
from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, runnable, serving
import cv2
import numpy as np
import paddlehub as hub
from tools.infer.utility import base64_to_cv2
from ppstructure.kie.predict_kie_token_ser import SerPredictor
from ppstructure.utility import parse_args
from deploy.hubserving.kie_ser.params import read_params
@moduleinfo(
name="kie_ser",
version="1.0.0",
summary="kie ser service",
author="paddle-dev",
author_email="paddle-dev@baidu.com",
type="cv/KIE_SER")
class KIESer(hub.Module):
def _initialize(self, use_gpu=False, enable_mkldnn=False):
"""
initialize with the necessary elements
"""
cfg = self.merge_configs()
cfg.use_gpu = use_gpu
if use_gpu:
try:
_places = os.environ["CUDA_VISIBLE_DEVICES"]
int(_places[0])
print("use gpu: ", use_gpu)
print("CUDA_VISIBLE_DEVICES: ", _places)
cfg.gpu_mem = 8000
except:
raise RuntimeError(
"Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id."
)
cfg.ir_optim = True
cfg.enable_mkldnn = enable_mkldnn
self.ser_predictor = SerPredictor(cfg)
def merge_configs(self, ):
# deafult cfg
backup_argv = copy.deepcopy(sys.argv)
sys.argv = sys.argv[:1]
cfg = parse_args()
update_cfg_map = vars(read_params())
for key in update_cfg_map:
cfg.__setattr__(key, update_cfg_map[key])
sys.argv = copy.deepcopy(backup_argv)
return cfg
def read_images(self, paths=[]):
images = []
for img_path in paths:
assert os.path.isfile(
img_path), "The {} isn't a valid file.".format(img_path)
img = cv2.imread(img_path)
if img is None:
logger.info("error in loading image:{}".format(img_path))
continue
images.append(img)
return images
def predict(self, images=[], paths=[]):
"""
Get the chinese texts in the predicted images.
Args:
images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths
paths (list[str]): The paths of images. If paths not images
Returns:
res (list): The result of chinese texts and save path of images.
"""
if images != [] and isinstance(images, list) and paths == []:
predicted_data = images
elif images == [] and isinstance(paths, list) and paths != []:
predicted_data = self.read_images(paths)
else:
raise TypeError("The input data is inconsistent with expectations.")
assert predicted_data != [], "There is not any image to be predicted. Please check the input data."
all_results = []
for img in predicted_data:
if img is None:
logger.info("error in loading image")
all_results.append([])
continue
starttime = time.time()
ser_res, _, elapse = self.ser_predictor(img)
elapse = time.time() - starttime
logger.info("Predict time: {}".format(elapse))
all_results.append(ser_res)
return all_results
@serving
def serving_method(self, images, **kwargs):
"""
Run as a service.
"""
images_decode = [base64_to_cv2(image) for image in images]
results = self.predict(images_decode, **kwargs)
return results
if __name__ == '__main__':
ocr = OCRSystem()
ocr._initialize()
image_path = [
'./doc/imgs/11.jpg',
'./doc/imgs/12.jpg',
]
res = ocr.predict(paths=image_path)
print(res)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from deploy.hubserving.ocr_system.params import read_params as pp_ocr_read_params
class Config(object):
pass
def read_params():
cfg = pp_ocr_read_params()
# SER params
cfg.kie_algorithm = "LayoutXLM"
cfg.use_visual_backbone = False
cfg.ser_model_dir = "./inference/ser_vi_layoutxlm_xfund_infer"
cfg.ser_dict_path = "train_data/XFUND/class_list_xfun.txt"
cfg.vis_font_path = "./doc/fonts/simfang.ttf"
cfg.ocr_order_method = "tb-yx"
return cfg
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
{
"modules_info": {
"kie_ser_re": {
"init_args": {
"version": "1.0.0",
"use_gpu": true
},
"predict_args": {
}
}
},
"port": 8872,
"use_multiprocess": false,
"workers": 2
}
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
sys.path.insert(0, ".")
import copy
import time
import paddlehub
from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, runnable, serving
import cv2
import numpy as np
import paddlehub as hub
from tools.infer.utility import base64_to_cv2
from ppstructure.kie.predict_kie_token_ser_re import SerRePredictor
from ppstructure.utility import parse_args
from deploy.hubserving.kie_ser_re.params import read_params
@moduleinfo(
name="kie_ser_re",
version="1.0.0",
summary="kie ser re service",
author="paddle-dev",
author_email="paddle-dev@baidu.com",
type="cv/KIE_SER_RE")
class KIESerRE(hub.Module):
def _initialize(self, use_gpu=False, enable_mkldnn=False):
"""
initialize with the necessary elements
"""
cfg = self.merge_configs()
cfg.use_gpu = use_gpu
if use_gpu:
try:
_places = os.environ["CUDA_VISIBLE_DEVICES"]
int(_places[0])
print("use gpu: ", use_gpu)
print("CUDA_VISIBLE_DEVICES: ", _places)
cfg.gpu_mem = 8000
except:
raise RuntimeError(
"Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id."
)
cfg.ir_optim = True
cfg.enable_mkldnn = enable_mkldnn
self.ser_re_predictor = SerRePredictor(cfg)
def merge_configs(self, ):
# deafult cfg
backup_argv = copy.deepcopy(sys.argv)
sys.argv = sys.argv[:1]
cfg = parse_args()
update_cfg_map = vars(read_params())
for key in update_cfg_map:
cfg.__setattr__(key, update_cfg_map[key])
sys.argv = copy.deepcopy(backup_argv)
return cfg
def read_images(self, paths=[]):
images = []
for img_path in paths:
assert os.path.isfile(
img_path), "The {} isn't a valid file.".format(img_path)
img = cv2.imread(img_path)
if img is None:
logger.info("error in loading image:{}".format(img_path))
continue
images.append(img)
return images
def predict(self, images=[], paths=[]):
"""
Get the chinese texts in the predicted images.
Args:
images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths
paths (list[str]): The paths of images. If paths not images
Returns:
res (list): The result of chinese texts and save path of images.
"""
if images != [] and isinstance(images, list) and paths == []:
predicted_data = images
elif images == [] and isinstance(paths, list) and paths != []:
predicted_data = self.read_images(paths)
else:
raise TypeError("The input data is inconsistent with expectations.")
assert predicted_data != [], "There is not any image to be predicted. Please check the input data."
all_results = []
for img in predicted_data:
if img is None:
logger.info("error in loading image")
all_results.append([])
continue
print(img.shape)
starttime = time.time()
re_res, _ = self.ser_re_predictor(img)
print(re_res)
elapse = time.time() - starttime
logger.info("Predict time: {}".format(elapse))
all_results.append(re_res)
return all_results
@serving
def serving_method(self, images, **kwargs):
"""
Run as a service.
"""
images_decode = [base64_to_cv2(image) for image in images]
results = self.predict(images_decode, **kwargs)
return results
if __name__ == '__main__':
ocr = OCRSystem()
ocr._initialize()
image_path = [
'./doc/imgs/11.jpg',
'./doc/imgs/12.jpg',
]
res = ocr.predict(paths=image_path)
print(res)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from deploy.hubserving.ocr_system.params import read_params as pp_ocr_read_params
class Config(object):
pass
def read_params():
cfg = pp_ocr_read_params()
# SER params
cfg.kie_algorithm = "LayoutXLM"
cfg.use_visual_backbone = False
cfg.ser_model_dir = "./inference/ser_vi_layoutxlm_xfund_infer"
cfg.re_model_dir = "./inference/re_vi_layoutxlm_xfund_infer"
cfg.ser_dict_path = "train_data/XFUND/class_list_xfun.txt"
cfg.vis_font_path = "./doc/fonts/simfang.ttf"
cfg.ocr_order_method = "tb-yx"
return cfg
...@@ -30,6 +30,8 @@ deploy/hubserving/ ...@@ -30,6 +30,8 @@ deploy/hubserving/
└─ structure_layout 版面分析服务包 └─ structure_layout 版面分析服务包
└─ structure_table 表格识别服务包 └─ structure_table 表格识别服务包
└─ structure_system PP-Structure服务包 └─ structure_system PP-Structure服务包
└─ kie_ser 关键信息抽取-SER服务包
└─ kie_ser_re 关键信息抽取-SER+RE服务包
``` ```
每个服务包下包含3个文件。以2阶段串联服务包为例,目录如下: 每个服务包下包含3个文件。以2阶段串联服务包为例,目录如下:
...@@ -42,6 +44,7 @@ deploy/hubserving/ocr_system/ ...@@ -42,6 +44,7 @@ deploy/hubserving/ocr_system/
``` ```
## 1. 近期更新 ## 1. 近期更新
* 2022.10.09 新增关键信息抽取服务。
* 2022.08.23 新增版面分析服务。 * 2022.08.23 新增版面分析服务。
* 2022.05.05 新增PP-OCRv3检测和识别模型。 * 2022.05.05 新增PP-OCRv3检测和识别模型。
* 2022.03.30 新增PP-Structure和表格识别两种服务。 * 2022.03.30 新增PP-Structure和表格识别两种服务。
...@@ -57,12 +60,15 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple ...@@ -57,12 +60,15 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple
### 2.2 下载推理模型 ### 2.2 下载推理模型
安装服务模块前,需要准备推理模型并放到正确路径。默认使用的是PP-OCRv3模型,默认模型路径为: 安装服务模块前,需要准备推理模型并放到正确路径。默认使用的是PP-OCRv3模型,默认模型路径为:
``` ```
检测模型:./inference/ch_PP-OCRv3_det_infer/ 检测模型:./inference/ch_PP-OCRv3_det_infer/
识别模型:./inference/ch_PP-OCRv3_rec_infer/ 识别模型:./inference/ch_PP-OCRv3_rec_infer/
方向分类器:./inference/ch_ppocr_mobile_v2.0_cls_infer/ 方向分类器:./inference/ch_ppocr_mobile_v2.0_cls_infer/
版面分析模型:./inference/picodet_lcnet_x1_0_fgd_layout_infer/ 版面分析模型:./inference/picodet_lcnet_x1_0_fgd_layout_infer/
表格结构识别模型:./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ 表格结构识别模型:./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/
关键信息抽取SER模型:./inference/ser_vi_layoutxlm_xfund_infer/
关键信息抽取RE模型:./inference/re_vi_layoutxlm_xfund_infer/
``` ```
**模型路径可在`params.py`中查看和修改。** 更多模型可以从PaddleOCR提供的模型库[PP-OCR](../../doc/doc_ch/models_list.md)[PP-Structure](../../ppstructure/docs/models_list.md)下载,也可以替换成自己训练转换好的模型。 **模型路径可在`params.py`中查看和修改。** 更多模型可以从PaddleOCR提供的模型库[PP-OCR](../../doc/doc_ch/models_list.md)[PP-Structure](../../ppstructure/docs/models_list.md)下载,也可以替换成自己训练转换好的模型。
...@@ -92,6 +98,12 @@ hub install deploy/hubserving/structure_system/ ...@@ -92,6 +98,12 @@ hub install deploy/hubserving/structure_system/
# 或,安装版面分析服务模块: # 或,安装版面分析服务模块:
hub install deploy/hubserving/structure_layout/ hub install deploy/hubserving/structure_layout/
# 或,安装关键信息抽取SER服务模块:
hub install deploy/hubserving/kie_ser/
# 或,安装关键信息抽取SER+RE服务模块:
hub install deploy/hubserving/kie_ser_re/
``` ```
* 在Windows环境下(文件夹的分隔符为`\`),安装示例如下: * 在Windows环境下(文件夹的分隔符为`\`),安装示例如下:
...@@ -116,6 +128,12 @@ hub install deploy\hubserving\structure_system\ ...@@ -116,6 +128,12 @@ hub install deploy\hubserving\structure_system\
# 或,安装版面分析服务模块: # 或,安装版面分析服务模块:
hub install deploy\hubserving\structure_layout\ hub install deploy\hubserving\structure_layout\
# 或,安装关键信息抽取SER服务模块:
hub install deploy\hubserving\kie_ser\
# 或,安装关键信息抽取SER+RE服务模块:
hub install deploy\hubserving\kie_ser_re\
``` ```
### 2.4 启动服务 ### 2.4 启动服务
...@@ -194,6 +212,8 @@ hub serving start -c deploy/hubserving/ocr_system/config.json ...@@ -194,6 +212,8 @@ hub serving start -c deploy/hubserving/ocr_system/config.json
`http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8869/predict/structure_table`
`http://127.0.0.1:8870/predict/structure_system` `http://127.0.0.1:8870/predict/structure_system`
`http://127.0.0.1:8870/predict/structure_layout` `http://127.0.0.1:8870/predict/structure_layout`
`http://127.0.0.1:8871/predict/kie_ser`
`http://127.0.0.1:8872/predict/kie_ser_re`
- **image_dir**:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径 - **image_dir**:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径
- **visualize**:是否可视化结果,默认为False - **visualize**:是否可视化结果,默认为False
- **output**:可视化结果保存路径,默认为`./hubserving_result` - **output**:可视化结果保存路径,默认为`./hubserving_result`
...@@ -216,15 +236,18 @@ hub serving start -c deploy/hubserving/ocr_system/config.json ...@@ -216,15 +236,18 @@ hub serving start -c deploy/hubserving/ocr_system/config.json
不同模块返回的字段不同,如,文本识别服务模块返回结果不含`text_region`字段,具体信息如下: 不同模块返回的字段不同,如,文本识别服务模块返回结果不含`text_region`字段,具体信息如下:
| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | Structure_layout | | 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | Structure_layout | kie_ser | kie_re |
| --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|angle| | ✔ | | ✔ | ||| |angle| | ✔ | | ✔ | |||
|text| | |✔|✔| | ✔ | | |text| | |✔|✔| | ✔ | | ✔ | ✔ |
|confidence| |✔ |✔| | | ✔| | |confidence| |✔ |✔| | | ✔| |✔ | ✔ |
|text_region| ✔| | |✔ | | ✔| | |text_region| ✔| | |✔ | | ✔| |✔ | ✔ |
|html| | | | |✔ |✔|| |html| | | | |✔ |✔||| |
|regions| | | | |✔ |✔ | | |regions| | | | |✔ |✔ | || |
|layout| | | | | | | ✔ | |layout| | | | | | | ✔ || |
|ser_res| | | | | | | | ✔ | |
|re_res| | | | | | | | | ✔ |
**说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。 **说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。
......
...@@ -30,6 +30,8 @@ deploy/hubserving/ ...@@ -30,6 +30,8 @@ deploy/hubserving/
└─ structure_layout layout analysis service package └─ structure_layout layout analysis service package
└─ structure_table table recognition service package └─ structure_table table recognition service package
└─ structure_system PP-Structure service package └─ structure_system PP-Structure service package
└─ kie_ser KIE(SER) service package
└─ kie_ser_re KIE(SER+RE) service package
``` ```
Each service pack contains 3 files. Take the 2-stage series connection service package as an example, the directory is as follows: Each service pack contains 3 files. Take the 2-stage series connection service package as an example, the directory is as follows:
...@@ -42,9 +44,10 @@ deploy/hubserving/ocr_system/ ...@@ -42,9 +44,10 @@ deploy/hubserving/ocr_system/
``` ```
## 1. Update ## 1. Update
* 2022.05.05 add PP-OCRv3 text detection and recognition models. * 2022.10.09 add KIE services.
* 2022.03.30 add PP-Structure and table recognition services。 * 2022.08.23 add layout analysis services.
* 2022.08.23 add layout analysis services。 * 2022.03.30 add PP-Structure and table recognition services.
* 2022.05.05 add PP-OCRv3 text detection and recognition services.
## 2. Quick start service ## 2. Quick start service
...@@ -65,6 +68,8 @@ text recognition model: ./inference/ch_PP-OCRv3_rec_infer/ ...@@ -65,6 +68,8 @@ text recognition model: ./inference/ch_PP-OCRv3_rec_infer/
text angle classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/ text angle classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/
layout parse model: ./inference/picodet_lcnet_x1_0_fgd_layout_infer/ layout parse model: ./inference/picodet_lcnet_x1_0_fgd_layout_infer/
tanle recognition: ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ tanle recognition: ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/
KIE(SER): ./inference/ser_vi_layoutxlm_xfund_infer/
KIE(SER+RE): ./inference/re_vi_layoutxlm_xfund_infer/
``` ```
**The model path can be found and modified in `params.py`.** More models provided by PaddleOCR can be obtained from the [model library](../../doc/doc_en/models_list_en.md). You can also use models trained by yourself. **The model path can be found and modified in `params.py`.** More models provided by PaddleOCR can be obtained from the [model library](../../doc/doc_en/models_list_en.md). You can also use models trained by yourself.
...@@ -92,8 +97,11 @@ hub install deploy/hubserving/structure_table/ ...@@ -92,8 +97,11 @@ hub install deploy/hubserving/structure_table/
# Or install PP-Structure service module # Or install PP-Structure service module
hub install deploy/hubserving/structure_system/ hub install deploy/hubserving/structure_system/
# Or install layout analysis service module # Or install KIE(SER) service module
hub install deploy/hubserving/structure_layout/ hub install deploy/hubserving/kie_ser/
# Or install KIE(SER+RE) service module
hub install deploy/hubserving/kie_ser_re/
``` ```
* On Windows platform, the examples are as follows. * On Windows platform, the examples are as follows.
...@@ -118,6 +126,12 @@ hub install deploy\hubserving\structure_system\ ...@@ -118,6 +126,12 @@ hub install deploy\hubserving\structure_system\
# Or install layout analysis service module # Or install layout analysis service module
hub install deploy\hubserving\structure_layout\ hub install deploy\hubserving\structure_layout\
# Or install KIE(SER) service module
hub install deploy\hubserving\kie_ser\
# Or install KIE(SER+RE) service module
hub install deploy\hubserving\kie_ser_re\
``` ```
### 2.4 Start service ### 2.4 Start service
...@@ -201,6 +215,8 @@ For example, if using the configuration file to start the text angle classificat ...@@ -201,6 +215,8 @@ For example, if using the configuration file to start the text angle classificat
`http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8869/predict/structure_table`
`http://127.0.0.1:8870/predict/structure_system` `http://127.0.0.1:8870/predict/structure_system`
`http://127.0.0.1:8870/predict/structure_layout` `http://127.0.0.1:8870/predict/structure_layout`
`http://127.0.0.1:8871/predict/kie_ser`
`http://127.0.0.1:8872/predict/kie_ser_re`
- **image_dir**:Test image path, can be a single image path or an image directory path - **image_dir**:Test image path, can be a single image path or an image directory path
- **visualize**:Whether to visualize the results, the default value is False - **visualize**:Whether to visualize the results, the default value is False
- **output**:The floder to save Visualization result, default value is `./hubserving_result` - **output**:The floder to save Visualization result, default value is `./hubserving_result`
...@@ -225,15 +241,17 @@ The returned result is a list. Each item in the list is a dict. The dict may con ...@@ -225,15 +241,17 @@ The returned result is a list. Each item in the list is a dict. The dict may con
The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`. The details are as follows: The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`. The details are as follows:
| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout | | field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout | kie_ser | kie_re |
| --- | --- | --- | --- | --- | --- |--- |--- | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|angle| | ✔ | | ✔ | || | |angle| | ✔ | | ✔ | |||
|text| | |✔|✔| | ✔ | | |text| | |✔|✔| | ✔ | | ✔ | ✔ |
|confidence| |✔ |✔| | | ✔| | |confidence| |✔ |✔| | | ✔| |✔ | ✔ |
|text_region| ✔| | |✔ | | ✔| | |text_region| ✔| | |✔ | | ✔| |✔ | ✔ |
|html| | | | |✔ |✔| | |html| | | | |✔ |✔||| |
|regions| | | | |✔ |✔ | | |regions| | | | |✔ |✔ | || |
|layout| | | | | | |✔ | |layout| | | | | | | ✔ || |
|ser_res| | | | | | | | ✔ | |
|re_res| | | | | | | | | ✔ |
**Note:** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section. **Note:** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section.
......
# Paddle2ONNX模型转化与预测 # Paddle2ONNX model transformation and prediction
本章节介绍 PaddleOCR 模型如何转化为 ONNX 模型,并基于 ONNXRuntime 引擎预测。 This chapter describes how the PaddleOCR model is converted into an ONNX model and predicted based on the ONNXRuntime engine.
## 1. 环境准备 ## 1. Environment preparation
需要准备 PaddleOCR、Paddle2ONNX 模型转化环境,和 ONNXRuntime 预测环境 Need to prepare PaddleOCR, Paddle2ONNX model conversion environment, and ONNXRuntime prediction environment
### PaddleOCR ### PaddleOCR
克隆PaddleOCR的仓库,使用release/2.4分支,并进行安装,由于PaddleOCR仓库比较大,git clone速度比较慢,所以本教程已下载 Clone the PaddleOCR repository, use the release/2.6 branch, and install it.
``` ```
git clone -b release/2.4 https://github.com/PaddlePaddle/PaddleOCR.git git clone -b release/2.6 https://github.com/PaddlePaddle/PaddleOCR.git
cd PaddleOCR && python3.7 setup.py install cd PaddleOCR && python3.7 setup.py install
``` ```
### Paddle2ONNX ### Paddle2ONNX
Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式,算子目前稳定支持导出 ONNX Opset 9~11,部分Paddle算子支持更低的ONNX Opset转换。 Paddle2ONNX supports converting the PaddlePaddle model format to the ONNX model format. The operator currently supports exporting ONNX Opset 9~11 stably, and some Paddle operators support lower ONNX Opset conversion.
更多细节可参考 [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md) For more details, please refer to [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_en.md)
- 安装 Paddle2ONNX
- install Paddle2ONNX
``` ```
python3.7 -m pip install paddle2onnx python3.7 -m pip install paddle2onnx
``` ```
- 安装 ONNXRuntime - install ONNXRuntime
``` ```
# 建议安装 1.9.0 版本,可根据环境更换版本号 # It is recommended to install version 1.9.0, and the version number can be changed according to the environment
python3.7 -m pip install onnxruntime==1.9.0 python3.7 -m pip install onnxruntime==1.9.0
``` ```
## 2. 模型转换 ## 2. Model conversion
- Paddle 模型下载 - Paddle model download
有两种方式获取Paddle静态图模型:在 [model_list](../../doc/doc_ch/models_list.md) 中下载PaddleOCR提供的预测模型; There are two ways to obtain the Paddle model: Download the prediction model provided by PaddleOCR in [model_list](../../doc/doc_en/models_list_en.md);
参考[模型导出说明](../../doc/doc_ch/inference.md#训练模型转inference模型)把训练好的权重转为 inference_model。 Refer to [Model Export Instructions](../../doc/doc_en/inference_en.md#1-convert-training-model-to-inference-model) to convert the trained weights to inference_model.
以 ppocr 中文检测、识别、分类模型为例: Take the PP-OCRv3 detection, recognition, and classification model as an example:
``` ```
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar
cd ./inference && tar xf ch_PP-OCRv2_det_infer.tar && cd .. cd ./inference && tar xf en_PP-OCRv3_det_infer.tar && cd ..
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar
cd ./inference && tar xf ch_PP-OCRv2_rec_infer.tar && cd .. cd ./inference && tar xf en_PP-OCRv3_rec_infer.tar && cd ..
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar
cd ./inference && tar xf ch_ppocr_mobile_v2.0_cls_infer.tar && cd .. cd ./inference && tar xf ch_ppocr_mobile_v2.0_cls_infer.tar && cd ..
``` ```
- 模型转换 - convert model
使用 Paddle2ONNX 将Paddle静态图模型转换为ONNX模型格式: Convert Paddle inference model to ONNX model format using Paddle2ONNX:
``` ```
paddle2onnx --model_dir ./inference/ch_PP-OCRv2_det_infer \ paddle2onnx --model_dir ./inference/en_PP-OCRv3_det_infer \
--model_filename inference.pdmodel \ --model_filename inference.pdmodel \
--params_filename inference.pdiparams \ --params_filename inference.pdiparams \
--save_file ./inference/det_onnx/model.onnx \ --save_file ./inference/det_onnx/model.onnx \
...@@ -65,7 +66,7 @@ paddle2onnx --model_dir ./inference/ch_PP-OCRv2_det_infer \ ...@@ -65,7 +66,7 @@ paddle2onnx --model_dir ./inference/ch_PP-OCRv2_det_infer \
--input_shape_dict="{'x':[-1,3,-1,-1]}" \ --input_shape_dict="{'x':[-1,3,-1,-1]}" \
--enable_onnx_checker True --enable_onnx_checker True
paddle2onnx --model_dir ./inference/ch_PP-OCRv2_rec_infer \ paddle2onnx --model_dir ./inference/en_PP-OCRv3_rec_infer \
--model_filename inference.pdmodel \ --model_filename inference.pdmodel \
--params_filename inference.pdiparams \ --params_filename inference.pdiparams \
--save_file ./inference/rec_onnx/model.onnx \ --save_file ./inference/rec_onnx/model.onnx \
...@@ -81,136 +82,89 @@ paddle2onnx --model_dir ./inference/ch_ppocr_mobile_v2.0_cls_infer \ ...@@ -81,136 +82,89 @@ paddle2onnx --model_dir ./inference/ch_ppocr_mobile_v2.0_cls_infer \
--input_shape_dict="{'x':[-1,3,-1,-1]}" \ --input_shape_dict="{'x':[-1,3,-1,-1]}" \
--enable_onnx_checker True --enable_onnx_checker True
``` ```
After execution, the ONNX model will be saved in `./inference/det_onnx/`, `./inference/rec_onnx/`, `./inference/cls_onnx/` paths respectively
执行完毕后,ONNX 模型会被分别保存在 `./inference/det_onnx/``./inference/rec_onnx/``./inference/cls_onnx/`路径下 * Note: For the OCR model, the conversion process must be in the form of dynamic shape, that is, add the option --input_shape_dict="{'x': [-1, 3, -1, -1]}", otherwise the prediction result may be the same as Predicting directly with Paddle is slightly different.
In addition, the following models do not currently support conversion to ONNX models:
* 注意:对于OCR模型,转化过程中必须采用动态shape的形式,即加入选项--input_shape_dict="{'x': [-1, 3, -1, -1]}",否则预测结果可能与直接使用Paddle预测有细微不同。 NRTR, SAR, RARE, SRN
另外,以下几个模型暂不支持转换为 ONNX 模型:
NRTR、SAR、RARE、SRN
## 3. 推理预测 ## 3. prediction
以中文OCR模型为例,使用 ONNXRuntime 预测可执行如下命令: Take the English OCR model as an example, use **ONNXRuntime** to predict and execute the following commands:
``` ```
python3.7 tools/infer/predict_system.py --use_gpu=False --use_onnx=True \ python3.7 tools/infer/predict_system.py --use_gpu=False --use_onnx=True \
--det_model_dir=./inference/det_onnx/model.onnx \ --det_model_dir=./inference/det_onnx/model.onnx \
--rec_model_dir=./inference/rec_onnx/model.onnx \ --rec_model_dir=./inference/rec_onnx/model.onnx \
--cls_model_dir=./inference/cls_onnx/model.onnx \ --cls_model_dir=./inference/cls_onnx/model.onnx \
--image_dir=./deploy/lite/imgs/lite_demo.png --image_dir=doc/imgs_en/img_12.jpg \
--rec_char_dict_path=ppocr/utils/en_dict.txt
``` ```
以中文OCR模型为例,使用 Paddle Inference 预测可执行如下命令: Taking the English OCR model as an example, use **Paddle Inference** to predict and execute the following commands:
``` ```
python3.7 tools/infer/predict_system.py --use_gpu=False \ python3.7 tools/infer/predict_system.py --use_gpu=False \
--cls_model_dir=./inference/ch_ppocr_mobile_v2.0_cls_infer \ --cls_model_dir=./inference/ch_ppocr_mobile_v2.0_cls_infer \
--rec_model_dir=./inference/ch_PP-OCRv2_rec_infer \ --rec_model_dir=./inference/en_PP-OCRv3_rec_infer \
--det_model_dir=./inference/ch_PP-OCRv2_det_infer \ --det_model_dir=./inference/en_PP-OCRv3_det_infer \
--image_dir=./deploy/lite/imgs/lite_demo.png --image_dir=doc/imgs_en/img_12.jpg \
--rec_char_dict_path=ppocr/utils/en_dict.txt
``` ```
执行命令后在终端会打印出预测的识别信息,并在 `./inference_results/` 下保存可视化结果。 After executing the command, the predicted identification information will be printed out in the terminal, and the visualization results will be saved under `./inference_results/`.
ONNXRuntime 执行效果 ONNXRuntime result
<div align="center"> <div align="center">
<img src="./images/lite_demo_onnx.png" width=800"> <img src="../../doc/imgs_results/multi_lang/img_12.jpg" width=800">
</div> </div>
Paddle Inference 执行效果 Paddle Inference result
<div align="center"> <div align="center">
<img src="./images/lite_demo_paddle.png" width=800"> <img src="../../doc/imgs_results/multi_lang/img_12.jpg" width=800">
</div> </div>
使用 ONNXRuntime 预测,终端输出: Using ONNXRuntime to predict, terminal output:
``` ```
[2022/02/22 17:48:27] root DEBUG: dt_boxes num : 38, elapse : 0.043187856674194336 [2022/10/10 12:06:28] ppocr DEBUG: dt_boxes num : 11, elapse : 0.3568880558013916
[2022/02/22 17:48:27] root DEBUG: rec_res num : 38, elapse : 0.592170000076294 [2022/10/10 12:06:31] ppocr DEBUG: rec_res num : 11, elapse : 2.6445000171661377
[2022/02/22 17:48:27] root DEBUG: 0 Predict time of ./deploy/lite/imgs/lite_demo.png: 0.642s [2022/10/10 12:06:31] ppocr DEBUG: 0 Predict time of doc/imgs_en/img_12.jpg: 3.021s
[2022/02/22 17:48:27] root DEBUG: The, 0.984 [2022/10/10 12:06:31] ppocr DEBUG: ACKNOWLEDGEMENTS, 0.997
[2022/02/22 17:48:27] root DEBUG: visualized, 0.882 [2022/10/10 12:06:31] ppocr DEBUG: We would like to thank all the designers and, 0.976
[2022/02/22 17:48:27] root DEBUG: etect18片, 0.720 [2022/10/10 12:06:31] ppocr DEBUG: contributors who have been involved in the, 0.979
[2022/02/22 17:48:27] root DEBUG: image saved in./vis.jpg, 0.947 [2022/10/10 12:06:31] ppocr DEBUG: production of this book; their contributions, 0.989
[2022/02/22 17:48:27] root DEBUG: 纯臻营养护发素0.993604, 0.996 [2022/10/10 12:06:31] ppocr DEBUG: have been indispensable to its creation. We, 0.956
[2022/02/22 17:48:27] root DEBUG: 产品信息/参数, 0.922 [2022/10/10 12:06:31] ppocr DEBUG: would also like to express our gratitude to all, 0.991
[2022/02/22 17:48:27] root DEBUG: 0.992728, 0.914 [2022/10/10 12:06:31] ppocr DEBUG: the producers for their invaluable opinions, 0.978
[2022/02/22 17:48:27] root DEBUG: (45元/每公斤,100公斤起订), 0.926 [2022/10/10 12:06:31] ppocr DEBUG: and assistance throughout this project. And to, 0.988
[2022/02/22 17:48:27] root DEBUG: 0.97417, 0.977 [2022/10/10 12:06:31] ppocr DEBUG: the many others whose names are not credited, 0.958
[2022/02/22 17:48:27] root DEBUG: 每瓶22元,1000瓶起订)0.993976, 0.962 [2022/10/10 12:06:31] ppocr DEBUG: but have made specific input in this book, we, 0.970
[2022/02/22 17:48:27] root DEBUG: 【品牌】:代加工方式/0EMODM, 0.945 [2022/10/10 12:06:31] ppocr DEBUG: thank you for your continuous support., 0.998
[2022/02/22 17:48:27] root DEBUG: 0.985133, 0.980 [2022/10/10 12:06:31] ppocr DEBUG: The visualized image saved in ./inference_results/img_12.jpg
[2022/02/22 17:48:27] root DEBUG: 【品名】:纯臻营养护发素, 0.921 [2022/10/10 12:06:31] ppocr INFO: The predict total time is 3.2482550144195557
[2022/02/22 17:48:27] root DEBUG: 0.995007, 0.883 ```
[2022/02/22 17:48:27] root DEBUG: 【产品编号】:YM-X-30110.96899, 0.955
[2022/02/22 17:48:27] root DEBUG: 【净含量】:220ml, 0.943 Using Paddle Inference to predict, terminal output:
[2022/02/22 17:48:27] root DEBUG: Q.996577, 0.932
[2022/02/22 17:48:27] root DEBUG: 【适用人群】:适合所有肤质, 0.913 ```
[2022/02/22 17:48:27] root DEBUG: 0.995842, 0.969 [2022/10/10 12:06:28] ppocr DEBUG: dt_boxes num : 11, elapse : 0.3568880558013916
[2022/02/22 17:48:27] root DEBUG: 【主要成分】:鲸蜡硬脂醇、燕麦B-葡聚, 0.883 [2022/10/10 12:06:31] ppocr DEBUG: rec_res num : 11, elapse : 2.6445000171661377
[2022/02/22 17:48:27] root DEBUG: 0.961928, 0.964 [2022/10/10 12:06:31] ppocr DEBUG: 0 Predict time of doc/imgs_en/img_12.jpg: 3.021s
[2022/02/22 17:48:27] root DEBUG: 10, 0.812 [2022/10/10 12:06:31] ppocr DEBUG: ACKNOWLEDGEMENTS, 0.997
[2022/02/22 17:48:27] root DEBUG: 糖、椰油酰胺丙基甜菜碱、泛醒, 0.866 [2022/10/10 12:06:31] ppocr DEBUG: We would like to thank all the designers and, 0.976
[2022/02/22 17:48:27] root DEBUG: 0.925898, 0.943 [2022/10/10 12:06:31] ppocr DEBUG: contributors who have been involved in the, 0.979
[2022/02/22 17:48:27] root DEBUG: (成品包材), 0.974 [2022/10/10 12:06:31] ppocr DEBUG: production of this book; their contributions, 0.989
[2022/02/22 17:48:27] root DEBUG: 0.972573, 0.961 [2022/10/10 12:06:31] ppocr DEBUG: have been indispensable to its creation. We, 0.956
[2022/02/22 17:48:27] root DEBUG: 【主要功能】:可紧致头发磷层,从而达到, 0.936 [2022/10/10 12:06:31] ppocr DEBUG: would also like to express our gratitude to all, 0.991
[2022/02/22 17:48:27] root DEBUG: 0.994448, 0.952 [2022/10/10 12:06:31] ppocr DEBUG: the producers for their invaluable opinions, 0.978
[2022/02/22 17:48:27] root DEBUG: 13, 0.998 [2022/10/10 12:06:31] ppocr DEBUG: and assistance throughout this project. And to, 0.988
[2022/02/22 17:48:27] root DEBUG: 即时持久改善头发光泽的效果,给干燥的头, 0.994 [2022/10/10 12:06:31] ppocr DEBUG: the many others whose names are not credited, 0.958
[2022/02/22 17:48:27] root DEBUG: 0.990198, 0.975 [2022/10/10 12:06:31] ppocr DEBUG: but have made specific input in this book, we, 0.970
[2022/02/22 17:48:27] root DEBUG: 14, 0.977 [2022/10/10 12:06:31] ppocr DEBUG: thank you for your continuous support., 0.998
[2022/02/22 17:48:27] root DEBUG: 发足够的滋养, 0.991 [2022/10/10 12:06:31] ppocr DEBUG: The visualized image saved in ./inference_results/img_12.jpg
[2022/02/22 17:48:27] root DEBUG: 0.997668, 0.918 [2022/10/10 12:06:31] ppocr INFO: The predict total time is 3.2482550144195557
[2022/02/22 17:48:27] root DEBUG: 花费了0.457335秒, 0.901
[2022/02/22 17:48:27] root DEBUG: The visualized image saved in ./inference_results/lite_demo.png
[2022/02/22 17:48:27] root INFO: The predict total time is 0.7003889083862305
```
使用 Paddle Inference 预测,终端输出:
```
[2022/02/22 17:47:25] root DEBUG: dt_boxes num : 38, elapse : 0.11791276931762695
[2022/02/22 17:47:27] root DEBUG: rec_res num : 38, elapse : 2.6206860542297363
[2022/02/22 17:47:27] root DEBUG: 0 Predict time of ./deploy/lite/imgs/lite_demo.png: 2.746s
[2022/02/22 17:47:27] root DEBUG: The, 0.984
[2022/02/22 17:47:27] root DEBUG: visualized, 0.882
[2022/02/22 17:47:27] root DEBUG: etect18片, 0.720
[2022/02/22 17:47:27] root DEBUG: image saved in./vis.jpg, 0.947
[2022/02/22 17:47:27] root DEBUG: 纯臻营养护发素0.993604, 0.996
[2022/02/22 17:47:27] root DEBUG: 产品信息/参数, 0.922
[2022/02/22 17:47:27] root DEBUG: 0.992728, 0.914
[2022/02/22 17:47:27] root DEBUG: (45元/每公斤,100公斤起订), 0.926
[2022/02/22 17:47:27] root DEBUG: 0.97417, 0.977
[2022/02/22 17:47:27] root DEBUG: 每瓶22元,1000瓶起订)0.993976, 0.962
[2022/02/22 17:47:27] root DEBUG: 【品牌】:代加工方式/0EMODM, 0.945
[2022/02/22 17:47:27] root DEBUG: 0.985133, 0.980
[2022/02/22 17:47:27] root DEBUG: 【品名】:纯臻营养护发素, 0.921
[2022/02/22 17:47:27] root DEBUG: 0.995007, 0.883
[2022/02/22 17:47:27] root DEBUG: 【产品编号】:YM-X-30110.96899, 0.955
[2022/02/22 17:47:27] root DEBUG: 【净含量】:220ml, 0.943
[2022/02/22 17:47:27] root DEBUG: Q.996577, 0.932
[2022/02/22 17:47:27] root DEBUG: 【适用人群】:适合所有肤质, 0.913
[2022/02/22 17:47:27] root DEBUG: 0.995842, 0.969
[2022/02/22 17:47:27] root DEBUG: 【主要成分】:鲸蜡硬脂醇、燕麦B-葡聚, 0.883
[2022/02/22 17:47:27] root DEBUG: 0.961928, 0.964
[2022/02/22 17:47:27] root DEBUG: 10, 0.812
[2022/02/22 17:47:27] root DEBUG: 糖、椰油酰胺丙基甜菜碱、泛醒, 0.866
[2022/02/22 17:47:27] root DEBUG: 0.925898, 0.943
[2022/02/22 17:47:27] root DEBUG: (成品包材), 0.974
[2022/02/22 17:47:27] root DEBUG: 0.972573, 0.961
[2022/02/22 17:47:27] root DEBUG: 【主要功能】:可紧致头发磷层,从而达到, 0.936
[2022/02/22 17:47:27] root DEBUG: 0.994448, 0.952
[2022/02/22 17:47:27] root DEBUG: 13, 0.998
[2022/02/22 17:47:27] root DEBUG: 即时持久改善头发光泽的效果,给干燥的头, 0.994
[2022/02/22 17:47:27] root DEBUG: 0.990198, 0.975
[2022/02/22 17:47:27] root DEBUG: 14, 0.977
[2022/02/22 17:47:27] root DEBUG: 发足够的滋养, 0.991
[2022/02/22 17:47:27] root DEBUG: 0.997668, 0.918
[2022/02/22 17:47:27] root DEBUG: 花费了0.457335秒, 0.901
[2022/02/22 17:47:27] root DEBUG: The visualized image saved in ./inference_results/lite_demo.png
[2022/02/22 17:47:27] root INFO: The predict total time is 2.8338775634765625
``` ```
...@@ -81,6 +81,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 ...@@ -81,6 +81,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广
- [x] [VisionLAN](./algorithm_rec_visionlan.md) - [x] [VisionLAN](./algorithm_rec_visionlan.md)
- [x] [SPIN](./algorithm_rec_spin.md) - [x] [SPIN](./algorithm_rec_spin.md)
- [x] [RobustScanner](./algorithm_rec_robustscanner.md) - [x] [RobustScanner](./algorithm_rec_robustscanner.md)
- [x] [RFL](./algorithm_rec_rfl.md)
参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: 参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
...@@ -104,7 +105,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 ...@@ -104,7 +105,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广
|VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [训练模型](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar) | |VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [训练模型](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar) |
|SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) | |SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) |
|RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| |RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)|
|RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar) |
<a name="2"></a> <a name="2"></a>
......
# 场景文本识别算法-RFL
- [1. 算法简介](#1)
- [2. 环境配置](#2)
- [3. 模型训练、评估、预测](#3)
- [3.1 训练](#3-1)
- [3.2 评估](#3-2)
- [3.3 预测](#3-3)
- [4. 推理部署](#4)
- [4.1 Python推理](#4-1)
- [4.2 C++推理](#4-2)
- [4.3 Serving服务化部署](#4-3)
- [4.4 更多推理部署](#4-4)
- [5. FAQ](#5)
<a name="1"></a>
## 1. 算法简介
论文信息:
> [Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition](https://arxiv.org/abs/2105.06229.pdf)
> Hui Jiang, Yunlu Xu, Zhanzhan Cheng, Shiliang Pu, Yi Niu, Wenqi Ren, Fei Wu, and Wenming Tan
> ICDAR, 2021
<a name="model"></a>
`RFL`使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下:
|模型|骨干网络|配置文件|Acc|下载链接|
| --- | --- | --- | --- | --- |
|RFL-CNT|ResNetRFL|[rec_resnet_rfl_visual.yml](../../configs/rec/rec_resnet_rfl_visual.yml)|93.40%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar)|
|RFL-Att|ResNetRFL|[rec_resnet_rfl_att.yml](../../configs/rec/rec_resnet_rfl_att.yml)|88.63%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar)|
<a name="2"></a>
## 2. 环境配置
请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
<a name="3"></a>
## 3. 模型训练、评估、预测
<a name="3-1"></a>
### 3.1 模型训练
PaddleOCR对代码进行了模块化,训练`RFL`识别模型时需要**更换配置文件**`RFL`[配置文件](../../configs/rec/rec_resnet_rfl_att.yml)
#### 启动训练
具体地,在完成数据准备后,便可以启动训练,训练命令如下:
```shell
#step1:训练CNT分支
#单卡训练(训练周期长,不建议)
python3 tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml
#多卡训练,通过--gpus参数指定卡号
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml
#step2:联合训练CNT和Att分支,注意将pretrained_model的路径设置为本地路径。
#单卡训练(训练周期长,不建议)
python3 tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_visual/best_accuracy
#多卡训练,通过--gpus参数指定卡号
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_visual/best_accuracy
```
<a name="3-2"></a>
### 3.2 评估
可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar),使用如下命令进行评估:
```shell
# 注意将pretrained_model的路径设置为本地路径。
python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_att/best_accuracy
```
<a name="3-3"></a>
### 3.3 预测
使用如下命令进行单张图片预测:
```shell
# 注意将pretrained_model的路径设置为本地路径。
python3 tools/infer_rec.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./output/rec/rec_resnet_rfl_att/best_accuracy
# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。
```
<a name="4"></a>
## 4. 推理部署
<a name="4-1"></a>
### 4.1 Python推理
首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar) ),可以使用如下命令进行转换:
```shell
# 注意将pretrained_model的路径设置为本地路径。
python3 tools/export_model.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_att/best_accuracy Global.save_inference_dir=./inference/rec_resnet_rfl_att/
```
**注意:**
- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。
- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应RFL的`infer_shape`
转换成功后,在目录下有三个文件:
```
/inference/rec_resnet_rfl_att/
├── inference.pdiparams # 识别inference模型的参数文件
├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略
└── inference.pdmodel # 识别inference模型的program文件
```
执行如下命令进行模型推理:
```shell
python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_resnet_rfl_att/' --rec_algorithm='RFL' --rec_image_shape='1,32,100'
# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。
```
![](../imgs_words_en/word_10.png)
执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下:
结果如下:
```shell
Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999927282333374)
```
**注意**
- 训练上述模型采用的图像分辨率是[1,32,100],需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。
- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。
- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中RFL的预处理为您的预处理方法。
<a name="4-2"></a>
### 4.2 C++推理部署
由于C++预处理后处理还未支持RFL,所以暂未支持
<a name="4-3"></a>
### 4.3 Serving服务化部署
暂不支持
<a name="4-4"></a>
### 4.4 更多推理部署
暂不支持
<a name="5"></a>
## 5. FAQ
## 引用
```bibtex
@article{2021Reciprocal,
title = {Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition},
author = {Jiang, H. and Xu, Y. and Cheng, Z. and Pu, S. and Niu, Y. and Ren, W. and Wu, F. and Tan, W. },
booktitle = {ICDAR},
year = {2021},
url = {https://arxiv.org/abs/2105.06229}
}
```
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
- [中文街景文字识别](#中文街景文字识别) - [中文街景文字识别](#中文街景文字识别)
- [中文文档文字识别](#中文文档文字识别) - [中文文档文字识别](#中文文档文字识别)
- [ICDAR2019-ArT](#ICDAR2019-ArT) - [ICDAR2019-ArT](#ICDAR2019-ArT)
- [电子印章数据集](#电子印章数据集)
除了开源数据,用户还可使用合成工具自行合成,可参考[数据合成工具](../data_synthesis.md) 除了开源数据,用户还可使用合成工具自行合成,可参考[数据合成工具](../data_synthesis.md)
...@@ -59,6 +60,12 @@ https://aistudio.baidu.com/aistudio/datasetdetail/8429 ...@@ -59,6 +60,12 @@ https://aistudio.baidu.com/aistudio/datasetdetail/8429
![](../../datasets/ArT.jpg) ![](../../datasets/ArT.jpg)
- **下载地址**:https://ai.baidu.com/broad/download?dataset=art - **下载地址**:https://ai.baidu.com/broad/download?dataset=art
<a name="电子印章数据集"></a>
#### 6、电子印章数据集
- **数据来源**:https://aistudio.baidu.com/aistudio/datasetdetail/154271/0
- **数据简介**:共包含10000张图像,训练集8000图,测试集2000图。数据集是用程序合成的,并不涉及隐私安全,主要用于印章弯曲文本的训练与检测。由开发者[jingsongliujing](https://github.com/jingsongliujing)贡献
- **下载地址**:https://aistudio.baidu.com/aistudio/datasetdetail/154271/0
## 参考文献 ## 参考文献
**ICDAR 2019-LSVT Challenge** **ICDAR 2019-LSVT Challenge**
``` ```
......
...@@ -41,16 +41,30 @@ python3 -m paddle.distributed.launch \ ...@@ -41,16 +41,30 @@ python3 -m paddle.distributed.launch \
## 性能效果测试 ## 性能效果测试
* 在2机8卡P40的机器上,基于26W公开识别数据集(LSVT, RCTW, MTWI)上进行训练,最终耗时如下 * 在2机8卡P40的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示
| 模型 | 配置 | 精度 | 单机8卡耗时 | 2机8卡耗时 | 加速比 | | 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 2机8卡耗时/精度 | 加速比 |
|------|-----|--------|--------|--------|-----| |:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** | | CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 26W中文数据集 | 2.50d/66.7% | 1.67d/67.0% | **1.5** |
*4机8卡V100的机器上,基于全量数据训练,最终耗时如下 *3机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。
| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 3机8卡耗时/精度 | 加速比 |
|:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** |
| 模型 | 配置 | 精度 | 单机8卡耗时 | 4机8卡耗时 | 加速比 |
|------|-----|--------|--------|--------|-----| > 注意:这里3机8卡训练时,单卡batch size相比于单机8卡不变,学习率乘以2 (默认乘以3的话,精度仅有73.42%)
| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** |
* 在4机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。
| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 4机8卡耗时/精度 | 加速比 |
|:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** |
* **注意**
* 在训练的GPU卡数过多时,精度会稍微有所损失(1%左右),此时可以尝试通过添加warmup或者适当增加迭代轮数来弥补精度损失。
...@@ -144,7 +144,7 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982] ...@@ -144,7 +144,7 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982]
**注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm``SVTR_LCNet`,注意和原始`SVTR`的区别。 **注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm``SVTR_LCNet`,注意和原始`SVTR`的区别。
以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir``rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径,也支持PDF文件、参数`det_model_dir`,`cls_model_dir``rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。
```shell ```shell
# 使用方向分类器 # 使用方向分类器
...@@ -153,8 +153,11 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de ...@@ -153,8 +153,11 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false
# 使用多进程 # 使用多进程
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6
# 使用PDF文件,可以通过使用`page_num`参数来控制推理前几页,默认为0,表示推理所有页
python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2
``` ```
执行命令后,识别结果图像如下: 执行命令后,识别结果图像如下:
![](../imgs_results/system_res_00018069_v3.jpg) ![](../imgs_results/system_res_00018069_v3.jpg)
......
...@@ -75,6 +75,11 @@ cd /path/to/ppocr_img ...@@ -75,6 +75,11 @@ cd /path/to/ppocr_img
...... ......
``` ```
此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
- 单独使用检测:设置`--rec``false` - 单独使用检测:设置`--rec``false`
```bash ```bash
...@@ -165,12 +170,14 @@ from paddleocr import PaddleOCR, draw_ocr ...@@ -165,12 +170,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = './imgs/11.jpg' img_path = './imgs/11.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -196,6 +203,50 @@ im_show.save('result.jpg') ...@@ -196,6 +203,50 @@ im_show.save('result.jpg')
<a name="3"></a> <a name="3"></a>
如果输入是PDF文件,那么可以参考下面代码进行可视化
```python
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
ocr = PaddleOCR(use_angle_cls=True, lang="ch" page_num=2) # need to run only once to download and load model into memory
img_path = './xxx.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
# 显示结果
import fitz
from PIL import Image
import cv2
import numpy as np
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.pageCount):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.getPixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for idx in range(len(result)):
res = result[idx]
image = imgs[idx]
boxes = [line[0] for line in res]
txts = [line[1][0] for line in res]
scores = [line[1][1] for line in res]
im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result_page_{}.jpg'.format(idx))
```
## 3. 小结 ## 3. 小结
通过本节内容,相信您已经熟练掌握PaddleOCR whl包的使用方法并获得了初步效果。 通过本节内容,相信您已经熟练掌握PaddleOCR whl包的使用方法并获得了初步效果。
......
...@@ -33,12 +33,14 @@ from paddleocr import PaddleOCR, draw_ocr ...@@ -33,12 +33,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -71,12 +73,14 @@ from paddleocr import PaddleOCR, draw_ocr ...@@ -71,12 +73,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, cls=False) result = ocr.ocr(img_path, cls=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -109,8 +113,10 @@ from paddleocr import PaddleOCR ...@@ -109,8 +113,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg'
result = ocr.ocr(img_path, det=False, cls=True) result = ocr.ocr(img_path, det=False, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
结果是一个list,每个item只包含识别结果和识别置信度 结果是一个list,每个item只包含识别结果和识别置信度
...@@ -127,12 +133,14 @@ from paddleocr import PaddleOCR, draw_ocr ...@@ -127,12 +133,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, rec=False) result = ocr.ocr(img_path, rec=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
...@@ -163,8 +171,10 @@ from paddleocr import PaddleOCR ...@@ -163,8 +171,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR() # need to run only once to download and load model into memory ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg'
result = ocr.ocr(img_path, det=False) result = ocr.ocr(img_path, det=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
结果是一个list,每个item只包含识别结果和识别置信度 结果是一个list,每个item只包含识别结果和识别置信度
...@@ -181,8 +191,10 @@ from paddleocr import PaddleOCR ...@@ -181,8 +191,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg'
result = ocr.ocr(img_path, det=False, rec=False, cls=True) result = ocr.ocr(img_path, det=False, rec=False, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
结果是一个list,每个item只包含分类结果和分类置信度 结果是一个list,每个item只包含分类结果和分类置信度
...@@ -212,6 +224,11 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true ...@@ -212,6 +224,11 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true
...... ......
``` ```
此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
* 检测+识别 * 检测+识别
```bash ```bash
...@@ -290,12 +307,14 @@ ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_m ...@@ -290,12 +307,14 @@ ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_m
use_angle_cls=True) use_angle_cls=True)
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -325,12 +344,14 @@ from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar ...@@ -325,12 +344,14 @@ from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
download_with_progressbar(img_path, 'tmp.jpg') download_with_progressbar(img_path, 'tmp.jpg')
image = Image.open('tmp.jpg').convert('RGB') image = Image.open('tmp.jpg').convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
...@@ -362,12 +383,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg' ...@@ -362,12 +383,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg'
img = cv2.imread(img_path) img = cv2.imread(img_path)
# img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消 # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消
result = ocr.ocr(img, cls=True) result = ocr.ocr(img, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -376,14 +399,65 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc ...@@ -376,14 +399,65 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
## 5 PDF文件作为输入
- 命令行模式
可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
- 代码使用
```python
from paddleocr import PaddleOCR, draw_ocr
## 5 参数说明 # Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
ocr = PaddleOCR(use_angle_cls=True, lang="ch" page_num=2) # need to run only once to download and load model into memory
img_path = './xxx.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
# 显示结果
import fitz
from PIL import Image
import cv2
import numpy as np
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.pageCount):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.getPixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for idx in range(len(result)):
res = result[idx]
image = imgs[idx]
boxes = [line[0] for line in res]
txts = [line[1][0] for line in res]
scores = [line[1][1] for line in res]
im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result_page_{}.jpg'.format(idx))
```
## 6 参数说明
| 字段 | 说明 | 默认值 | | 字段 | 说明 | 默认值 |
|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------|
| use_gpu | 是否使用GPU | TRUE | | use_gpu | 是否使用GPU | TRUE |
| gpu_mem | 初始化占用的GPU内存大小 | 8000M | | gpu_mem | 初始化占用的GPU内存大小 | 8000M |
| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | | | image_dir | 通过命令行调用时执行预测的图片或文件夹路径 |
| page_num | 当输入类型为pdf文件时有效,指定预测前面page_num页,默认预测所有页 | 0 |
| det_algorithm | 使用的检测算法类型 | DB | | det_algorithm | 使用的检测算法类型 | DB |
| det_model_dir | 检测模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/det`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | | det_model_dir | 检测模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/det`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None |
| det_max_side_len | 检测算法前向时图片长边的最大尺寸,当长边超出这个值时会将长边resize到这个大小,短边等比例缩放 | 960 | | det_max_side_len | 检测算法前向时图片长边的最大尺寸,当长边超出这个值时会将长边resize到这个大小,短边等比例缩放 | 960 |
......
...@@ -78,6 +78,7 @@ Supported text recognition algorithms (Click the link to get the tutorial): ...@@ -78,6 +78,7 @@ Supported text recognition algorithms (Click the link to get the tutorial):
- [x] [VisionLAN](./algorithm_rec_visionlan_en.md) - [x] [VisionLAN](./algorithm_rec_visionlan_en.md)
- [x] [SPIN](./algorithm_rec_spin_en.md) - [x] [SPIN](./algorithm_rec_spin_en.md)
- [x] [RobustScanner](./algorithm_rec_robustscanner_en.md) - [x] [RobustScanner](./algorithm_rec_robustscanner_en.md)
- [x] [RFL](./algorithm_rec_rfl_en.md)
Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow:
...@@ -101,7 +102,7 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r ...@@ -101,7 +102,7 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r
|VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [trained model](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar) | |VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [trained model](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar) |
|SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) | |SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) |
|RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| |RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)|
|RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar) |
<a name="2"></a> <a name="2"></a>
......
# RFL
- [1. Introduction](#1)
- [2. Environment](#2)
- [3. Model Training / Evaluation / Prediction](#3)
- [3.1 Training](#3-1)
- [3.2 Evaluation](#3-2)
- [3.3 Prediction](#3-3)
- [4. Inference and Deployment](#4)
- [4.1 Python Inference](#4-1)
- [4.2 C++ Inference](#4-2)
- [4.3 Serving](#4-3)
- [4.4 More](#4-4)
- [5. FAQ](#5)
<a name="1"></a>
## 1. Introduction
Paper:
> [Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition](https://arxiv.org/abs/2105.06229.pdf)
> Hui Jiang, Yunlu Xu, Zhanzhan Cheng, Shiliang Pu, Yi Niu, Wenqi Ren, Fei Wu, and Wenming Tan
> ICDAR, 2021
Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
|Model|Backbone|config|Acc|Download link|
| --- | --- | --- | --- | --- |
|RFL-CNT|ResNetRFL|[rec_resnet_rfl_visual.yml](../../configs/rec/rec_resnet_rfl_visual.yml)|93.40%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar)|
|RFL-Att|ResNetRFL|[rec_resnet_rfl_att.yml](../../configs/rec/rec_resnet_rfl_att.yml)|88.63%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar)|
<a name="2"></a>
## 2. Environment
Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
<a name="3"></a>
## 3. Model Training / Evaluation / Prediction
PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training:
Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
```
#step1:train the CNT branch
#Single GPU training (long training period, not recommended)
python3 tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml
#Multi GPU training, specify the gpu number through the --gpus parameter
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml
#step2:joint training of CNT and Att branches
#Single GPU training (long training period, not recommended)
python3 tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
#Multi GPU training, specify the gpu number through the --gpus parameter
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
```
Evaluation:
```
# GPU evaluation
python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
```
Prediction:
```
# The configuration file used for prediction must match the training
python3 tools/infer_rec.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model={path/to/weights}/best_accuracy
```
<a name="4"></a>
## 4. Inference and Deployment
<a name="4-1"></a>
### 4.1 Python Inference
First, the model saved during the RFL text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar)) ), you can use the following command to convert:
```
python3 tools/export_model.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_resnet_rfl_att
```
**Note:**
- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file.
- If you modified the input size during training, please modify the `infer_shape` corresponding to NRTR in the `tools/export_model.py` file.
After the conversion is successful, there are three files in the directory:
```
/inference/rec_resnet_rfl_att/
├── inference.pdiparams
├── inference.pdiparams.info
└── inference.pdmodel
```
For RFL text recognition model inference, the following commands can be executed:
```
python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_resnet_rfl_att/' --rec_algorithm='RFL' --rec_image_shape='1,32,100'
```
![](../imgs_words_en/word_10.png)
After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows:
The result is as follows:
```shell
Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999927282333374)
```
<a name="4-2"></a>
### 4.2 C++ Inference
Not supported
<a name="4-3"></a>
### 4.3 Serving
Not supported
<a name="4-4"></a>
### 4.4 More
Not supported
<a name="5"></a>
## 5. FAQ
## Citation
```bibtex
@article{2021Reciprocal,
title = {Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition},
author = {Jiang, H. and Xu, Y. and Cheng, Z. and Pu, S. and Niu, Y. and Ren, W. and Wu, F. and Tan, W. },
booktitle = {ICDAR},
year = {2021},
url = {https://arxiv.org/abs/2105.06229}
}
```
...@@ -40,17 +40,29 @@ python3 -m paddle.distributed.launch \ ...@@ -40,17 +40,29 @@ python3 -m paddle.distributed.launch \
## Performance comparison ## Performance comparison
* On two 8-card P40 graphics cards, the final time consumption and speedup ratio for public recognition dataset (LSVT, RCTW, MTWI) containing 260k images are as follows. * We conducted model training on 2x8 P40 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below.
| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio |
| Model | Config file | Recognition acc | single 8-card training time | two 8-card training time | Speedup ratio |
|------|-----|--------|--------|--------|-----|
| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** |
| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio |
|:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 260k Chinese dataset | 2.50d/66.7% | 1.67d/67.0% | **1.5** |
* On four 8-card V100 graphics cards, the final time consumption and speedup ratio for full data are as follows.
* We conducted model training on 3x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below.
| Model | Config file | Recognition acc | single 8-card training time | four 8-card training time | Speedup ratio | | Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio |
|------|-----|--------|--------|--------|-----| |:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** | | SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** |
> Note: when training with 3x8 GPUs, the single card batch size is unchanged compared with the 1x8 GPUs' training process, and the learning rate is multiplied by 2 (if it is multiplied by 3 by default, the accuracy is only 73.42%).
* We conducted model training on 4x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below.
| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 4x8 GPU training time / Accuracy | Acceleration ratio |
|:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** |
...@@ -144,16 +144,17 @@ After executing the command, the prediction results (classification angle and sc ...@@ -144,16 +144,17 @@ After executing the command, the prediction results (classification angle and sc
**Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`. **Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`.
When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, pdf file is also supported, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default.
```shell ```shell
# use direction classifier # use direction classifier
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true
# not use use direction classifier # not use use direction classifier
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false
# use multi-process # use multi-process
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6
# use PDF files, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages
python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2
``` ```
......
...@@ -86,6 +86,12 @@ If you do not use the provided test image, you can replace the following `--imag ...@@ -86,6 +86,12 @@ If you do not use the provided test image, you can replace the following `--imag
...... ......
``` ```
pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
* Only detection: set `--rec` to `false` * Only detection: set `--rec` to `false`
```bash ```bash
...@@ -176,12 +182,15 @@ from paddleocr import PaddleOCR,draw_ocr ...@@ -176,12 +182,15 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory
img_path = './imgs_en/img_12.jpg' img_path = './imgs_en/img_12.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -206,6 +215,50 @@ Visualization of results ...@@ -206,6 +215,50 @@ Visualization of results
<img src="../imgs_results/whl/12_det_rec.jpg" width="800"> <img src="../imgs_results/whl/12_det_rec.jpg" width="800">
</div> </div>
If the input is a PDF file, you can refer to the following code for visualization
```python
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr supports Chinese, English, French, German, Korean and Japanese.
# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan`
# to switch the language model in order.
ocr = PaddleOCR(use_angle_cls=True, lang="ch" page_num=2) # need to run only once to download and load model into memory
img_path = './xxx.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
# draw result
import fitz
from PIL import Image
import cv2
import numpy as np
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.pageCount):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.getPixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for idx in range(len(result)):
res = result[idx]
image = imgs[idx]
boxes = [line[0] for line in res]
txts = [line[1][0] for line in res]
scores = [line[1][1] for line in res]
im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result_page_{}.jpg'.format(idx))
```
<a name="3"></a> <a name="3"></a>
......
...@@ -25,12 +25,14 @@ from paddleocr import PaddleOCR,draw_ocr ...@@ -25,12 +25,14 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -60,11 +62,14 @@ from paddleocr import PaddleOCR,draw_ocr ...@@ -60,11 +62,14 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path, cls=False) result = ocr.ocr(img_path, cls=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -94,8 +99,10 @@ from paddleocr import PaddleOCR ...@@ -94,8 +99,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to load model into memory ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to load model into memory
img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png'
result = ocr.ocr(img_path, det=False, cls=True) result = ocr.ocr(img_path, det=False, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
Output will be a list, each item contains recognition text and confidence Output will be a list, each item contains recognition text and confidence
...@@ -109,12 +116,14 @@ from paddleocr import PaddleOCR,draw_ocr ...@@ -109,12 +116,14 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path,rec=False) result = ocr.ocr(img_path,rec=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
...@@ -141,8 +150,10 @@ from paddleocr import PaddleOCR ...@@ -141,8 +150,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(lang='en') # need to run only once to load model into memory ocr = PaddleOCR(lang='en') # need to run only once to load model into memory
img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png'
result = ocr.ocr(img_path, det=False, cls=False) result = ocr.ocr(img_path, det=False, cls=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
Output will be a list, each item contains recognition text and confidence Output will be a list, each item contains recognition text and confidence
...@@ -156,8 +167,10 @@ from paddleocr import PaddleOCR ...@@ -156,8 +167,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to load model into memory ocr = PaddleOCR(use_angle_cls=True) # need to run only once to load model into memory
img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png'
result = ocr.ocr(img_path, det=False, rec=False, cls=True) result = ocr.ocr(img_path, det=False, rec=False, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
Output will be a list, each item contains classification result and confidence Output will be a list, each item contains classification result and confidence
...@@ -185,6 +198,11 @@ Output will be a list, each item contains bounding box, text and recognition con ...@@ -185,6 +198,11 @@ Output will be a list, each item contains bounding box, text and recognition con
...... ......
``` ```
pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
* detection and recognition * detection and recognition
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en
...@@ -253,11 +271,14 @@ from paddleocr import PaddleOCR,draw_ocr ...@@ -253,11 +271,14 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True) ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True)
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -283,11 +304,14 @@ from paddleocr import PaddleOCR, draw_ocr ...@@ -283,11 +304,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# show result # show result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -312,12 +336,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg' ...@@ -312,12 +336,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg'
img = cv2.imread(img_path) img = cv2.imread(img_path)
# img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# show result # show result
from PIL import Image from PIL import Image
result = result[0]
download_with_progressbar(img_path, 'tmp.jpg') download_with_progressbar(img_path, 'tmp.jpg')
image = Image.open('tmp.jpg').convert('RGB') image = Image.open('tmp.jpg').convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
...@@ -327,15 +353,66 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc ...@@ -327,15 +353,66 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
## 5 PDF file
- Use by command line
you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
- Use by code
```python
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr supports Chinese, English, French, German, Korean and Japanese.
# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan`
# to switch the language model in order.
ocr = PaddleOCR(use_angle_cls=True, lang="ch" page_num=2) # need to run only once to download and load model into memory
img_path = './xxx.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
## 5 Parameter Description # draw result
import fitz
from PIL import Image
import cv2
import numpy as np
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.pageCount):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.getPixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for idx in range(len(result)):
res = result[idx]
image = imgs[idx]
boxes = [line[0] for line in res]
txts = [line[1][0] for line in res]
scores = [line[1][1] for line in res]
im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result_page_{}.jpg'.format(idx))
```
## 6 Parameter Description
| Parameter | Description | Default value | | Parameter | Description | Default value |
|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------|
| use_gpu | use GPU or not | TRUE | | use_gpu | use GPU or not | TRUE |
| gpu_mem | GPU memory size used for initialization | 8000M | | gpu_mem | GPU memory size used for initialization | 8000M |
| image_dir | The images path or folder path for predicting when used by the command line | | | image_dir | The images path or folder path for predicting when used by the command line | |
| page_num | Valid when the input type is pdf file, specify to predict the previous page_num pages, all pages are predicted by default | 0 |
| det_algorithm | Type of detection algorithm selected | DB | | det_algorithm | Type of detection algorithm selected | DB |
| det_model_dir | the text detection inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/det`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | | det_model_dir | the text detection inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/det`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None |
| det_max_side_len | The maximum size of the long side of the image. When the long side exceeds this value, the long side will be resized to this size, and the short side will be scaled proportionally | 960 | | det_max_side_len | The maximum size of the long side of the image. When the long side exceeds this value, the long side will be resized to this size, and the short side will be scaled proportionally | 960 |
......
...@@ -47,7 +47,7 @@ __all__ = [ ...@@ -47,7 +47,7 @@ __all__ = [
] ]
SUPPORT_DET_MODEL = ['DB'] SUPPORT_DET_MODEL = ['DB']
VERSION = '2.6.0.1' VERSION = '2.6.0.2'
SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet'] SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet']
BASE_DIR = os.path.expanduser("~/.paddleocr/") BASE_DIR = os.path.expanduser("~/.paddleocr/")
...@@ -428,8 +428,8 @@ def check_img(img): ...@@ -428,8 +428,8 @@ def check_img(img):
download_with_progressbar(img, 'tmp.jpg') download_with_progressbar(img, 'tmp.jpg')
img = 'tmp.jpg' img = 'tmp.jpg'
image_file = img image_file = img
img, flag, _ = check_and_read(image_file) img, flag_gif, flag_pdf = check_and_read(image_file)
if not flag: if not flag_gif and not flag_pdf:
with open(image_file, 'rb') as f: with open(image_file, 'rb') as f:
img = img_decode(f.read()) img = img_decode(f.read())
if img is None: if img is None:
...@@ -500,6 +500,7 @@ class PaddleOCR(predict_system.TextSystem): ...@@ -500,6 +500,7 @@ class PaddleOCR(predict_system.TextSystem):
logger.debug(params) logger.debug(params)
# init det_model and rec_model # init det_model and rec_model
super().__init__(params) super().__init__(params)
self.page_num = params.page_num
def ocr(self, img, det=True, rec=True, cls=True): def ocr(self, img, det=True, rec=True, cls=True):
""" """
...@@ -520,24 +521,43 @@ class PaddleOCR(predict_system.TextSystem): ...@@ -520,24 +521,43 @@ class PaddleOCR(predict_system.TextSystem):
) )
img = check_img(img) img = check_img(img)
# for infer pdf file
if isinstance(img, list):
if self.page_num > len(img) or self.page_num == 0:
self.page_num = len(img)
imgs = img[:self.page_num]
else:
imgs = [img]
if det and rec: if det and rec:
dt_boxes, rec_res, _ = self.__call__(img, cls) ocr_res = []
return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] for idx, img in enumerate(imgs):
dt_boxes, rec_res, _ = self.__call__(img, cls)
tmp_res = [[box.tolist(), res]
for box, res in zip(dt_boxes, rec_res)]
ocr_res.append(tmp_res)
return ocr_res
elif det and not rec: elif det and not rec:
dt_boxes, elapse = self.text_detector(img) ocr_res = []
if dt_boxes is None: for idx, img in enumerate(imgs):
return None dt_boxes, elapse = self.text_detector(img)
return [box.tolist() for box in dt_boxes] tmp_res = [box.tolist() for box in dt_boxes]
ocr_res.append(tmp_res)
return ocr_res
else: else:
if not isinstance(img, list): ocr_res = []
img = [img] cls_res = []
if self.use_angle_cls and cls: for idx, img in enumerate(imgs):
img, cls_res, elapse = self.text_classifier(img) if not isinstance(img, list):
if not rec: img = [img]
return cls_res if self.use_angle_cls and cls:
rec_res, elapse = self.text_recognizer(img) img, cls_res_tmp, elapse = self.text_classifier(img)
return rec_res if not rec:
cls_res.append(cls_res_tmp)
rec_res, elapse = self.text_recognizer(img)
ocr_res.append(rec_res)
if not rec:
return cls_res
return ocr_res
class PPStructure(StructureSystem): class PPStructure(StructureSystem):
...@@ -547,6 +567,7 @@ class PPStructure(StructureSystem): ...@@ -547,6 +567,7 @@ class PPStructure(StructureSystem):
assert params.structure_version in SUPPORT_STRUCTURE_MODEL_VERSION, "structure_version must in {}, but get {}".format( assert params.structure_version in SUPPORT_STRUCTURE_MODEL_VERSION, "structure_version must in {}, but get {}".format(
SUPPORT_STRUCTURE_MODEL_VERSION, params.structure_version) SUPPORT_STRUCTURE_MODEL_VERSION, params.structure_version)
params.use_gpu = check_gpu(params.use_gpu) params.use_gpu = check_gpu(params.use_gpu)
params.mode = 'structure'
if not params.show_log: if not params.show_log:
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
...@@ -633,8 +654,10 @@ def main(): ...@@ -633,8 +654,10 @@ def main():
rec=args.rec, rec=args.rec,
cls=args.use_angle_cls) cls=args.use_angle_cls)
if result is not None: if result is not None:
for line in result: for idx in range(len(result)):
logger.info(line) res = result[idx]
for line in res:
logger.info(line)
elif args.type == 'structure': elif args.type == 'structure':
img, flag_gif, flag_pdf = check_and_read(img_path) img, flag_gif, flag_pdf = check_and_read(img_path)
if not flag_gif and not flag_pdf: if not flag_gif and not flag_pdf:
...@@ -682,7 +705,7 @@ def main(): ...@@ -682,7 +705,7 @@ def main():
"error in layout recovery image:{}, err msg: {}".format( "error in layout recovery image:{}, err msg: {}".format(
img_name, ex)) img_name, ex))
continue continue
for item in all_res: for item in all_res:
item.pop('img') item.pop('img')
item.pop('res') item.pop('res')
......
...@@ -26,7 +26,8 @@ from .make_pse_gt import MakePseGt ...@@ -26,7 +26,8 @@ from .make_pse_gt import MakePseGt
from .rec_img_aug import BaseDataAugmentation, RecAug, RecConAug, RecResizeImg, ClsResizeImg, \ from .rec_img_aug import BaseDataAugmentation, RecAug, RecConAug, RecResizeImg, ClsResizeImg, \
SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg, \ SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg, \
ABINetRecResizeImg, SVTRRecResizeImg, ABINetRecAug, VLRecResizeImg, SPINRecResizeImg, RobustScannerRecResizeImg ABINetRecResizeImg, SVTRRecResizeImg, ABINetRecAug, VLRecResizeImg, SPINRecResizeImg, RobustScannerRecResizeImg, \
RFLRecResizeImg
from .ssl_img_aug import SSLRotateResize from .ssl_img_aug import SSLRotateResize
from .randaugment import RandAugment from .randaugment import RandAugment
from .copy_paste import CopyPaste from .copy_paste import CopyPaste
......
...@@ -488,6 +488,62 @@ class AttnLabelEncode(BaseRecLabelEncode): ...@@ -488,6 +488,62 @@ class AttnLabelEncode(BaseRecLabelEncode):
return idx return idx
class RFLLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """
def __init__(self,
max_text_length,
character_dict_path=None,
use_space_char=False,
**kwargs):
super(RFLLabelEncode, self).__init__(
max_text_length, character_dict_path, use_space_char)
def add_special_char(self, dict_character):
self.beg_str = "sos"
self.end_str = "eos"
dict_character = [self.beg_str] + dict_character + [self.end_str]
return dict_character
def encode_cnt(self, text):
cnt_label = [0.0] * len(self.character)
for char_ in text:
cnt_label[char_] += 1
return np.array(cnt_label)
def __call__(self, data):
text = data['label']
text = self.encode(text)
if text is None:
return None
if len(text) >= self.max_text_len:
return None
cnt_label = self.encode_cnt(text)
data['length'] = np.array(len(text))
text = [0] + text + [len(self.character) - 1] + [0] * (self.max_text_len
- len(text) - 2)
if len(text) != self.max_text_len:
return None
data['label'] = np.array(text)
data['cnt_label'] = cnt_label
return data
def get_ignored_tokens(self):
beg_idx = self.get_beg_end_flag_idx("beg")
end_idx = self.get_beg_end_flag_idx("end")
return [beg_idx, end_idx]
def get_beg_end_flag_idx(self, beg_or_end):
if beg_or_end == "beg":
idx = np.array(self.dict[self.beg_str])
elif beg_or_end == "end":
idx = np.array(self.dict[self.end_str])
else:
assert False, "Unsupport type %s in get_beg_end_flag_idx" \
% beg_or_end
return idx
class SEEDLabelEncode(BaseRecLabelEncode): class SEEDLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """ """ Convert between text-label and text-index """
...@@ -1089,7 +1145,7 @@ class VQATokenLabelEncode(object): ...@@ -1089,7 +1145,7 @@ class VQATokenLabelEncode(object):
def _load_ocr_info(self, data): def _load_ocr_info(self, data):
if self.infer_mode: if self.infer_mode:
ocr_result = self.ocr_engine.ocr(data['image'], cls=False) ocr_result = self.ocr_engine.ocr(data['image'], cls=False)[0]
ocr_info = [] ocr_info = []
for res in ocr_result: for res in ocr_result:
ocr_info.append({ ocr_info.append({
......
...@@ -237,6 +237,33 @@ class VLRecResizeImg(object): ...@@ -237,6 +237,33 @@ class VLRecResizeImg(object):
return data return data
class RFLRecResizeImg(object):
def __init__(self, image_shape, padding=True, interpolation=1, **kwargs):
self.image_shape = image_shape
self.padding = padding
self.interpolation = interpolation
if self.interpolation == 0:
self.interpolation = cv2.INTER_NEAREST
elif self.interpolation == 1:
self.interpolation = cv2.INTER_LINEAR
elif self.interpolation == 2:
self.interpolation = cv2.INTER_CUBIC
elif self.interpolation == 3:
self.interpolation = cv2.INTER_AREA
else:
raise Exception("Unsupported interpolation type !!!")
def __call__(self, data):
img = data['image']
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
norm_img, valid_ratio = resize_norm_img(
img, self.image_shape, self.padding, self.interpolation)
data['image'] = norm_img
data['valid_ratio'] = valid_ratio
return data
class SRNRecResizeImg(object): class SRNRecResizeImg(object):
def __init__(self, image_shape, num_heads, max_text_length, **kwargs): def __init__(self, image_shape, num_heads, max_text_length, **kwargs):
self.image_shape = image_shape self.image_shape = image_shape
...@@ -414,8 +441,13 @@ class SVTRRecResizeImg(object): ...@@ -414,8 +441,13 @@ class SVTRRecResizeImg(object):
data['valid_ratio'] = valid_ratio data['valid_ratio'] = valid_ratio
return data return data
class RobustScannerRecResizeImg(object): class RobustScannerRecResizeImg(object):
def __init__(self, image_shape, max_text_length, width_downsample_ratio=0.25, **kwargs): def __init__(self,
image_shape,
max_text_length,
width_downsample_ratio=0.25,
**kwargs):
self.image_shape = image_shape self.image_shape = image_shape
self.width_downsample_ratio = width_downsample_ratio self.width_downsample_ratio = width_downsample_ratio
self.max_text_length = max_text_length self.max_text_length = max_text_length
...@@ -432,6 +464,7 @@ class RobustScannerRecResizeImg(object): ...@@ -432,6 +464,7 @@ class RobustScannerRecResizeImg(object):
data['word_positons'] = word_positons data['word_positons'] = word_positons
return data return data
def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
imgC, imgH, imgW_min, imgW_max = image_shape imgC, imgH, imgW_min, imgW_max = image_shape
h = img.shape[0] h = img.shape[0]
...@@ -467,13 +500,16 @@ def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): ...@@ -467,13 +500,16 @@ def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
return padding_im, resize_shape, pad_shape, valid_ratio return padding_im, resize_shape, pad_shape, valid_ratio
def resize_norm_img(img, image_shape, padding=True): def resize_norm_img(img,
image_shape,
padding=True,
interpolation=cv2.INTER_LINEAR):
imgC, imgH, imgW = image_shape imgC, imgH, imgW = image_shape
h = img.shape[0] h = img.shape[0]
w = img.shape[1] w = img.shape[1]
if not padding: if not padding:
resized_image = cv2.resize( resized_image = cv2.resize(
img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) img, (imgW, imgH), interpolation=interpolation)
resized_w = imgW resized_w = imgW
else: else:
ratio = w / float(h) ratio = w / float(h)
......
...@@ -39,6 +39,7 @@ from .rec_pren_loss import PRENLoss ...@@ -39,6 +39,7 @@ from .rec_pren_loss import PRENLoss
from .rec_multi_loss import MultiLoss from .rec_multi_loss import MultiLoss
from .rec_vl_loss import VLLoss from .rec_vl_loss import VLLoss
from .rec_spin_att_loss import SPINAttentionLoss from .rec_spin_att_loss import SPINAttentionLoss
from .rec_rfl_loss import RFLLoss
# cls loss # cls loss
from .cls_loss import ClsLoss from .cls_loss import ClsLoss
...@@ -70,7 +71,7 @@ def build_loss(config): ...@@ -70,7 +71,7 @@ def build_loss(config):
'CELoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss', 'SDMGRLoss', 'CELoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss', 'SDMGRLoss',
'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss', 'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss',
'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss', 'StrokeFocusLoss', 'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss', 'StrokeFocusLoss',
'SLALoss', 'CTLoss', 'DRRGLoss' 'SLALoss', 'CTLoss', 'RFLLoss', 'DRRGLoss'
] ]
config = copy.deepcopy(config) config = copy.deepcopy(config)
module_name = config.pop('name') module_name = config.pop('name')
......
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_common/models/loss/cross_entropy_loss.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from paddle import nn
from .basic_loss import CELoss, DistanceLoss
class RFLLoss(nn.Layer):
def __init__(self, ignore_index=-100, **kwargs):
super().__init__()
self.cnt_loss = nn.MSELoss(**kwargs)
self.seq_loss = nn.CrossEntropyLoss(ignore_index=ignore_index)
def forward(self, predicts, batch):
self.total_loss = {}
total_loss = 0.0
if isinstance(predicts, tuple) or isinstance(predicts, list):
cnt_outputs, seq_outputs = predicts
else:
cnt_outputs, seq_outputs = predicts, None
# batch [image, label, length, cnt_label]
if cnt_outputs is not None:
cnt_loss = self.cnt_loss(cnt_outputs,
paddle.cast(batch[3], paddle.float32))
self.total_loss['cnt_loss'] = cnt_loss
total_loss += cnt_loss
if seq_outputs is not None:
targets = batch[1].astype("int64")
label_lengths = batch[2].astype('int64')
batch_size, num_steps, num_classes = seq_outputs.shape[
0], seq_outputs.shape[1], seq_outputs.shape[2]
assert len(targets.shape) == len(list(seq_outputs.shape)) - 1, \
"The target's shape and inputs's shape is [N, d] and [N, num_steps]"
inputs = seq_outputs[:, :-1, :]
targets = targets[:, 1:]
inputs = paddle.reshape(inputs, [-1, inputs.shape[-1]])
targets = paddle.reshape(targets, [-1])
seq_loss = self.seq_loss(inputs, targets)
self.total_loss['seq_loss'] = seq_loss
total_loss += seq_loss
self.total_loss['loss'] = total_loss
return self.total_loss
...@@ -22,7 +22,7 @@ import copy ...@@ -22,7 +22,7 @@ import copy
__all__ = ["build_metric"] __all__ = ["build_metric"]
from .det_metric import DetMetric, DetFCEMetric from .det_metric import DetMetric, DetFCEMetric
from .rec_metric import RecMetric from .rec_metric import RecMetric, CNTMetric
from .cls_metric import ClsMetric from .cls_metric import ClsMetric
from .e2e_metric import E2EMetric from .e2e_metric import E2EMetric
from .distillation_metric import DistillationMetric from .distillation_metric import DistillationMetric
...@@ -38,7 +38,7 @@ def build_metric(config): ...@@ -38,7 +38,7 @@ def build_metric(config):
support_dict = [ support_dict = [
"DetMetric", "DetFCEMetric", "RecMetric", "ClsMetric", "E2EMetric", "DetMetric", "DetFCEMetric", "RecMetric", "ClsMetric", "E2EMetric",
"DistillationMetric", "TableMetric", 'KIEMetric', 'VQASerTokenMetric', "DistillationMetric", "TableMetric", 'KIEMetric', 'VQASerTokenMetric',
'VQAReTokenMetric', 'SRMetric', 'CTMetric' 'VQAReTokenMetric', 'SRMetric', 'CTMetric', 'CNTMetric'
] ]
config = copy.deepcopy(config) config = copy.deepcopy(config)
......
...@@ -16,7 +16,6 @@ from rapidfuzz.distance import Levenshtein ...@@ -16,7 +16,6 @@ from rapidfuzz.distance import Levenshtein
import string import string
class RecMetric(object): class RecMetric(object):
def __init__(self, def __init__(self,
main_indicator='acc', main_indicator='acc',
...@@ -74,3 +73,36 @@ class RecMetric(object): ...@@ -74,3 +73,36 @@ class RecMetric(object):
self.correct_num = 0 self.correct_num = 0
self.all_num = 0 self.all_num = 0
self.norm_edit_dis = 0 self.norm_edit_dis = 0
class CNTMetric(object):
def __init__(self, main_indicator='acc', **kwargs):
self.main_indicator = main_indicator
self.eps = 1e-5
self.reset()
def __call__(self, pred_label, *args, **kwargs):
preds, labels = pred_label
correct_num = 0
all_num = 0
for pred, target in zip(preds, labels):
if pred == target:
correct_num += 1
all_num += 1
self.correct_num += correct_num
self.all_num += all_num
return {'acc': correct_num / (all_num + self.eps), }
def get_metric(self):
"""
return metrics {
'acc': 0,
}
"""
acc = 1.0 * self.correct_num / (self.all_num + self.eps)
self.reset()
return {'acc': acc}
def reset(self):
self.correct_num = 0
self.all_num = 0
...@@ -42,10 +42,11 @@ def build_backbone(config, model_type): ...@@ -42,10 +42,11 @@ def build_backbone(config, model_type):
from .rec_efficientb3_pren import EfficientNetb3_PREN from .rec_efficientb3_pren import EfficientNetb3_PREN
from .rec_svtrnet import SVTRNet from .rec_svtrnet import SVTRNet
from .rec_vitstr import ViTSTR from .rec_vitstr import ViTSTR
from .rec_resnet_rfl import ResNetRFL
support_dict = [ support_dict = [
'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB', 'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB',
'ResNet31', 'ResNet45', 'ResNet_ASTER', 'MicroNet', 'ResNet31', 'ResNet45', 'ResNet_ASTER', 'MicroNet',
'EfficientNetb3_PREN', 'SVTRNet', 'ViTSTR', 'ResNet32' 'EfficientNetb3_PREN', 'SVTRNet', 'ViTSTR', 'ResNet32', 'ResNetRFL'
] ]
elif model_type == 'e2e': elif model_type == 'e2e':
from .e2e_resnet_vd_pg import ResNet from .e2e_resnet_vd_pg import ResNet
......
...@@ -21,124 +21,165 @@ from __future__ import division ...@@ -21,124 +21,165 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import math import math
from collections import namedtuple import re
import collections
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
__all__ = ['EfficientNetb3'] __all__ = ['EfficientNetb3']
GlobalParams = collections.namedtuple('GlobalParams', [
'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 'num_classes',
'width_coefficient', 'depth_coefficient', 'depth_divisor', 'min_depth',
'drop_connect_rate', 'image_size'
])
class EffB3Params: BlockArgs = collections.namedtuple('BlockArgs', [
'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
'expand_ratio', 'id_skip', 'stride', 'se_ratio'
])
class BlockDecoder:
@staticmethod @staticmethod
def get_global_params(): def _decode_block_string(block_string):
""" assert isinstance(block_string, str)
The fllowing are efficientnetb3's arch superparams, but to fit for scene
text recognition task, the resolution(image_size) here is changed ops = block_string.split('_')
from 300 to 64. options = {}
""" for op in ops:
GlobalParams = namedtuple('GlobalParams', [ splits = re.split(r'(\d.*)', op)
'drop_connect_rate', 'width_coefficient', 'depth_coefficient', if len(splits) >= 2:
'depth_divisor', 'image_size' key, value = splits[:2]
]) options[key] = value
global_params = GlobalParams(
drop_connect_rate=0.3, assert (('s' in options and len(options['s']) == 1) or
width_coefficient=1.2, (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
depth_coefficient=1.4,
depth_divisor=8, return BlockArgs(
image_size=64) kernel_size=int(options['k']),
return global_params num_repeat=int(options['r']),
input_filters=int(options['i']),
output_filters=int(options['o']),
expand_ratio=int(options['e']),
id_skip=('noskip' not in block_string),
se_ratio=float(options['se']) if 'se' in options else None,
stride=[int(options['s'][0])])
@staticmethod @staticmethod
def get_block_params(): def decode(string_list):
BlockParams = namedtuple('BlockParams', [ assert isinstance(string_list, list)
'kernel_size', 'num_repeat', 'input_filters', 'output_filters', blocks_args = []
'expand_ratio', 'id_skip', 'se_ratio', 'stride' for block_string in string_list:
]) blocks_args.append(BlockDecoder._decode_block_string(block_string))
block_params = [ return blocks_args
BlockParams(3, 1, 32, 16, 1, True, 0.25, 1),
BlockParams(3, 2, 16, 24, 6, True, 0.25, 2),
BlockParams(5, 2, 24, 40, 6, True, 0.25, 2), def efficientnet(width_coefficient=None,
BlockParams(3, 3, 40, 80, 6, True, 0.25, 2), depth_coefficient=None,
BlockParams(5, 3, 80, 112, 6, True, 0.25, 1), dropout_rate=0.2,
BlockParams(5, 4, 112, 192, 6, True, 0.25, 2), drop_connect_rate=0.2,
BlockParams(3, 1, 192, 320, 6, True, 0.25, 1) image_size=None,
] num_classes=1000):
return block_params blocks_args = [
'r1_k3_s11_e1_i32_o16_se0.25',
'r2_k3_s22_e6_i16_o24_se0.25',
'r2_k5_s22_e6_i24_o40_se0.25',
'r3_k3_s22_e6_i40_o80_se0.25',
'r3_k5_s11_e6_i80_o112_se0.25',
'r4_k5_s22_e6_i112_o192_se0.25',
'r1_k3_s11_e6_i192_o320_se0.25',
]
blocks_args = BlockDecoder.decode(blocks_args)
global_params = GlobalParams(
batch_norm_momentum=0.99,
batch_norm_epsilon=1e-3,
dropout_rate=dropout_rate,
drop_connect_rate=drop_connect_rate,
num_classes=num_classes,
width_coefficient=width_coefficient,
depth_coefficient=depth_coefficient,
depth_divisor=8,
min_depth=None,
image_size=image_size, )
return blocks_args, global_params
class EffUtils: class EffUtils:
@staticmethod @staticmethod
def round_filters(filters, global_params): def round_filters(filters, global_params):
"""Calculate and round number of filters based on depth multiplier.""" """ Calculate and round number of filters based on depth multiplier. """
multiplier = global_params.width_coefficient multiplier = global_params.width_coefficient
if not multiplier: if not multiplier:
return filters return filters
divisor = global_params.depth_divisor divisor = global_params.depth_divisor
min_depth = global_params.min_depth
filters *= multiplier filters *= multiplier
new_filters = int(filters + divisor / 2) // divisor * divisor min_depth = min_depth or divisor
new_filters = max(min_depth,
int(filters + divisor / 2) // divisor * divisor)
if new_filters < 0.9 * filters: if new_filters < 0.9 * filters:
new_filters += divisor new_filters += divisor
return int(new_filters) return int(new_filters)
@staticmethod @staticmethod
def round_repeats(repeats, global_params): def round_repeats(repeats, global_params):
"""Round number of filters based on depth multiplier.""" """ Round number of filters based on depth multiplier. """
multiplier = global_params.depth_coefficient multiplier = global_params.depth_coefficient
if not multiplier: if not multiplier:
return repeats return repeats
return int(math.ceil(multiplier * repeats)) return int(math.ceil(multiplier * repeats))
class ConvBlock(nn.Layer): class MbConvBlock(nn.Layer):
def __init__(self, block_params): def __init__(self, block_args):
super(ConvBlock, self).__init__() super(MbConvBlock, self).__init__()
self.block_args = block_params self._block_args = block_args
self.has_se = (self.block_args.se_ratio is not None) and \ self.has_se = (self._block_args.se_ratio is not None) and \
(0 < self.block_args.se_ratio <= 1) (0 < self._block_args.se_ratio <= 1)
self.id_skip = block_params.id_skip self.id_skip = block_args.id_skip
# expansion phase # expansion phase
self.input_filters = self.block_args.input_filters self.inp = self._block_args.input_filters
output_filters = \ oup = self._block_args.input_filters * self._block_args.expand_ratio
self.block_args.input_filters * self.block_args.expand_ratio if self._block_args.expand_ratio != 1:
if self.block_args.expand_ratio != 1: self._expand_conv = nn.Conv2D(self.inp, oup, 1, bias_attr=False)
self.expand_conv = nn.Conv2D( self._bn0 = nn.BatchNorm(oup)
self.input_filters, output_filters, 1, bias_attr=False)
self.bn0 = nn.BatchNorm(output_filters)
# depthwise conv phase # depthwise conv phase
k = self.block_args.kernel_size k = self._block_args.kernel_size
s = self.block_args.stride s = self._block_args.stride
self.depthwise_conv = nn.Conv2D( if isinstance(s, list):
output_filters, s = s[0]
output_filters, self._depthwise_conv = nn.Conv2D(
groups=output_filters, oup,
oup,
groups=oup,
kernel_size=k, kernel_size=k,
stride=s, stride=s,
padding='same', padding='same',
bias_attr=False) bias_attr=False)
self.bn1 = nn.BatchNorm(output_filters) self._bn1 = nn.BatchNorm(oup)
# squeeze and excitation layer, if desired # squeeze and excitation layer, if desired
if self.has_se: if self.has_se:
num_squeezed_channels = max(1, num_squeezed_channels = max(1,
int(self.block_args.input_filters * int(self._block_args.input_filters *
self.block_args.se_ratio)) self._block_args.se_ratio))
self.se_reduce = nn.Conv2D(output_filters, num_squeezed_channels, 1) self._se_reduce = nn.Conv2D(oup, num_squeezed_channels, 1)
self.se_expand = nn.Conv2D(num_squeezed_channels, output_filters, 1) self._se_expand = nn.Conv2D(num_squeezed_channels, oup, 1)
# output phase # output phase and some util class
self.final_oup = self.block_args.output_filters self.final_oup = self._block_args.output_filters
self.project_conv = nn.Conv2D( self._project_conv = nn.Conv2D(oup, self.final_oup, 1, bias_attr=False)
output_filters, self.final_oup, 1, bias_attr=False) self._bn2 = nn.BatchNorm(self.final_oup)
self.bn2 = nn.BatchNorm(self.final_oup) self._swish = nn.Swish()
self.swish = nn.Swish()
def _drop_connect(self, inputs, p, training):
def drop_connect(self, inputs, p, training):
if not training: if not training:
return inputs return inputs
batch_size = inputs.shape[0] batch_size = inputs.shape[0]
keep_prob = 1 - p keep_prob = 1 - p
random_tensor = keep_prob random_tensor = keep_prob
...@@ -151,22 +192,23 @@ class ConvBlock(nn.Layer): ...@@ -151,22 +192,23 @@ class ConvBlock(nn.Layer):
def forward(self, inputs, drop_connect_rate=None): def forward(self, inputs, drop_connect_rate=None):
# expansion and depthwise conv # expansion and depthwise conv
x = inputs x = inputs
if self.block_args.expand_ratio != 1: if self._block_args.expand_ratio != 1:
x = self.swish(self.bn0(self.expand_conv(inputs))) x = self._swish(self._bn0(self._expand_conv(inputs)))
x = self.swish(self.bn1(self.depthwise_conv(x))) x = self._swish(self._bn1(self._depthwise_conv(x)))
# squeeze and excitation # squeeze and excitation
if self.has_se: if self.has_se:
x_squeezed = F.adaptive_avg_pool2d(x, 1) x_squeezed = F.adaptive_avg_pool2d(x, 1)
x_squeezed = self.se_expand(self.swish(self.se_reduce(x_squeezed))) x_squeezed = self._se_expand(
self._swish(self._se_reduce(x_squeezed)))
x = F.sigmoid(x_squeezed) * x x = F.sigmoid(x_squeezed) * x
x = self.bn2(self.project_conv(x)) x = self._bn2(self._project_conv(x))
# skip conntection and drop connect # skip conntection and drop connect
if self.id_skip and self.block_args.stride == 1 and \ if self.id_skip and self._block_args.stride == 1 and \
self.input_filters == self.final_oup: self.inp == self.final_oup:
if drop_connect_rate: if drop_connect_rate:
x = self.drop_connect( x = self._drop_connect(
x, p=drop_connect_rate, training=self.training) x, p=drop_connect_rate, training=self.training)
x = x + inputs x = x + inputs
return x return x
...@@ -175,54 +217,63 @@ class ConvBlock(nn.Layer): ...@@ -175,54 +217,63 @@ class ConvBlock(nn.Layer):
class EfficientNetb3_PREN(nn.Layer): class EfficientNetb3_PREN(nn.Layer):
def __init__(self, in_channels): def __init__(self, in_channels):
super(EfficientNetb3_PREN, self).__init__() super(EfficientNetb3_PREN, self).__init__()
self.blocks_params = EffB3Params.get_block_params() """
self.global_params = EffB3Params.get_global_params() the fllowing are efficientnetb3's superparams,
they means efficientnetb3 network's width, depth, resolution and
dropout respectively, to fit for text recognition task, the resolution
here is changed from 300 to 64.
"""
w, d, s, p = 1.2, 1.4, 64, 0.3
self._blocks_args, self._global_params = efficientnet(
width_coefficient=w,
depth_coefficient=d,
dropout_rate=p,
image_size=s)
self.out_channels = [] self.out_channels = []
# stem # stem
stem_channels = EffUtils.round_filters(32, self.global_params) out_channels = EffUtils.round_filters(32, self._global_params)
self.conv_stem = nn.Conv2D( self._conv_stem = nn.Conv2D(
in_channels, stem_channels, 3, 2, padding='same', bias_attr=False) in_channels, out_channels, 3, 2, padding='same', bias_attr=False)
self.bn0 = nn.BatchNorm(stem_channels) self._bn0 = nn.BatchNorm(out_channels)
self.blocks = [] # build blocks
self._blocks = []
# to extract three feature maps for fpn based on efficientnetb3 backbone # to extract three feature maps for fpn based on efficientnetb3 backbone
self.concerned_block_idxes = [7, 17, 25] self._concerned_block_idxes = [7, 17, 25]
concerned_idx = 0 _concerned_idx = 0
for i, block_params in enumerate(self.blocks_params): for i, block_args in enumerate(self._blocks_args):
block_params = block_params._replace( block_args = block_args._replace(
input_filters=EffUtils.round_filters(block_params.input_filters, input_filters=EffUtils.round_filters(block_args.input_filters,
self.global_params), self._global_params),
output_filters=EffUtils.round_filters( output_filters=EffUtils.round_filters(block_args.output_filters,
block_params.output_filters, self.global_params), self._global_params),
num_repeat=EffUtils.round_repeats(block_params.num_repeat, num_repeat=EffUtils.round_repeats(block_args.num_repeat,
self.global_params)) self._global_params))
self.blocks.append( self._blocks.append(
self.add_sublayer("{}-0".format(i), ConvBlock(block_params))) self.add_sublayer(f"{i}-0", MbConvBlock(block_args)))
concerned_idx += 1 _concerned_idx += 1
if concerned_idx in self.concerned_block_idxes: if _concerned_idx in self._concerned_block_idxes:
self.out_channels.append(block_params.output_filters) self.out_channels.append(block_args.output_filters)
if block_params.num_repeat > 1: if block_args.num_repeat > 1:
block_params = block_params._replace( block_args = block_args._replace(
input_filters=block_params.output_filters, stride=1) input_filters=block_args.output_filters, stride=1)
for j in range(block_params.num_repeat - 1): for j in range(block_args.num_repeat - 1):
self.blocks.append( self._blocks.append(
self.add_sublayer('{}-{}'.format(i, j + 1), self.add_sublayer(f'{i}-{j+1}', MbConvBlock(block_args)))
ConvBlock(block_params))) _concerned_idx += 1
concerned_idx += 1 if _concerned_idx in self._concerned_block_idxes:
if concerned_idx in self.concerned_block_idxes: self.out_channels.append(block_args.output_filters)
self.out_channels.append(block_params.output_filters)
self._swish = nn.Swish()
self.swish = nn.Swish()
def forward(self, inputs): def forward(self, inputs):
outs = [] outs = []
x = self._swish(self._bn0(self._conv_stem(inputs)))
x = self.swish(self.bn0(self.conv_stem(inputs))) for idx, block in enumerate(self._blocks):
for idx, block in enumerate(self.blocks): drop_connect_rate = self._global_params.drop_connect_rate
drop_connect_rate = self.global_params.drop_connect_rate
if drop_connect_rate: if drop_connect_rate:
drop_connect_rate *= float(idx) / len(self.blocks) drop_connect_rate *= float(idx) / len(self._blocks)
x = block(x, drop_connect_rate=drop_connect_rate) x = block(x, drop_connect_rate=drop_connect_rate)
if idx in self.concerned_block_idxes: if idx in self._concerned_block_idxes:
outs.append(x) outs.append(x)
return outs return outs
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/backbones/ResNetRFL.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal
kaiming_init_ = KaimingNormal()
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
class BasicBlock(nn.Layer):
"""Res-net Basic Block"""
expansion = 1
def __init__(self,
inplanes,
planes,
stride=1,
downsample=None,
norm_type='BN',
**kwargs):
"""
Args:
inplanes (int): input channel
planes (int): channels of the middle feature
stride (int): stride of the convolution
downsample (int): type of the down_sample
norm_type (str): type of the normalization
**kwargs (None): backup parameter
"""
super(BasicBlock, self).__init__()
self.conv1 = self._conv3x3(inplanes, planes)
self.bn1 = nn.BatchNorm(planes)
self.conv2 = self._conv3x3(planes, planes)
self.bn2 = nn.BatchNorm(planes)
self.relu = nn.ReLU()
self.downsample = downsample
self.stride = stride
def _conv3x3(self, in_planes, out_planes, stride=1):
return nn.Conv2D(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias_attr=False)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNetRFL(nn.Layer):
def __init__(self,
in_channels,
out_channels=512,
use_cnt=True,
use_seq=True):
"""
Args:
in_channels (int): input channel
out_channels (int): output channel
"""
super(ResNetRFL, self).__init__()
assert use_cnt or use_seq
self.use_cnt, self.use_seq = use_cnt, use_seq
self.backbone = RFLBase(in_channels)
self.out_channels = out_channels
self.out_channels_block = [
int(self.out_channels / 4), int(self.out_channels / 2),
self.out_channels, self.out_channels
]
block = BasicBlock
layers = [1, 2, 5, 3]
self.inplanes = int(self.out_channels // 2)
self.relu = nn.ReLU()
if self.use_seq:
self.maxpool3 = nn.MaxPool2D(
kernel_size=2, stride=(2, 1), padding=(0, 1))
self.layer3 = self._make_layer(
block, self.out_channels_block[2], layers[2], stride=1)
self.conv3 = nn.Conv2D(
self.out_channels_block[2],
self.out_channels_block[2],
kernel_size=3,
stride=1,
padding=1,
bias_attr=False)
self.bn3 = nn.BatchNorm(self.out_channels_block[2])
self.layer4 = self._make_layer(
block, self.out_channels_block[3], layers[3], stride=1)
self.conv4_1 = nn.Conv2D(
self.out_channels_block[3],
self.out_channels_block[3],
kernel_size=2,
stride=(2, 1),
padding=(0, 1),
bias_attr=False)
self.bn4_1 = nn.BatchNorm(self.out_channels_block[3])
self.conv4_2 = nn.Conv2D(
self.out_channels_block[3],
self.out_channels_block[3],
kernel_size=2,
stride=1,
padding=0,
bias_attr=False)
self.bn4_2 = nn.BatchNorm(self.out_channels_block[3])
if self.use_cnt:
self.inplanes = int(self.out_channels // 2)
self.v_maxpool3 = nn.MaxPool2D(
kernel_size=2, stride=(2, 1), padding=(0, 1))
self.v_layer3 = self._make_layer(
block, self.out_channels_block[2], layers[2], stride=1)
self.v_conv3 = nn.Conv2D(
self.out_channels_block[2],
self.out_channels_block[2],
kernel_size=3,
stride=1,
padding=1,
bias_attr=False)
self.v_bn3 = nn.BatchNorm(self.out_channels_block[2])
self.v_layer4 = self._make_layer(
block, self.out_channels_block[3], layers[3], stride=1)
self.v_conv4_1 = nn.Conv2D(
self.out_channels_block[3],
self.out_channels_block[3],
kernel_size=2,
stride=(2, 1),
padding=(0, 1),
bias_attr=False)
self.v_bn4_1 = nn.BatchNorm(self.out_channels_block[3])
self.v_conv4_2 = nn.Conv2D(
self.out_channels_block[3],
self.out_channels_block[3],
kernel_size=2,
stride=1,
padding=0,
bias_attr=False)
self.v_bn4_2 = nn.BatchNorm(self.out_channels_block[3])
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2D(
self.inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias_attr=False),
nn.BatchNorm(planes * block.expansion), )
layers = list()
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, inputs):
x_1 = self.backbone(inputs)
if self.use_cnt:
v_x = self.v_maxpool3(x_1)
v_x = self.v_layer3(v_x)
v_x = self.v_conv3(v_x)
v_x = self.v_bn3(v_x)
visual_feature_2 = self.relu(v_x)
v_x = self.v_layer4(visual_feature_2)
v_x = self.v_conv4_1(v_x)
v_x = self.v_bn4_1(v_x)
v_x = self.relu(v_x)
v_x = self.v_conv4_2(v_x)
v_x = self.v_bn4_2(v_x)
visual_feature_3 = self.relu(v_x)
else:
visual_feature_3 = None
if self.use_seq:
x = self.maxpool3(x_1)
x = self.layer3(x)
x = self.conv3(x)
x = self.bn3(x)
x_2 = self.relu(x)
x = self.layer4(x_2)
x = self.conv4_1(x)
x = self.bn4_1(x)
x = self.relu(x)
x = self.conv4_2(x)
x = self.bn4_2(x)
x_3 = self.relu(x)
else:
x_3 = None
return [visual_feature_3, x_3]
class ResNetBase(nn.Layer):
def __init__(self, in_channels, out_channels, block, layers):
super(ResNetBase, self).__init__()
self.out_channels_block = [
int(out_channels / 4), int(out_channels / 2), out_channels,
out_channels
]
self.inplanes = int(out_channels / 8)
self.conv0_1 = nn.Conv2D(
in_channels,
int(out_channels / 16),
kernel_size=3,
stride=1,
padding=1,
bias_attr=False)
self.bn0_1 = nn.BatchNorm(int(out_channels / 16))
self.conv0_2 = nn.Conv2D(
int(out_channels / 16),
self.inplanes,
kernel_size=3,
stride=1,
padding=1,
bias_attr=False)
self.bn0_2 = nn.BatchNorm(self.inplanes)
self.relu = nn.ReLU()
self.maxpool1 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
self.layer1 = self._make_layer(block, self.out_channels_block[0],
layers[0])
self.conv1 = nn.Conv2D(
self.out_channels_block[0],
self.out_channels_block[0],
kernel_size=3,
stride=1,
padding=1,
bias_attr=False)
self.bn1 = nn.BatchNorm(self.out_channels_block[0])
self.maxpool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
self.layer2 = self._make_layer(
block, self.out_channels_block[1], layers[1], stride=1)
self.conv2 = nn.Conv2D(
self.out_channels_block[1],
self.out_channels_block[1],
kernel_size=3,
stride=1,
padding=1,
bias_attr=False)
self.bn2 = nn.BatchNorm(self.out_channels_block[1])
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2D(
self.inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias_attr=False),
nn.BatchNorm(planes * block.expansion), )
layers = list()
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv0_1(x)
x = self.bn0_1(x)
x = self.relu(x)
x = self.conv0_2(x)
x = self.bn0_2(x)
x = self.relu(x)
x = self.maxpool1(x)
x = self.layer1(x)
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool2(x)
x = self.layer2(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu(x)
return x
class RFLBase(nn.Layer):
""" Reciprocal feature learning share backbone network"""
def __init__(self, in_channels, out_channels=512):
super(RFLBase, self).__init__()
self.ConvNet = ResNetBase(in_channels, out_channels, BasicBlock,
[1, 2, 5, 3])
def forward(self, inputs):
return self.ConvNet(inputs)
...@@ -39,6 +39,7 @@ def build_head(config): ...@@ -39,6 +39,7 @@ def build_head(config):
from .rec_abinet_head import ABINetHead from .rec_abinet_head import ABINetHead
from .rec_robustscanner_head import RobustScannerHead from .rec_robustscanner_head import RobustScannerHead
from .rec_visionlan_head import VLHead from .rec_visionlan_head import VLHead
from .rec_rfl_head import RFLHead
# cls head # cls head
from .cls_head import ClsHead from .cls_head import ClsHead
...@@ -54,7 +55,8 @@ def build_head(config): ...@@ -54,7 +55,8 @@ def build_head(config):
'ClsHead', 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer', 'ClsHead', 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer',
'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead', 'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead',
'MultiHead', 'ABINetHead', 'TableMasterHead', 'SPINAttentionHead', 'MultiHead', 'ABINetHead', 'TableMasterHead', 'SPINAttentionHead',
'VLHead', 'SLAHead', 'RobustScannerHead', 'CT_Head', 'DRRGHead' 'VLHead', 'SLAHead', 'RobustScannerHead', 'CT_Head', 'RFLHead',
'DRRGHead'
] ]
#table head #table head
......
...@@ -149,6 +149,8 @@ class AttentionLSTM(nn.Layer): ...@@ -149,6 +149,8 @@ class AttentionLSTM(nn.Layer):
else: else:
targets = paddle.zeros(shape=[batch_size], dtype="int32") targets = paddle.zeros(shape=[batch_size], dtype="int32")
probs = None probs = None
char_onehots = None
alpha = None
for i in range(num_steps): for i in range(num_steps):
char_onehots = self._char_to_onehot( char_onehots = self._char_to_onehot(
...@@ -167,7 +169,8 @@ class AttentionLSTM(nn.Layer): ...@@ -167,7 +169,8 @@ class AttentionLSTM(nn.Layer):
next_input = probs_step.argmax(axis=1) next_input = probs_step.argmax(axis=1)
targets = next_input targets = next_input
if not self.training:
probs = paddle.nn.functional.softmax(probs, axis=2)
return probs return probs
......
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/sequence_heads/counting_head.py
"""
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal
from .rec_att_head import AttentionLSTM
kaiming_init_ = KaimingNormal()
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
class CNTHead(nn.Layer):
def __init__(self,
embed_size=512,
encode_length=26,
out_channels=38,
**kwargs):
super(CNTHead, self).__init__()
self.out_channels = out_channels
self.Wv_fusion = nn.Linear(embed_size, embed_size, bias_attr=False)
self.Prediction_visual = nn.Linear(encode_length * embed_size,
self.out_channels)
def forward(self, visual_feature):
b, c, h, w = visual_feature.shape
visual_feature = visual_feature.reshape([b, c, h * w]).transpose(
[0, 2, 1])
visual_feature_num = self.Wv_fusion(visual_feature) # batch * 26 * 512
b, n, c = visual_feature_num.shape
# using visual feature directly calculate the text length
visual_feature_num = visual_feature_num.reshape([b, n * c])
prediction_visual = self.Prediction_visual(visual_feature_num)
return prediction_visual
class RFLHead(nn.Layer):
def __init__(self,
in_channels=512,
hidden_size=256,
batch_max_legnth=25,
out_channels=38,
use_cnt=True,
use_seq=True,
**kwargs):
super(RFLHead, self).__init__()
assert use_cnt or use_seq
self.use_cnt = use_cnt
self.use_seq = use_seq
if self.use_cnt:
self.cnt_head = CNTHead(
embed_size=in_channels,
encode_length=batch_max_legnth + 1,
out_channels=out_channels,
**kwargs)
if self.use_seq:
self.seq_head = AttentionLSTM(
in_channels=in_channels,
out_channels=out_channels,
hidden_size=hidden_size,
**kwargs)
self.batch_max_legnth = batch_max_legnth
self.num_class = out_channels
self.apply(self.init_weights)
def init_weights(self, m):
if isinstance(m, nn.Linear):
kaiming_init_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
def forward(self, x, targets=None):
cnt_inputs, seq_inputs = x
if self.use_cnt:
cnt_outputs = self.cnt_head(cnt_inputs)
else:
cnt_outputs = None
if self.use_seq:
if self.training:
seq_outputs = self.seq_head(seq_inputs, targets[0],
self.batch_max_legnth)
else:
seq_outputs = self.seq_head(seq_inputs, None,
self.batch_max_legnth)
return cnt_outputs, seq_outputs
else:
return cnt_outputs
...@@ -28,10 +28,11 @@ def build_neck(config): ...@@ -28,10 +28,11 @@ def build_neck(config):
from .csp_pan import CSPPAN from .csp_pan import CSPPAN
from .ct_fpn import CTFPN from .ct_fpn import CTFPN
from .fpn_unet import FPN_UNet from .fpn_unet import FPN_UNet
from .rf_adaptor import RFAdaptor
support_dict = [ support_dict = [
'FPN', 'FCEFPN', 'LKPAN', 'DBFPN', 'RSEFPN', 'EASTFPN', 'SASTFPN', 'FPN', 'FCEFPN', 'LKPAN', 'DBFPN', 'RSEFPN', 'EASTFPN', 'SASTFPN',
'SequenceEncoder', 'PGFPN', 'TableFPN', 'PRENFPN', 'CSPPAN', 'CTFPN', 'SequenceEncoder', 'PGFPN', 'TableFPN', 'PRENFPN', 'CSPPAN', 'CTFPN',
'FPN_UNet' 'RFAdaptor', 'FPN_UNet'
] ]
module_name = config.pop('name') module_name = config.pop('name')
......
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/connects/single_block/RFAdaptor.py
"""
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal
kaiming_init_ = KaimingNormal()
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
class S2VAdaptor(nn.Layer):
""" Semantic to Visual adaptation module"""
def __init__(self, in_channels=512):
super(S2VAdaptor, self).__init__()
self.in_channels = in_channels # 512
# feature strengthen module, channel attention
self.channel_inter = nn.Linear(
self.in_channels, self.in_channels, bias_attr=False)
self.channel_bn = nn.BatchNorm1D(self.in_channels)
self.channel_act = nn.ReLU()
self.apply(self.init_weights)
def init_weights(self, m):
if isinstance(m, nn.Conv2D):
kaiming_init_(m.weight)
if isinstance(m, nn.Conv2D) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, (nn.BatchNorm, nn.BatchNorm2D, nn.BatchNorm1D)):
zeros_(m.bias)
ones_(m.weight)
def forward(self, semantic):
semantic_source = semantic # batch, channel, height, width
# feature transformation
semantic = semantic.squeeze(2).transpose(
[0, 2, 1]) # batch, width, channel
channel_att = self.channel_inter(semantic) # batch, width, channel
channel_att = channel_att.transpose([0, 2, 1]) # batch, channel, width
channel_bn = self.channel_bn(channel_att) # batch, channel, width
channel_att = self.channel_act(channel_bn) # batch, channel, width
# Feature enhancement
channel_output = semantic_source * channel_att.unsqueeze(
-2) # batch, channel, 1, width
return channel_output
class V2SAdaptor(nn.Layer):
""" Visual to Semantic adaptation module"""
def __init__(self, in_channels=512, return_mask=False):
super(V2SAdaptor, self).__init__()
# parameter initialization
self.in_channels = in_channels
self.return_mask = return_mask
# output transformation
self.channel_inter = nn.Linear(
self.in_channels, self.in_channels, bias_attr=False)
self.channel_bn = nn.BatchNorm1D(self.in_channels)
self.channel_act = nn.ReLU()
def forward(self, visual):
# Feature enhancement
visual = visual.squeeze(2).transpose([0, 2, 1]) # batch, width, channel
channel_att = self.channel_inter(visual) # batch, width, channel
channel_att = channel_att.transpose([0, 2, 1]) # batch, channel, width
channel_bn = self.channel_bn(channel_att) # batch, channel, width
channel_att = self.channel_act(channel_bn) # batch, channel, width
# size alignment
channel_output = channel_att.unsqueeze(-2) # batch, width, channel
if self.return_mask:
return channel_output, channel_att
return channel_output
class RFAdaptor(nn.Layer):
def __init__(self, in_channels=512, use_v2s=True, use_s2v=True, **kwargs):
super(RFAdaptor, self).__init__()
if use_v2s is True:
self.neck_v2s = V2SAdaptor(in_channels=in_channels, **kwargs)
else:
self.neck_v2s = None
if use_s2v is True:
self.neck_s2v = S2VAdaptor(in_channels=in_channels, **kwargs)
else:
self.neck_s2v = None
self.out_channels = in_channels
def forward(self, x):
visual_feature, rcg_feature = x
if visual_feature is not None:
batch, source_channels, v_source_height, v_source_width = visual_feature.shape
visual_feature = visual_feature.reshape(
[batch, source_channels, 1, v_source_height * v_source_width])
if self.neck_v2s is not None:
v_rcg_feature = rcg_feature * self.neck_v2s(visual_feature)
else:
v_rcg_feature = rcg_feature
if self.neck_s2v is not None:
v_visual_feature = visual_feature + self.neck_s2v(rcg_feature)
else:
v_visual_feature = visual_feature
if v_rcg_feature is not None:
batch, source_channels, source_height, source_width = v_rcg_feature.shape
v_rcg_feature = v_rcg_feature.reshape(
[batch, source_channels, 1, source_height * source_width])
v_rcg_feature = v_rcg_feature.squeeze(2).transpose([0, 2, 1])
return v_visual_feature, v_rcg_feature
...@@ -53,6 +53,9 @@ def build_optimizer(config, epochs, step_each_epoch, model): ...@@ -53,6 +53,9 @@ def build_optimizer(config, epochs, step_each_epoch, model):
if 'clip_norm' in config: if 'clip_norm' in config:
clip_norm = config.pop('clip_norm') clip_norm = config.pop('clip_norm')
grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm) grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
elif 'clip_norm_global' in config:
clip_norm = config.pop('clip_norm_global')
grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=clip_norm)
else: else:
grad_clip = None grad_clip = None
optim = getattr(optimizer, optim_name)(learning_rate=lr, optim = getattr(optimizer, optim_name)(learning_rate=lr,
......
...@@ -28,7 +28,7 @@ from .fce_postprocess import FCEPostProcess ...@@ -28,7 +28,7 @@ from .fce_postprocess import FCEPostProcess
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \ from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \
DistillationCTCLabelDecode, NRTRLabelDecode, SARLabelDecode, \ DistillationCTCLabelDecode, NRTRLabelDecode, SARLabelDecode, \
SEEDLabelDecode, PRENLabelDecode, ViTSTRLabelDecode, ABINetLabelDecode, \ SEEDLabelDecode, PRENLabelDecode, ViTSTRLabelDecode, ABINetLabelDecode, \
SPINLabelDecode, VLLabelDecode SPINLabelDecode, VLLabelDecode, RFLLabelDecode
from .cls_postprocess import ClsPostProcess from .cls_postprocess import ClsPostProcess
from .pg_postprocess import PGPostProcess from .pg_postprocess import PGPostProcess
from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess, DistillationSerPostProcess from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess, DistillationSerPostProcess
...@@ -51,7 +51,7 @@ def build_post_process(config, global_config=None): ...@@ -51,7 +51,7 @@ def build_post_process(config, global_config=None):
'TableMasterLabelDecode', 'SPINLabelDecode', 'TableMasterLabelDecode', 'SPINLabelDecode',
'DistillationSerPostProcess', 'DistillationRePostProcess', 'DistillationSerPostProcess', 'DistillationRePostProcess',
'VLLabelDecode', 'PicoDetPostProcess', 'CTPostProcess', 'VLLabelDecode', 'PicoDetPostProcess', 'CTPostProcess',
'DRRGPostprocess' 'RFLLabelDecode', 'DRRGPostprocess'
] ]
if config['name'] == 'PSEPostProcess': if config['name'] == 'PSEPostProcess':
......
...@@ -242,6 +242,95 @@ class AttnLabelDecode(BaseRecLabelDecode): ...@@ -242,6 +242,95 @@ class AttnLabelDecode(BaseRecLabelDecode):
return idx return idx
class RFLLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """
def __init__(self, character_dict_path=None, use_space_char=False,
**kwargs):
super(RFLLabelDecode, self).__init__(character_dict_path,
use_space_char)
def add_special_char(self, dict_character):
self.beg_str = "sos"
self.end_str = "eos"
dict_character = dict_character
dict_character = [self.beg_str] + dict_character + [self.end_str]
return dict_character
def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
""" convert text-index into text-label. """
result_list = []
ignored_tokens = self.get_ignored_tokens()
[beg_idx, end_idx] = self.get_ignored_tokens()
batch_size = len(text_index)
for batch_idx in range(batch_size):
char_list = []
conf_list = []
for idx in range(len(text_index[batch_idx])):
if text_index[batch_idx][idx] in ignored_tokens:
continue
if int(text_index[batch_idx][idx]) == int(end_idx):
break
if is_remove_duplicate:
# only for predict
if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
batch_idx][idx]:
continue
char_list.append(self.character[int(text_index[batch_idx][
idx])])
if text_prob is not None:
conf_list.append(text_prob[batch_idx][idx])
else:
conf_list.append(1)
text = ''.join(char_list)
result_list.append((text, np.mean(conf_list).tolist()))
return result_list
def __call__(self, preds, label=None, *args, **kwargs):
# if seq_outputs is not None:
if isinstance(preds, tuple) or isinstance(preds, list):
cnt_outputs, seq_outputs = preds
if isinstance(seq_outputs, paddle.Tensor):
seq_outputs = seq_outputs.numpy()
preds_idx = seq_outputs.argmax(axis=2)
preds_prob = seq_outputs.max(axis=2)
text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
if label is None:
return text
label = self.decode(label, is_remove_duplicate=False)
return text, label
else:
cnt_outputs = preds
if isinstance(cnt_outputs, paddle.Tensor):
cnt_outputs = cnt_outputs.numpy()
cnt_length = []
for lens in cnt_outputs:
length = round(np.sum(lens))
cnt_length.append(length)
if label is None:
return cnt_length
label = self.decode(label, is_remove_duplicate=False)
length = [len(res[0]) for res in label]
return cnt_length, length
def get_ignored_tokens(self):
beg_idx = self.get_beg_end_flag_idx("beg")
end_idx = self.get_beg_end_flag_idx("end")
return [beg_idx, end_idx]
def get_beg_end_flag_idx(self, beg_or_end):
if beg_or_end == "beg":
idx = np.array(self.dict[self.beg_str])
elif beg_or_end == "end":
idx = np.array(self.dict[self.end_str])
else:
assert False, "unsupport type %s in get_beg_end_flag_idx" \
% beg_or_end
return idx
class SEEDLabelDecode(BaseRecLabelDecode): class SEEDLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """ """ Convert between text-label and text-index """
...@@ -562,7 +651,8 @@ class PRENLabelDecode(BaseRecLabelDecode): ...@@ -562,7 +651,8 @@ class PRENLabelDecode(BaseRecLabelDecode):
return result_list return result_list
def __call__(self, preds, label=None, *args, **kwargs): def __call__(self, preds, label=None, *args, **kwargs):
preds = preds.numpy() if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
preds_idx = preds.argmax(axis=2) preds_idx = preds.argmax(axis=2)
preds_prob = preds.max(axis=2) preds_prob = preds.max(axis=2)
text = self.decode(preds_idx, preds_prob) text = self.decode(preds_idx, preds_prob)
......
# 基于Python预测引擎推理 # 基于Python预测引擎推理
- [1. 版面信息抽取](#1) - [1. 版面信息抽取](#1-版面信息抽取)
- [1.1 版面分析+表格识别](#1.1) - [1.1 版面分析+表格识别](#11-版面分析表格识别)
- [1.2 版面分析](#1.2) - [1.2 版面分析](#12-版面分析)
- [1.3 表格识别](#1.3) - [1.3 表格识别](#13-表格识别)
- [2. 关键信息抽取](#2) - [2. 关键信息抽取](#2-关键信息抽取)
- [2.1 SER](#21-ser)
- [2.2 RE+SER](#22-reser)
<a name="1"></a> <a name="1"></a>
## 1. 版面信息抽取 ## 1. 版面信息抽取
...@@ -70,6 +72,8 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ ...@@ -70,6 +72,8 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \
<a name="2"></a> <a name="2"></a>
## 2. 关键信息抽取 ## 2. 关键信息抽取
### 2.1 SER
```bash ```bash
cd ppstructure cd ppstructure
...@@ -77,13 +81,38 @@ mkdir inference && cd inference ...@@ -77,13 +81,38 @@ mkdir inference && cd inference
# 下载SER XFUND 模型并解压 # 下载SER XFUND 模型并解压
wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar
cd .. cd ..
python3 kie/predict_kie_token_ser.py \ python3 predict_system.py \
--kie_algorithm=LayoutXLM \ --kie_algorithm=LayoutXLM \
--ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \
--image_dir=./docs/kie/input/zh_val_42.jpg \ --image_dir=./docs/kie/input/zh_val_42.jpg \
--ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \
--vis_font_path=../doc/fonts/simfang.ttf \ --vis_font_path=../doc/fonts/simfang.ttf \
--ocr_order_method="tb-yx" --ocr_order_method="tb-yx" \
--mode=kie
``` ```
运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下存放可视化之后的图片,图片名和输入图片名一致。 运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下存放可视化之后的图片,图片名和输入图片名一致。
### 2.2 RE+SER
```bash
cd ppstructure
mkdir inference && cd inference
# 下载RE SER XFUND 模型并解压
wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar
wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar
cd ..
python3 predict_system.py \
--kie_algorithm=LayoutXLM \
--re_model_dir=./inference/re_vi_layoutxlm_xfund_infer \
--ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \
--image_dir=./docs/kie/input/zh_val_42.jpg \
--ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \
--vis_font_path=../doc/fonts/simfang.ttf \
--ocr_order_method="tb-yx" \
--mode=kie
```
运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下有一个同名目录,目录中存放可视化图片和预测结果。
# Python Inference # Python Inference
- [1. Layout Structured Analysis](#1) - [1. Layout Structured Analysis](#1-layout-structured-analysis)
- [1.1 layout analysis + table recognition](#1.1) - [1.1 layout analysis + table recognition](#11-layout-analysis--table-recognition)
- [1.2 layout analysis](#1.2) - [1.2 layout analysis](#12-layout-analysis)
- [1.3 table recognition](#1.3) - [1.3 table recognition](#13-table-recognition)
- [2. Key Information Extraction](#2) - [2. Key Information Extraction](#2-key-information-extraction)
- [2.1 SER](#21-ser)
- [2.2 RE+SER](#22-reser)
<a name="1"></a> <a name="1"></a>
## 1. Layout Structured Analysis ## 1. Layout Structured Analysis
...@@ -72,6 +74,7 @@ After the operation is completed, each image will have a directory with the same ...@@ -72,6 +74,7 @@ After the operation is completed, each image will have a directory with the same
<a name="2"></a> <a name="2"></a>
## 2. Key Information Extraction ## 2. Key Information Extraction
### 2.1 SER
```bash ```bash
cd ppstructure cd ppstructure
...@@ -79,13 +82,39 @@ mkdir inference && cd inference ...@@ -79,13 +82,39 @@ mkdir inference && cd inference
# download model # download model
wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar
cd .. cd ..
python3 kie/predict_kie_token_ser.py \ python3 predict_system.py \
--kie_algorithm=LayoutXLM \ --kie_algorithm=LayoutXLM \
--ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \
--image_dir=./docs/kie/input/zh_val_42.jpg \ --image_dir=./docs/kie/input/zh_val_42.jpg \
--ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \
--vis_font_path=../doc/fonts/simfang.ttf \ --vis_font_path=../doc/fonts/simfang.ttf \
--ocr_order_method="tb-yx" --ocr_order_method="tb-yx" \
--mode=kie
``` ```
After the operation is completed, each image will store the visualized image in the `kie` directory under the directory specified by the `output` field, and the image name is the same as the input image name. After the operation is completed, each image will store the visualized image in the `kie` directory under the directory specified by the `output` field, and the image name is the same as the input image name.
### 2.2 RE+SER
```bash
cd ppstructure
mkdir inference && cd inference
# download model
wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar
wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar
cd ..
python3 predict_system.py \
--kie_algorithm=LayoutXLM \
--re_model_dir=./inference/re_vi_layoutxlm_xfund_infer \
--ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \
--image_dir=./docs/kie/input/zh_val_42.jpg \
--ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \
--vis_font_path=../doc/fonts/simfang.ttf \
--ocr_order_method="tb-yx" \
--mode=kie
```
After the operation is completed, each image will have a directory with the same name in the `kie` directory under the directory specified by the `output` field, where the visual images and prediction results are stored.
...@@ -29,13 +29,11 @@ import tools.infer.utility as utility ...@@ -29,13 +29,11 @@ import tools.infer.utility as utility
from tools.infer_kie_token_ser_re import make_input from tools.infer_kie_token_ser_re import make_input
from ppocr.postprocess import build_post_process from ppocr.postprocess import build_post_process
from ppocr.utils.logging import get_logger from ppocr.utils.logging import get_logger
from ppocr.utils.visual import draw_re_results from ppocr.utils.visual import draw_ser_results, draw_re_results
from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.utility import get_image_file_list, check_and_read
from ppstructure.utility import parse_args from ppstructure.utility import parse_args
from ppstructure.kie.predict_kie_token_ser import SerPredictor from ppstructure.kie.predict_kie_token_ser import SerPredictor
from paddleocr import PaddleOCR
logger = get_logger() logger = get_logger()
...@@ -43,16 +41,20 @@ class SerRePredictor(object): ...@@ -43,16 +41,20 @@ class SerRePredictor(object):
def __init__(self, args): def __init__(self, args):
self.use_visual_backbone = args.use_visual_backbone self.use_visual_backbone = args.use_visual_backbone
self.ser_engine = SerPredictor(args) self.ser_engine = SerPredictor(args)
if args.re_model_dir is not None:
postprocess_params = {'name': 'VQAReTokenLayoutLMPostProcess'} postprocess_params = {'name': 'VQAReTokenLayoutLMPostProcess'}
self.postprocess_op = build_post_process(postprocess_params) self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.input_tensor, self.output_tensors, self.config = \ self.predictor, self.input_tensor, self.output_tensors, self.config = \
utility.create_predictor(args, 're', logger) utility.create_predictor(args, 're', logger)
else:
self.predictor = None
def __call__(self, img): def __call__(self, img):
ori_im = img.copy()
starttime = time.time() starttime = time.time()
ser_results, ser_inputs, _ = self.ser_engine(img) ser_results, ser_inputs, ser_elapse = self.ser_engine(img)
if self.predictor is None:
return ser_results, ser_elapse
re_input, entity_idx_dict_batch = make_input(ser_inputs, ser_results) re_input, entity_idx_dict_batch = make_input(ser_inputs, ser_results)
if self.use_visual_backbone == False: if self.use_visual_backbone == False:
re_input.pop(4) re_input.pop(4)
...@@ -80,7 +82,7 @@ class SerRePredictor(object): ...@@ -80,7 +82,7 @@ class SerRePredictor(object):
def main(args): def main(args):
image_file_list = get_image_file_list(args.image_dir) image_file_list = get_image_file_list(args.image_dir)
ser_predictor = SerRePredictor(args) ser_re_predictor = SerRePredictor(args)
count = 0 count = 0
total_time = 0 total_time = 0
...@@ -96,7 +98,7 @@ def main(args): ...@@ -96,7 +98,7 @@ def main(args):
if img is None: if img is None:
logger.info("error in loading image:{}".format(image_file)) logger.info("error in loading image:{}".format(image_file))
continue continue
re_res, elapse = ser_predictor(img) re_res, elapse = ser_re_predictor(img)
re_res = re_res[0] re_res = re_res[0]
res_str = '{}\t{}\n'.format( res_str = '{}\t{}\n'.format(
...@@ -106,14 +108,20 @@ def main(args): ...@@ -106,14 +108,20 @@ def main(args):
"ocr_info": re_res, "ocr_info": re_res,
}, ensure_ascii=False)) }, ensure_ascii=False))
f_w.write(res_str) f_w.write(res_str)
if ser_re_predictor.predictor is not None:
img_res = draw_re_results( img_res = draw_re_results(
image_file, re_res, font_path=args.vis_font_path) image_file, re_res, font_path=args.vis_font_path)
img_save_path = os.path.join(
img_save_path = os.path.join( args.output,
args.output, os.path.splitext(os.path.basename(image_file))[0] +
os.path.splitext(os.path.basename(image_file))[0] + "_ser_re.jpg")
"_ser_re.jpg") else:
img_res = draw_ser_results(
image_file, re_res, font_path=args.vis_font_path)
img_save_path = os.path.join(
args.output,
os.path.splitext(os.path.basename(image_file))[0] +
"_ser.jpg")
cv2.imwrite(img_save_path, img_res) cv2.imwrite(img_save_path, img_res)
logger.info("save vis result to {}".format(img_save_path)) logger.info("save vis result to {}".format(img_save_path))
......
...@@ -30,6 +30,7 @@ from copy import deepcopy ...@@ -30,6 +30,7 @@ from copy import deepcopy
from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.utility import get_image_file_list, check_and_read
from ppocr.utils.logging import get_logger from ppocr.utils.logging import get_logger
from ppocr.utils.visual import draw_ser_results, draw_re_results
from tools.infer.predict_system import TextSystem from tools.infer.predict_system import TextSystem
from ppstructure.layout.predict_layout import LayoutPredictor from ppstructure.layout.predict_layout import LayoutPredictor
from ppstructure.table.predict_table import TableSystem, to_excel from ppstructure.table.predict_table import TableSystem, to_excel
...@@ -75,7 +76,8 @@ class StructureSystem(object): ...@@ -75,7 +76,8 @@ class StructureSystem(object):
self.table_system = TableSystem(args) self.table_system = TableSystem(args)
elif self.mode == 'kie': elif self.mode == 'kie':
raise NotImplementedError from ppstructure.kie.predict_kie_token_ser_re import SerRePredictor
self.kie_predictor = SerRePredictor(args)
def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
time_dict = { time_dict = {
...@@ -176,7 +178,10 @@ class StructureSystem(object): ...@@ -176,7 +178,10 @@ class StructureSystem(object):
time_dict['all'] = end - start time_dict['all'] = end - start
return res_list, time_dict return res_list, time_dict
elif self.mode == 'kie': elif self.mode == 'kie':
raise NotImplementedError re_res, elapse = self.kie_predictor(img)
time_dict['kie'] = elapse
time_dict['all'] = elapse
return re_res[0], time_dict
return None, None return None, None
...@@ -235,15 +240,32 @@ def main(args): ...@@ -235,15 +240,32 @@ def main(args):
all_res = [] all_res = []
for index, img in enumerate(imgs): for index, img in enumerate(imgs):
res, time_dict = structure_sys(img, img_idx=index) res, time_dict = structure_sys(img, img_idx=index)
img_save_path = os.path.join(save_folder, img_name,
'show_{}.jpg'.format(index))
os.makedirs(os.path.join(save_folder, img_name), exist_ok=True)
if structure_sys.mode == 'structure' and res != []: if structure_sys.mode == 'structure' and res != []:
save_structure_res(res, save_folder, img_name, index)
draw_img = draw_structure_result(img, res, args.vis_font_path) draw_img = draw_structure_result(img, res, args.vis_font_path)
img_save_path = os.path.join(save_folder, img_name, save_structure_res(res, save_folder, img_name, index)
'show_{}.jpg'.format(index))
elif structure_sys.mode == 'kie': elif structure_sys.mode == 'kie':
raise NotImplementedError if structure_sys.kie_predictor.predictor is not None:
# draw_img = draw_ser_results(img, res, args.vis_font_path) draw_img = draw_re_results(
# img_save_path = os.path.join(save_folder, img_name + '.jpg') img, res, font_path=args.vis_font_path)
else:
draw_img = draw_ser_results(
img, res, font_path=args.vis_font_path)
with open(
os.path.join(save_folder, img_name,
'res_{}_kie.txt'.format(index)),
'w',
encoding='utf8') as f:
res_str = '{}\t{}\n'.format(
image_file,
json.dumps(
{
"ocr_info": res
}, ensure_ascii=False))
f.write(res_str)
if res != []: if res != []:
cv2.imwrite(img_save_path, draw_img) cv2.imwrite(img_save_path, draw_img)
logger.info('result save to {}'.format(img_save_path)) logger.info('result save to {}'.format(img_save_path))
......
...@@ -11,9 +11,9 @@ ...@@ -11,9 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import random
import ast import ast
from PIL import Image from PIL import Image, ImageDraw, ImageFont
import numpy as np import numpy as np
from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args
...@@ -64,6 +64,7 @@ def init_args(): ...@@ -64,6 +64,7 @@ def init_args():
parser.add_argument( parser.add_argument(
"--mode", "--mode",
type=str, type=str,
choices=['structure', 'kie'],
default='structure', default='structure',
help='structure and kie is supported') help='structure and kie is supported')
parser.add_argument( parser.add_argument(
......
Global:
use_gpu: True
epoch_num: 6
log_smooth_window: 20
print_batch_step: 50
save_model_dir: ./output/rec/rec_resnet_rfl/
save_epoch_step: 1
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 5000]
cal_metric_during_train: False
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/rec_resnet_rfl.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.999
weight_decay: 0.0
clip_norm_global: 5.0
lr:
name: Piecewise
decay_epochs : [3, 4, 5]
values : [0.001, 0.0003, 0.00009, 0.000027]
Architecture:
model_type: rec
algorithm: RFL
in_channels: 1
Transform:
name: TPS
num_fiducial: 20
loc_lr: 1.0
model_name: large
Backbone:
name: ResNetRFL
use_cnt: True
use_seq: True
Neck:
name: RFAdaptor
use_v2s: True
use_s2v: True
Head:
name: RFLHead
in_channels: 512
hidden_size: 256
batch_max_legnth: 25
out_channels: 38
use_cnt: True
use_seq: True
Loss:
name: RFLLoss
PostProcess:
name: RFLLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: SimpleDataSet
data_dir: ./train_data/ic15_data/
label_file_list: ["./train_data/ic15_data/rec_gt_train.txt"]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RFLLabelEncode: # Class handling label
- RFLRecResizeImg:
image_shape: [1, 32, 100]
interpolation: 2
- KeepKeys:
keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 64
drop_last: True
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: ./train_data/ic15_data
label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RFLLabelEncode: # Class handling label
- RFLRecResizeImg:
image_shape: [1, 32, 100]
interpolation: 2
- KeepKeys:
keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 8
===========================train_params===========================
model_name:rec_resnet_rfl
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=64
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./inference/rec_inference
null:null
##
trainer:norm_train
norm_train:tools/train.py -c test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml -o
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:tools/eval.py -c test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml -o
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null
##
train_model:./inference/rec_resnet_rfl_train/best_accuracy
infer_export:tools/export_model.py -c test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml -o
infer_quant:False
inference:tools/infer/predict_rec.py --rec_image_shape="1,32,100" --rec_algorithm="RFL" --min_subgraph_size=5
--use_gpu:True|False
--enable_mkldnn:False
--cpu_threads:6
--rec_batch_num:1
--use_tensorrt:False
--precision:fp32
--rec_model_dir:
--image_dir:./inference/rec_inference
--save_log_path:./test/output/
--benchmark:True
null:null
===========================infer_benchmark_params==========================
random_infer_input:[{float32,[1,32,100]}]
...@@ -77,7 +77,7 @@ def export_single_model(model, ...@@ -77,7 +77,7 @@ def export_single_model(model,
elif arch_config["algorithm"] == "PREN": elif arch_config["algorithm"] == "PREN":
other_shape = [ other_shape = [
paddle.static.InputSpec( paddle.static.InputSpec(
shape=[None, 3, 64, 512], dtype="float32"), shape=[None, 3, 64, 256], dtype="float32"),
] ]
model = to_static(model, input_spec=other_shape) model = to_static(model, input_spec=other_shape)
elif arch_config["model_type"] == "sr": elif arch_config["model_type"] == "sr":
...@@ -99,7 +99,7 @@ def export_single_model(model, ...@@ -99,7 +99,7 @@ def export_single_model(model,
] ]
# print([None, 3, 32, 128]) # print([None, 3, 32, 128])
model = to_static(model, input_spec=other_shape) model = to_static(model, input_spec=other_shape)
elif arch_config["algorithm"] in ["NRTR", "SPIN"]: elif arch_config["algorithm"] in ["NRTR", "SPIN", 'RFL']:
other_shape = [ other_shape = [
paddle.static.InputSpec( paddle.static.InputSpec(
shape=[None, 1, 32, 100], dtype="float32"), shape=[None, 1, 32, 100], dtype="float32"),
......
...@@ -100,6 +100,14 @@ class TextRecognizer(object): ...@@ -100,6 +100,14 @@ class TextRecognizer(object):
"use_space_char": args.use_space_char, "use_space_char": args.use_space_char,
"rm_symbol": True "rm_symbol": True
} }
elif self.rec_algorithm == 'RFL':
postprocess_params = {
'name': 'RFLLabelDecode',
"character_dict_path": None,
"use_space_char": args.use_space_char
}
elif self.rec_algorithm == "PREN":
postprocess_params = {'name': 'PRENLabelDecode'}
self.postprocess_op = build_post_process(postprocess_params) self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.input_tensor, self.output_tensors, self.config = \ self.predictor, self.input_tensor, self.output_tensors, self.config = \
utility.create_predictor(args, 'rec', logger) utility.create_predictor(args, 'rec', logger)
...@@ -143,6 +151,16 @@ class TextRecognizer(object): ...@@ -143,6 +151,16 @@ class TextRecognizer(object):
else: else:
norm_img = norm_img.astype(np.float32) / 128. - 1. norm_img = norm_img.astype(np.float32) / 128. - 1.
return norm_img return norm_img
elif self.rec_algorithm == 'RFL':
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
resized_image = cv2.resize(
img, (imgW, imgH), interpolation=cv2.INTER_CUBIC)
resized_image = resized_image.astype('float32')
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
resized_image -= 0.5
resized_image /= 0.5
return resized_image
assert imgC == img.shape[2] assert imgC == img.shape[2]
imgW = int((imgH * max_wh_ratio)) imgW = int((imgH * max_wh_ratio))
...@@ -384,7 +402,7 @@ class TextRecognizer(object): ...@@ -384,7 +402,7 @@ class TextRecognizer(object):
self.rec_image_shape) self.rec_image_shape)
norm_img = norm_img[np.newaxis, :] norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img) norm_img_batch.append(norm_img)
elif self.rec_algorithm == "VisionLAN": elif self.rec_algorithm in ["VisionLAN", "PREN"]:
norm_img = self.resize_norm_img_vl(img_list[indices[ino]], norm_img = self.resize_norm_img_vl(img_list[indices[ino]],
self.rec_image_shape) self.rec_image_shape)
norm_img = norm_img[np.newaxis, :] norm_img = norm_img[np.newaxis, :]
......
...@@ -97,7 +97,8 @@ def main(): ...@@ -97,7 +97,8 @@ def main():
elif config['Architecture']['algorithm'] == "SAR": elif config['Architecture']['algorithm'] == "SAR":
op[op_name]['keep_keys'] = ['image', 'valid_ratio'] op[op_name]['keep_keys'] = ['image', 'valid_ratio']
elif config['Architecture']['algorithm'] == "RobustScanner": elif config['Architecture']['algorithm'] == "RobustScanner":
op[op_name]['keep_keys'] = ['image', 'valid_ratio', 'word_positons'] op[op_name][
'keep_keys'] = ['image', 'valid_ratio', 'word_positons']
else: else:
op[op_name]['keep_keys'] = ['image'] op[op_name]['keep_keys'] = ['image']
transforms.append(op) transforms.append(op)
...@@ -136,9 +137,10 @@ def main(): ...@@ -136,9 +137,10 @@ def main():
if config['Architecture']['algorithm'] == "RobustScanner": if config['Architecture']['algorithm'] == "RobustScanner":
valid_ratio = np.expand_dims(batch[1], axis=0) valid_ratio = np.expand_dims(batch[1], axis=0)
word_positons = np.expand_dims(batch[2], axis=0) word_positons = np.expand_dims(batch[2], axis=0)
img_metas = [paddle.to_tensor(valid_ratio), img_metas = [
paddle.to_tensor(word_positons), paddle.to_tensor(valid_ratio),
] paddle.to_tensor(word_positons),
]
images = np.expand_dims(batch[0], axis=0) images = np.expand_dims(batch[0], axis=0)
images = paddle.to_tensor(images) images = paddle.to_tensor(images)
if config['Architecture']['algorithm'] == "SRN": if config['Architecture']['algorithm'] == "SRN":
...@@ -160,6 +162,10 @@ def main(): ...@@ -160,6 +162,10 @@ def main():
"score": float(post_result[key][0][1]), "score": float(post_result[key][0][1]),
} }
info = json.dumps(rec_info, ensure_ascii=False) info = json.dumps(rec_info, ensure_ascii=False)
elif isinstance(post_result, list) and isinstance(post_result[0],
int):
# for RFLearning CNT branch
info = str(post_result[0])
else: else:
if len(post_result[0]) >= 2: if len(post_result[0]) >= 2:
info = post_result[0][0] + "\t" + str(post_result[0][1]) info = post_result[0][0] + "\t" + str(post_result[0][1])
......
...@@ -114,7 +114,7 @@ def merge_config(config, opts): ...@@ -114,7 +114,7 @@ def merge_config(config, opts):
return config return config
def check_device(use_gpu, use_xpu=False, use_npu=False): def check_device(use_gpu, use_xpu=False, use_npu=False, use_mlu=False):
""" """
Log error and exit when set use_gpu=true in paddlepaddle Log error and exit when set use_gpu=true in paddlepaddle
cpu version. cpu version.
...@@ -137,6 +137,9 @@ def check_device(use_gpu, use_xpu=False, use_npu=False): ...@@ -137,6 +137,9 @@ def check_device(use_gpu, use_xpu=False, use_npu=False):
if use_npu and not paddle.device.is_compiled_with_npu(): if use_npu and not paddle.device.is_compiled_with_npu():
print(err.format("use_npu", "npu", "npu", "use_npu")) print(err.format("use_npu", "npu", "npu", "use_npu"))
sys.exit(1) sys.exit(1)
if use_mlu and not paddle.device.is_compiled_with_mlu():
print(err.format("use_mlu", "mlu", "mlu", "use_mlu"))
sys.exit(1)
except Exception as e: except Exception as e:
pass pass
...@@ -217,7 +220,7 @@ def train(config, ...@@ -217,7 +220,7 @@ def train(config,
use_srn = config['Architecture']['algorithm'] == "SRN" use_srn = config['Architecture']['algorithm'] == "SRN"
extra_input_models = [ extra_input_models = [
"SRN", "NRTR", "SAR", "SEED", "SVTR", "SPIN", "VisionLAN", "SRN", "NRTR", "SAR", "SEED", "SVTR", "SPIN", "VisionLAN",
"RobustScanner", 'DRRG' "RobustScanner", "RFL", 'DRRG'
] ]
extra_input = False extra_input = False
if config['Architecture']['algorithm'] == 'Distillation': if config['Architecture']['algorithm'] == 'Distillation':
...@@ -618,6 +621,7 @@ def preprocess(is_train=False): ...@@ -618,6 +621,7 @@ def preprocess(is_train=False):
use_gpu = config['Global'].get('use_gpu', False) use_gpu = config['Global'].get('use_gpu', False)
use_xpu = config['Global'].get('use_xpu', False) use_xpu = config['Global'].get('use_xpu', False)
use_npu = config['Global'].get('use_npu', False) use_npu = config['Global'].get('use_npu', False)
use_mlu = config['Global'].get('use_mlu', False)
alg = config['Architecture']['algorithm'] alg = config['Architecture']['algorithm']
assert alg in [ assert alg in [
...@@ -625,17 +629,19 @@ def preprocess(is_train=False): ...@@ -625,17 +629,19 @@ def preprocess(is_train=False):
'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE',
'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'LayoutLMv2', 'PREN', 'FCE', 'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'LayoutLMv2', 'PREN', 'FCE',
'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN', 'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN',
'Gestalt', 'SLANet', 'RobustScanner', 'CT', 'DRRG' 'Gestalt', 'SLANet', 'RobustScanner', 'CT', 'RFL', 'DRRG'
] ]
if use_xpu: if use_xpu:
device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0)) device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
elif use_npu: elif use_npu:
device = 'npu:{0}'.format(os.getenv('FLAGS_selected_npus', 0)) device = 'npu:{0}'.format(os.getenv('FLAGS_selected_npus', 0))
elif use_mlu:
device = 'mlu:{0}'.format(os.getenv('FLAGS_selected_mlus', 0))
else: else:
device = 'gpu:{}'.format(dist.ParallelEnv() device = 'gpu:{}'.format(dist.ParallelEnv()
.dev_id) if use_gpu else 'cpu' .dev_id) if use_gpu else 'cpu'
check_device(use_gpu, use_xpu, use_npu) check_device(use_gpu, use_xpu, use_npu, use_mlu)
device = paddle.set_device(device) device = paddle.set_device(device)
......
...@@ -149,10 +149,11 @@ def main(config, device, logger, vdl_writer): ...@@ -149,10 +149,11 @@ def main(config, device, logger, vdl_writer):
amp_level = config["Global"].get("amp_level", 'O2') amp_level = config["Global"].get("amp_level", 'O2')
amp_custom_black_list = config['Global'].get('amp_custom_black_list', []) amp_custom_black_list = config['Global'].get('amp_custom_black_list', [])
if use_amp: if use_amp:
AMP_RELATED_FLAGS_SETTING = { AMP_RELATED_FLAGS_SETTING = {'FLAGS_max_inplace_grad_add': 8, }
'FLAGS_cudnn_batchnorm_spatial_persistent': 1, if paddle.is_compiled_with_cuda():
'FLAGS_max_inplace_grad_add': 8, AMP_RELATED_FLAGS_SETTING.update({
} 'FLAGS_cudnn_batchnorm_spatial_persistent': 1
})
paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
scale_loss = config["Global"].get("scale_loss", 1.0) scale_loss = config["Global"].get("scale_loss", 1.0)
use_dynamic_loss_scaling = config["Global"].get( use_dynamic_loss_scaling = config["Global"].get(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册