提交 e1430c3b 编写于 作者: L LDOUBLEV

fix hubserving

......@@ -147,6 +147,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.itemsToShapesbox = {}
self.shapesToItemsbox = {}
self.prevLabelText = getStr('tempLabel')
self.noLabelText = getStr('nullLabel')
self.model = 'paddle'
self.PPreader = None
self.autoSaveNum = 5
......@@ -1020,7 +1021,7 @@ class MainWindow(QMainWindow, WindowMixin):
item.setText(str([(int(p.x()), int(p.y())) for p in shape.points]))
self.updateComboBox()
def updateComboBox(self): # TODO:貌似没用
def updateComboBox(self):
# Get the unique labels and add them to the Combobox.
itemsTextList = [str(self.labelList.item(i).text()) for i in range(self.labelList.count())]
......@@ -1040,7 +1041,7 @@ class MainWindow(QMainWindow, WindowMixin):
return dict(label=s.label, # str
line_color=s.line_color.getRgb(),
fill_color=s.fill_color.getRgb(),
points=[(p.x(), p.y()) for p in s.points], # QPonitF
points=[(int(p.x()), int(p.y())) for p in s.points], # QPonitF
# add chris
difficult=s.difficult) # bool
......@@ -1069,7 +1070,7 @@ class MainWindow(QMainWindow, WindowMixin):
# print('Image:{0} -> Annotation:{1}'.format(self.filePath, annotationFilePath))
return True
except:
self.errorMessage(u'Error saving label data')
self.errorMessage(u'Error saving label data', u'Error saving label data')
return False
def copySelectedShape(self):
......@@ -1802,7 +1803,11 @@ class MainWindow(QMainWindow, WindowMixin):
result.insert(0, box)
print('result in reRec is ', result)
self.result_dic.append(result)
if result[1][0] == shape.label:
else:
print('Can not recognise the box')
self.result_dic.append([box,(self.noLabelText,0)])
if self.noLabelText == shape.label or result[1][0] == shape.label:
print('label no change')
else:
rec_flag += 1
......@@ -1836,9 +1841,14 @@ class MainWindow(QMainWindow, WindowMixin):
print('label no change')
else:
shape.label = result[1][0]
else:
print('Can not recognise the box')
if self.noLabelText == shape.label:
print('label no change')
else:
shape.label = self.noLabelText
self.singleLabel(shape)
self.setDirty()
print(box)
def autolcm(self):
vbox = QVBoxLayout()
......
......@@ -45,7 +45,7 @@ class Canvas(QWidget):
CREATE, EDIT = list(range(2))
_fill_drawing = False # draw shadows
epsilon = 11.0
epsilon = 5.0
def __init__(self, *args, **kwargs):
super(Canvas, self).__init__(*args, **kwargs)
......
此差异已折叠。
......@@ -87,6 +87,7 @@ creatPolygon=四点标注
drawSquares=正方形标注
saveRec=保存识别结果
tempLabel=待识别
nullLabel=无法识别
steps=操作步骤
choseModelLg=选择模型语言
cancel=取消
......
......@@ -77,7 +77,7 @@ IR=Image Resize
autoRecognition=Auto Recognition
reRecognition=Re-recognition
mfile=File
medit=Eidt
medit=Edit
mview=View
mhelp=Help
iconList=Icon List
......@@ -87,6 +87,7 @@ creatPolygon=Create Quadrilateral
drawSquares=Draw Squares
saveRec=Save Recognition Result
tempLabel=TEMPORARY
nullLabel=NULL
steps=Steps
choseModelLg=Choose Model Language
cancel=Cancel
......
......@@ -9,7 +9,6 @@ PaddleOCR supports both dynamic graph and static graph programming paradigm
- Static graph: develop branch
**Recent updates**
- **Notice: PaddleOCR R & D team will conduct in-depth technical interpretation of the latest version at 19:00 P.M. on April 13. ([live address](https://live.bilibili.com/21689802))**
- 2021.4.8 Release PaddleOCRv2.1(branch release/2.1). Newly released AAAI 2021 end-to-end algorithm [PGNet](./doc/doc_en/pgnet_en.md) and [Multi language recognition model](./doc/doc_en/multi_languages_en.md), support more than 80 languages recognition.
- 2021.2.8 Release PaddleOCRv2.0 (branch release/2.0). Refer to [release note](https://github.com/PaddlePaddle/PaddleOCR/releases/tag/v2.0.0) for more details.
- 2021.1.21 update more than 25+ multilingual recognition models [models list](./doc/doc_en/models_list_en.md), including:English, Chinese, German, French, Japanese,Spanish,Portuguese Russia Arabic and so on. Models for more languages will continue to be updated [Develop Plan](https://github.com/PaddlePaddle/PaddleOCR/issues/1048).
......@@ -22,7 +21,7 @@ PaddleOCR supports both dynamic graph and static graph programming paradigm
- Ultra lightweight ppocr_mobile series models: detection (3.0M) + direction classifier (1.4M) + recognition (5.0M) = 9.4M
- General ppocr_server series models: detection (47.1M) + direction classifier (1.4M) + recognition (94.9M) = 143.4M
- Support Chinese, English, and digit recognition, vertical text recognition, and long text recognition
- Support more than 80 kinds of multi-language recognition models: [For details](./doc/doc_ch/multi_languages.md)
- Support more than 80 kinds of multi-language recognition models: [For details](./doc/doc_en/multi_languages_en.md)
- Rich toolkits related to the OCR areas
- Semi-automatic data annotation tool, i.e., PPOCRLabel: support fast and efficient data annotation
- Data synthesis tool, i.e., Style-Text: easy to synthesize a large number of images which are similar to the target scene image
......@@ -98,7 +97,7 @@ For a new language request, please refer to [Guideline for new language_requests
- [Quick Inference Based on PIP](./doc/doc_en/whl_en.md)
- [Python Inference](./doc/doc_en/inference_en.md)
- [C++ Inference](./deploy/cpp_infer/readme_en.md)
- [Serving](./deploy/pdserving/README.md)
- [Serving](./deploy/hubserving/readme_en.md)
- [Mobile](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/lite/readme_en.md)
- [Benchmark](./doc/doc_en/benchmark_en.md)
- Data Annotation and Synthesis
......
......@@ -8,8 +8,8 @@ PaddleOCR同时支持动态图与静态图两种编程范式
- 静态图版本:develop分支
**近期更新**
- **【预告】 PaddleOCR研发团队对最新发版内容技术深入解读,4月13日晚上19:00,[直播地址](https://live.bilibili.com/21689802)**
- 2021.4.12 [FAQ](./doc/doc_ch/FAQ.md)新增5个高频问题,总数203个,每周一都会更新,欢迎大家持续关注
- 2021.4.20 [FAQ](./doc/doc_ch/FAQ.md)新增5个高频问题,总数208个,每周一都会更新,欢迎大家持续关注。
- PaddleOCR研发团队对最新发版内容技术深入解读,4月13日晚上19:00,[直播地址](https://live.bilibili.com/21689802)
- 2021.4.8 release 2.1版本,新增AAAI 2021论文[端到端识别算法PGNet](./doc/doc_ch/pgnet.md)开源,[多语言模型](./doc/doc_ch/multi_languages.md)支持种类增加到80+。
- 2021.2.8 正式发布PaddleOCRv2.0(branch release/2.0)并设置为推荐用户使用的默认分支. 发布的详细内容,请参考: https://github.com/PaddlePaddle/PaddleOCR/releases/tag/v2.0.0
- 2021.1.26,28,29 PaddleOCR官方研发团队带来技术深入解读三日直播课,1月26日、28日、29日晚上19:30,[直播地址](https://live.bilibili.com/21689802)
......@@ -89,7 +89,7 @@ PaddleOCR同时支持动态图与静态图两种编程范式
- [基于pip安装whl包快速推理](./doc/doc_ch/whl.md)
- [基于Python脚本预测引擎推理](./doc/doc_ch/inference.md)
- [基于C++预测引擎推理](./deploy/cpp_infer/readme.md)
- [服务化部署](./deploy/pdserving/README_CN.md)
- [服务化部署](./deploy/hubserving/readme.md)
- [端侧部署](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/lite/readme.md)
- [Benchmark](./doc/doc_ch/benchmark.md)
- 数据集
......
......@@ -60,22 +60,25 @@ PostProcess:
name: PGPostProcess
score_thresh: 0.5
mode: fast # fast or slow two ways
Metric:
name: E2EMetric
gt_mat_dir: # the dir of gt_mat
mode: A # two ways for eval, A: label from txt, B: label from gt_mat
gt_mat_dir: ./train_data/total_text/gt # the dir of gt_mat
character_dict_path: ppocr/utils/ic15_dict.txt
main_indicator: f_score_e2e
Train:
dataset:
name: PGDataSet
label_file_list: [.././train_data/total_text/train/]
data_dir: ./train_data/total_text/train
label_file_list: [./train_data/total_text/train/train.txt]
ratio_list: [1.0]
data_format: icdar #two data format: icdar/textnet
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- E2ELabelEncodeTrain:
- PGProcessTrain:
batch_size: 14 # same as loader: batch_size_per_card
min_crop_size: 24
......@@ -92,13 +95,13 @@ Train:
Eval:
dataset:
name: PGDataSet
data_dir: ./train_data/
label_file_list: [./train_data/total_text/test/]
data_dir: ./train_data/total_text/test
label_file_list: [./train_data/total_text/test/test.txt]
transforms:
- DecodeImage: # load image
img_mode: RGB
channel_first: False
- E2ELabelEncode:
- E2ELabelEncodeTest:
- E2EResizeForTest:
max_side_len: 768
- NormalizeImage:
......@@ -108,7 +111,7 @@ Eval:
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: [ 'image', 'shape', 'polys', 'strs', 'tags', 'img_id']
keep_keys: [ 'image', 'shape', 'polys', 'texts', 'ignore_tags', 'img_id']
loader:
shuffle: False
drop_last: False
......
......@@ -37,7 +37,7 @@ Architecture:
name: TPS
num_fiducial: 20
loc_lr: 0.1
model_name: small
model_name: large
Backbone:
name: ResNet
layers: 34
......
......@@ -40,6 +40,7 @@ endif()
if (WIN32)
include_directories("${PADDLE_LIB}/paddle/fluid/inference")
include_directories("${PADDLE_LIB}/paddle/include")
link_directories("${PADDLE_LIB}/paddle/lib")
link_directories("${PADDLE_LIB}/paddle/fluid/inference")
find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/build/ NO_DEFAULT_PATH)
......@@ -140,22 +141,22 @@ else()
endif ()
endif()
# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
# Note: libpaddle_inference_api.so/a must put before libpaddle_inference.so/a
if(WITH_STATIC_LIB)
if(WIN32)
set(DEPS
${PADDLE_LIB}/paddle/lib/paddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
else()
set(DEPS
${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
endif()
else()
if(WIN32)
set(DEPS
${PADDLE_LIB}/paddle/lib/paddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
else()
set(DEPS
${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
endif()
endif(WITH_STATIC_LIB)
......
......@@ -51,6 +51,7 @@ public:
float &ssid);
float BoxScoreFast(std::vector<std::vector<float>> box_array, cv::Mat pred);
float PolygonScoreAcc(std::vector<cv::Point> contour, cv::Mat pred);
std::vector<std::vector<std::vector<int>>>
BoxesFromBitmap(const cv::Mat pred, const cv::Mat bitmap,
......
......@@ -76,7 +76,7 @@ opencv3/
#### 1.2.1 直接下载安装
* [Paddle预测库官网](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/05_inference_deployment/inference/build_and_install_lib_cn.html)上提供了不同cuda版本的Linux预测库,可以在官网查看并选择合适的预测库版本
* [Paddle预测库官网](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)上提供了不同cuda版本的Linux预测库,可以在官网查看并选择合适的预测库版本(*建议选择paddle版本>=2.0.1版本的预测库*
* 下载之后使用下面的方法解压。
......@@ -131,7 +131,6 @@ build/paddle_inference_install_dir/
## 2 开始运行
### 2.1 将模型导出为inference model
......
......@@ -78,6 +78,7 @@ opencv3/
#### 1.2.1 Direct download and installation
* Different cuda versions of the Linux inference library (based on GCC 4.8.2) are provided on the
[Paddle inference library official website](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/guides/05_inference_deployment/inference/build_and_install_lib_en.html). You can view and select the appropriate version of the inference library on the official website.
......@@ -90,9 +91,10 @@ tar -xf paddle_inference.tgz
Finally you can see the following files in the folder of `paddle_inference/`.
#### 1.2.2 Compile from the source code
* If you want to get the latest Paddle inference library features, you can download the latest code from Paddle github repository and compile the inference library from the source code.
* You can refer to [Paddle inference library] (https://www.paddlepaddle.org.cn/documentation/docs/en/develop/guides/05_inference_deployment/inference/build_and_install_lib_en.html) to get the Paddle source code from github, and then compile To generate the latest inference library. The method of using git to access the code is as follows.
* If you want to get the latest Paddle inference library features, you can download the latest code from Paddle github repository and compile the inference library from the source code. It is recommended to download the inference library with paddle version greater than or equal to 2.0.1.
* You can refer to [Paddle inference library] (https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/inference_deployment/inference/build_and_install_lib_en.html) to get the Paddle source code from github, and then compile To generate the latest inference library. The method of using git to access the code is as follows.
```shell
......
......@@ -159,6 +159,39 @@ std::vector<std::vector<float>> PostProcessor::GetMiniBoxes(cv::RotatedRect box,
return array;
}
float PostProcessor::PolygonScoreAcc(std::vector<cv::Point> contour,
cv::Mat pred){
int width = pred.cols;
int height = pred.rows;
std::vector<float> box_x;
std::vector<float> box_y;
for(int i=0; i<contour.size(); ++i){
box_x.push_back(contour[i].x);
box_y.push_back(contour[i].y);
}
int xmin = clamp(int(std::floor(*(std::min_element(box_x.begin(), box_x.end())))), 0, width - 1);
int xmax = clamp(int(std::ceil(*(std::max_element(box_x.begin(), box_x.end())))), 0, width - 1);
int ymin = clamp(int(std::floor(*(std::min_element(box_y.begin(), box_y.end())))), 0, height - 1);
int ymax = clamp(int(std::ceil(*(std::max_element(box_y.begin(), box_y.end())))), 0, height - 1);
cv::Mat mask;
mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
cv::Point rook_point[contour.size()];
for(int i=0; i<contour.size(); ++i){
rook_point[i] = cv::Point(int(box_x[i]) - xmin, int(box_y[i]) - ymin);
}
const cv::Point *ppt[1] = {rook_point};
int npt[] = {int(contour.size())};
cv::fillPoly(mask, ppt, npt, 1, cv::Scalar(1));
cv::Mat croppedImg;
pred(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1)).copyTo(croppedImg);
float score = cv::mean(croppedImg, mask)[0];
return score;
}
float PostProcessor::BoxScoreFast(std::vector<std::vector<float>> box_array,
cv::Mat pred) {
auto array = box_array;
......@@ -235,6 +268,8 @@ PostProcessor::BoxesFromBitmap(const cv::Mat pred, const cv::Mat bitmap,
float score;
score = BoxScoreFast(array, pred);
/* compute using polygon*/
// score = PolygonScoreAcc(contours[_i], pred);
if (score < box_thresh)
continue;
......
......@@ -6,6 +6,7 @@ from __future__ import print_function
import os
import sys
sys.path.insert(0, ".")
import copy
from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, runnable, serving
......@@ -14,6 +15,7 @@ import paddlehub as hub
from tools.infer.utility import base64_to_cv2
from tools.infer.predict_cls import TextClassifier
from tools.infer.utility import parse_args
@moduleinfo(
......@@ -28,8 +30,7 @@ class OCRCls(hub.Module):
"""
initialize with the necessary elements
"""
from ocr_cls.params import read_params
cfg = read_params()
cfg = self.merge_configs()
cfg.use_gpu = use_gpu
if use_gpu:
......@@ -48,6 +49,21 @@ class OCRCls(hub.Module):
self.text_classifier = TextClassifier(cfg)
def merge_configs(self, ):
# deafult cfg
backup_argv = copy.deepcopy(sys.argv)
sys.argv = sys.argv[:1]
cfg = parse_args()
from ocr_det.params import read_params
update_cfg_map = vars(read_params())
for key in update_cfg_map:
cfg.__setattr__(key, update_cfg_map[key])
sys.argv = copy.deepcopy(backup_argv)
return cfg
def read_images(self, paths=[]):
images = []
for img_path in paths:
......
......@@ -7,6 +7,8 @@ import os
import sys
sys.path.insert(0, ".")
import copy
from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, runnable, serving
import cv2
......@@ -15,7 +17,9 @@ import paddlehub as hub
from tools.infer.utility import base64_to_cv2
from tools.infer.predict_det import TextDetector
from deploy.hubserving.ocr_det.params import read_params
from tools.infer.utility import parse_args
@moduleinfo(
......@@ -30,7 +34,8 @@ class OCRDet(hub.Module):
"""
initialize with the necessary elements
"""
cfg = read_params()
cfg = self.merge_configs()
cfg.use_gpu = use_gpu
if use_gpu:
......@@ -49,6 +54,20 @@ class OCRDet(hub.Module):
self.text_detector = TextDetector(cfg)
def merge_configs(self, ):
# deafult cfg
backup_argv = copy.deepcopy(sys.argv)
sys.argv = sys.argv[:1]
cfg = parse_args()
update_cfg_map = vars(read_params())
for key in update_cfg_map:
cfg.__setattr__(key, update_cfg_map[key])
sys.argv = copy.deepcopy(backup_argv)
return cfg
def read_images(self, paths=[]):
images = []
for img_path in paths:
......
......@@ -22,6 +22,7 @@ def read_params():
cfg.det_db_box_thresh = 0.5
cfg.det_db_unclip_ratio = 1.6
cfg.use_dilation = False
cfg.det_db_score_mode = "fast"
# #EAST parmas
# cfg.det_east_score_thresh = 0.8
......
......@@ -6,6 +6,7 @@ from __future__ import print_function
import os
import sys
sys.path.insert(0, ".")
import copy
from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, runnable, serving
......@@ -14,7 +15,9 @@ import paddlehub as hub
from tools.infer.utility import base64_to_cv2
from tools.infer.predict_rec import TextRecognizer
from deploy.hubserving.ocr_rec.params import read_params
from tools.infer.utility import parse_args
@moduleinfo(
......@@ -29,7 +32,8 @@ class OCRRec(hub.Module):
"""
initialize with the necessary elements
"""
cfg = read_params()
cfg = self.merge_configs()
cfg.use_gpu = use_gpu
if use_gpu:
......@@ -48,6 +52,20 @@ class OCRRec(hub.Module):
self.text_recognizer = TextRecognizer(cfg)
def merge_configs(self, ):
# deafult cfg
backup_argv = copy.deepcopy(sys.argv)
sys.argv = sys.argv[:1]
cfg = parse_args()
update_cfg_map = vars(read_params())
for key in update_cfg_map:
cfg.__setattr__(key, update_cfg_map[key])
sys.argv = copy.deepcopy(backup_argv)
return cfg
def read_images(self, paths=[]):
images = []
for img_path in paths:
......
......@@ -6,6 +6,7 @@ from __future__ import print_function
import os
import sys
sys.path.insert(0, ".")
import copy
import time
......@@ -18,6 +19,7 @@ import paddlehub as hub
from tools.infer.utility import base64_to_cv2
from tools.infer.predict_system import TextSystem
from deploy.hubserving.ocr_system.params import read_params
from tools.infer.utility import parse_args
@moduleinfo(
......@@ -32,7 +34,7 @@ class OCRSystem(hub.Module):
"""
initialize with the necessary elements
"""
cfg = read_params()
cfg = self.merge_configs()
cfg.use_gpu = use_gpu
if use_gpu:
......@@ -51,6 +53,20 @@ class OCRSystem(hub.Module):
self.text_sys = TextSystem(cfg)
def merge_configs(self, ):
# deafult cfg
backup_argv = copy.deepcopy(sys.argv)
sys.argv = sys.argv[:1]
cfg = parse_args()
update_cfg_map = vars(read_params())
for key in update_cfg_map:
cfg.__setattr__(key, update_cfg_map[key])
sys.argv = copy.deepcopy(backup_argv)
return cfg
def read_images(self, paths=[]):
images = []
for img_path in paths:
......
......@@ -22,6 +22,7 @@ def read_params():
cfg.det_db_box_thresh = 0.5
cfg.det_db_unclip_ratio = 1.6
cfg.use_dilation = False
cfg.det_db_score_mode = "fast"
#EAST parmas
cfg.det_east_score_thresh = 0.8
......
......@@ -2,7 +2,7 @@ English | [简体中文](readme.md)
PaddleOCR provides 2 service deployment methods:
- Based on **PaddleHub Serving**: Code path is "`./deploy/hubserving`". Please follow this tutorial.
- Based on **PaddleServing**: Code path is "`./deploy/pdserving`". Please refer to the [tutorial](../../deploy/pdserving/README.md) for usage.
- Based on **PaddleServing**: Code path is "`./deploy/pdserving`". Please refer to the [tutorial](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/pdserving/README.md) for usage.
# Service deployment based on PaddleHub Serving
......
......@@ -9,34 +9,41 @@
## PaddleOCR常见问题汇总(持续更新)
* [近期更新(2021.4.12](#近期更新)
* [近期更新(2021.4.20](#近期更新)
* [【精选】OCR精选10个问题](#OCR精选10个问题)
* [【理论篇】OCR通用43个问题](#OCR通用问题)
* [基础知识13题](#基础知识)
* [数据集9题](#数据集2)
* [模型训练调优21题](#模型训练调优2)
* [【实战篇】PaddleOCR实战150个问题](#PaddleOCR实战问题)
* [使用咨询57](#使用咨询)
* [使用咨询61](#使用咨询)
* [数据集18题](#数据集3)
* [模型训练调优34题](#模型训练调优3)
* [预测部署41](#预测部署3)
* [预测部署42](#预测部署3)
<a name="近期更新"></a>
## 近期更新(2021.4.12)
#### Q2.2.9: 端到端算法PGNet使用的是什么类型的数据集呢?
**A**: PGNet目前可以使用四点标注数据集,也可以使用多点标注数据集(十四点),多点标注训练的效果要比四点的好,一种可以尝试的策略是先在四点数据集上训练,之后用多点数据集在此基础上继续训练。
## 近期更新(2021.4.20)
#### Q2.3.21: 端到端算法PGNet是否支持中文识别,速度会很慢嘛?
**A**:目前开源的PGNet算法模型主要是用于检测英文数字,对于中文的识别需要自己训练,大家可以使用开源的端到端中文数据集,而对于复杂文本(弯曲文本)的识别,也可以自己构造一批数据集针对进行训练,对于推理速度,可以先将模型转换为inference再进行预测,速度应该会相当可观。
#### Q3.1.58: 使用PGNet进行eval报错?
**A**: 需要注意,我们目前在release/2.1更新了评测代码,目前支持A,B两种评测模式:
* A模式:该模式主要为了方便用户使用,与训练集一样的标注文件就可以正常进行eval操作, 代码中默认是A模式。
* B模式:该模式主要为了保证我们的评测代码可以和Total Text官方的评测方式对齐,该模式下直接加载官方提供的mat文件进行eval。
#### Q3.1.57: 端到端算法PGNet提供了两种后处理方式,两者之间有什么区别呢
**A**: 两种后处理的区别主要在于速度的推理,config中PostProcess有fast/slow两种模式,slow模式的后处理速度慢,精度相对较高,fast模式的后处理速度快,精度也在可接受的范围之内。建议使用速度快的后处理方式。
#### Q3.1.59: 使用预训练模型进行预测,对于特定字符识别识别效果较差,怎么解决
**A**: 由于我们所提供的识别模型是基于通用大规模数据集进行训练的,部分字符可能在训练集中包含较少,因此您可以构建特定场景的数据集,基于我们提供的预训练模型进行微调。建议用于微调的数据集中,每个字符出现的样本数量不低于300,但同时需要注意不同字符的数量均衡。具体可以参考:[微调](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/recognition.md#2-%E5%90%AF%E5%8A%A8%E8%AE%AD%E7%BB%83)
#### Q3.3.34: 表格识别中,如何提高单字的识别结果?
**A**: 首先需要确认一下检测模型有没有有效的检测出单个字符,如果没有的话,需要在训练集当中添加相应的单字数据集。
#### Q3.1.60: PGNet有中文预训练模型吗?
**A**: 目前我们尚未提供针对中文的预训练模型,如有需要,可以尝试自己训练。具体需要修改的地方有:
1. [config文件中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/configs/e2e/e2e_r50_vd_pg.yml#L23-L24),字典文件路径及语种设置;
1. [网络结构中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/modeling/heads/e2e_pg_head.py#L181)`out_channels`修改为字典中的字符数目+1(考虑到空格);
1. [loss中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/losses/e2e_pg_loss.py#L93),修改`37`为字典中的字符数目+1(考虑到空格);
#### Q3.4.41: PaddleOCR持tensorrt推理吗?
**A**: 支持的,需要在编译的时候将CMakeLists.txt文件当中,将相关代码`option(WITH_TENSORRT "Compile demo with TensorRT." OFF)`的OFF改成ON。关于服务器端部署的更多设置,可以参考[飞桨官网](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/native_infer.html)
#### Q3.1.61: 用于PGNet的训练集,文本框的标注有要求吗?
**A**: PGNet支持多点标注,比如4点、8点、14点等。但需要注意的是,标注点尽可能分布均匀(相邻标注点间隔距离均匀一致),且label文件中的标注点需要从标注框的左上角开始,按标注点顺时针顺序依次编写,以上问题都可能对训练精度造成影响。
我们提供的,基于Total Text数据集的PGNet预训练模型使用了14点标注方式。
#### Q3.4.42: 在使用PaddleLite进行预测部署时,启动预测后卡死/手机死机?
**A**: 请检查模型转换时所用PaddleLite的版本,和预测库的版本是否对齐。即PaddleLite版本为2.8,则预测库版本也要为2.8。
<a name="OCR精选10个问题"></a>
## 【精选】OCR精选10个问题
......@@ -120,7 +127,7 @@
#### Q1.1.10:PaddleOCR中,对于模型预测加速,CPU加速的途径有哪些?基于TenorRT加速GPU对输入有什么要求?
**A**:(1)CPU可以使用mkldnn进行加速;对于python inference的话,可以把enable_mkldnn改为true,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/tools/infer/utility.py#L84),对于cpp inference的话,在配置文件里面配置use_mkldnn 1即可,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/cpp_infer/tools/config.txt#L6)
**A**:(1)CPU可以使用mkldnn进行加速;对于python inference的话,可以把enable_mkldnn改为true,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/tools/infer/utility.py#L99),对于cpp inference的话,在配置文件里面配置use_mkldnn 1即可,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/cpp_infer/tools/config.txt#L6)
(2)GPU需要注意变长输入问题等,TRT6 之后才支持变长输入
......@@ -613,6 +620,24 @@ repo中config.yml文件的前后处理参数和inference预测默认的超参数
#### Q3.1.57: 端到端算法PGNet提供了两种后处理方式,两者之间有什么区别呢?
**A**: 两种后处理的区别主要在于速度的推理,config中PostProcess有fast/slow两种模式,slow模式的后处理速度慢,精度相对较高,fast模式的后处理速度快,精度也在可接受的范围之内。建议使用速度快的后处理方式。
#### Q3.1.58: 使用PGNet进行eval报错?
**A**: 需要注意,我们目前在release/2.1更新了评测代码,目前支持A,B两种评测模式:
* A模式:该模式主要为了方便用户使用,与训练集一样的标注文件就可以正常进行eval操作, 代码中默认是A模式。
* B模式:该模式主要为了保证我们的评测代码可以和Total Text官方的评测方式对齐,该模式下直接加载官方提供的mat文件进行eval。
#### Q3.1.59: 使用预训练模型进行预测,对于特定字符识别识别效果较差,怎么解决?
**A**: 由于我们所提供的识别模型是基于通用大规模数据集进行训练的,部分字符可能在训练集中包含较少,因此您可以构建特定场景的数据集,基于我们提供的预训练模型进行微调。建议用于微调的数据集中,每个字符出现的样本数量不低于300,但同时需要注意不同字符的数量均衡。具体可以参考:[微调](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/recognition.md#2-%E5%90%AF%E5%8A%A8%E8%AE%AD%E7%BB%83)
#### Q3.1.60: PGNet有中文预训练模型吗?
**A**: 目前我们尚未提供针对中文的预训练模型,如有需要,可以尝试自己训练。具体需要修改的地方有:
1. [config文件中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/configs/e2e/e2e_r50_vd_pg.yml#L23-L24),字典文件路径及语种设置;
1. [网络结构中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/modeling/heads/e2e_pg_head.py#L181)`out_channels`修改为字典中的字符数目+1(考虑到空格);
1. [loss中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/losses/e2e_pg_loss.py#L93),修改`37`为字典中的字符数目+1(考虑到空格);
#### Q3.1.61: 用于PGNet的训练集,文本框的标注有要求吗?
**A**: PGNet支持多点标注,比如4点、8点、14点等。但需要注意的是,标注点尽可能分布均匀(相邻标注点间隔距离均匀一致),且label文件中的标注点需要从标注框的左上角开始,按标注点顺时针顺序依次编写,以上问题都可能对训练精度造成影响。
我们提供的,基于Total Text数据集的PGNet预训练模型使用了14点标注方式。
<a name="数据集3"></a>
......@@ -1086,3 +1111,6 @@ nvidia-smi --lock-gpu-clocks=1590 -i 0
#### Q3.4.41: PaddleOCR支持tensorrt推理吗?
**A**: 支持的,需要在编译的时候将CMakeLists.txt文件当中,将相关代码`option(WITH_TENSORRT "Compile demo with TensorRT." OFF)`的OFF改成ON。关于服务器端部署的更多设置,可以参考[飞桨官网](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/native_infer.html)
#### Q3.4.42: 在使用PaddleLite进行预测部署时,启动预测后卡死/手机死机?
**A**: 请检查模型转换时所用PaddleLite的版本,和预测库的版本是否对齐。即PaddleLite版本为2.8,则预测库版本也要为2.8。
......@@ -362,17 +362,18 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982]
<a name="超轻量中文OCR模型推理"></a>
### 1. 超轻量中文OCR模型推理
在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir``rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。可视化识别结果默认保存到 ./inference_results 文件夹里面。
在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir``rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。
```
```shell
# 使用方向分类器
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --cls_model_dir="./inference/cls/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=true
# 不使用方向分类器
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=false
```
# 使用多进程
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=false --use_mp=True --total_process_num=6
```
......
......@@ -11,7 +11,7 @@ PaddleOCR 旨在打造一套丰富、领先、且实用的OCR工具库,不仅
其中英文模型支持,大小写字母和常见标点的检测识别,并优化了空格字符的识别:
<div align="center">
<img src="../imgs_results/multi_lang/en_1.jpg" width="400" height="600">
<img src="../imgs_results/multi_lang/img_12.jpg" width="900" height="300">
</div>
小语种模型覆盖了拉丁语系、阿拉伯语系、中文繁体、韩语、日语等等:
......@@ -19,6 +19,8 @@ PaddleOCR 旨在打造一套丰富、领先、且实用的OCR工具库,不仅
<div align="center">
<img src="../imgs_results/multi_lang/japan_2.jpg" width="600" height="300">
<img src="../imgs_results/multi_lang/french_0.jpg" width="300" height="300">
<img src="../imgs_results/multi_lang/korean_0.jpg" width="500" height="300">
<img src="../imgs_results/multi_lang/arabic_0.jpg" width="300" height="300">
</div>
......@@ -30,14 +32,9 @@ PaddleOCR 旨在打造一套丰富、领先、且实用的OCR工具库,不仅
- [2 快速使用](#快速使用)
- [2.1 命令行运行](#命令行运行)
- [2.1.1 整图预测](#bash_检测+识别)
- [2.1.2 识别预测](#bash_识别)
- [2.1.3 检测预测](#bash_检测)
- [2.2 python 脚本运行](#python_脚本运行)
- [2.2.1 整图预测](#python_检测+识别)
- [2.2.2 识别预测](#python_识别)
- [2.2.3 检测预测](#python_检测)
- [3 自定义训练](#自定义训练)
- [4 预测部署](#预测部署)
- [4 支持语种及缩写](#语种缩写)
<a name="安装"></a>
......@@ -108,8 +105,6 @@ paddleocr --image_dir doc/imgs/japan_2.jpg --lang=japan
paddleocr --image_dir doc/imgs_words/japan/1.jpg --det false --lang=japan
```
![](https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.0/doc/imgs_words/japan/1.jpg)
结果是一个tuple,返回识别结果和识别置信度
```text
......@@ -145,6 +140,9 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(lang="korean") # 首次执行会自动下载模型文件
img_path = 'doc/imgs/korean_1.jpg '
result = ocr.ocr(img_path)
# 可通过参数控制单独执行识别、检测
# result = ocr.ocr(img_path, det=False) 只执行识别
# result = ocr.ocr(img_path, rec=False) 只执行检测
# 打印检测框和识别结果
for line in result:
print(line)
......@@ -166,59 +164,7 @@ im_show.save('result.jpg')
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.1/doc/imgs_results/korean.jpg" width="800">
</div>
* 识别预测
```
from paddleocr import PaddleOCR
ocr = PaddleOCR(lang="german")
img_path = 'PaddleOCR/doc/imgs_words/german/1.jpg'
result = ocr.ocr(img_path, det=False, cls=True)
for line in result:
print(line)
```
![](../imgs_words/german/1.jpg)
结果是一个tuple,只包含识别结果和识别置信度
```
('leider auch jetzt', 0.97538936)
```
* 检测预测
```python
from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path, rec=False)
for line in result:
print(line)
# 显示结果
from PIL import Image
image = Image.open(img_path).convert('RGB')
im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result.jpg')
```
结果是一个list,每个item只包含文本框
```bash
[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]]
[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]]
[[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]]
......
```
结果可视化 :
<div align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.1/doc/imgs_results/whl/12_det.jpg" width="800">
</div>
ppocr 还支持方向分类, 更多使用方式请参考:[whl包使用说明](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.0/doc/doc_ch/whl.md)
ppocr 还支持方向分类, 更多使用方式请参考:[whl包使用说明](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.0/doc/doc_ch/whl.md)
<a name="自定义训练"></a>
## 3 自定义训练
......@@ -229,84 +175,58 @@ ppocr 支持使用自己的数据进行自定义训练或finetune, 其中识别
具体数据准备、训练过程可参考:[文本检测](../doc_ch/detection.md)[文本识别](../doc_ch/recognition.md),更多功能如预测部署、
数据标注等功能可以阅读完整的[文档教程](../../README_ch.md)
<a name="预测部署"></a>
## 4 预测部署
除了安装whl包进行快速预测,ppocr 也提供了多种预测部署方式,如有需求可阅读相关文档:
- [基于Python脚本预测引擎推理](./doc/doc_ch/inference.md)
- [基于C++预测引擎推理](./deploy/cpp_infer/readme.md)
- [服务化部署](./deploy/hubserving/readme.md)
- [端侧部署](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/lite/readme.md)
- [Benchmark](./doc/doc_ch/benchmark.md)
<a name="语种缩写"></a>
## 4 支持语种及缩写
| 语种 | 描述 | 缩写 |
| --- | --- | --- |
|中文|chinese and english|ch|
|英文|english|en|
|法文|french|fr|
|德文|german|german|
|日文|japan|japan|
|韩文|korean|korean|
|中文繁体|chinese traditional |chinese_cht|
|意大利文| Italian |it|
|西班牙文|Spanish |es|
|葡萄牙文| Portuguese|pt|
|俄罗斯文|Russia|ru|
|阿拉伯文|Arabic|ar|
|印地文|Hindi|hi|
|维吾尔|Uyghur|ug|
|波斯文|Persian|fa|
|乌尔都文|Urdu|ur|
|塞尔维亚文(latin)| Serbian(latin) |rs_latin|
|欧西坦文|Occitan |oc|
|马拉地文|Marathi|mr|
|尼泊尔文|Nepali|ne|
|塞尔维亚文(cyrillic)|Serbian(cyrillic)|rs_cyrillic|
|保加利亚文|Bulgarian |bg|
|乌克兰文|Ukranian|uk|
|白俄罗斯文|Belarusian|be|
|泰卢固文|Telugu |te|
|泰米尔文|Tamil |ta|
|南非荷兰文 |Afrikaans |af|
|阿塞拜疆文 |Azerbaijani |az|
|波斯尼亚文|Bosnian|bs|
|捷克文|Czech|cs|
|威尔士文 |Welsh |cy|
|丹麦文 |Danish|da|
|爱沙尼亚文 |Estonian |et|
|爱尔兰文 |Irish |ga|
|克罗地亚文|Croatian |hr|
|匈牙利文|Hungarian |hu|
|印尼文|Indonesian|id|
|冰岛文 |Icelandic|is|
|库尔德文 |Kurdish|ku|
|立陶宛文|Lithuanian |lt|
|拉脱维亚文 |Latvian |lv|
|毛利文|Maori|mi|
|马来文 |Malay|ms|
|马耳他文 |Maltese |mt|
|荷兰文 |Dutch |nl|
|挪威文 |Norwegian |no|
|波兰文|Polish |pl|
| 罗马尼亚文|Romanian |ro|
| 斯洛伐克文|Slovak |sk|
| 斯洛文尼亚文|Slovenian |sl|
| 阿尔巴尼亚文|Albanian |sq|
| 瑞典文|Swedish |sv|
| 西瓦希里文|Swahili |sw|
| 塔加洛文|Tagalog |tl|
| 土耳其文|Turkish |tr|
| 乌兹别克文|Uzbek |uz|
| 越南文|Vietnamese |vi|
| 蒙古文|Mongolian |mn|
| 阿巴扎文|Abaza |abq|
| 阿迪赫文|Adyghe |ady|
| 卡巴丹文|Kabardian |kbd|
| 阿瓦尔文|Avar |ava|
| 达尔瓦文|Dargwa |dar|
| 因古什文|Ingush |inh|
| 拉克文|Lak |lbe|
| 莱兹甘文|Lezghian |lez|
|塔巴萨兰文 |Tabassaran |tab|
| 比尔哈文|Bihari |bh|
| 迈蒂利文|Maithili |mai|
| 昂加文|Angika |ang|
| 孟加拉文|Bhojpuri |bho|
| 摩揭陀文 |Magahi |mah|
| 那格浦尔文|Nagpur |sck|
| 尼瓦尔文|Newari |new|
| 保加利亚文 |Goan Konkani|gom|
| 沙特阿拉伯文|Saudi Arabia|sa|
## 5 支持语种及缩写
| 语种 | 描述 | 缩写 | | 语种 | 描述 | 缩写 |
| --- | --- | --- | ---|--- | --- | --- |
|中文|chinese and english|ch| |保加利亚文|Bulgarian |bg|
|英文|english|en| |乌克兰文|Ukranian|uk|
|法文|french|fr| |白俄罗斯文|Belarusian|be|
|德文|german|german| |泰卢固文|Telugu |te|
|日文|japan|japan| | |阿巴扎文|Abaza |abq|
|韩文|korean|korean| |泰米尔文|Tamil |ta|
|中文繁体|chinese traditional |ch_tra| |南非荷兰文 |Afrikaans |af|
|意大利文| Italian |it| |阿塞拜疆文 |Azerbaijani |az|
|西班牙文|Spanish |es| |波斯尼亚文|Bosnian|bs|
|葡萄牙文| Portuguese|pt| |捷克文|Czech|cs|
|俄罗斯文|Russia|ru| |威尔士文 |Welsh |cy|
|阿拉伯文|Arabic|ar| |丹麦文 |Danish|da|
|印地文|Hindi|hi| |爱沙尼亚文 |Estonian |et|
|维吾尔|Uyghur|ug| |爱尔兰文 |Irish |ga|
|波斯文|Persian|fa| |克罗地亚文|Croatian |hr|
|乌尔都文|Urdu|ur| |匈牙利文|Hungarian |hu|
|塞尔维亚文(latin)| Serbian(latin) |rs_latin| |印尼文|Indonesian|id|
|欧西坦文|Occitan |oc| |冰岛文 |Icelandic|is|
|马拉地文|Marathi|mr| |库尔德文 |Kurdish|ku|
|尼泊尔文|Nepali|ne| |立陶宛文|Lithuanian |lt|
|塞尔维亚文(cyrillic)|Serbian(cyrillic)|rs_cyrillic| |拉脱维亚文 |Latvian |lv|
|毛利文|Maori|mi| | 达尔瓦文|Dargwa |dar|
|马来文 |Malay|ms| | 因古什文|Ingush |inh|
|马耳他文 |Maltese |mt| | 拉克文|Lak |lbe|
|荷兰文 |Dutch |nl| | 莱兹甘文|Lezghian |lez|
|挪威文 |Norwegian |no| |塔巴萨兰文 |Tabassaran |tab|
|波兰文|Polish |pl| | 比尔哈文|Bihari |bh|
| 罗马尼亚文|Romanian |ro| | 迈蒂利文|Maithili |mai|
| 斯洛伐克文|Slovak |sk| | 昂加文|Angika |ang|
| 斯洛文尼亚文|Slovenian |sl| | 孟加拉文|Bhojpuri |bho|
| 阿尔巴尼亚文|Albanian |sq| | 摩揭陀文 |Magahi |mah|
| 瑞典文|Swedish |sv| | 那格浦尔文|Nagpur |sck|
| 西瓦希里文|Swahili |sw| | 尼瓦尔文|Newari |new|
| 塔加洛文|Tagalog |tl| | 保加利亚文 |Goan Konkani|gom|
| 土耳其文|Turkish |tr| | 沙特阿拉伯文|Saudi Arabia|sa|
| 乌兹别克文|Uzbek |uz| | 阿瓦尔文|Avar |ava|
| 越南文|Vietnamese |vi| | 阿瓦尔文|Avar |ava|
| 蒙古文|Mongolian |mn| | 阿迪赫文|Adyghe |ady|
......@@ -30,6 +30,7 @@ PGNet算法细节详见[论文](https://www.aaai.org/AAAI21Papers/AAAI-2885.Wang
测试集:Total-Text
测试环境: NVIDIA Tesla V100-SXM2-16GB
|PGNetA|det_precision|det_recall|det_f_score|e2e_precision|e2e_recall|e2e_f_score|FPS|下载|
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
|Paper|85.30|86.80|86.1|-|-|61.7|38.20 (size=640)|-|
......@@ -79,24 +80,26 @@ python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/im
本节以totaltext数据集为例,介绍PaddleOCR中端到端模型的训练、评估与测试。
### 准备数据
下载解压[totaltext](https://github.com/cs-chan/Total-Text-Dataset/blob/master/Dataset/README.md) 数据集到PaddleOCR/train_data/目录,数据集组织结构:
下载解压[totaltext](https://paddleocr.bj.bcebos.com/dataset/total_text.tar) 数据集到PaddleOCR/train_data/目录,数据集组织结构:
```
/PaddleOCR/train_data/total_text/train/
|- rgb/ # total_text数据集的训练数据
|- gt_0.png
|- img11.jpg
| ...
|- total_text.txt # total_text数据集的训练标注
|- train.txt # total_text数据集的训练标注
```
total_text.txt标注文件格式如下,文件名和标注信息中间用"\t"分隔:
```
" 图像文件名 json.dumps编码的图像标注信息"
rgb/gt_0.png [{"transcription": "EST", "points": [[1004.0,689.0],[1019.0,698.0],[1034.0,708.0],[1049.0,718.0],[1064.0,728.0],[1079.0,738.0],[1095.0,748.0],[1094.0,774.0],[1079.0,765.0],[1065.0,756.0],[1050.0,747.0],[1036.0,738.0],[1021.0,729.0],[1007.0,721.0]]}, {...}]
rgb/img11.jpg [{"transcription": "ASRAMA", "points": [[214.0, 325.0], [235.0, 308.0], [259.0, 296.0], [286.0, 291.0], [313.0, 295.0], [338.0, 305.0], [362.0, 320.0], [349.0, 347.0], [330.0, 337.0], [310.0, 329.0], [290.0, 324.0], [269.0, 328.0], [249.0, 336.0], [231.0, 346.0]]}, {...}]
```
json.dumps编码前的图像标注信息是包含多个字典的list,字典中的 `points` 表示文本框的四个点的坐标(x, y),从左上角的点开始顺时针排列。
json.dumps编码前的图像标注信息是包含多个字典的list,字典中的 `points` 表示文本框的四个点的坐标(x, y),从左上角的点开始顺时针排列。
`transcription` 表示当前文本框的文字,**当其内容为“###”时,表示该文本框无效,在训练时会跳过。**
如果您想在其他数据集上训练,可以按照上述形式构建标注文件。
**注意:PGNet支持任意点的数据输入,如4点、8点等,但是需要保证均匀标注(上下对称,左右距离一致)。在Total-Text数据集上,我们使用了14点标注进行训练。**
### 启动训练
PGNet训练分为两个步骤:step1: 在合成数据上训练,得到预训练模型,此时模型精度依然较低;step2: 加载预训练模型,在totaltext数据集上训练;为快速训练,我们直接提供了step1的预训练模型。
......
......@@ -379,14 +379,18 @@ After executing the command, the prediction results (classification angle and sc
<a name="LIGHTWEIGHT_CHINESE_MODEL"></a>
### 1. LIGHTWEIGHT CHINESE MODEL
When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model.The visualized recognition results are saved to the `./inference_results` folder by default.
When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default.
```
```shell
# use direction classifier
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --cls_model_dir="./inference/cls/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=true
# not use use direction classifier
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/"
# use multi-process
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=false --use_mp=True --total_process_num=6
```
```
After executing the command, the recognition result image is as follows:
......
......@@ -102,14 +102,14 @@ python3 generate_multi_language_configs.py -l it \
| german_mobile_v2.0_rec |Lightweight model for German recognition|[rec_german_lite_train.yml](../../configs/rec/multi_language/rec_german_lite_train.yml)|2.65M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_train.tar) |
| korean_mobile_v2.0_rec |Lightweight model for Korean recognition|[rec_korean_lite_train.yml](../../configs/rec/multi_language/rec_korean_lite_train.yml)|3.9M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_train.tar) |
| japan_mobile_v2.0_rec |Lightweight model for Japanese recognition|[rec_japan_lite_train.yml](../../configs/rec/multi_language/rec_japan_lite_train.yml)|4.23M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_train.tar) |
| chinese_cht_mobile_v2.0_rec |Lightweight model for chinese cht recognition|rec_chinese_cht_lite_train.yml|5.63M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_train.tar) |
| chinese_cht_mobile_v2.0_rec |Lightweight model for chinese cht recognition|rec_chinese_cht_lite_train.yml|5.63M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_train.tar) |
| te_mobile_v2.0_rec |Lightweight model for Telugu recognition|rec_te_lite_train.yml|2.63M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_train.tar) |
| ka_mobile_v2.0_rec |Lightweight model for Kannada recognition|rec_ka_lite_train.yml|2.63M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_train.tar) |
| ta_mobile_v2.0_rec |Lightweight model for Tamil recognition|rec_ta_lite_train.yml|2.63M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_train.tar) |
| latin_mobile_v2.0_rec | Lightweight model for latin recognition | [rec_latin_lite_train.yml](../../configs/rec/multi_language/rec_latin_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_train.tar) |
| arabic_mobile_v2.0_rec | Lightweight model for arabic recognition | [rec_arabic_lite_train.yml](../../configs/rec/multi_language/rec_arabic_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_train.tar) |
| cyrillic_mobile_v2.0_rec | Lightweight model for cyrillic recognition | [rec_cyrillic_lite_train.yml](../../configs/rec/multi_language/rec_cyrillic_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_train.tar) |
| devanagari_mobile_v2.0_rec | Lightweight model for devanagari recognition | [rec_devanagari_lite_train.yml](../../configs/rec/multi_language/rec_devanagari_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_train.tar) |
| latin_mobile_v2.0_rec | Lightweight model for latin recognition | [rec_latin_lite_train.yml](../../configs/rec/multi_language/rec_latin_lite_train.yml) |2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_train.tar) |
| arabic_mobile_v2.0_rec | Lightweight model for arabic recognition | [rec_arabic_lite_train.yml](../../configs/rec/multi_language/rec_arabic_lite_train.yml) |2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_train.tar) |
| cyrillic_mobile_v2.0_rec | Lightweight model for cyrillic recognition | [rec_cyrillic_lite_train.yml](../../configs/rec/multi_language/rec_cyrillic_lite_train.yml) |2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_train.tar) |
| devanagari_mobile_v2.0_rec | Lightweight model for devanagari recognition | [rec_devanagari_lite_train.yml](../../configs/rec/multi_language/rec_devanagari_lite_train.yml) |2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_train.tar) |
For more supported languages, please refer to : [Multi-language model](./multi_languages_en.md)
......
......@@ -13,7 +13,7 @@ Among them, the English model supports the detection and recognition of uppercas
letters and common punctuation, and the recognition of space characters is optimized:
<div align="center">
<img src="../imgs_results/multi_lang/en_1.jpg" width="400" height="600">
<img src="../imgs_results/multi_lang/img_12.jpg" width="900" height="300">
</div>
The multilingual models cover Latin, Arabic, Traditional Chinese, Korean, Japanese, etc.:
......@@ -21,6 +21,8 @@ The multilingual models cover Latin, Arabic, Traditional Chinese, Korean, Japane
<div align="center">
<img src="../imgs_results/multi_lang/japan_2.jpg" width="600" height="300">
<img src="../imgs_results/multi_lang/french_0.jpg" width="300" height="300">
<img src="../imgs_results/multi_lang/korean_0.jpg" width="500" height="300">
<img src="../imgs_results/multi_lang/arabic_0.jpg" width="300" height="300">
</div>
This document will briefly introduce how to use the multilingual model.
......@@ -31,14 +33,9 @@ This document will briefly introduce how to use the multilingual model.
- [2 Quick Use](#Quick_Use)
- [2.1 Command line operation](#Command_line_operation)
- [2.1.1 Prediction of the whole image](#bash_detection+recognition)
- [2.1.2 Recognition](#bash_Recognition)
- [2.1.3 Detection](#bash_detection)
- [2.2 python script running](#python_Script_running)
- [2.2.1 Whole image prediction](#python_detection+recognition)
- [2.2.2 Recognition](#python_Recognition)
- [2.2.3 Detection](#python_detection)
- [3 Custom Training](#Custom_Training)
- [4 Inference and Deployment](#inference)
- [4 Supported languages and abbreviations](#language_abbreviations)
<a name="Install"></a>
......@@ -143,6 +140,9 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(lang="korean") # The model file will be downloaded automatically when executed for the first time
img_path ='doc/imgs/korean_1.jpg'
result = ocr.ocr(img_path)
# Recognition and detection can be performed separately through parameter control
# result = ocr.ocr(img_path, det=False) Only perform recognition
# result = ocr.ocr(img_path, rec=False) Only perform detection
# Print detection frame and recognition result
for line in result:
print(line)
......@@ -162,54 +162,6 @@ Visualization of results:
![](https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.1/doc/imgs_results/korean.jpg)
* Recognition
```
from paddleocr import PaddleOCR
ocr = PaddleOCR(lang="german")
img_path ='PaddleOCR/doc/imgs_words/german/1.jpg'
result = ocr.ocr(img_path, det=False, cls=True)
for line in result:
print(line)
```
![](https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.1/doc/imgs_words/german/1.jpg)
The result is a tuple, which only contains the recognition result and recognition confidence
```
('leider auch jetzt', 0.97538936)
```
* Detection
```python
from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path ='PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path, rec=False)
for line in result:
print(line)
# show result
from PIL import Image
image = Image.open(img_path).convert('RGB')
im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result.jpg')
```
The result is a list, each item contains only text boxes
```bash
[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]]
[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]]
[[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]]
......
```
Visualization of results:
![](https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.1/doc/imgs_results/whl/12_det.jpg)
ppocr also supports direction classification. For more usage methods, please refer to: [whl package instructions](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.0/doc/doc_ch/whl.md).
<a name="Custom_training"></a>
......@@ -221,84 +173,61 @@ Modify the training data path, dictionary and other parameters.
For specific data preparation and training process, please refer to: [Text Detection](../doc_en/detection_en.md), [Text Recognition](../doc_en/recognition_en.md), more functions such as predictive deployment,
For functions such as data annotation, you can read the complete [Document Tutorial](../../README.md).
<a name="language_abbreviation"></a>
## 4 Support languages and abbreviations
| Language | Abbreviation |
| --- | --- |
|chinese and english|ch|
|english|en|
|french|fr|
|german|german|
|japan|japan|
|korean|korean|
|chinese traditional |chinese_cht|
| Italian |it|
|Spanish |es|
| Portuguese|pt|
|Russia|ru|
|Arabic|ar|
|Hindi|hi|
|Uyghur|ug|
|Persian|fa|
|Urdu|ur|
| Serbian(latin) |rs_latin|
|Occitan |oc|
|Marathi|mr|
|Nepali|ne|
|Serbian(cyrillic)|rs_cyrillic|
|Bulgarian |bg|
|Ukranian|uk|
|Belarusian|be|
|Telugu |te|
|Tamil |ta|
|Afrikaans |af|
|Azerbaijani |az|
|Bosnian|bs|
|Czech|cs|
|Welsh |cy|
|Danish|da|
|Estonian |et|
|Irish |ga|
|Croatian |hr|
|Hungarian |hu|
|Indonesian|id|
|Icelandic|is|
|Kurdish|ku|
|Lithuanian |lt|
|Latvian |lv|
|Maori|mi|
|Malay|ms|
|Maltese |mt|
|Dutch |nl|
|Norwegian |no|
|Polish |pl|
|Romanian |ro|
|Slovak |sk|
|Slovenian |sl|
|Albanian |sq|
|Swedish |sv|
|Swahili |sw|
|Tagalog |tl|
|Turkish |tr|
|Uzbek |uz|
|Vietnamese |vi|
|Mongolian |mn|
|Abaza |abq|
|Adyghe |ady|
|Kabardian |kbd|
|Avar |ava|
|Dargwa |dar|
|Ingush |inh|
|Lak |lbe|
|Lezghian |lez|
|Tabassaran |tab|
|Bihari |bh|
|Maithili |mai|
|Angika |ang|
|Bhojpuri |bho|
|Magahi |mah|
|Nagpur |sck|
|Newari |new|
|Goan Konkani|gom|
|Saudi Arabia|sa|
<a name="inference"></a>
## 4 Inference and Deployment
In addition to installing the whl package for quick forecasting,
ppocr also provides a variety of forecasting deployment methods.
If necessary, you can read related documents:
- [Python Inference](./doc/doc_en/inference_en.md)
- [C++ Inference](./deploy/cpp_infer/readme_en.md)
- [Serving](./deploy/hubserving/readme_en.md)
- [Mobile](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/lite/readme_en.md)
- [Benchmark](./doc/doc_en/benchmark_en.md)
<a name="language_abbreviations"></a>
## 5 Support languages and abbreviations
| Language | Abbreviation | | Language | Abbreviation |
| --- | --- | --- | --- | --- |
|chinese and english|ch| |Arabic|ar|
|english|en| |Hindi|hi|
|french|fr| |Uyghur|ug|
|german|german| |Persian|fa|
|japan|japan| |Urdu|ur|
|korean|korean| | Serbian(latin) |rs_latin|
|chinese traditional |ch_tra| |Occitan |oc|
| Italian |it| |Marathi|mr|
|Spanish |es| |Nepali|ne|
| Portuguese|pt| |Serbian(cyrillic)|rs_cyrillic|
|Russia|ru||Bulgarian |bg|
|Ukranian|uk| |Estonian |et|
|Belarusian|be| |Irish |ga|
|Telugu |te| |Croatian |hr|
|Saudi Arabia|sa| |Hungarian |hu|
|Tamil |ta| |Indonesian|id|
|Afrikaans |af| |Icelandic|is|
|Azerbaijani |az||Kurdish|ku|
|Bosnian|bs| |Lithuanian |lt|
|Czech|cs| |Latvian |lv|
|Welsh |cy| |Maori|mi|
|Danish|da| |Malay|ms|
|Maltese |mt| |Adyghe |ady|
|Dutch |nl| |Kabardian |kbd|
|Norwegian |no| |Avar |ava|
|Polish |pl| |Dargwa |dar|
|Romanian |ro| |Ingush |inh|
|Slovak |sk| |Lak |lbe|
|Slovenian |sl| |Lezghian |lez|
|Albanian |sq| |Tabassaran |tab|
|Swedish |sv| |Bihari |bh|
|Swahili |sw| |Maithili |mai|
|Tagalog |tl| |Angika |ang|
|Turkish |tr| |Bhojpuri |bho|
|Uzbek |uz| |Magahi |mah|
|Vietnamese |vi| |Nagpur |sck|
|Mongolian |mn| |Newari |new|
|Abaza |abq| |Goan Konkani|gom|
......@@ -77,28 +77,30 @@ The visualized end-to-end results are saved to the `./inference_results` folder
This section takes the totaltext dataset as an example to introduce the training, evaluation and testing of the end-to-end model in PaddleOCR.
### Data Preparation
Download and unzip [totaltext](https://github.com/cs-chan/Total-Text-Dataset/blob/master/Dataset/README.md) dataset to PaddleOCR/train_data/, dataset organization structure is as follow:
Download and unzip [totaltext](https://paddleocr.bj.bcebos.com/dataset/total_text.tar) dataset to PaddleOCR/train_data/, dataset organization structure is as follow:
```
/PaddleOCR/train_data/total_text/train/
|- rgb/ # total_text training data of dataset
|- gt_0.png
|- img11.png
| ...
|- total_text.txt # total_text training annotation of dataset
|- train.txt # total_text training annotation of dataset
```
total_text.txt: the format of dimension file is as follows,the file name and annotation information are separated by "\t":
```
" Image file name Image annotation information encoded by json.dumps"
rgb/gt_0.png [{"transcription": "EST", "points": [[1004.0,689.0],[1019.0,698.0],[1034.0,708.0],[1049.0,718.0],[1064.0,728.0],[1079.0,738.0],[1095.0,748.0],[1094.0,774.0],[1079.0,765.0],[1065.0,756.0],[1050.0,747.0],[1036.0,738.0],[1021.0,729.0],[1007.0,721.0]]}, {...}]
rgb/img11.jpg [{"transcription": "ASRAMA", "points": [[214.0, 325.0], [235.0, 308.0], [259.0, 296.0], [286.0, 291.0], [313.0, 295.0], [338.0, 305.0], [362.0, 320.0], [349.0, 347.0], [330.0, 337.0], [310.0, 329.0], [290.0, 324.0], [269.0, 328.0], [249.0, 336.0], [231.0, 346.0]]}, {...}]
```
The image annotation after **json.dumps()** encoding is a list containing multiple dictionaries.
The `points` in the dictionary represent the coordinates (x, y) of the four points of the text box, arranged clockwise from the point at the upper left corner.
The `points` in the dictionary represent the coordinates (x, y) of the fourteen points of the text box, arranged clockwise from the point at the upper left corner.
`transcription` represents the text of the current text box. **When its content is "###" it means that the text box is invalid and will be skipped during training.**
If you want to train PaddleOCR on other datasets, please build the annotation file according to the above format.
*PGNet supports data input of any point, but it needs to ensure uniform labeling (upper and lower symmetry, left and right distance is consistent). In our experiment, the training effect of fourteen points tagging is better than that of four points tagging. We can try to do two-stage training on four points tagging and fourteen points tagging.*
### Start Training
......
......@@ -193,6 +193,7 @@ def parse_args(mMain=True, add_help=True):
parser.add_argument("--det_db_box_thresh", type=float, default=0.5)
parser.add_argument("--det_db_unclip_ratio", type=float, default=1.6)
parser.add_argument("--use_dilation", type=bool, default=False)
parser.add_argument("--det_db_score_mode", type=str, default="fast")
# EAST parmas
parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
......@@ -241,6 +242,7 @@ def parse_args(mMain=True, add_help=True):
det_db_box_thresh=0.5,
det_db_unclip_ratio=1.6,
use_dilation=False,
det_db_score_mode="fast",
det_east_score_thresh=0.8,
det_east_cover_thresh=0.1,
det_east_nms_thresh=0.2,
......
......@@ -187,29 +187,75 @@ class CTCLabelEncode(BaseRecLabelEncode):
return dict_character
class E2ELabelEncode(BaseRecLabelEncode):
class E2ELabelEncodeTest(BaseRecLabelEncode):
def __init__(self,
max_text_length,
character_dict_path=None,
character_type='EN',
use_space_char=False,
**kwargs):
super(E2ELabelEncode,
super(E2ELabelEncodeTest,
self).__init__(max_text_length, character_dict_path,
character_type, use_space_char)
self.pad_num = len(self.dict) # the length to pad
def __call__(self, data):
texts = data['strs']
import json
padnum = len(self.dict)
label = data['label']
label = json.loads(label)
nBox = len(label)
boxes, txts, txt_tags = [], [], []
for bno in range(0, nBox):
box = label[bno]['points']
txt = label[bno]['transcription']
boxes.append(box)
txts.append(txt)
if txt in ['*', '###']:
txt_tags.append(True)
else:
txt_tags.append(False)
boxes = np.array(boxes, dtype=np.float32)
txt_tags = np.array(txt_tags, dtype=np.bool)
data['polys'] = boxes
data['ignore_tags'] = txt_tags
temp_texts = []
for text in texts:
for text in txts:
text = text.lower()
text = self.encode(text)
if text is None:
return None
text = text + [self.pad_num] * (self.max_text_len - len(text))
text = text + [padnum] * (self.max_text_len - len(text)
) # use 36 to pad
temp_texts.append(text)
data['strs'] = np.array(temp_texts)
data['texts'] = np.array(temp_texts)
return data
class E2ELabelEncodeTrain(object):
def __init__(self, **kwargs):
pass
def __call__(self, data):
import json
label = data['label']
label = json.loads(label)
nBox = len(label)
boxes, txts, txt_tags = [], [], []
for bno in range(0, nBox):
box = label[bno]['points']
txt = label[bno]['transcription']
boxes.append(box)
txts.append(txt)
if txt in ['*', '###']:
txt_tags.append(True)
else:
txt_tags.append(False)
boxes = np.array(boxes, dtype=np.float32)
txt_tags = np.array(txt_tags, dtype=np.bool)
data['polys'] = boxes
data['texts'] = txts
data['ignore_tags'] = txt_tags
return data
......
......@@ -88,7 +88,7 @@ class PGProcessTrain(object):
return min_area_quad
def check_and_validate_polys(self, polys, tags, xxx_todo_changeme):
def check_and_validate_polys(self, polys, tags, im_size):
"""
check so that the text poly is in the same direction,
and also filter some invalid polygons
......@@ -96,7 +96,7 @@ class PGProcessTrain(object):
:param tags:
:return:
"""
(h, w) = xxx_todo_changeme
(h, w) = im_size
if polys.shape[0] == 0:
return polys, np.array([]), np.array([])
polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1)
......@@ -750,8 +750,8 @@ class PGProcessTrain(object):
input_size = 512
im = data['image']
text_polys = data['polys']
text_tags = data['tags']
text_strs = data['strs']
text_tags = data['ignore_tags']
text_strs = data['texts']
h, w, _ = im.shape
text_polys, text_tags, hv_tags = self.check_and_validate_polys(
text_polys, text_tags, (h, w))
......
......@@ -29,20 +29,20 @@ class PGDataSet(Dataset):
dataset_config = config[mode]['dataset']
loader_config = config[mode]['loader']
self.delimiter = dataset_config.get('delimiter', '\t')
label_file_list = dataset_config.pop('label_file_list')
data_source_num = len(label_file_list)
ratio_list = dataset_config.get("ratio_list", [1.0])
if isinstance(ratio_list, (float, int)):
ratio_list = [float(ratio_list)] * int(data_source_num)
self.data_format = dataset_config.get('data_format', 'icdar')
assert len(
ratio_list
) == data_source_num, "The length of ratio_list should be the same as the file_list."
self.data_dir = dataset_config['data_dir']
self.do_shuffle = loader_config['shuffle']
logger.info("Initialize indexs of datasets:%s" % label_file_list)
self.data_lines = self.get_image_info_list(label_file_list, ratio_list,
self.data_format)
self.data_lines = self.get_image_info_list(label_file_list, ratio_list)
self.data_idx_order_list = list(range(len(self.data_lines)))
if mode.lower() == "train":
self.shuffle_data_random()
......@@ -55,108 +55,42 @@ class PGDataSet(Dataset):
random.shuffle(self.data_lines)
return
def extract_polys(self, poly_txt_path):
"""
Read text_polys, txt_tags, txts from give txt file.
"""
text_polys, txt_tags, txts = [], [], []
with open(poly_txt_path) as f:
for line in f.readlines():
poly_str, txt = line.strip().split('\t')
poly = list(map(float, poly_str.split(',')))
text_polys.append(
np.array(
poly, dtype=np.float32).reshape(-1, 2))
txts.append(txt)
txt_tags.append(txt == '###')
return np.array(list(map(np.array, text_polys))), \
np.array(txt_tags, dtype=np.bool), txts
def extract_info_textnet(self, im_fn, img_dir=''):
"""
Extract information from line in textnet format.
"""
info_list = im_fn.split('\t')
img_path = ''
for ext in [
'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'JPG'
]:
if os.path.exists(os.path.join(img_dir, info_list[0] + "." + ext)):
img_path = os.path.join(img_dir, info_list[0] + "." + ext)
break
if img_path == '':
print('Image {0} NOT found in {1}, and it will be ignored.'.format(
info_list[0], img_dir))
nBox = (len(info_list) - 1) // 9
wordBBs, txts, txt_tags = [], [], []
for n in range(0, nBox):
wordBB = list(map(float, info_list[n * 9 + 1:(n + 1) * 9]))
txt = info_list[(n + 1) * 9]
wordBBs.append([[wordBB[0], wordBB[1]], [wordBB[2], wordBB[3]],
[wordBB[4], wordBB[5]], [wordBB[6], wordBB[7]]])
txts.append(txt)
if txt == '###':
txt_tags.append(True)
else:
txt_tags.append(False)
return img_path, np.array(wordBBs, dtype=np.float32), txt_tags, txts
def get_image_info_list(self, file_list, ratio_list, data_format='textnet'):
def get_image_info_list(self, file_list, ratio_list):
if isinstance(file_list, str):
file_list = [file_list]
data_lines = []
for idx, data_source in enumerate(file_list):
image_files = []
if data_format == 'icdar':
image_files = [(data_source, x) for x in
os.listdir(os.path.join(data_source, 'rgb'))
if x.split('.')[-1] in [
'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif',
'tiff', 'gif', 'JPG'
]]
elif data_format == 'textnet':
with open(data_source) as f:
image_files = [(data_source, x.strip())
for x in f.readlines()]
else:
print("Unrecognized data format...")
exit(-1)
for idx, file in enumerate(file_list):
with open(file, "rb") as f:
lines = f.readlines()
if self.mode == "train" or ratio_list[idx] < 1.0:
random.seed(self.seed)
image_files = random.sample(
image_files, round(len(image_files) * ratio_list[idx]))
data_lines.extend(image_files)
lines = random.sample(lines,
round(len(lines) * ratio_list[idx]))
data_lines.extend(lines)
return data_lines
def __getitem__(self, idx):
file_idx = self.data_idx_order_list[idx]
data_path, data_line = self.data_lines[file_idx]
data_line = self.data_lines[file_idx]
img_id = 0
try:
if self.data_format == 'icdar':
im_path = os.path.join(data_path, 'rgb', data_line)
poly_path = os.path.join(data_path, 'poly',
data_line.split('.')[0] + '.txt')
text_polys, text_tags, text_strs = self.extract_polys(poly_path)
else:
image_dir = os.path.join(os.path.dirname(data_path), 'image')
im_path, text_polys, text_tags, text_strs = self.extract_info_textnet(
data_line, image_dir)
img_id = int(data_line.split(".")[0][3:])
data = {
'img_path': im_path,
'polys': text_polys,
'tags': text_tags,
'strs': text_strs,
'img_id': img_id
}
data_line = data_line.decode('utf-8')
substr = data_line.strip("\n").split(self.delimiter)
file_name = substr[0]
label = substr[1]
img_path = os.path.join(self.data_dir, file_name)
if self.mode.lower() == 'eval':
try:
img_id = int(data_line.split(".")[0][7:])
except:
img_id = 0
data = {'img_path': img_path, 'label': label, 'img_id': img_id}
if not os.path.exists(img_path):
raise Exception("{} does not exist!".format(img_path))
with open(data['img_path'], 'rb') as f:
img = f.read()
data['image'] = img
outs = transform(data, self.ops)
except Exception as e:
self.logger.error(
"When parsing line {}, error happened with msg: {}".format(
......
......@@ -102,13 +102,11 @@ class PGLoss(nn.Layer):
f_tcl_char_ld = paddle.transpose(f_tcl_char_mask, (1, 0, 2))
N, B, _ = f_tcl_char_ld.shape
input_lengths = paddle.to_tensor([N] * B, dtype='int64')
cost = paddle.nn.functional.ctc_loss(
log_probs=f_tcl_char_ld,
labels=tcl_label,
input_lengths=input_lengths,
label_lengths=label_t,
blank=self.pad_num,
reduction='none')
loss_out = paddle.fluid.layers.warpctc(f_tcl_char_ld, tcl_label,
self.pad_num, True,
input_lengths, label_t)
cost = paddle.fluid.layers.squeeze(loss_out, [-1])
cost = cost.mean()
return cost
......
......@@ -18,16 +18,18 @@ from __future__ import print_function
__all__ = ['E2EMetric']
from ppocr.utils.e2e_metric.Deteval import get_socre, combine_results
from ppocr.utils.e2e_metric.Deteval import get_socre_A, get_socre_B, combine_results
from ppocr.utils.e2e_utils.extract_textpoint_slow import get_dict
class E2EMetric(object):
def __init__(self,
mode,
gt_mat_dir,
character_dict_path,
main_indicator='f_score_e2e',
**kwargs):
self.mode = mode
self.gt_mat_dir = gt_mat_dir
self.label_list = get_dict(character_dict_path)
self.max_index = len(self.label_list)
......@@ -35,12 +37,44 @@ class E2EMetric(object):
self.reset()
def __call__(self, preds, batch, **kwargs):
if self.mode == 'A':
gt_polyons_batch = batch[2]
temp_gt_strs_batch = batch[3][0]
ignore_tags_batch = batch[4]
gt_strs_batch = []
for temp_list in temp_gt_strs_batch:
t = ""
for index in temp_list:
if index < self.max_index:
t += self.label_list[index]
gt_strs_batch.append(t)
for pred, gt_polyons, gt_strs, ignore_tags in zip(
[preds], gt_polyons_batch, [gt_strs_batch], ignore_tags_batch):
# prepare gt
gt_info_list = [{
'points': gt_polyon,
'text': gt_str,
'ignore': ignore_tag
} for gt_polyon, gt_str, ignore_tag in
zip(gt_polyons, gt_strs, ignore_tags)]
# prepare det
e2e_info_list = [{
'points': det_polyon,
'texts': pred_str
} for det_polyon, pred_str in
zip(pred['points'], pred['texts'])]
result = get_socre_A(gt_info_list, e2e_info_list)
self.results.append(result)
else:
img_id = batch[5][0]
e2e_info_list = [{
'points': det_polyon,
'text': pred_str
} for det_polyon, pred_str in zip(preds['points'], preds['strs'])]
result = get_socre(self.gt_mat_dir, img_id, e2e_info_list)
'texts': pred_str
} for det_polyon, pred_str in zip(preds['points'], preds['texts'])]
result = get_socre_B(self.gt_mat_dir, img_id, e2e_info_list)
self.results.append(result)
def get_metric(self):
......
......@@ -34,12 +34,18 @@ class DBPostProcess(object):
max_candidates=1000,
unclip_ratio=2.0,
use_dilation=False,
score_mode="fast",
**kwargs):
self.thresh = thresh
self.box_thresh = box_thresh
self.max_candidates = max_candidates
self.unclip_ratio = unclip_ratio
self.min_size = 3
self.score_mode = score_mode
assert score_mode in [
"slow", "fast"
], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
self.dilation_kernel = None if not use_dilation else np.array(
[[1, 1], [1, 1]])
......@@ -69,7 +75,10 @@ class DBPostProcess(object):
if sside < self.min_size:
continue
points = np.array(points)
if self.score_mode == "fast":
score = self.box_score_fast(pred, points.reshape(-1, 2))
else:
score = self.box_score_slow(pred, contour)
if self.box_thresh > score:
continue
......@@ -120,6 +129,9 @@ class DBPostProcess(object):
return box, min(bounding_box[1])
def box_score_fast(self, bitmap, _box):
'''
box_score_fast: use bbox mean score as the mean score
'''
h, w = bitmap.shape[:2]
box = _box.copy()
xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
......@@ -133,6 +145,27 @@ class DBPostProcess(object):
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
def box_score_slow(self, bitmap, contour):
'''
box_score_slow: use polyon mean score as the mean score
'''
h, w = bitmap.shape[:2]
contour = contour.copy()
contour = np.reshape(contour, (-1, 2))
xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
contour[:, 0] = contour[:, 0] - xmin
contour[:, 1] = contour[:, 1] - ymin
cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
def __call__(self, outs_dict, shape_list):
pred = outs_dict['maps']
if isinstance(pred, paddle.Tensor):
......
......@@ -17,7 +17,7 @@ import scipy.io as io
from ppocr.utils.e2e_metric.polygon_fast import iod, area_of_intersection, area
def get_socre(gt_dir, img_id, pred_dict):
def get_socre_A(gt_dir, pred_dict):
allInputs = 1
def input_reading_mod(pred_dict):
......@@ -26,7 +26,144 @@ def get_socre(gt_dir, img_id, pred_dict):
n = len(pred_dict)
for i in range(n):
points = pred_dict[i]['points']
text = pred_dict[i]['text']
text = pred_dict[i]['texts']
point = ",".join(map(str, points.reshape(-1, )))
det.append([point, text])
return det
def gt_reading_mod(gt_dict):
"""This helper reads groundtruths from mat files"""
gt = []
n = len(gt_dict)
for i in range(n):
points = gt_dict[i]['points'].tolist()
h = len(points)
text = gt_dict[i]['text']
xx = [
np.array(
['x:'], dtype='<U2'), 0, np.array(
['y:'], dtype='<U2'), 0, np.array(
['#'], dtype='<U1'), np.array(
['#'], dtype='<U1')
]
t_x, t_y = [], []
for j in range(h):
t_x.append(points[j][0])
t_y.append(points[j][1])
xx[1] = np.array([t_x], dtype='int16')
xx[3] = np.array([t_y], dtype='int16')
if text != "":
xx[4] = np.array([text], dtype='U{}'.format(len(text)))
xx[5] = np.array(['c'], dtype='<U1')
gt.append(xx)
return gt
def detection_filtering(detections, groundtruths, threshold=0.5):
for gt_id, gt in enumerate(groundtruths):
if (gt[5] == '#') and (gt[1].shape[1] > 1):
gt_x = list(map(int, np.squeeze(gt[1])))
gt_y = list(map(int, np.squeeze(gt[3])))
for det_id, detection in enumerate(detections):
detection_orig = detection
detection = [float(x) for x in detection[0].split(',')]
detection = list(map(int, detection))
det_x = detection[0::2]
det_y = detection[1::2]
det_gt_iou = iod(det_x, det_y, gt_x, gt_y)
if det_gt_iou > threshold:
detections[det_id] = []
detections[:] = [item for item in detections if item != []]
return detections
def sigma_calculation(det_x, det_y, gt_x, gt_y):
"""
sigma = inter_area / gt_area
"""
return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) /
area(gt_x, gt_y)), 2)
def tau_calculation(det_x, det_y, gt_x, gt_y):
if area(det_x, det_y) == 0.0:
return 0
return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) /
area(det_x, det_y)), 2)
##############################Initialization###################################
# global_sigma = []
# global_tau = []
# global_pred_str = []
# global_gt_str = []
###############################################################################
for input_id in range(allInputs):
if (input_id != '.DS_Store') and (input_id != 'Pascal_result.txt') and (
input_id != 'Pascal_result_curved.txt') and (input_id != 'Pascal_result_non_curved.txt') and (
input_id != 'Deteval_result.txt') and (input_id != 'Deteval_result_curved.txt') \
and (input_id != 'Deteval_result_non_curved.txt'):
detections = input_reading_mod(pred_dict)
groundtruths = gt_reading_mod(gt_dir)
detections = detection_filtering(
detections,
groundtruths) # filters detections overlapping with DC area
dc_id = []
for i in range(len(groundtruths)):
if groundtruths[i][5] == '#':
dc_id.append(i)
cnt = 0
for a in dc_id:
num = a - cnt
del groundtruths[num]
cnt += 1
local_sigma_table = np.zeros((len(groundtruths), len(detections)))
local_tau_table = np.zeros((len(groundtruths), len(detections)))
local_pred_str = {}
local_gt_str = {}
for gt_id, gt in enumerate(groundtruths):
if len(detections) > 0:
for det_id, detection in enumerate(detections):
detection_orig = detection
detection = [float(x) for x in detection[0].split(',')]
detection = list(map(int, detection))
pred_seq_str = detection_orig[1].strip()
det_x = detection[0::2]
det_y = detection[1::2]
gt_x = list(map(int, np.squeeze(gt[1])))
gt_y = list(map(int, np.squeeze(gt[3])))
gt_seq_str = str(gt[4].tolist()[0])
local_sigma_table[gt_id, det_id] = sigma_calculation(
det_x, det_y, gt_x, gt_y)
local_tau_table[gt_id, det_id] = tau_calculation(
det_x, det_y, gt_x, gt_y)
local_pred_str[det_id] = pred_seq_str
local_gt_str[gt_id] = gt_seq_str
global_sigma = local_sigma_table
global_tau = local_tau_table
global_pred_str = local_pred_str
global_gt_str = local_gt_str
single_data = {}
single_data['sigma'] = global_sigma
single_data['global_tau'] = global_tau
single_data['global_pred_str'] = global_pred_str
single_data['global_gt_str'] = global_gt_str
return single_data
def get_socre_B(gt_dir, img_id, pred_dict):
allInputs = 1
def input_reading_mod(pred_dict):
"""This helper reads input from txt files"""
det = []
n = len(pred_dict)
for i in range(n):
points = pred_dict[i]['points']
text = pred_dict[i]['texts']
point = ",".join(map(str, points.reshape(-1, )))
det.append([point, text])
return det
......
......@@ -342,6 +342,7 @@ def generate_pivot_list_curved(p_score,
center_pos_yxs = []
end_points_yxs = []
instance_center_pos_yxs = []
pred_strs = []
if instance_count > 0:
for instance_id in range(1, instance_count):
pos_list = []
......@@ -367,12 +368,13 @@ def generate_pivot_list_curved(p_score,
if is_backbone:
keep_yxs_list_with_id = add_id(keep_yxs_list, image_id=image_id)
instance_center_pos_yxs.append(keep_yxs_list_with_id)
pred_strs.append(decoded_str)
else:
end_points_yxs.extend((keep_yxs_list[0], keep_yxs_list[-1]))
center_pos_yxs.extend(keep_yxs_list)
if is_backbone:
return instance_center_pos_yxs
return pred_strs, instance_center_pos_yxs
else:
return center_pos_yxs, end_points_yxs
......
......@@ -64,7 +64,7 @@ class PGNet_PostProcess(object):
src_w, src_h, self.valid_set)
data = {
'points': poly_list,
'strs': keep_str_list,
'texts': keep_str_list,
}
return data
......@@ -85,32 +85,13 @@ class PGNet_PostProcess(object):
p_char = p_char[0]
src_h, src_w, ratio_h, ratio_w = self.shape_list[0]
is_curved = self.valid_set == "totaltext"
instance_yxs_list = generate_pivot_list_slow(
char_seq_idx_set, instance_yxs_list = generate_pivot_list_slow(
p_score,
p_char,
p_direction,
score_thresh=self.score_thresh,
is_backbone=True,
is_curved=is_curved)
p_char = paddle.to_tensor(np.expand_dims(p_char, axis=0))
char_seq_idx_set = []
for i in range(len(instance_yxs_list)):
gather_info_lod = paddle.to_tensor(instance_yxs_list[i])
f_char_map = paddle.transpose(p_char, [0, 2, 3, 1])
feature_seq = paddle.gather_nd(f_char_map, gather_info_lod)
feature_seq = np.expand_dims(feature_seq.numpy(), axis=0)
feature_len = [len(feature_seq[0])]
featyre_seq = paddle.to_tensor(feature_seq)
feature_len = np.array([feature_len]).astype(np.int64)
length = paddle.to_tensor(feature_len)
seq_pred = paddle.fluid.layers.ctc_greedy_decoder(
input=featyre_seq, blank=36, input_length=length)
seq_pred_str = seq_pred[0].numpy().tolist()[0]
seq_len = seq_pred[1].numpy()[0][0]
temp_t = []
for c in seq_pred_str[:seq_len]:
temp_t.append(c)
char_seq_idx_set.append(temp_t)
seq_strs = []
for char_idx_set in char_seq_idx_set:
pr_str = ''.join([self.Lexicon_Table[pos] for pos in char_idx_set])
......@@ -176,6 +157,6 @@ class PGNet_PostProcess(object):
exit(-1)
data = {
'points': poly_list,
'strs': keep_str_list,
'texts': keep_str_list,
}
return data
......@@ -3,9 +3,8 @@ scikit-image==0.17.2
imgaug==0.4.0
pyclipper
lmdb
opencv-python==4.2.0.32
tqdm
numpy
visualdl
python-Levenshtein
opencv-contrib-python
\ No newline at end of file
opencv-contrib-python==4.2.0.32
\ No newline at end of file
......@@ -62,6 +62,7 @@ class TextDetector(object):
postprocess_params["max_candidates"] = 1000
postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
postprocess_params["use_dilation"] = args.use_dilation
postprocess_params["score_mode"] = args.det_db_score_mode
elif self.det_algorithm == "EAST":
postprocess_params['name'] = 'EASTPostProcess'
postprocess_params["score_thresh"] = args.det_east_score_thresh
......
......@@ -122,7 +122,7 @@ class TextE2E(object):
else:
raise NotImplementedError
post_result = self.postprocess_op(preds, shape_list)
points, strs = post_result['points'], post_result['strs']
points, strs = post_result['points'], post_result['texts']
dt_boxes = self.filter_tag_det_res_only_clip(points, ori_im.shape)
elapse = time.time() - starttime
return dt_boxes, strs, elapse
......
......@@ -48,6 +48,7 @@ def parse_args():
parser.add_argument("--det_db_unclip_ratio", type=float, default=1.6)
parser.add_argument("--max_batch_size", type=int, default=10)
parser.add_argument("--use_dilation", type=bool, default=False)
parser.add_argument("--det_db_score_mode", type=str, default="fast")
# EAST parmas
parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
......
......@@ -103,7 +103,7 @@ def main():
images = paddle.to_tensor(images)
preds = model(images)
post_result = post_process_class(preds, shape_list)
points, strs = post_result['points'], post_result['strs']
points, strs = post_result['points'], post_result['texts']
# write resule
dt_boxes_json = []
for poly, str in zip(points, strs):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册