diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py
index c17db91a5b5cd9d3cbb4b5bf6c87afd745d0870d..d0d2bb721be41fe2c4042fbea1b55e4e76bdd664 100644
--- a/PPOCRLabel/PPOCRLabel.py
+++ b/PPOCRLabel/PPOCRLabel.py
@@ -2531,7 +2531,7 @@ class MainWindow(QMainWindow):
split = 'test'
# save dict
- html = {'structure': {'tokens': token_list}, 'cell': cells}
+ html = {'structure': {'tokens': token_list}, 'cells': cells}
json_results.append({'filename': os.path.basename(image_path), 'split': split, 'imgid': imgid, 'html': html})
imgid += 1
diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md
index 3bdc336827adb87f52e9baa2c012304595b2c656..089a63fd55bb8c127104e7c404852ba52c3ac88c 100644
--- a/PPOCRLabel/README.md
+++ b/PPOCRLabel/README.md
@@ -1,10 +1,14 @@
English | [简体中文](README_ch.md)
-# PPOCRLabel
+# PPOCRLabelv2
-PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box, table and multi-point annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models.
+PPOCRLabelv2 is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in Python3 and PyQT5, supporting rectangular box, table, irregular text and key information annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models.
-
+| regular text annotation | table annotation |
+| :-------------------------------------------------: | :--------------------------------------------: |
+|
|
|
+| **irregular text annotation** | **key information annotation** |
+|
|
|
### Recent Update
diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md
index 107f902a68bd68b30d286e8dd88b29752f0c6ad0..3ea684a3f09a6084403fa0b91e2511b7fd790f4b 100644
--- a/PPOCRLabel/README_ch.md
+++ b/PPOCRLabel/README_ch.md
@@ -1,10 +1,14 @@
[English](README.md) | 简体中文
-# PPOCRLabel
+# PPOCRLabelv2
PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写,支持矩形框标注和四点标注模式,导出格式可直接用于PaddleOCR检测和识别模型的训练。
-
+| 常规标注 | 表格标注 |
+| :-------------------------------------------------: | :--------------------------------------------: |
+|
|
|
+| **不规则文本标注** | **关键信息标注** |
+|
|
|
#### 近期更新
- 2022.05:**新增表格标注**,使用方法见下方`2.2 表格标注`(by [whjdark](https://github.com/peterh0323); [Evezerest](https://github.com/Evezerest))
diff --git a/PPOCRLabel/setup.py b/PPOCRLabel/setup.py
index 1ec54df11a75b8a7ad8f023ca4a5b24ef5343d71..1750f84b8259a237fb6bb1b5eb9dc33e29441bc1 100644
--- a/PPOCRLabel/setup.py
+++ b/PPOCRLabel/setup.py
@@ -33,7 +33,7 @@ setup(
package_dir={'PPOCRLabel': ''},
include_package_data=True,
entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]},
- version='1.0.2',
+ version='2.1.1',
install_requires=requirements,
license='Apache License 2.0',
description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models',
diff --git a/configs/table/SLANet_ch.yml b/configs/table/SLANet_ch.yml
index 997ff0a77b5ea824957abc1d32a7ba7f70abc12c..a3fc1c68ddd2287e6bbaa6c53c5e20961950d23f 100644
--- a/configs/table/SLANet_ch.yml
+++ b/configs/table/SLANet_ch.yml
@@ -107,7 +107,7 @@ Train:
Eval:
dataset:
name: PubTabDataSet
- data_dir: train_data/table/val/
+ data_dir: train_data/table/val/
label_file_list: [train_data/table/val.txt]
transforms:
- DecodeImage:
diff --git a/deploy/cpp_infer/src/utility.cpp b/deploy/cpp_infer/src/utility.cpp
index 4bfc1d091d6124b10c79032beb702ba8727210fc..251184b91bb8efed4b58bbf2bc3d11ea6a1bf916 100644
--- a/deploy/cpp_infer/src/utility.cpp
+++ b/deploy/cpp_infer/src/utility.cpp
@@ -268,11 +268,12 @@ cv::Mat Utility::crop_image(cv::Mat &img, std::vector &area) {
void Utility::sorted_boxes(std::vector &ocr_result) {
std::sort(ocr_result.begin(), ocr_result.end(), Utility::comparison_box);
-
- for (int i = 0; i < ocr_result.size() - 1; i++) {
- if (abs(ocr_result[i + 1].box[0][1] - ocr_result[i].box[0][1]) < 10 &&
- (ocr_result[i + 1].box[0][0] < ocr_result[i].box[0][0])) {
- std::swap(ocr_result[i], ocr_result[i + 1]);
+ if (ocr_result.size() > 0) {
+ for (int i = 0; i < ocr_result.size() - 1; i++) {
+ if (abs(ocr_result[i + 1].box[0][1] - ocr_result[i].box[0][1]) < 10 &&
+ (ocr_result[i + 1].box[0][0] < ocr_result[i].box[0][0])) {
+ std::swap(ocr_result[i], ocr_result[i + 1]);
+ }
}
}
}
diff --git a/paddleocr.py b/paddleocr.py
index 0b7aed36279081f50208f75272fc54c5081929a7..fa732fc110dc7873f8d89b2ca2a21817a1e6d20d 100644
--- a/paddleocr.py
+++ b/paddleocr.py
@@ -414,6 +414,33 @@ def get_model_config(type, version, model_type, lang):
return model_urls[version][model_type][lang]
+def img_decode(content: bytes):
+ np_arr = np.frombuffer(content, dtype=np.uint8)
+ return cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
+
+
+def check_img(img):
+ if isinstance(img, bytes):
+ img = img_decode(img)
+ if isinstance(img, str):
+ # download net image
+ if is_link(img):
+ download_with_progressbar(img, 'tmp.jpg')
+ img = 'tmp.jpg'
+ image_file = img
+ img, flag, _ = check_and_read(image_file)
+ if not flag:
+ with open(image_file, 'rb') as f:
+ img = img_decode(f.read())
+ if img is None:
+ logger.error("error in loading image:{}".format(image_file))
+ return None
+ if isinstance(img, np.ndarray) and len(img.shape) == 2:
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+ return img
+
+
class PaddleOCR(predict_system.TextSystem):
def __init__(self, **kwargs):
"""
@@ -482,7 +509,7 @@ class PaddleOCR(predict_system.TextSystem):
rec: use text recognition or not. If false, only det will be exec. Default is True
cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
"""
- assert isinstance(img, (np.ndarray, list, str))
+ assert isinstance(img, (np.ndarray, list, str, bytes))
if isinstance(img, list) and det == True:
logger.error('When input a list of images, det must be false')
exit(0)
@@ -491,22 +518,8 @@ class PaddleOCR(predict_system.TextSystem):
'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process'
)
- if isinstance(img, str):
- # download net image
- if img.startswith('http'):
- download_with_progressbar(img, 'tmp.jpg')
- img = 'tmp.jpg'
- image_file = img
- img, flag, _ = check_and_read(image_file)
- if not flag:
- with open(image_file, 'rb') as f:
- np_arr = np.frombuffer(f.read(), dtype=np.uint8)
- img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
- if img is None:
- logger.error("error in loading image:{}".format(image_file))
- return None
- if isinstance(img, np.ndarray) and len(img.shape) == 2:
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+ img = check_img(img)
+
if det and rec:
dt_boxes, rec_res, _ = self.__call__(img, cls)
return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)]
@@ -585,23 +598,7 @@ class PPStructure(StructureSystem):
super().__init__(params)
def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
- if isinstance(img, str):
- # download net image
- if img.startswith('http'):
- download_with_progressbar(img, 'tmp.jpg')
- img = 'tmp.jpg'
- image_file = img
- img, flag, _ = check_and_read(image_file)
- if not flag:
- with open(image_file, 'rb') as f:
- np_arr = np.frombuffer(f.read(), dtype=np.uint8)
- img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
- if img is None:
- logger.error("error in loading image:{}".format(image_file))
- return None
- if isinstance(img, np.ndarray) and len(img.shape) == 2:
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-
+ img = check_img(img)
res, _ = super().__call__(
img, return_ocr_result_in_table, img_idx=img_idx)
return res
@@ -644,7 +641,7 @@ def main():
if not flag_pdf:
if img is None:
- logger.error("error in loading image:{}".format(image_file))
+ logger.error("error in loading image:{}".format(img_path))
continue
img_paths = [[img_path, img]]
else:
diff --git a/ppstructure/pdf2word/pdf2word.md b/ppstructure/pdf2word/README.md
similarity index 100%
rename from ppstructure/pdf2word/pdf2word.md
rename to ppstructure/pdf2word/README.md
diff --git a/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml b/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d2be152f0bae7d87129904d87c56c6d777a1f338
--- /dev/null
+++ b/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml
@@ -0,0 +1,122 @@
+Global:
+ use_gpu: True
+ epoch_num: &epoch_num 200
+ log_smooth_window: 10
+ print_batch_step: 10
+ save_model_dir: ./output/ser_layoutxlm_xfund_zh
+ save_epoch_step: 2000
+ # evaluation is run every 10 iterations after the 0th iteration
+ eval_batch_step: [ 0, 187 ]
+ cal_metric_during_train: False
+ save_inference_dir:
+ use_visualdl: False
+ seed: 2022
+ infer_img: ppstructure/docs/kie/input/zh_val_42.jpg
+ save_res_path: ./output/ser_layoutxlm_xfund_zh/res
+
+Architecture:
+ model_type: kie
+ algorithm: &algorithm "LayoutXLM"
+ Transform:
+ Backbone:
+ name: LayoutXLMForSer
+ pretrained: True
+ checkpoints:
+ num_classes: &num_classes 7
+
+Loss:
+ name: VQASerTokenLayoutLMLoss
+ num_classes: *num_classes
+ key: "backbone_out"
+
+Optimizer:
+ name: AdamW
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Linear
+ learning_rate: 0.00005
+ epochs: *epoch_num
+ warmup_epoch: 2
+ regularizer:
+ name: L2
+ factor: 0.00000
+
+PostProcess:
+ name: VQASerTokenLayoutLMPostProcess
+ class_path: &class_path train_data/XFUND/class_list_xfun.txt
+
+Metric:
+ name: VQASerTokenMetric
+ main_indicator: hmean
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: train_data/XFUND/zh_train/image
+ label_file_list:
+ - train_data/XFUND/zh_train/train.json
+ ratio_list: [ 1.0 ]
+ transforms:
+ - DecodeImage: # load image
+ img_mode: RGB
+ channel_first: False
+ - VQATokenLabelEncode: # Class handling label
+ contains_re: False
+ algorithm: *algorithm
+ class_path: *class_path
+ - VQATokenPad:
+ max_seq_len: &max_seq_len 512
+ return_attention_mask: True
+ - VQASerTokenChunk:
+ max_seq_len: *max_seq_len
+ - Resize:
+ size: [224,224]
+ - NormalizeImage:
+ scale: 1
+ mean: [ 123.675, 116.28, 103.53 ]
+ std: [ 58.395, 57.12, 57.375 ]
+ order: 'hwc'
+ - ToCHWImage:
+ - KeepKeys:
+ keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
+ loader:
+ shuffle: True
+ drop_last: False
+ batch_size_per_card: 8
+ num_workers: 4
+
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: train_data/XFUND/zh_val/image
+ label_file_list:
+ - train_data/XFUND/zh_val/val.json
+ transforms:
+ - DecodeImage: # load image
+ img_mode: RGB
+ channel_first: False
+ - VQATokenLabelEncode: # Class handling label
+ contains_re: False
+ algorithm: *algorithm
+ class_path: *class_path
+ - VQATokenPad:
+ max_seq_len: *max_seq_len
+ return_attention_mask: True
+ - VQASerTokenChunk:
+ max_seq_len: *max_seq_len
+ - Resize:
+ size: [224,224]
+ - NormalizeImage:
+ scale: 1
+ mean: [ 123.675, 116.28, 103.53 ]
+ std: [ 58.395, 57.12, 57.375 ]
+ order: 'hwc'
+ - ToCHWImage:
+ - KeepKeys:
+ keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
+ loader:
+ shuffle: False
+ drop_last: False
+ batch_size_per_card: 8
+ num_workers: 4
diff --git a/test_tipc/configs/layoutxlm_ser/train_infer_python.txt b/test_tipc/configs/layoutxlm_ser/train_infer_python.txt
index 549a31e69e367237ec0396778162a5f91c8b7412..d07daa9a1429ec5cd1955ec64ded122a9d1a723d 100644
--- a/test_tipc/configs/layoutxlm_ser/train_infer_python.txt
+++ b/test_tipc/configs/layoutxlm_ser/train_infer_python.txt
@@ -13,7 +13,7 @@ train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg
null:null
##
trainer:norm_train
-norm_train:tools/train.py -c configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false
+norm_train:tools/train.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false
pact_train:null
fpgm_train:null
distill_train:null
@@ -27,7 +27,7 @@ null:null
===========================infer_params===========================
Global.save_inference_dir:./output/
Architecture.Backbone.checkpoints:
-norm_export:tools/export_model.py -c configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml -o
+norm_export:tools/export_model.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o
quant_export:
fpgm_export:
distill_export:null
diff --git a/test_tipc/configs/table_master/train_infer_python.txt b/test_tipc/configs/table_master/train_infer_python.txt
index 56b8e636026939ae8cd700308690010e1300d8f6..c3a871731a36fb5434db111cfd68b6eab7ba3f99 100644
--- a/test_tipc/configs/table_master/train_infer_python.txt
+++ b/test_tipc/configs/table_master/train_infer_python.txt
@@ -37,8 +37,8 @@ export2:null
infer_model:null
infer_export:null
infer_quant:False
-inference:ppstructure/table/predict_structure.py --table_char_dict_path=./ppocr/utils/dict/table_master_structure_dict.txt --image_dir=./ppstructure/docs/table/table.jpg --output ./output/table --table_algorithm=TableMaster --table_max_len=480
---use_gpu:True|False
+inference:ppstructure/table/predict_structure.py --table_char_dict_path=./ppocr/utils/dict/table_master_structure_dict.txt --output ./output/table --table_algorithm=TableMaster --table_max_len=480
+--use_gpu:True
--enable_mkldnn:False
--cpu_threads:6
--rec_batch_num:1
diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh
index bb4b58b4cac900166eeda4d9479fa6bd3fe69e02..1c0e16044c4264d14a93e2e0470ea95e7d5d4ba6 100644
--- a/test_tipc/prepare.sh
+++ b/test_tipc/prepare.sh
@@ -21,7 +21,11 @@ model_name=$(func_parser_value "${lines[1]}")
trainer_list=$(func_parser_value "${lines[14]}")
if [ ${MODE} = "benchmark_train" ];then
- pip install -r requirements.txt
+ python_name_list=$(func_parser_value "${lines[2]}")
+ array=(${python_name_list})
+ python_name=${array[0]}
+ ${python_name} -m pip install -r requirements.txt
+ ${python_name} -m pip install git+https://github.com/LDOUBLEV/AutoLog
if [[ ${model_name} =~ "ch_ppocr_mobile_v2_0_det" || ${model_name} =~ "det_mv3_db_v2_0" ]];then
wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate
rm -rf ./train_data/icdar2015
@@ -107,8 +111,8 @@ if [ ${MODE} = "benchmark_train" ];then
cd ../
fi
if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then
- pip install -r ppstructure/kie/requirements.txt
- pip install opencv-python -U
+ ${python_name} -m pip install -r ppstructure/kie/requirements.txt
+ ${python_name} -m pip install opencv-python -U
wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate
cd ./train_data/ && tar xf XFUND.tar
# expand gt.txt 10 times
@@ -122,6 +126,11 @@ if [ ${MODE} = "benchmark_train" ];then
fi
if [ ${MODE} = "lite_train_lite_infer" ];then
+ python_name_list=$(func_parser_value "${lines[2]}")
+ array=(${python_name_list})
+ python_name=${array[0]}
+ ${python_name} -m pip install -r requirements.txt
+ ${python_name} -m pip install git+https://github.com/LDOUBLEV/AutoLog
# pretrain lite train data
wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate
wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar --no-check-certificate
@@ -212,6 +221,10 @@ if [ ${MODE} = "lite_train_lite_infer" ];then
if [ ${model_name} == "ch_ppocr_mobile_v2_0_rec_FPGM" ]; then
wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar --no-check-certificate
cd ./pretrain_models/ && tar xf ch_ppocr_mobile_v2.0_rec_train.tar && cd ../
+ ${python_name} -m pip install paddleslim
+ fi
+ if [ ${model_name} == "ch_ppocr_mobile_v2_0_det_FPGM" ]; then
+ ${python_name} -m pip install paddleslim
fi
if [ ${model_name} == "det_mv3_east_v2_0" ]; then
wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar --no-check-certificate
@@ -230,8 +243,8 @@ if [ ${MODE} = "lite_train_lite_infer" ];then
cd ./pretrain_models/ && tar xf rec_r32_gaspin_bilstm_att_train.tar && cd ../
fi
if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then
- pip install -r ppstructure/kie/requirements.txt
- pip install opencv-python -U
+ ${python_name} -m pip install -r ppstructure/kie/requirements.txt
+ ${python_name} -m pip install opencv-python -U
wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate
cd ./train_data/ && tar xf XFUND.tar
cd ../
@@ -639,6 +652,7 @@ if [ ${MODE} = "serving_infer" ];then
${python_name} -m pip install paddle-serving-server-gpu
${python_name} -m pip install paddle_serving_client
${python_name} -m pip install paddle-serving-app
+ ${python_name} -m pip install git+https://github.com/LDOUBLEV/AutoLog
# wget model
if [ ${model_name} == "ch_ppocr_mobile_v2_0_det_KL" ] || [ ${model_name} == "ch_ppocr_mobile_v2.0_rec_KL" ] ; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/tipc_fake_model/ch_ppocr_mobile_v2.0_det_klquant_infer.tar --no-check-certificate