diff --git a/deploy/android_demo/app/src/main/cpp/native.cpp b/deploy/android_demo/app/src/main/cpp/native.cpp index ced932556f09244d1e9e962e7b75461203a7cc3a..4961e5ecf141bb50701ecf9c3654a54f062937ce 100644 --- a/deploy/android_demo/app/src/main/cpp/native.cpp +++ b/deploy/android_demo/app/src/main/cpp/native.cpp @@ -47,7 +47,7 @@ str_to_cpu_mode(const std::string &cpu_mode) { std::string upper_key; std::transform(cpu_mode.cbegin(), cpu_mode.cend(), upper_key.begin(), ::toupper); - auto index = cpu_mode_map.find(upper_key); + auto index = cpu_mode_map.find(upper_key.c_str()); if (index == cpu_mode_map.end()) { LOGE("cpu_mode not found %s", upper_key.c_str()); return paddle::lite_api::LITE_POWER_HIGH; @@ -116,4 +116,4 @@ Java_com_baidu_paddle_lite_demo_ocr_OCRPredictorNative_release( ppredictor::OCR_PPredictor *ppredictor = (ppredictor::OCR_PPredictor *)java_pointer; delete ppredictor; -} \ No newline at end of file +} diff --git a/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java b/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java index 622da2a3f9a1233167e777e62b687c1f246df01f..41fa183dea1d968582dbedf4e831c55b043ae00f 100644 --- a/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java +++ b/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java @@ -54,7 +54,7 @@ public class OCRPredictorNative { } public void destory() { - if (nativePointer > 0) { + if (nativePointer != 0) { release(nativePointer); nativePointer = 0; } diff --git a/deploy/cpp_infer/docs/windows_vs2019_build.md b/deploy/cpp_infer/docs/windows_vs2019_build.md index 4f391d925008b4bffcbd123e937eb608f502c646..bcaefa46f83a30a4c232add78dc2e9f521b9f84f 100644 --- a/deploy/cpp_infer/docs/windows_vs2019_build.md +++ b/deploy/cpp_infer/docs/windows_vs2019_build.md @@ -109,8 +109,10 @@ CUDA_LIB、CUDNN_LIB、TENSORRT_DIR、WITH_GPU、WITH_TENSORRT 运行之前,将下面文件拷贝到`build/Release/`文件夹下 1. `paddle_inference/paddle/lib/paddle_inference.dll` -2. `opencv/build/x64/vc15/bin/opencv_world455.dll` -3. 如果使用openblas版本的预测库还需要拷贝 `paddle_inference/third_party/install/openblas/lib/openblas.dll` +2. `paddle_inference/third_party/install/onnxruntime/lib/onnxruntime.dll` +3. `paddle_inference/third_party/install/paddle2onnx/lib/paddle2onnx.dll` +4. `opencv/build/x64/vc15/bin/opencv_world455.dll` +5. 如果使用openblas版本的预测库还需要拷贝 `paddle_inference/third_party/install/openblas/lib/openblas.dll` ### Step4: 预测 diff --git a/deploy/slim/quantization/README_en.md b/deploy/slim/quantization/README_en.md index 33b2c4784afa4be68c8b9db1a02d83013c886655..c6796ae9dc256496308e432023c45ef1026c3d92 100644 --- a/deploy/slim/quantization/README_en.md +++ b/deploy/slim/quantization/README_en.md @@ -73,4 +73,4 @@ python deploy/slim/quantization/export_model.py -c configs/det/ch_ppocr_v2.0/ch_ The numerical range of the quantized model parameters derived from the above steps is still FP32, but the numerical range of the parameters is int8. The derived model can be converted through the `opt tool` of PaddleLite. -For quantitative model deployment, please refer to [Mobile terminal model deployment](../../lite/readme_en.md) +For quantitative model deployment, please refer to [Mobile terminal model deployment](../../lite/readme.md) diff --git a/paddleocr.py b/paddleocr.py index d78046802eb8b8af42ae2718697a5cfc1e7186de..f6fb095af34a58cc91b9fd0f22b2e95bf833e010 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -636,4 +636,6 @@ def main(): for item in result: item.pop('img') + item.pop('res') logger.info(item) + logger.info('result save to {}'.format(args.output)) diff --git a/ppocr/data/imaug/copy_paste.py b/ppocr/data/imaug/copy_paste.py index 0b3386c896792bd670cd2bfc757eb3b80f22bac4..79343da60fd40f8dc0ffe8927398b70cb751b532 100644 --- a/ppocr/data/imaug/copy_paste.py +++ b/ppocr/data/imaug/copy_paste.py @@ -35,10 +35,12 @@ class CopyPaste(object): point_num = data['polys'].shape[1] src_img = data['image'] src_polys = data['polys'].tolist() + src_texts = data['texts'] src_ignores = data['ignore_tags'].tolist() ext_data = data['ext_data'][0] ext_image = ext_data['image'] ext_polys = ext_data['polys'] + ext_texts = ext_data['texts'] ext_ignores = ext_data['ignore_tags'] indexs = [i for i in range(len(ext_ignores)) if not ext_ignores[i]] @@ -53,7 +55,7 @@ class CopyPaste(object): src_img = cv2.cvtColor(src_img, cv2.COLOR_BGR2RGB) ext_image = cv2.cvtColor(ext_image, cv2.COLOR_BGR2RGB) src_img = Image.fromarray(src_img).convert('RGBA') - for poly, tag in zip(select_polys, select_ignores): + for idx, poly, tag in zip(select_idxs, select_polys, select_ignores): box_img = get_rotate_crop_image(ext_image, poly) src_img, box = self.paste_img(src_img, box_img, src_polys) @@ -62,6 +64,7 @@ class CopyPaste(object): for _ in range(len(box), point_num): box.append(box[-1]) src_polys.append(box) + src_texts.append(ext_texts[idx]) src_ignores.append(tag) src_img = cv2.cvtColor(np.array(src_img), cv2.COLOR_RGB2BGR) h, w = src_img.shape[:2] @@ -70,6 +73,7 @@ class CopyPaste(object): src_polys[:, :, 1] = np.clip(src_polys[:, :, 1], 0, h) data['image'] = src_img data['polys'] = src_polys + data['texts'] = src_texts data['ignore_tags'] = np.array(src_ignores) return data diff --git a/ppocr/metrics/rec_metric.py b/ppocr/metrics/rec_metric.py index d858ae28e999d546847727243b35f2ac902e1026..9863978116b1340fa809e8919a6a37d598d6bbdf 100644 --- a/ppocr/metrics/rec_metric.py +++ b/ppocr/metrics/rec_metric.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import Levenshtein +from rapidfuzz.distance import Levenshtein import string @@ -46,8 +46,7 @@ class RecMetric(object): if self.is_filter: pred = self._normalize_text(pred) target = self._normalize_text(target) - norm_edit_dis += Levenshtein.distance(pred, target) / max( - len(pred), len(target), 1) + norm_edit_dis += Levenshtein.normalized_distance(pred, target) if pred == target: correct_num += 1 all_num += 1 diff --git a/ppocr/utils/dict/kie_dict/xfund_class_list.txt b/ppocr/utils/dict/kie_dict/xfund_class_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..faded9f9b8f56bd258909bec9b8f1755aa688367 --- /dev/null +++ b/ppocr/utils/dict/kie_dict/xfund_class_list.txt @@ -0,0 +1,4 @@ +OTHER +QUESTION +ANSWER +HEADER diff --git a/ppocr/utils/save_load.py b/ppocr/utils/save_load.py index 0c652c8fdc88bd066d7202bb57c046aefbc20cc4..f86125521d19342f63a9fcb3bdcaed02cc4c6463 100644 --- a/ppocr/utils/save_load.py +++ b/ppocr/utils/save_load.py @@ -194,6 +194,9 @@ def save_model(model, _mkdir_if_not_exist(model_path, logger) model_prefix = os.path.join(model_path, prefix) paddle.save(optimizer.state_dict(), model_prefix + '.pdopt') + + is_nlp_model = config['Architecture']["model_type"] == 'kie' and config[ + "Architecture"]["algorithm"] not in ["SDMGR"] if is_nlp_model is not True: paddle.save(model.state_dict(), model_prefix + '.pdparams') metric_prefix = model_prefix diff --git a/ppstructure/README.md b/ppstructure/README.md index cff057e81909e620eaa86ffe464433cc3a5d6f21..66df10b2ec4d52fb743c40893d5fc5aa7d6ab5be 100644 --- a/ppstructure/README.md +++ b/ppstructure/README.md @@ -106,9 +106,9 @@ PP-Structure Series Model List (Updating) |model name|description|model size|download| | --- | --- | --- | --- | -|ch_PP-OCRv3_det_slim|[New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar)| -|ch_PP-OCRv3_rec_slim |[New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) | -|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model trained on PubTabNet dataset based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_PP-OCRv3_det| [New] Lightweight model, supporting Chinese, English, multilingual text detection | 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| +|ch_PP-OCRv3_rec| [New] Lightweight model, supporting Chinese, English, multilingual text recognition | 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | ### 7.3 KIE model diff --git a/ppstructure/README_ch.md b/ppstructure/README_ch.md index efd25eb2cbda585c3fc2e192cd8184ccc7e10c0d..597cceafdf4fa94433da31a87b5cf4fa663c30fb 100644 --- a/ppstructure/README_ch.md +++ b/ppstructure/README_ch.md @@ -120,9 +120,9 @@ PP-Structure系列模型列表(更新中) |模型名称|模型简介|模型大小|下载地址| | --- | --- | --- | --- | -|ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar)| -|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) | -|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_PP-OCRv3_det| 【最新】超轻量模型,支持中英文、多语种文本检测 | 3.8M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| +|ch_PP-OCRv3_rec|【最新】超轻量模型,支持中英文、数字识别|12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | diff --git a/ppstructure/docs/inference.md b/ppstructure/docs/inference.md index b050900760067402b2b738ed8d0e94d6788aca4f..cf11960c1ccde00f102db1a33f2b0d0e5dc9c985 100644 --- a/ppstructure/docs/inference.md +++ b/ppstructure/docs/inference.md @@ -4,7 +4,7 @@ - [1.1 版面分析+表格识别](#1.1) - [1.2 版面分析](#1.2) - [1.3 表格识别](#1.3) -- [2. DocVQA](#2) +- [2. 关键信息抽取](#2) ## 1. Structure @@ -16,23 +16,26 @@ cd ppstructure 下载模型 ```bash mkdir inference && cd inference -# 下载PP-OCRv2文本检测模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -# 下载PP-OCRv2文本识别模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar -# 下载超轻量级英文表格预测模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# 下载PP-Structurev2版面分析模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar +# 下载PP-OCRv3文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# 下载PP-OCRv3文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# 下载PP-Structurev2表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. ``` ### 1.1 版面分析+表格识别 ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ --image_dir=./docs/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf ``` @@ -41,19 +44,23 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i ### 1.2 版面分析 ```bash -python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ +python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --output=../output \ + --table=false \ + --ocr=false ``` 运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片区域会被裁剪之后保存下来,图片名为表格在图片里的坐标。版面分析结果会存储在`res.txt`文件中。 ### 1.3 表格识别 ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --image_dir=./docs/table/table.jpg \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf \ --layout=false @@ -61,20 +68,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i 运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,表格会存储为一个excel,excel文件名为`[0,0,img_h,img_w]`。 -## 2. DocVQA +## 2. 关键信息抽取 ```bash cd ppstructure -# 下载模型 mkdir inference && cd inference -# 下载SER xfun 模型并解压 -wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar +# 下载SER XFUND 模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar cd .. - -python3 predict_system.py --model_name_or_path=kie/PP-Layout_v1.0_ser_pretrained/ \ - --mode=kie \ - --image_dir=kie/images/input/zh_val_0.jpg \ - --vis_font_path=../doc/fonts/simfang.ttf +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" ``` + 运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下存放可视化之后的图片,图片名和输入图片名一致。 diff --git a/ppstructure/docs/inference_en.md b/ppstructure/docs/inference_en.md index ad16f048e3b08a45d6e6d76e630ba48483f263d4..357e26a11f7e86a342bb3dbf24ea3c721705ae98 100644 --- a/ppstructure/docs/inference_en.md +++ b/ppstructure/docs/inference_en.md @@ -4,7 +4,7 @@ - [1.1 layout analysis + table recognition](#1.1) - [1.2 layout analysis](#1.2) - [1.3 table recognition](#1.3) -- [2. DocVQA](#2) +- [2. KIE](#2) ## 1. Structure @@ -18,23 +18,26 @@ download model ```bash mkdir inference && cd inference -# Download the PP-OCRv2 text detection model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -# Download the PP-OCRv2 text recognition model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar -# Download the ultra-lightweight English table structure model and unzip it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# Download the PP-Structurev2 layout analysis model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar +# Download the PP-OCRv3 text detection model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# Download the PP-OCRv3 text recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# Download the PP-Structurev2 form recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. ``` ### 1.1 layout analysis + table recognition ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ --image_dir=./docs/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf ``` @@ -43,19 +46,23 @@ After the operation is completed, each image will have a directory with the same ### 1.2 layout analysis ```bash -python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ +python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --output=../output \ + --table=false \ + --ocr=false ``` After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each picture in image will be cropped and saved. The filename of picture area is their coordinates in the image. Layout analysis results will be stored in the `res.txt` file ### 1.3 table recognition ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --image_dir=./docs/table/table.jpg \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf \ --layout=false @@ -63,19 +70,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel. The filename of excel is their coordinates in the image. -## 2. DocVQA +## 2. KIE ```bash cd ppstructure -# download model mkdir inference && cd inference -wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar +# download model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar cd .. - -python3 predict_system.py --model_name_or_path=kie/PP-Layout_v1.0_ser_pretrained/ \ - --mode=kie \ - --image_dir=kie/images/input/zh_val_0.jpg \ - --vis_font_path=../doc/fonts/simfang.ttf +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" ``` + After the operation is completed, each image will store the visualized image in the `kie` directory under the directory specified by the `output` field, and the image name is the same as the input image name. diff --git a/ppstructure/docs/installation.md b/ppstructure/docs/installation.md index 3649e729d04ec83ba2d97571af993d75358eec73..0635580234abe2716769441d845df6386fbf5b86 100644 --- a/ppstructure/docs/installation.md +++ b/ppstructure/docs/installation.md @@ -1,7 +1,7 @@ - [快速安装](#快速安装) - [1. PaddlePaddle 和 PaddleOCR](#1-paddlepaddle-和-paddleocr) - [2. 安装其他依赖](#2-安装其他依赖) - - [2.1 VQA所需依赖](#21--kie所需依赖) + - [2.1 KIE所需依赖](#21-kie所需依赖) # 快速安装 @@ -11,16 +11,11 @@ ## 2. 安装其他依赖 -### 2.1 VQA所需依赖 -* paddleocr +### 2.1 KIE所需依赖 -```bash -pip3 install paddleocr -``` +* paddleocr -* PaddleNLP ```bash -git clone https://github.com/PaddlePaddle/PaddleNLP -b develop -cd PaddleNLP -pip3 install -e . +pip install paddleocr -U +pip install -r ./kie/requirements.txt ``` diff --git a/ppstructure/docs/installation_en.md b/ppstructure/docs/installation_en.md index 02b02db0c58f60a5296734b93563510732a7286d..de8bb5f6fc06fbd4f21cb0ca00ec80cce109ebf7 100644 --- a/ppstructure/docs/installation_en.md +++ b/ppstructure/docs/installation_en.md @@ -2,7 +2,7 @@ - [1. PaddlePaddle 和 PaddleOCR](#1) - [2. Install other dependencies](#2) - - [2.1 VQA](#21) + - [2.1 KIE](#21) @@ -14,17 +14,11 @@ Please refer to [PaddleOCR installation documentation](../../doc/doc_en/installa ## 2. Install other dependencies -### 2.1 VQA +### 2.1 KIE * paddleocr ```bash -pip3 install paddleocr -``` - -* PaddleNLP -```bash -git clone https://github.com/PaddlePaddle/PaddleNLP -b develop -cd PaddleNLP -pip3 install -e . +pip install paddleocr -U +pip install -r ./kie/requirements.txt ``` diff --git a/ppstructure/docs/models_list.md b/ppstructure/docs/models_list.md index f4c63659bdb558ba92f9e974f0ffd3ef4dbc28e4..935d12d756eec467574f9ae32d48c70a3ea054c3 100644 --- a/ppstructure/docs/models_list.md +++ b/ppstructure/docs/models_list.md @@ -28,8 +28,8 @@ |模型名称|模型简介|推理模型大小|下载地址| | --- | --- | --- | --- | -|en_ppocr_mobile_v2.0_table_det|PubLayNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | -|en_ppocr_mobile_v2.0_table_rec|PubLayNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | +|en_ppocr_mobile_v2.0_table_det|PubTabNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | +|en_ppocr_mobile_v2.0_table_rec|PubTabNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | 如需要使用其他OCR模型,可以在 [PP-OCR model_list](../../doc/doc_ch/models_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。 @@ -40,7 +40,7 @@ | --- | --- | --- | --- | |en_ppocr_mobile_v2.0_table_structure|基于TableRec-RARE在PubTabNet数据集上训练的英文表格识别模型|6.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | |en_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的英文表格识别模型|9.2M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | -|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | diff --git a/ppstructure/docs/models_list_en.md b/ppstructure/docs/models_list_en.md index 7d840b9d412703fb0a9e8f32889414e4489133b8..85531fb753c4e32f0cdc9296ab97a9faebbb0ebd 100644 --- a/ppstructure/docs/models_list_en.md +++ b/ppstructure/docs/models_list_en.md @@ -39,7 +39,7 @@ If you need to use other OCR models, you can download the model in [PP-OCR model | --- |-----------------------------------------------------------------------------| --- | --- | |en_ppocr_mobile_v2.0_table_structure| English table recognition model trained on PubTabNet dataset based on TableRec-RARE |6.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | |en_ppstructure_mobile_v2.0_SLANet|English table recognition model trained on PubTabNet dataset based on SLANet|9.2M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | -|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model trained on PubTabNet dataset based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | ## 3. KIE diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 51cd1015685549906decd329960823bc52f9c28f..38a37bebcb561228ddfb2b2970b9ddbaebbcb19a 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -7,18 +7,22 @@ - [2.1.2 版面分析+表格识别](#212-版面分析表格识别) - [2.1.3 版面分析](#213-版面分析) - [2.1.4 表格识别](#214-表格识别) - - [2.1.5 DocVQA](#215-dockie) + - [2.1.5 关键信息抽取](#215-关键信息抽取) - [2.1.6 版面恢复](#216-版面恢复) - [2.2 代码使用](#22-代码使用) - - [2.2.1 图像方向分类版面分析表格识别](#221-图像方向分类版面分析表格识别) + + - [2.2.1 图像方向+分类版面分析+表格识别](#221-图像方向分类版面分析表格识别) - [2.2.2 版面分析+表格识别](#222-版面分析表格识别) - [2.2.3 版面分析](#223-版面分析) - [2.2.4 表格识别](#224-表格识别) - - [2.2.5 DocVQA](#225-dockie) + + - [2.2.5 关键信息抽取](#225-关键信息抽取) - [2.2.6 版面恢复](#226-版面恢复) + - [2.3 返回结果说明](#23-返回结果说明) - - [2.3.1 版面分+表格识别](#231-版面分析表格识别) - - [2.3.2 DocVQA](#232-dockie) + - [2.3.1 版面分析+表格识别](#231-版面分析表格识别) + - [2.3.2 关键信息抽取](#232-关键信息抽取) + - [2.4 参数说明](#24-参数说明) @@ -28,8 +32,8 @@ ```bash # 安装 paddleocr,推荐使用2.6版本 pip3 install "paddleocr>=2.6" -# 安装 DocVQA依赖包paddlenlp(如不需要DocVQA功能,可跳过) -pip3 install paddlenlp +# 安装 关键信息抽取 依赖包(如不需要KIE功能,可跳过) +pip install -r kie/requirements.txt # 安装 图像方向分类依赖包paddleclas(如不需要图像方向分类功能,可跳过) pip3 install paddleclas ``` @@ -66,9 +70,8 @@ paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structur -#### 2.1.5 DocVQA - -请参考:[文档视觉问答](../kie/README.md)。 +#### 2.1.5 关键信息抽取 +请参考:[关键信息抽取教程](../kie/README_ch.md)。 @@ -184,9 +187,9 @@ for line in result: ``` -#### 2.2.5 DocVQA +#### 2.2.5 关键信息抽取 -请参考:[文档视觉问答](../kie/README.md)。 +请参考:[关键信息抽取教程](../kie/README_ch.md)。 @@ -249,9 +252,9 @@ dict 里各个字段说明如下 ``` -#### 2.3.2 DocVQA +#### 2.3.2 关键信息抽取 -请参考:[文档视觉问答](../kie/README.md)。 +请参考:[关键信息抽取教程](../kie/README_ch.md)。 ### 2.4 参数说明 diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index cccb30f839303948db751f2cb58d42b0897881be..dbfbf43b01c94bd6f9c729f2f6edcd1dd6aee056 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -7,7 +7,7 @@ - [2.1.2 layout analysis + table recognition](#212-layout-analysis--table-recognition) - [2.1.3 layout analysis](#213-layout-analysis) - [2.1.4 table recognition](#214-table-recognition) - - [2.1.5 DocVQA](#215-dockie) + - [2.1.5 Key Information Extraction](#215-Key-Information-Extraction) - [2.1.6 layout recovery](#216-layout-recovery) - [2.2 Use by code](#22-use-by-code) - [2.2.1 image orientation + layout analysis + table recognition](#221-image-orientation--layout-analysis--table-recognition) @@ -15,10 +15,11 @@ - [2.2.3 layout analysis](#223-layout-analysis) - [2.2.4 table recognition](#224-table-recognition) - [2.2.5 DocVQA](#225-dockie) + - [2.2.5 Key Information Extraction](#225-Key-Information-Extraction) - [2.2.6 layout recovery](#226-layout-recovery) - [2.3 Result description](#23-result-description) - [2.3.1 layout analysis + table recognition](#231-layout-analysis--table-recognition) - - [2.3.2 DocVQA](#232-dockie) + - [2.3.2 Key Information Extraction](#232-Key-Information-Extraction) - [2.4 Parameter Description](#24-parameter-description) @@ -28,14 +29,14 @@ ```bash # Install paddleocr, version 2.6 is recommended pip3 install "paddleocr>=2.6" -# Install the DocVQA dependency package paddlenlp (if you do not use the DocVQA, you can skip it) -pip3 install paddlenlp +# Install the KIE dependency packages (if you do not use the KIE, you can skip it) +pip install -r kie/requirements.txt # Install the image direction classification dependency package paddleclas (if you do not use the image direction classification, you can skip it) pip3 install paddleclas - ``` + ## 2. Use @@ -66,9 +67,9 @@ paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structur ``` -#### 2.1.5 DocVQA +#### 2.1.5 Key Information Extraction -Please refer to: [Documentation Visual Q&A](../kie/README.md) . +Please refer to: [Key Information Extraction](../kie/README.md) . #### 2.1.6 layout recovery @@ -130,7 +131,7 @@ for line in result: from PIL import Image -font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # font provieded in PaddleOCR image = Image.open(img_path).convert('RGB') im_show = draw_structure_result(image, result,font_path=font_path) im_show = Image.fromarray(im_show) @@ -180,9 +181,9 @@ for line in result: ``` -#### 2.2.5 DocVQA +#### 2.2.5 Key Information Extraction -Please refer to: [Documentation Visual Q&A](../kie/README.md) . +Please refer to: [Key Information Extraction](../kie/README.md) . #### 2.2.6 layout recovery @@ -244,9 +245,9 @@ After the recognition is completed, each image will have a directory with the sa ``` -#### 2.3.2 DocVQA +#### 2.3.2 Key Information Extraction -Please refer to: [Documentation Visual Q&A](../kie/README.md) . +Please refer to: [Key Information Extraction](../kie/README.md) . ### 2.4 Parameter Description diff --git a/ppstructure/kie/README.md b/ppstructure/kie/README.md index 9e1b72e772f03a9dadd202268c39cba11f8f121e..adb19a3ca729821ab16bf8f0f8ec14c2376de1de 100644 --- a/ppstructure/kie/README.md +++ b/ppstructure/kie/README.md @@ -246,7 +246,7 @@ For training, evaluation and inference tutorial for text recognition models, ple If you want to finish the KIE tasks in your scene, and don't know what to prepare, please refer to [End cdoc](../../doc/doc_en/recognition.md). -关于怎样在自己的场景中完成关键信息抽取任务,请参考:[Guide to End-to-end KIE](./how_to_do_kie_en.md)。 +To complete the key information extraction task in your own scenario from data preparation to model selection, please refer to: [Guide to End-to-end KIE](./how_to_do_kie_en.md)。 ## 5. Reference diff --git a/ppstructure/kie/tools/eval_with_label_end2end.py b/ppstructure/kie/tools/eval_with_label_end2end.py index b13ffb568fd9610fee5d5a246c501ed5b90de91a..b0fd84363f450dfb7e4ef18e53adc17ef088cf18 100644 --- a/ppstructure/kie/tools/eval_with_label_end2end.py +++ b/ppstructure/kie/tools/eval_with_label_end2end.py @@ -20,7 +20,7 @@ from shapely.geometry import Polygon import numpy as np from collections import defaultdict import operator -import Levenshtein +from rapidfuzz.distance import Levenshtein import argparse import json import copy diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index a5d0da3ccd7b1893d826f026609ec39b804218da..e5c85eb9619ea92cd8b31041907d518eeceaf6a5 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -59,16 +59,16 @@ cd PaddleOCR/ppstructure # download model mkdir inference && cd inference # Download the PP-OCRv3 text detection model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar # Download the PP-OCRv3 text recognition model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv3_rec_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar # Download the PP-Structurev2 form recognition model and unzip it wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. # run python3.7 table/predict_table.py \ - --det_model_dir=inference/ch_PP-OCRv3_det_slim_infer \ - --rec_model_dir=inference/ch_PP-OCRv3_rec_slim_infer \ + --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index e83c81befbea95ab1e0a1f532901e39a4d80bd9d..086e39348e96abe4320debef1cc11487694ccd49 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -64,16 +64,16 @@ cd PaddleOCR/ppstructure # 下载模型 mkdir inference && cd inference # 下载PP-OCRv3文本检测模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar # 下载PP-OCRv3文本识别模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv3_rec_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar # 下载PP-Structurev2表格识别模型并解压 wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. # 执行表格识别 python table/predict_table.py \ - --det_model_dir=inference/ch_PP-OCRv3_det_slim_infer \ - --rec_model_dir=inference/ch_PP-OCRv3_rec_slim_infer \ + --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ diff --git a/ppstructure/table/table_metric/table_metric.py b/ppstructure/table/table_metric/table_metric.py index 9aca98ad785d4614a803fa5a277a6e4a27b3b078..923a9c0071d083de72a2a896d6f62037373d4e73 100755 --- a/ppstructure/table/table_metric/table_metric.py +++ b/ppstructure/table/table_metric/table_metric.py @@ -9,7 +9,7 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # Apache 2.0 License for more details. -import distance +from rapidfuzz.distance import Levenshtein from apted import APTED, Config from apted.helpers import Tree from lxml import etree, html @@ -39,17 +39,6 @@ class TableTree(Tree): class CustomConfig(Config): - @staticmethod - def maximum(*sequences): - """Get maximum possible value - """ - return max(map(len, sequences)) - - def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ - return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) - def rename(self, node1, node2): """Compares attributes of trees""" #print(node1.tag) @@ -58,23 +47,12 @@ class CustomConfig(Config): if node1.tag == 'td': if node1.content or node2.content: #print(node1.content, ) - return self.normalized_distance(node1.content, node2.content) + return Levenshtein.normalized_distance(node1.content, node2.content) return 0. class CustomConfig_del_short(Config): - @staticmethod - def maximum(*sequences): - """Get maximum possible value - """ - return max(map(len, sequences)) - - def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ - return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) - def rename(self, node1, node2): """Compares attributes of trees""" if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): @@ -90,21 +68,10 @@ class CustomConfig_del_short(Config): node1_content = ['####'] if len(node2_content) < 3: node2_content = ['####'] - return self.normalized_distance(node1_content, node2_content) + return Levenshtein.normalized_distance(node1_content, node2_content) return 0. class CustomConfig_del_block(Config): - @staticmethod - def maximum(*sequences): - """Get maximum possible value - """ - return max(map(len, sequences)) - - def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ - return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) - def rename(self, node1, node2): """Compares attributes of trees""" if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): @@ -120,7 +87,7 @@ class CustomConfig_del_block(Config): while ' ' in node2_content: print(node2_content.index(' ')) node2_content.pop(node2_content.index(' ')) - return self.normalized_distance(node1_content, node2_content) + return Levenshtein.normalized_distance(node1_content, node2_content) return 0. class TEDS(object): diff --git a/ppstructure/utility.py b/ppstructure/utility.py index 4df726118c0cca8ffc58a423357eda11e5221919..bdea0af69e37e15d1f191b2a86c036ae1c2b1e45 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -38,7 +38,7 @@ def init_args(): parser.add_argument( "--layout_dict_path", type=str, - default="../ppocr/utils/dict/layout_dict/layout_pubalynet_dict.txt") + default="../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt") parser.add_argument( "--layout_score_threshold", type=float, diff --git a/requirements.txt b/requirements.txt index b15176db3eb42c381c1612f404fd15c6b020b3dc..976d29192abbbf89b8ee6064c0b4ec48d43ad268 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ lmdb tqdm numpy visualdl -python-Levenshtein +rapidfuzz opencv-contrib-python==4.4.0.46 cython lxml diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index 252ed1aaf2eb441415dff3bb823814936ab5f24f..e0f2c41fa2aba23491efee920afbd76db1ec84e0 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -120,11 +120,14 @@ def sorted_boxes(dt_boxes): _boxes = list(sorted_boxes) for i in range(num_boxes - 1): - if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \ - (_boxes[i + 1][0][0] < _boxes[i][0][0]): - tmp = _boxes[i] - _boxes[i] = _boxes[i + 1] - _boxes[i + 1] = tmp + for j in range(i, 0, -1): + if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ + (_boxes[j + 1][0][0] < _boxes[j][0][0]): + tmp = _boxes[j] + _boxes[j] = _boxes[j + 1] + _boxes[j + 1] = tmp + else: + break return _boxes diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 1355ca62e5c6b50177d1c7c2f6d7a016e168f545..8d3e93992d9d8cbd19fdd2c071565c940d011883 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -225,23 +225,24 @@ def create_predictor(args, mode, logger): min_subgraph_size, # skip the minmum trt subgraph use_calib_mode=False) - # collect shape - if args.shape_info_filename is not None: - if not os.path.exists(args.shape_info_filename): - config.collect_shape_range_info(args.shape_info_filename) - logger.info( - f"collect dynamic shape info into : {args.shape_info_filename}" - ) + # collect shape + if args.shape_info_filename is not None: + if not os.path.exists(args.shape_info_filename): + config.collect_shape_range_info( + args.shape_info_filename) + logger.info( + f"collect dynamic shape info into : {args.shape_info_filename}" + ) + else: + logger.info( + f"dynamic shape info file( {args.shape_info_filename} ) already exists, not need to generate again." + ) + config.enable_tuned_tensorrt_dynamic_shape( + args.shape_info_filename, True) else: logger.info( - f"dynamic shape info file( {args.shape_info_filename} ) already exists, not need to generate again." + f"when using tensorrt, dynamic shape is a suggested option, you can use '--shape_info_filename=shape.txt' for offline dygnamic shape tuning" ) - config.enable_tuned_tensorrt_dynamic_shape( - args.shape_info_filename, True) - else: - logger.info( - f"when using tensorrt, dynamic shape is a suggested option, you can use '--shape_info_filename=shape.txt' for offline dygnamic shape tuning" - ) elif args.use_xpu: config.enable_xpu(10 * 1024 * 1024) @@ -549,7 +550,7 @@ def text_visual(texts, def base64_to_cv2(b64str): import base64 data = base64.b64decode(b64str.encode('utf8')) - data = np.fromstring(data, np.uint8) + data = np.frombuffer(data, np.uint8) data = cv2.imdecode(data, cv2.IMREAD_COLOR) return data diff --git a/tools/infer_kie.py b/tools/infer_kie.py index 346e2e0aeeee695ab49577b6b13dcc058150df1a..9375434cc887b08dfa746420a6c73c58c6e04797 100755 --- a/tools/infer_kie.py +++ b/tools/infer_kie.py @@ -88,6 +88,29 @@ def draw_kie_result(batch, node, idx_to_cls, count): cv2.imwrite(save_path, vis_img) logger.info("The Kie Image saved in {}".format(save_path)) +def write_kie_result(fout, node, data): + """ + Write infer result to output file, sorted by the predict label of each line. + The format keeps the same as the input with additional score attribute. + """ + import json + label = data['label'] + annotations = json.loads(label) + max_value, max_idx = paddle.max(node, -1), paddle.argmax(node, -1) + node_pred_label = max_idx.numpy().tolist() + node_pred_score = max_value.numpy().tolist() + res = [] + for i, label in enumerate(node_pred_label): + pred_score = '{:.2f}'.format(node_pred_score[i]) + pred_res = { + 'label': label, + 'transcription': annotations[i]['transcription'], + 'score': pred_score, + 'points': annotations[i]['points'], + } + res.append(pred_res) + res.sort(key=lambda x: x['label']) + fout.writelines([json.dumps(res, ensure_ascii=False) + '\n']) def main(): global_config = config['Global'] @@ -114,7 +137,7 @@ def main(): warmup_times = 0 count_t = [] - with open(save_res_path, "wb") as fout: + with open(save_res_path, "w") as fout: with open(config['Global']['infer_img'], "rb") as f: lines = f.readlines() for index, data_line in enumerate(lines): @@ -139,6 +162,8 @@ def main(): node = F.softmax(node, -1) count_t.append(time.time() - st) draw_kie_result(batch, node, idx_to_cls, index) + write_kie_result(fout, node, data) + fout.close() logger.info("success!") logger.info("It took {} s for predict {} images.".format( np.sum(count_t), len(count_t))) diff --git a/tools/infer_kie_token_ser_re.py b/tools/infer_kie_token_ser_re.py index 40784e39be4784621c56ae84c1819720231c032f..3ee696f28470a16205be628b3aeb586ef7a9c6a6 100755 --- a/tools/infer_kie_token_ser_re.py +++ b/tools/infer_kie_token_ser_re.py @@ -39,7 +39,7 @@ from ppocr.utils.visual import draw_re_results from ppocr.utils.logging import get_logger from ppocr.utils.utility import get_image_file_list, load_vqa_bio_label_maps, print_dict from tools.program import ArgsParser, load_config, merge_config -from tools.infer_vqa_token_ser import SerPredictor +from tools.infer_kie_token_ser import SerPredictor class ReArgsParser(ArgsParser):