diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index c17db91a5b5cd9d3cbb4b5bf6c87afd745d0870d..d0d2bb721be41fe2c4042fbea1b55e4e76bdd664 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -2531,7 +2531,7 @@ class MainWindow(QMainWindow): split = 'test' # save dict - html = {'structure': {'tokens': token_list}, 'cell': cells} + html = {'structure': {'tokens': token_list}, 'cells': cells} json_results.append({'filename': os.path.basename(image_path), 'split': split, 'imgid': imgid, 'html': html}) imgid += 1 diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 3bdc336827adb87f52e9baa2c012304595b2c656..089a63fd55bb8c127104e7c404852ba52c3ac88c 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -1,10 +1,14 @@ English | [简体中文](README_ch.md) -# PPOCRLabel +# PPOCRLabelv2 -PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box, table and multi-point annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models. +PPOCRLabelv2 is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in Python3 and PyQT5, supporting rectangular box, table, irregular text and key information annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models. - +| regular text annotation | table annotation | +| :-------------------------------------------------: | :--------------------------------------------: | +| | | +| **irregular text annotation** | **key information annotation** | +| | | ### Recent Update diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index 107f902a68bd68b30d286e8dd88b29752f0c6ad0..3ea684a3f09a6084403fa0b91e2511b7fd790f4b 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -1,10 +1,14 @@ [English](README.md) | 简体中文 -# PPOCRLabel +# PPOCRLabelv2 PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写,支持矩形框标注和四点标注模式,导出格式可直接用于PaddleOCR检测和识别模型的训练。 - +| 常规标注 | 表格标注 | +| :-------------------------------------------------: | :--------------------------------------------: | +| | | +| **不规则文本标注** | **关键信息标注** | +| | | #### 近期更新 - 2022.05:**新增表格标注**,使用方法见下方`2.2 表格标注`(by [whjdark](https://github.com/peterh0323); [Evezerest](https://github.com/Evezerest)) diff --git a/PPOCRLabel/setup.py b/PPOCRLabel/setup.py index 1ec54df11a75b8a7ad8f023ca4a5b24ef5343d71..1750f84b8259a237fb6bb1b5eb9dc33e29441bc1 100644 --- a/PPOCRLabel/setup.py +++ b/PPOCRLabel/setup.py @@ -33,7 +33,7 @@ setup( package_dir={'PPOCRLabel': ''}, include_package_data=True, entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]}, - version='1.0.2', + version='2.1.1', install_requires=requirements, license='Apache License 2.0', description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models', diff --git a/configs/table/SLANet_ch.yml b/configs/table/SLANet_ch.yml index 997ff0a77b5ea824957abc1d32a7ba7f70abc12c..a3fc1c68ddd2287e6bbaa6c53c5e20961950d23f 100644 --- a/configs/table/SLANet_ch.yml +++ b/configs/table/SLANet_ch.yml @@ -107,7 +107,7 @@ Train: Eval: dataset: name: PubTabDataSet - data_dir: train_data/table/val/ + data_dir: train_data/table/val/ label_file_list: [train_data/table/val.txt] transforms: - DecodeImage: diff --git a/deploy/cpp_infer/src/utility.cpp b/deploy/cpp_infer/src/utility.cpp index 4bfc1d091d6124b10c79032beb702ba8727210fc..251184b91bb8efed4b58bbf2bc3d11ea6a1bf916 100644 --- a/deploy/cpp_infer/src/utility.cpp +++ b/deploy/cpp_infer/src/utility.cpp @@ -268,11 +268,12 @@ cv::Mat Utility::crop_image(cv::Mat &img, std::vector &area) { void Utility::sorted_boxes(std::vector &ocr_result) { std::sort(ocr_result.begin(), ocr_result.end(), Utility::comparison_box); - - for (int i = 0; i < ocr_result.size() - 1; i++) { - if (abs(ocr_result[i + 1].box[0][1] - ocr_result[i].box[0][1]) < 10 && - (ocr_result[i + 1].box[0][0] < ocr_result[i].box[0][0])) { - std::swap(ocr_result[i], ocr_result[i + 1]); + if (ocr_result.size() > 0) { + for (int i = 0; i < ocr_result.size() - 1; i++) { + if (abs(ocr_result[i + 1].box[0][1] - ocr_result[i].box[0][1]) < 10 && + (ocr_result[i + 1].box[0][0] < ocr_result[i].box[0][0])) { + std::swap(ocr_result[i], ocr_result[i + 1]); + } } } } diff --git a/paddleocr.py b/paddleocr.py index 0b7aed36279081f50208f75272fc54c5081929a7..fa732fc110dc7873f8d89b2ca2a21817a1e6d20d 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -414,6 +414,33 @@ def get_model_config(type, version, model_type, lang): return model_urls[version][model_type][lang] +def img_decode(content: bytes): + np_arr = np.frombuffer(content, dtype=np.uint8) + return cv2.imdecode(np_arr, cv2.IMREAD_COLOR) + + +def check_img(img): + if isinstance(img, bytes): + img = img_decode(img) + if isinstance(img, str): + # download net image + if is_link(img): + download_with_progressbar(img, 'tmp.jpg') + img = 'tmp.jpg' + image_file = img + img, flag, _ = check_and_read(image_file) + if not flag: + with open(image_file, 'rb') as f: + img = img_decode(f.read()) + if img is None: + logger.error("error in loading image:{}".format(image_file)) + return None + if isinstance(img, np.ndarray) and len(img.shape) == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + + return img + + class PaddleOCR(predict_system.TextSystem): def __init__(self, **kwargs): """ @@ -482,7 +509,7 @@ class PaddleOCR(predict_system.TextSystem): rec: use text recognition or not. If false, only det will be exec. Default is True cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. """ - assert isinstance(img, (np.ndarray, list, str)) + assert isinstance(img, (np.ndarray, list, str, bytes)) if isinstance(img, list) and det == True: logger.error('When input a list of images, det must be false') exit(0) @@ -491,22 +518,8 @@ class PaddleOCR(predict_system.TextSystem): 'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process' ) - if isinstance(img, str): - # download net image - if img.startswith('http'): - download_with_progressbar(img, 'tmp.jpg') - img = 'tmp.jpg' - image_file = img - img, flag, _ = check_and_read(image_file) - if not flag: - with open(image_file, 'rb') as f: - np_arr = np.frombuffer(f.read(), dtype=np.uint8) - img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - return None - if isinstance(img, np.ndarray) and len(img.shape) == 2: - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + img = check_img(img) + if det and rec: dt_boxes, rec_res, _ = self.__call__(img, cls) return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] @@ -585,23 +598,7 @@ class PPStructure(StructureSystem): super().__init__(params) def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): - if isinstance(img, str): - # download net image - if img.startswith('http'): - download_with_progressbar(img, 'tmp.jpg') - img = 'tmp.jpg' - image_file = img - img, flag, _ = check_and_read(image_file) - if not flag: - with open(image_file, 'rb') as f: - np_arr = np.frombuffer(f.read(), dtype=np.uint8) - img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - return None - if isinstance(img, np.ndarray) and len(img.shape) == 2: - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - + img = check_img(img) res, _ = super().__call__( img, return_ocr_result_in_table, img_idx=img_idx) return res @@ -644,7 +641,7 @@ def main(): if not flag_pdf: if img is None: - logger.error("error in loading image:{}".format(image_file)) + logger.error("error in loading image:{}".format(img_path)) continue img_paths = [[img_path, img]] else: diff --git a/ppstructure/pdf2word/pdf2word.md b/ppstructure/pdf2word/README.md similarity index 100% rename from ppstructure/pdf2word/pdf2word.md rename to ppstructure/pdf2word/README.md diff --git a/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml b/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml new file mode 100644 index 0000000000000000000000000000000000000000..d2be152f0bae7d87129904d87c56c6d777a1f338 --- /dev/null +++ b/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml @@ -0,0 +1,122 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/ser_layoutxlm_xfund_zh + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 187 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: ppstructure/docs/kie/input/zh_val_42.jpg + save_res_path: ./output/ser_layoutxlm_xfund_zh/res + +Architecture: + model_type: kie + algorithm: &algorithm "LayoutXLM" + Transform: + Backbone: + name: LayoutXLMForSer + pretrained: True + checkpoints: + num_classes: &num_classes 7 + +Loss: + name: VQASerTokenLayoutLMLoss + num_classes: *num_classes + key: "backbone_out" + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Linear + learning_rate: 0.00005 + epochs: *epoch_num + warmup_epoch: 2 + regularizer: + name: L2 + factor: 0.00000 + +PostProcess: + name: VQASerTokenLayoutLMPostProcess + class_path: &class_path train_data/XFUND/class_list_xfun.txt + +Metric: + name: VQASerTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: train_data/XFUND/zh_train/image + label_file_list: + - train_data/XFUND/zh_train/train.json + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/XFUND/zh_val/image + label_file_list: + - train_data/XFUND/zh_val/val.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 4 diff --git a/test_tipc/configs/layoutxlm_ser/train_infer_python.txt b/test_tipc/configs/layoutxlm_ser/train_infer_python.txt index 549a31e69e367237ec0396778162a5f91c8b7412..d07daa9a1429ec5cd1955ec64ded122a9d1a723d 100644 --- a/test_tipc/configs/layoutxlm_ser/train_infer_python.txt +++ b/test_tipc/configs/layoutxlm_ser/train_infer_python.txt @@ -13,7 +13,7 @@ train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg null:null ## trainer:norm_train -norm_train:tools/train.py -c configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false +norm_train:tools/train.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false pact_train:null fpgm_train:null distill_train:null @@ -27,7 +27,7 @@ null:null ===========================infer_params=========================== Global.save_inference_dir:./output/ Architecture.Backbone.checkpoints: -norm_export:tools/export_model.py -c configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml -o +norm_export:tools/export_model.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o quant_export: fpgm_export: distill_export:null diff --git a/test_tipc/configs/table_master/train_infer_python.txt b/test_tipc/configs/table_master/train_infer_python.txt index 56b8e636026939ae8cd700308690010e1300d8f6..c3a871731a36fb5434db111cfd68b6eab7ba3f99 100644 --- a/test_tipc/configs/table_master/train_infer_python.txt +++ b/test_tipc/configs/table_master/train_infer_python.txt @@ -37,8 +37,8 @@ export2:null infer_model:null infer_export:null infer_quant:False -inference:ppstructure/table/predict_structure.py --table_char_dict_path=./ppocr/utils/dict/table_master_structure_dict.txt --image_dir=./ppstructure/docs/table/table.jpg --output ./output/table --table_algorithm=TableMaster --table_max_len=480 ---use_gpu:True|False +inference:ppstructure/table/predict_structure.py --table_char_dict_path=./ppocr/utils/dict/table_master_structure_dict.txt --output ./output/table --table_algorithm=TableMaster --table_max_len=480 +--use_gpu:True --enable_mkldnn:False --cpu_threads:6 --rec_batch_num:1 diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index bb4b58b4cac900166eeda4d9479fa6bd3fe69e02..1c0e16044c4264d14a93e2e0470ea95e7d5d4ba6 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -21,7 +21,11 @@ model_name=$(func_parser_value "${lines[1]}") trainer_list=$(func_parser_value "${lines[14]}") if [ ${MODE} = "benchmark_train" ];then - pip install -r requirements.txt + python_name_list=$(func_parser_value "${lines[2]}") + array=(${python_name_list}) + python_name=${array[0]} + ${python_name} -m pip install -r requirements.txt + ${python_name} -m pip install git+https://github.com/LDOUBLEV/AutoLog if [[ ${model_name} =~ "ch_ppocr_mobile_v2_0_det" || ${model_name} =~ "det_mv3_db_v2_0" ]];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate rm -rf ./train_data/icdar2015 @@ -107,8 +111,8 @@ if [ ${MODE} = "benchmark_train" ];then cd ../ fi if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then - pip install -r ppstructure/kie/requirements.txt - pip install opencv-python -U + ${python_name} -m pip install -r ppstructure/kie/requirements.txt + ${python_name} -m pip install opencv-python -U wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate cd ./train_data/ && tar xf XFUND.tar # expand gt.txt 10 times @@ -122,6 +126,11 @@ if [ ${MODE} = "benchmark_train" ];then fi if [ ${MODE} = "lite_train_lite_infer" ];then + python_name_list=$(func_parser_value "${lines[2]}") + array=(${python_name_list}) + python_name=${array[0]} + ${python_name} -m pip install -r requirements.txt + ${python_name} -m pip install git+https://github.com/LDOUBLEV/AutoLog # pretrain lite train data wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar --no-check-certificate @@ -212,6 +221,10 @@ if [ ${MODE} = "lite_train_lite_infer" ];then if [ ${model_name} == "ch_ppocr_mobile_v2_0_rec_FPGM" ]; then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf ch_ppocr_mobile_v2.0_rec_train.tar && cd ../ + ${python_name} -m pip install paddleslim + fi + if [ ${model_name} == "ch_ppocr_mobile_v2_0_det_FPGM" ]; then + ${python_name} -m pip install paddleslim fi if [ ${model_name} == "det_mv3_east_v2_0" ]; then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar --no-check-certificate @@ -230,8 +243,8 @@ if [ ${MODE} = "lite_train_lite_infer" ];then cd ./pretrain_models/ && tar xf rec_r32_gaspin_bilstm_att_train.tar && cd ../ fi if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then - pip install -r ppstructure/kie/requirements.txt - pip install opencv-python -U + ${python_name} -m pip install -r ppstructure/kie/requirements.txt + ${python_name} -m pip install opencv-python -U wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate cd ./train_data/ && tar xf XFUND.tar cd ../ @@ -639,6 +652,7 @@ if [ ${MODE} = "serving_infer" ];then ${python_name} -m pip install paddle-serving-server-gpu ${python_name} -m pip install paddle_serving_client ${python_name} -m pip install paddle-serving-app + ${python_name} -m pip install git+https://github.com/LDOUBLEV/AutoLog # wget model if [ ${model_name} == "ch_ppocr_mobile_v2_0_det_KL" ] || [ ${model_name} == "ch_ppocr_mobile_v2.0_rec_KL" ] ; then wget -nc -P ./inference https://paddleocr.bj.bcebos.com/tipc_fake_model/ch_ppocr_mobile_v2.0_det_klquant_infer.tar --no-check-certificate