Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into dygraph

4ee160ae · LDOUBLEV · 2189e54a · aa38b358 · 4ee160ae · 4ee160ae
12 changed file
--- a/PPOCRLabel/PPOCRLabel.py
+++ b/PPOCRLabel/PPOCRLabel.py
@@ -2531,7 +2531,7 @@ class MainWindow(QMainWindow):
                split = 'test'

            #  save dict
-            html = {'structure': {'tokens': token_list}, 'cell': cells}
+            html = {'structure': {'tokens': token_list}, 'cells': cells}
            json_results.append({'filename': os.path.basename(image_path), 'split': split, 'imgid': imgid, 'html': html})
            imgid += 1


--- a/PPOCRLabel/README.md
+++ b/PPOCRLabel/README.md
 English | [简体中文](README_ch.md)

-# PPOCRLabel
+# PPOCRLabelv2

-PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box, table and multi-point annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models.
+PPOCRLabelv2 is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in Python3 and PyQT5, supporting rectangular box, table, irregular text and key information annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models.

-<img src="./data/gif/steps_en.gif" width="100%"/>
+|               regular text annotation               |                table annotation                |
+| :-------------------------------------------------: | :--------------------------------------------: |
+|  <img src="./data/gif/steps_en.gif" width="80%"/>   | <img src="./data/gif/table.gif" width="100%"/> |
+|            **irregular text annotation**            |         **key information annotation**         |
+| <img src="./data/gif/multi-point.gif" width="80%"/> |  <img src="./data/gif/kie.gif" width="100%"/>  |

 ### Recent Update


--- a/PPOCRLabel/README_ch.md
+++ b/PPOCRLabel/README_ch.md
 [English](README.md) | 简体中文

-# PPOCRLabel
+# PPOCRLabelv2

 PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具，内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写，支持矩形框标注和四点标注模式，导出格式可直接用于PaddleOCR检测和识别模型的训练。

-<img src="./data/gif/steps.gif" width="100%"/>
+|                      常规标注                       |                    表格标注                    |
+| :-------------------------------------------------: | :--------------------------------------------: |
+|  <img src="./data/gif/steps_en.gif" width="80%"/>   | <img src="./data/gif/table.gif" width="100%"/> |
+|                 **不规则文本标注**                  |                **关键信息标注**                |
+| <img src="./data/gif/multi-point.gif" width="80%"/> |  <img src="./data/gif/kie.gif" width="100%"/>  |

 #### 近期更新
 - 2022.05：**新增表格标注**，使用方法见下方`2.2 表格标注`（by [whjdark](https://github.com/peterh0323); [Evezerest](https://github.com/Evezerest))

--- a/PPOCRLabel/setup.py
+++ b/PPOCRLabel/setup.py
@@ -33,7 +33,7 @@ setup(
    package_dir={'PPOCRLabel': ''},
    include_package_data=True,
    entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]},
-    version='1.0.2',
+    version='2.1.1',
    install_requires=requirements,
    license='Apache License 2.0',
    description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models',

--- a/configs/table/SLANet_ch.yml
+++ b/configs/table/SLANet_ch.yml
--- a/deploy/cpp_infer/src/utility.cpp
+++ b/deploy/cpp_infer/src/utility.cpp
@@ -268,13 +268,14 @@ cv::Mat Utility::crop_image(cv::Mat &img, std::vector<int> &area) {

 void Utility::sorted_boxes(std::vector<OCRPredictResult> &ocr_result) {
  std::sort(ocr_result.begin(), ocr_result.end(), Utility::comparison_box);
-
+  if (ocr_result.size() > 0) {
    for (int i = 0; i < ocr_result.size() - 1; i++) {
      if (abs(ocr_result[i + 1].box[0][1] - ocr_result[i].box[0][1]) < 10 &&
          (ocr_result[i + 1].box[0][0] < ocr_result[i].box[0][0])) {
        std::swap(ocr_result[i], ocr_result[i + 1]);
      }
    }
+  }
 }

 } // namespace PaddleOCR
\ No newline at end of file
--- a/paddleocr.py
+++ b/paddleocr.py
@@ -414,6 +414,33 @@ def get_model_config(type, version, model_type, lang):
    return model_urls[version][model_type][lang]


+def img_decode(content: bytes):
+    np_arr = np.frombuffer(content, dtype=np.uint8)
+    return cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
+
+
+def check_img(img):
+    if isinstance(img, bytes):
+        img = img_decode(img)
+    if isinstance(img, str):
+        # download net image
+        if is_link(img):
+            download_with_progressbar(img, 'tmp.jpg')
+            img = 'tmp.jpg'
+        image_file = img
+        img, flag, _ = check_and_read(image_file)
+        if not flag:
+            with open(image_file, 'rb') as f:
+                img = img_decode(f.read())
+        if img is None:
+            logger.error("error in loading image:{}".format(image_file))
+            return None
+    if isinstance(img, np.ndarray) and len(img.shape) == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+    return img
+
+
 class PaddleOCR(predict_system.TextSystem):
    def __init__(self, **kwargs):
        """
@@ -482,7 +509,7 @@ class PaddleOCR(predict_system.TextSystem):
            rec: use text recognition or not. If false, only det will be exec. Default is True
            cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
        """
-        assert isinstance(img, (np.ndarray, list, str))
+        assert isinstance(img, (np.ndarray, list, str, bytes))
        if isinstance(img, list) and det == True:
            logger.error('When input a list of images, det must be false')
            exit(0)
@@ -491,22 +518,8 @@ class PaddleOCR(predict_system.TextSystem):
                'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process'
            )

-        if isinstance(img, str):
-            # download net image
-            if img.startswith('http'):
-                download_with_progressbar(img, 'tmp.jpg')
-                img = 'tmp.jpg'
-            image_file = img
-            img, flag, _ = check_and_read(image_file)
-            if not flag:
-                with open(image_file, 'rb') as f:
-                    np_arr = np.frombuffer(f.read(), dtype=np.uint8)
-                    img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
-            if img is None:
-                logger.error("error in loading image:{}".format(image_file))
-                return None
-        if isinstance(img, np.ndarray) and len(img.shape) == 2:
-            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        img = check_img(img)
+
        if det and rec:
            dt_boxes, rec_res, _ = self.__call__(img, cls)
            return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)]
@@ -585,23 +598,7 @@ class PPStructure(StructureSystem):
        super().__init__(params)

    def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
-        if isinstance(img, str):
-            # download net image
-            if img.startswith('http'):
-                download_with_progressbar(img, 'tmp.jpg')
-                img = 'tmp.jpg'
-            image_file = img
-            img, flag, _ = check_and_read(image_file)
-            if not flag:
-                with open(image_file, 'rb') as f:
-                    np_arr = np.frombuffer(f.read(), dtype=np.uint8)
-                    img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
-            if img is None:
-                logger.error("error in loading image:{}".format(image_file))
-                return None
-        if isinstance(img, np.ndarray) and len(img.shape) == 2:
-            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-
+        img = check_img(img)
        res, _ = super().__call__(
            img, return_ocr_result_in_table, img_idx=img_idx)
        return res
@@ -644,7 +641,7 @@ def main():

            if not flag_pdf:
                if img is None:
-                    logger.error("error in loading image:{}".format(image_file))
+                    logger.error("error in loading image:{}".format(img_path))
                    continue
                img_paths = [[img_path, img]]
            else:

--- a/ppstructure/pdf2word/pdf2word.md
+++ b/ppstructure/pdf2word/pdf2word.md
--- a/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml
+++ b/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutxlm_xfund_zh
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 187 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: ppstructure/docs/kie/input/zh_val_42.jpg
+  save_res_path: ./output/ser_layoutxlm_xfund_zh/res
+
+Architecture:
+  model_type: kie
+  algorithm: &algorithm "LayoutXLM"
+  Transform:
+  Backbone:
+    name: LayoutXLMForSer
+    pretrained: True
+    checkpoints: 
+    num_classes: &num_classes 7
+
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+  key: "backbone_out"
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+    
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path train_data/XFUND/class_list_xfun.txt
+
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/XFUND/zh_train/image
+    label_file_list: 
+      - train_data/XFUND/zh_train/train.json
+    ratio_list: [ 1.0 ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/XFUND/zh_val/image
+    label_file_list:
+      - train_data/XFUND/zh_val/val.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/test_tipc/configs/layoutxlm_ser/train_infer_python.txt
+++ b/test_tipc/configs/layoutxlm_ser/train_infer_python.txt
@@ -13,7 +13,7 @@ train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg
 null:null
 ##
 trainer:norm_train
-norm_train:tools/train.py -c configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false
+norm_train:tools/train.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false
 pact_train:null
 fpgm_train:null
 distill_train:null
@@ -27,7 +27,7 @@ null:null
 ===========================infer_params===========================
 Global.save_inference_dir:./output/
 Architecture.Backbone.checkpoints:
-norm_export:tools/export_model.py -c configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml -o 
+norm_export:tools/export_model.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o 
 quant_export:
 fpgm_export: 
 distill_export:null

--- a/test_tipc/configs/table_master/train_infer_python.txt
+++ b/test_tipc/configs/table_master/train_infer_python.txt
@@ -37,8 +37,8 @@ export2:null
 infer_model:null
 infer_export:null
 infer_quant:False
-inference:ppstructure/table/predict_structure.py  --table_char_dict_path=./ppocr/utils/dict/table_master_structure_dict.txt --image_dir=./ppstructure/docs/table/table.jpg --output ./output/table --table_algorithm=TableMaster --table_max_len=480 
--use_gpu:True|False
+inference:ppstructure/table/predict_structure.py  --table_char_dict_path=./ppocr/utils/dict/table_master_structure_dict.txt --output ./output/table --table_algorithm=TableMaster --table_max_len=480 
+--use_gpu:True
 --enable_mkldnn:False
 --cpu_threads:6
 --rec_batch_num:1

--- a/test_tipc/prepare.sh
+++ b/test_tipc/prepare.sh
@@ -21,7 +21,11 @@ model_name=$(func_parser_value "${lines[1]}")
 trainer_list=$(func_parser_value "${lines[14]}")

 if [ ${MODE} = "benchmark_train" ];then
-    pip install -r requirements.txt
+    python_name_list=$(func_parser_value "${lines[2]}")
+    array=(${python_name_list}) 
+    python_name=${array[0]}
+    ${python_name} -m pip install -r requirements.txt
+    ${python_name} -m pip install git+https://github.com/LDOUBLEV/AutoLog
    if [[ ${model_name} =~ "ch_ppocr_mobile_v2_0_det" || ${model_name} =~ "det_mv3_db_v2_0" ]];then
        wget -nc -P  ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams  --no-check-certificate
        rm -rf ./train_data/icdar2015
@@ -107,8 +111,8 @@ if [ ${MODE} = "benchmark_train" ];then
        cd ../
    fi
    if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then
-        pip install -r ppstructure/kie/requirements.txt
-        pip install opencv-python -U
+        ${python_name} -m pip install -r ppstructure/kie/requirements.txt
+        ${python_name} -m pip install opencv-python -U
        wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate
        cd ./train_data/ && tar xf XFUND.tar
        # expand gt.txt 10 times
@@ -122,6 +126,11 @@ if [ ${MODE} = "benchmark_train" ];then
 fi

 if [ ${MODE} = "lite_train_lite_infer" ];then
+    python_name_list=$(func_parser_value "${lines[2]}")
+    array=(${python_name_list}) 
+    python_name=${array[0]}
+    ${python_name} -m pip install -r requirements.txt
+    ${python_name} -m pip install git+https://github.com/LDOUBLEV/AutoLog
    # pretrain lite train data
    wget -nc -P  ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams  --no-check-certificate
    wget -nc -P ./pretrain_models/  https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar  --no-check-certificate
@@ -212,6 +221,10 @@ if [ ${MODE} = "lite_train_lite_infer" ];then
    if [ ${model_name} == "ch_ppocr_mobile_v2_0_rec_FPGM" ]; then
        wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar --no-check-certificate
        cd ./pretrain_models/ && tar xf ch_ppocr_mobile_v2.0_rec_train.tar && cd ../
+        ${python_name} -m pip install paddleslim
+    fi
+    if [ ${model_name} == "ch_ppocr_mobile_v2_0_det_FPGM" ]; then
+        ${python_name} -m pip install paddleslim
    fi
    if [ ${model_name} == "det_mv3_east_v2_0" ]; then
        wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar --no-check-certificate
@@ -230,8 +243,8 @@ if [ ${MODE} = "lite_train_lite_infer" ];then
        cd ./pretrain_models/ && tar xf rec_r32_gaspin_bilstm_att_train.tar && cd ../
    fi
    if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then
-        pip install -r ppstructure/kie/requirements.txt
-        pip install opencv-python -U
+        ${python_name} -m pip install -r ppstructure/kie/requirements.txt
+        ${python_name} -m pip install opencv-python -U
        wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate
        cd ./train_data/ && tar xf XFUND.tar
        cd ../
@@ -639,6 +652,7 @@ if [ ${MODE} = "serving_infer" ];then
    ${python_name} -m pip install paddle-serving-server-gpu
    ${python_name} -m pip install paddle_serving_client
    ${python_name} -m pip install paddle-serving-app
+    ${python_name} -m pip install git+https://github.com/LDOUBLEV/AutoLog
    # wget model
    if [ ${model_name} == "ch_ppocr_mobile_v2_0_det_KL" ] || [ ${model_name} == "ch_ppocr_mobile_v2.0_rec_KL" ] ; then
        wget -nc  -P ./inference https://paddleocr.bj.bcebos.com/tipc_fake_model/ch_ppocr_mobile_v2.0_det_klquant_infer.tar --no-check-certificate