Cherrypicking GH-10217 and GH-10216 to PaddlePaddle:dygraph (#10654)

* Don't break overall processing on a bad image * Add preprocessing common to OCR tasks Add preprocessing to options

Cherrypicking GH-10217 and GH-10216 to PaddlePaddle:dygraph (#10654)
* Don't break overall processing on a bad image * Add preprocessing common to OCR tasks Add preprocessing to options
b3912fcf · UserUnknownFactor · GitHub · 2bd552c8 · b3912fcf · b3912fcf
5 changed file
--- a/paddleocr.py
+++ b/paddleocr.py
@@ -46,7 +46,7 @@ ppocr = importlib.import_module('ppocr', 'paddleocr')
 ppstructure = importlib.import_module('ppstructure', 'paddleocr')
 from ppocr.utils.logging import get_logger
 from tools.infer import predict_system
-from ppocr.utils.utility import check_and_read, get_image_file_list
+from ppocr.utils.utility import check_and_read, get_image_file_list, alpha_to_color, binarize_img
 from ppocr.utils.network import maybe_download, download_with_progressbar, is_link, confirm_model_dir_url
 from tools.infer.utility import draw_ocr, str2bool, check_gpu
 from ppstructure.utility import init_args, draw_structure_result
@@ -513,7 +513,7 @@ def get_model_config(type, version, model_type, lang):
 def img_decode(content: bytes):
    np_arr = np.frombuffer(content, dtype=np.uint8)
-    return cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
+    return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
 def check_img(img):
@@ -617,14 +617,17 @@ class PaddleOCR(predict_system.TextSystem):
        super().__init__(params)
        self.page_num = params.page_num
-    def ocr(self, img, det=True, rec=True, cls=True):
+    def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, alpha_color=(255, 255, 255)):
        """
-        ocr with paddleocr
+        OCR with PaddleOCR
        args:
-            img: img for ocr, support ndarray, img_path and list or ndarray
+            img: img for OCR, support ndarray, img_path and list or ndarray
-            det: use text detection or not. If false, only rec will be exec. Default is True
+            det: use text detection or not. If False, only rec will be exec. Default is True
-            rec: use text recognition or not. If false, only det will be exec. Default is True
+            rec: use text recognition or not. If False, only det will be exec. Default is True
-            cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
+            cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
+            bin: binarize image to black and white. Default is False.
+            inv: invert image colors. Default is False.
+            alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
        """
        assert isinstance(img, (np.ndarray, list, str, bytes))
        if isinstance(img, list) and det == True:
@@ -632,7 +635,7 @@ class PaddleOCR(predict_system.TextSystem):
            exit(0)
        if cls == True and self.use_angle_cls == False:
            logger.warning(
-                'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process'
+                'Since the angle classifier is not initialized, it will not be used during the forward process'
            )
        img = check_img(img)
@@ -643,10 +646,23 @@ class PaddleOCR(predict_system.TextSystem):
            imgs = img[:self.page_num]
        else:
            imgs = [img]
+        def preprocess_image(_image):
+            _image = alpha_to_color(_image, alpha_color)
+            if inv:
+                _image = cv2.bitwise_not(_image)
+            if bin:
+                _image = binarize_img(_image)
+            return _image
        if det and rec:
            ocr_res = []
            for idx, img in enumerate(imgs):
+                img = preprocess_image(img)
                dt_boxes, rec_res, _ = self.__call__(img, cls)
+                if not dt_boxes and not rec_res:
+                    ocr_res.append(None)
+                    continue
                tmp_res = [[box.tolist(), res]
                           for box, res in zip(dt_boxes, rec_res)]
                ocr_res.append(tmp_res)
@@ -654,7 +670,11 @@ class PaddleOCR(predict_system.TextSystem):
        elif det and not rec:
            ocr_res = []
            for idx, img in enumerate(imgs):
+                img = preprocess_image(img)
                dt_boxes, elapse = self.text_detector(img)
+                if not dt_boxes:
+                    ocr_res.append(None)
+                    continue
                tmp_res = [box.tolist() for box in dt_boxes]
                ocr_res.append(tmp_res)
            return ocr_res
@@ -663,6 +683,7 @@ class PaddleOCR(predict_system.TextSystem):
            cls_res = []
            for idx, img in enumerate(imgs):
                if not isinstance(img, list):
+                    img = preprocess_image(img)
                    img = [img]
                if self.use_angle_cls and cls:
                    img, cls_res_tmp, elapse = self.text_classifier(img)
@@ -764,10 +785,15 @@ def main():
        img_name = os.path.basename(img_path).split('.')[0]
        logger.info('{}{}{}'.format('*' * 10, img_path, '*' * 10))
        if args.type == 'ocr':
-            result = engine.ocr(img_path,
+            result = engine.ocr(
-                                det=args.det,
+                img_path,
-                                rec=args.rec,
+                det=args.det,
-                                cls=args.use_angle_cls)
+                rec=args.rec,
+                cls=args.use_angle_cls,
+                bin=args.binarize,
+                inv=args.invert,
+                alpha_color=args.alphacolor
+            )
            if result is not None:
                lines = []
                for idx in range(len(result)):

--- a/ppocr/utils/utility.py
+++ b/ppocr/utils/utility.py
@@ -75,6 +75,25 @@ def get_image_file_list(img_file):
    imgs_lists = sorted(imgs_lists)
    return imgs_lists
+def binarize_img(img):
+    if len(img.shape) == 3 and img.shape[2] == 3:
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # conversion to grayscale image
+        # use cv2 threshold binarization
+        _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        img = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
+    return img
+def alpha_to_color(img, alpha_color=(255, 255, 255)):
+    if len(img.shape) == 3 and img.shape[2] == 4:
+        B, G, R, A = cv2.split(img)
+        alpha = A / 255
+        R = (alpha_color[0] * (1 - alpha) + R * alpha).astype(np.uint8)
+        G = (alpha_color[1] * (1 - alpha) + G * alpha).astype(np.uint8)
+        B = (alpha_color[2] * (1 - alpha) + B * alpha).astype(np.uint8)
+        img = cv2.merge((B, G, R))
+    return img
 def check_and_read(img_path):
    if os.path.basename(img_path)[-3:] in ['gif', 'GIF']:

--- a/ppstructure/utility.py
+++ b/ppstructure/utility.py
@@ -16,7 +16,7 @@ import ast
 import PIL
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
-from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args
+from tools.infer.utility import draw_ocr_box_txt, str2bool, str2int_tuple, init_args as infer_args
 import math
@@ -100,6 +100,21 @@ def init_args():
        type=str2bool,
        default=False,
        help='Whether to use pdf2docx api')
+    parser.add_argument(
+        "--invert",
+        type=str2bool,
+        default=False,
+        help='Whether to invert image before processing')
+    parser.add_argument(
+        "--binarize",
+        type=str2bool,
+        default=False,
+        help='Whether to threshold binarize image before processing')
+    parser.add_argument(
+        "--alphacolor",
+        type=str2int_tuple,
+        default=(255, 255, 255),
+        help='Replacement color for the alpha channel, if the latter is present; R,G,B integers')
    return parser

--- a/tools/infer/predict_system.py
+++ b/tools/infer/predict_system.py
@@ -65,15 +65,25 @@ class TextSystem(object):
        self.crop_image_res_index += bbox_num
    def __call__(self, img, cls=True):
-        time_dict = {'det': 0, 'rec': 0, 'csl': 0, 'all': 0}
+        time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
+        if img is None:
+            logger.debug("no valid image provided")
+            return None, None, time_dict
        start = time.time()
        ori_im = img.copy()
        dt_boxes, elapse = self.text_detector(img)
        time_dict['det'] = elapse
-        logger.debug("dt_boxes num : {}, elapse : {}".format(
-            len(dt_boxes), elapse))
        if dt_boxes is None:
-            return None, None
+            logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
+            end = time.time()
+            time_dict['all'] = end - start
+            return None, None, time_dict
+        else:
+            logger.debug("dt_boxes num : {}, elapsed : {}".format(
+                len(dt_boxes), elapse))
        img_crop_list = []
        dt_boxes = sorted_boxes(dt_boxes)
@@ -89,12 +99,12 @@ class TextSystem(object):
            img_crop_list, angle_list, elapse = self.text_classifier(
                img_crop_list)
            time_dict['cls'] = elapse
-            logger.debug("cls num  : {}, elapse : {}".format(
+            logger.debug("cls num  : {}, elapsed : {}".format(
                len(img_crop_list), elapse))
        rec_res, elapse = self.text_recognizer(img_crop_list)
        time_dict['rec'] = elapse
-        logger.debug("rec_res num  : {}, elapse : {}".format(
+        logger.debug("rec_res num  : {}, elapsed : {}".format(
            len(rec_res), elapse))
        if self.args.save_crop_res:
            self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,

--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -29,8 +29,10 @@ from ppocr.utils.logging import get_logger
 def str2bool(v):
-    return v.lower() in ("true", "t", "1")
+    return v.lower() in ("true", "yes", "t", "y", "1")
+def str2int_tuple(v):
+    return tuple([int(i.strip()) for i in v.split(",")])
 def init_args():
    parser = argparse.ArgumentParser()