From b3912fcf7a567109a605f6be25a286e78da8952f Mon Sep 17 00:00:00 2001 From: UserUnknownFactor <63057995+UserUnknownFactor@users.noreply.github.com> Date: Mon, 21 Aug 2023 11:33:03 +0300 Subject: [PATCH] Cherrypicking GH-10217 and GH-10216 to PaddlePaddle:dygraph (#10654) * Don't break overall processing on a bad image * Add preprocessing common to OCR tasks Add preprocessing to options --- paddleocr.py | 52 ++++++++++++++++++++++++++--------- ppocr/utils/utility.py | 19 +++++++++++++ ppstructure/utility.py | 17 +++++++++++- tools/infer/predict_system.py | 22 +++++++++++---- tools/infer/utility.py | 4 ++- 5 files changed, 93 insertions(+), 21 deletions(-) diff --git a/paddleocr.py b/paddleocr.py index c76f09a4..dc92cbf6 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -46,7 +46,7 @@ ppocr = importlib.import_module('ppocr', 'paddleocr') ppstructure = importlib.import_module('ppstructure', 'paddleocr') from ppocr.utils.logging import get_logger from tools.infer import predict_system -from ppocr.utils.utility import check_and_read, get_image_file_list +from ppocr.utils.utility import check_and_read, get_image_file_list, alpha_to_color, binarize_img from ppocr.utils.network import maybe_download, download_with_progressbar, is_link, confirm_model_dir_url from tools.infer.utility import draw_ocr, str2bool, check_gpu from ppstructure.utility import init_args, draw_structure_result @@ -513,7 +513,7 @@ def get_model_config(type, version, model_type, lang): def img_decode(content: bytes): np_arr = np.frombuffer(content, dtype=np.uint8) - return cv2.imdecode(np_arr, cv2.IMREAD_COLOR) + return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED) def check_img(img): @@ -617,14 +617,17 @@ class PaddleOCR(predict_system.TextSystem): super().__init__(params) self.page_num = params.page_num - def ocr(self, img, det=True, rec=True, cls=True): + def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, alpha_color=(255, 255, 255)): """ - ocr with paddleocr + OCR with PaddleOCR args: - img: img for ocr, support ndarray, img_path and list or ndarray - det: use text detection or not. If false, only rec will be exec. Default is True - rec: use text recognition or not. If false, only det will be exec. Default is True - cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. + img: img for OCR, support ndarray, img_path and list or ndarray + det: use text detection or not. If False, only rec will be exec. Default is True + rec: use text recognition or not. If False, only det will be exec. Default is True + cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. + bin: binarize image to black and white. Default is False. + inv: invert image colors. Default is False. + alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white. """ assert isinstance(img, (np.ndarray, list, str, bytes)) if isinstance(img, list) and det == True: @@ -632,7 +635,7 @@ class PaddleOCR(predict_system.TextSystem): exit(0) if cls == True and self.use_angle_cls == False: logger.warning( - 'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process' + 'Since the angle classifier is not initialized, it will not be used during the forward process' ) img = check_img(img) @@ -643,10 +646,23 @@ class PaddleOCR(predict_system.TextSystem): imgs = img[:self.page_num] else: imgs = [img] + + def preprocess_image(_image): + _image = alpha_to_color(_image, alpha_color) + if inv: + _image = cv2.bitwise_not(_image) + if bin: + _image = binarize_img(_image) + return _image + if det and rec: ocr_res = [] for idx, img in enumerate(imgs): + img = preprocess_image(img) dt_boxes, rec_res, _ = self.__call__(img, cls) + if not dt_boxes and not rec_res: + ocr_res.append(None) + continue tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] ocr_res.append(tmp_res) @@ -654,7 +670,11 @@ class PaddleOCR(predict_system.TextSystem): elif det and not rec: ocr_res = [] for idx, img in enumerate(imgs): + img = preprocess_image(img) dt_boxes, elapse = self.text_detector(img) + if not dt_boxes: + ocr_res.append(None) + continue tmp_res = [box.tolist() for box in dt_boxes] ocr_res.append(tmp_res) return ocr_res @@ -663,6 +683,7 @@ class PaddleOCR(predict_system.TextSystem): cls_res = [] for idx, img in enumerate(imgs): if not isinstance(img, list): + img = preprocess_image(img) img = [img] if self.use_angle_cls and cls: img, cls_res_tmp, elapse = self.text_classifier(img) @@ -764,10 +785,15 @@ def main(): img_name = os.path.basename(img_path).split('.')[0] logger.info('{}{}{}'.format('*' * 10, img_path, '*' * 10)) if args.type == 'ocr': - result = engine.ocr(img_path, - det=args.det, - rec=args.rec, - cls=args.use_angle_cls) + result = engine.ocr( + img_path, + det=args.det, + rec=args.rec, + cls=args.use_angle_cls, + bin=args.binarize, + inv=args.invert, + alpha_color=args.alphacolor + ) if result is not None: lines = [] for idx in range(len(result)): diff --git a/ppocr/utils/utility.py b/ppocr/utils/utility.py index 0f8660ce..f788e79c 100755 --- a/ppocr/utils/utility.py +++ b/ppocr/utils/utility.py @@ -75,6 +75,25 @@ def get_image_file_list(img_file): imgs_lists = sorted(imgs_lists) return imgs_lists +def binarize_img(img): + if len(img.shape) == 3 and img.shape[2] == 3: + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # conversion to grayscale image + # use cv2 threshold binarization + _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + img = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR) + return img + +def alpha_to_color(img, alpha_color=(255, 255, 255)): + if len(img.shape) == 3 and img.shape[2] == 4: + B, G, R, A = cv2.split(img) + alpha = A / 255 + + R = (alpha_color[0] * (1 - alpha) + R * alpha).astype(np.uint8) + G = (alpha_color[1] * (1 - alpha) + G * alpha).astype(np.uint8) + B = (alpha_color[2] * (1 - alpha) + B * alpha).astype(np.uint8) + + img = cv2.merge((B, G, R)) + return img def check_and_read(img_path): if os.path.basename(img_path)[-3:] in ['gif', 'GIF']: diff --git a/ppstructure/utility.py b/ppstructure/utility.py index 28ef3d9f..4ab4b88b 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -16,7 +16,7 @@ import ast import PIL from PIL import Image, ImageDraw, ImageFont import numpy as np -from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args +from tools.infer.utility import draw_ocr_box_txt, str2bool, str2int_tuple, init_args as infer_args import math @@ -100,6 +100,21 @@ def init_args(): type=str2bool, default=False, help='Whether to use pdf2docx api') + parser.add_argument( + "--invert", + type=str2bool, + default=False, + help='Whether to invert image before processing') + parser.add_argument( + "--binarize", + type=str2bool, + default=False, + help='Whether to threshold binarize image before processing') + parser.add_argument( + "--alphacolor", + type=str2int_tuple, + default=(255, 255, 255), + help='Replacement color for the alpha channel, if the latter is present; R,G,B integers') return parser diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index 3ddcfda6..8af45b4c 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -65,15 +65,25 @@ class TextSystem(object): self.crop_image_res_index += bbox_num def __call__(self, img, cls=True): - time_dict = {'det': 0, 'rec': 0, 'csl': 0, 'all': 0} + time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0} + + if img is None: + logger.debug("no valid image provided") + return None, None, time_dict + start = time.time() ori_im = img.copy() dt_boxes, elapse = self.text_detector(img) time_dict['det'] = elapse - logger.debug("dt_boxes num : {}, elapse : {}".format( - len(dt_boxes), elapse)) + if dt_boxes is None: - return None, None + logger.debug("no dt_boxes found, elapsed : {}".format(elapse)) + end = time.time() + time_dict['all'] = end - start + return None, None, time_dict + else: + logger.debug("dt_boxes num : {}, elapsed : {}".format( + len(dt_boxes), elapse)) img_crop_list = [] dt_boxes = sorted_boxes(dt_boxes) @@ -89,12 +99,12 @@ class TextSystem(object): img_crop_list, angle_list, elapse = self.text_classifier( img_crop_list) time_dict['cls'] = elapse - logger.debug("cls num : {}, elapse : {}".format( + logger.debug("cls num : {}, elapsed : {}".format( len(img_crop_list), elapse)) rec_res, elapse = self.text_recognizer(img_crop_list) time_dict['rec'] = elapse - logger.debug("rec_res num : {}, elapse : {}".format( + logger.debug("rec_res num : {}, elapsed : {}".format( len(rec_res), elapse)) if self.args.save_crop_res: self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, diff --git a/tools/infer/utility.py b/tools/infer/utility.py index fcd8ba7f..b064cbf1 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -29,8 +29,10 @@ from ppocr.utils.logging import get_logger def str2bool(v): - return v.lower() in ("true", "t", "1") + return v.lower() in ("true", "yes", "t", "y", "1") +def str2int_tuple(v): + return tuple([int(i.strip()) for i in v.split(",")]) def init_args(): parser = argparse.ArgumentParser() -- GitLab