add paddle2onnx doc

3f5ffc32 · tink2123 · 1c5d1aa4 · 1c5d1aa4 · 1c5d1aa4 · 1c5d1aa4
12 changed file
--- a/deploy/paddle2onnx/predict_cls.py
+++ b/deploy/paddle2onnx/predict_cls.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
-import cv2
-import copy
-import numpy as np
-import math
-import time
-import traceback
-
-import utility
-from ppocr.postprocess import build_post_process
-from ppocr.utils.logging import get_logger
-from ppocr.utils.utility import get_image_file_list, check_and_read_gif
-
-logger = get_logger()
-
-
-class TextClassifier(object):
-    def __init__(self, args):
-        self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")]
-        self.cls_batch_num = args.cls_batch_num
-        self.cls_thresh = args.cls_thresh
-        postprocess_params = {
-            'name': 'ClsPostProcess',
-            "label_list": args.label_list,
-        }
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.input_tensor, self.output_tensors = \
-            utility.create_predictor(args, 'cls', logger)
-
-    def resize_norm_img(self, img):
-        imgC, imgH, imgW = self.cls_image_shape
-        h = img.shape[0]
-        w = img.shape[1]
-        ratio = w / float(h)
-        if math.ceil(imgH * ratio) > imgW:
-            resized_w = imgW
-        else:
-            resized_w = int(math.ceil(imgH * ratio))
-        resized_image = cv2.resize(img, (resized_w, imgH))
-        resized_image = resized_image.astype('float32')
-        if self.cls_image_shape[0] == 1:
-            resized_image = resized_image / 255
-            resized_image = resized_image[np.newaxis, :]
-        else:
-            resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        padding_im[:, :, 0:resized_w] = resized_image
-        return padding_im
-
-    def __call__(self, img_list):
-        img_list = copy.deepcopy(img_list)
-        img_num = len(img_list)
-        # Calculate the aspect ratio of all text bars
-        width_list = []
-        for img in img_list:
-            width_list.append(img.shape[1] / float(img.shape[0]))
-        # Sorting can speed up the cls process
-        indices = np.argsort(np.array(width_list))
-
-        cls_res = [['', 0.0]] * img_num
-        batch_num = self.cls_batch_num
-        elapse = 0
-        for beg_img_no in range(0, img_num, batch_num):
-            end_img_no = min(img_num, beg_img_no + batch_num)
-            norm_img_batch = []
-            max_wh_ratio = 0
-            for ino in range(beg_img_no, end_img_no):
-                h, w = img_list[indices[ino]].shape[0:2]
-                wh_ratio = w * 1.0 / h
-                max_wh_ratio = max(max_wh_ratio, wh_ratio)
-            for ino in range(beg_img_no, end_img_no):
-                norm_img = self.resize_norm_img(img_list[indices[ino]])
-                norm_img = norm_img[np.newaxis, :]
-                norm_img_batch.append(norm_img)
-            norm_img_batch = np.concatenate(norm_img_batch)
-            norm_img_batch = norm_img_batch.copy()
-            starttime = time.time()
-            input_dict = {}
-            input_dict[self.input_tensor.name] = norm_img_batch
-            outputs = self.predictor.run(self.output_tensors, input_dict)
-            prob_out = outputs[0]
-            cls_result = self.postprocess_op(prob_out)
-            elapse += time.time() - starttime
-            for rno in range(len(cls_result)):
-                label, score = cls_result[rno]
-                cls_res[indices[beg_img_no + rno]] = [label, score]
-                if '180' in label and score > self.cls_thresh:
-                    img_list[indices[beg_img_no + rno]] = cv2.rotate(
-                        img_list[indices[beg_img_no + rno]], 1)
-        return img_list, cls_res, elapse
-
-
-def main(args):
-    image_file_list = get_image_file_list(args.image_dir)
-    text_classifier = TextClassifier(args)
-    valid_image_file_list = []
-    img_list = []
-    for image_file in image_file_list:
-        img, flag = check_and_read_gif(image_file)
-        if not flag:
-            img = cv2.imread(image_file)
-        if img is None:
-            logger.info("error in loading image:{}".format(image_file))
-            continue
-        valid_image_file_list.append(image_file)
-        img_list.append(img)
-    try:
-        img_list, cls_res, predict_time = text_classifier(img_list)
-    except:
-        logger.info(traceback.format_exc())
-        logger.info(
-            "ERROR!!!! \n"
-            "Please read the FAQ：https://github.com/PaddlePaddle/PaddleOCR#faq \n"
-            "If your model has tps module:  "
-            "TPS does not support variable shape.\n"
-            "Please set --rec_image_shape='3,32,100' and --rec_char_type='en' ")
-        exit()
-    for ino in range(len(img_list)):
-        logger.info("Predicts of {}:{}".format(valid_image_file_list[ino],
-                                               cls_res[ino]))
-    logger.info("Total predict time for {} images, cost: {:.3f}".format(
-        len(img_list), predict_time))
-
-
-if __name__ == "__main__":
-    main(utility.parse_args())
--- a/deploy/paddle2onnx/predict_det.py
+++ b/deploy/paddle2onnx/predict_det.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
-import cv2
-import numpy as np
-import time
-import sys
-
-import utility as utility
-from ppocr.utils.logging import get_logger
-from ppocr.utils.utility import get_image_file_list, check_and_read_gif
-from ppocr.data import create_operators, transform
-from ppocr.postprocess import build_post_process
-
-logger = get_logger()
-
-
-class TextDetector(object):
-    def __init__(self, args):
-        self.args = args
-        self.det_algorithm = args.det_algorithm
-        pre_process_list = [{
-            'DetResizeForTest': {
-                'image_shape': [640, 640]
-            }
-        }, {
-            'NormalizeImage': {
-                'std': [0.229, 0.224, 0.225],
-                'mean': [0.485, 0.456, 0.406],
-                'scale': '1./255.',
-                'order': 'hwc'
-            }
-        }, {
-            'ToCHWImage': None
-        }, {
-            'KeepKeys': {
-                'keep_keys': ['image', 'shape']
-            }
-        }]
-        postprocess_params = {}
-        if self.det_algorithm == "DB":
-            postprocess_params['name'] = 'DBPostProcess'
-            postprocess_params["thresh"] = args.det_db_thresh
-            postprocess_params["box_thresh"] = args.det_db_box_thresh
-            postprocess_params["max_candidates"] = 1000
-            postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
-            postprocess_params["use_dilation"] = True
-        elif self.det_algorithm == "EAST":
-            postprocess_params['name'] = 'EASTPostProcess'
-            postprocess_params["score_thresh"] = args.det_east_score_thresh
-            postprocess_params["cover_thresh"] = args.det_east_cover_thresh
-            postprocess_params["nms_thresh"] = args.det_east_nms_thresh
-        elif self.det_algorithm == "SAST":
-            pre_process_list[0] = {
-                'DetResizeForTest': {
-                    'resize_long': args.det_limit_side_len
-                }
-            }
-            postprocess_params['name'] = 'SASTPostProcess'
-            postprocess_params["score_thresh"] = args.det_sast_score_thresh
-            postprocess_params["nms_thresh"] = args.det_sast_nms_thresh
-            self.det_sast_polygon = args.det_sast_polygon
-            if self.det_sast_polygon:
-                postprocess_params["sample_pts_num"] = 6
-                postprocess_params["expand_scale"] = 1.2
-                postprocess_params["shrink_ratio_of_width"] = 0.2
-            else:
-                postprocess_params["sample_pts_num"] = 2
-                postprocess_params["expand_scale"] = 1.0
-                postprocess_params["shrink_ratio_of_width"] = 0.3
-        else:
-            logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
-            sys.exit(0)
-
-        self.preprocess_op = create_operators(pre_process_list)
-        self.postprocess_op = build_post_process(postprocess_params)
-        print()
-        self.predictor, self.input_tensor, self.output_tensors = utility.create_predictor(
-            args, 'det', logger)  # paddle.jit.load(args.det_model_dir)
-        # self.predictor.eval()
-
-    def order_points_clockwise(self, pts):
-        """
-        reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
-        # sort the points based on their x-coordinates
-        """
-        xSorted = pts[np.argsort(pts[:, 0]), :]
-
-        # grab the left-most and right-most points from the sorted
-        # x-roodinate points
-        leftMost = xSorted[:2, :]
-        rightMost = xSorted[2:, :]
-
-        # now, sort the left-most coordinates according to their
-        # y-coordinates so we can grab the top-left and bottom-left
-        # points, respectively
-        leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
-        (tl, bl) = leftMost
-
-        rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
-        (tr, br) = rightMost
-
-        rect = np.array([tl, tr, br, bl], dtype="float32")
-        return rect
-
-    def clip_det_res(self, points, img_height, img_width):
-        for pno in range(points.shape[0]):
-            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
-            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
-        return points
-
-    def filter_tag_det_res(self, dt_boxes, image_shape):
-        img_height, img_width = image_shape[0:2]
-        dt_boxes_new = []
-        for box in dt_boxes:
-            box = self.order_points_clockwise(box)
-            box = self.clip_det_res(box, img_height, img_width)
-            rect_width = int(np.linalg.norm(box[0] - box[1]))
-            rect_height = int(np.linalg.norm(box[0] - box[3]))
-            if rect_width <= 3 or rect_height <= 3:
-                continue
-            dt_boxes_new.append(box)
-        dt_boxes = np.array(dt_boxes_new)
-        return dt_boxes
-
-    def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
-        img_height, img_width = image_shape[0:2]
-        dt_boxes_new = []
-        for box in dt_boxes:
-            box = self.clip_det_res(box, img_height, img_width)
-            dt_boxes_new.append(box)
-        dt_boxes = np.array(dt_boxes_new)
-        return dt_boxes
-
-    def __call__(self, img):
-        ori_im = img.copy()
-        data = {'image': img}
-        data = transform(data, self.preprocess_op)
-        img, shape_list = data
-        if img is None:
-            return None, 0
-        img = np.expand_dims(img, axis=0)
-        shape_list = np.expand_dims(shape_list, axis=0)
-        img = img.copy()
-        starttime = time.time()
-
-        input_dict = {}
-        input_dict[self.input_tensor.name] = img
-
-        outputs = self.predictor.run(self.output_tensors, input_dict)
-
-        preds = {}
-        if self.det_algorithm == "EAST":
-            preds['f_geo'] = outputs[0]
-            preds['f_score'] = outputs[1]
-        elif self.det_algorithm == 'SAST':
-            preds['f_border'] = outputs[0]
-            preds['f_score'] = outputs[1]
-            preds['f_tco'] = outputs[2]
-            preds['f_tvo'] = outputs[3]
-        elif self.det_algorithm == 'DB':
-            preds['maps'] = outputs[0]
-        else:
-            raise NotImplementedError
-
-        post_result = self.postprocess_op(preds, shape_list)
-        dt_boxes = post_result[0]['points']
-        if self.det_algorithm == "SAST" and self.det_sast_polygon:
-            dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
-        else:
-            dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
-        elapse = time.time() - starttime
-        return dt_boxes, elapse
-
-
-if __name__ == "__main__":
-    args = utility.parse_args()
-    image_file_list = get_image_file_list(args.image_dir)
-    text_detector = TextDetector(args)
-    count = 0
-    total_time = 0
-    draw_img_save = "./inference_results"
-    if not os.path.exists(draw_img_save):
-        os.makedirs(draw_img_save)
-    for image_file in image_file_list:
-        img, flag = check_and_read_gif(image_file)
-        if not flag:
-            img = cv2.imread(image_file)
-        if img is None:
-            logger.info("error in loading image:{}".format(image_file))
-            continue
-        dt_boxes, elapse = text_detector(img)
-        if count > 0:
-            total_time += elapse
-        count += 1
-        logger.info("Predict time of {}: {}".format(image_file, elapse))
-        src_im = utility.draw_text_det_res(dt_boxes, image_file)
-        img_name_pure = os.path.split(image_file)[-1]
-        img_path = os.path.join(draw_img_save,
-                                "det_res_{}".format(img_name_pure))
-        cv2.imwrite(img_path, src_im)
-        logger.info("The visualized image saved in {}".format(img_path))
-    if count > 1:
-        logger.info("Avg Time: {}".format(total_time / (count - 1)))
--- a/deploy/paddle2onnx/predict_rec.py
+++ b/deploy/paddle2onnx/predict_rec.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
-import cv2
-import numpy as np
-import math
-import time
-import traceback
-
-import utility
-from ppocr.postprocess import build_post_process
-from ppocr.utils.logging import get_logger
-from ppocr.utils.utility import get_image_file_list, check_and_read_gif
-
-logger = get_logger()
-
-
-class TextRecognizer(object):
-    def __init__(self, args):
-        self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
-        self.character_type = args.rec_char_type
-        self.rec_batch_num = args.rec_batch_num
-        self.rec_algorithm = args.rec_algorithm
-        postprocess_params = {
-            'name': 'CTCLabelDecode',
-            "character_type": args.rec_char_type,
-            "character_dict_path": args.rec_char_dict_path,
-            "use_space_char": args.use_space_char
-        }
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.input_tensor, self.output_tensors = \
-            utility.create_predictor(args, 'rec', logger)
-
-    def resize_norm_img(self, img, max_wh_ratio):
-        imgC, imgH, imgW = self.rec_image_shape
-        assert imgC == img.shape[2]
-        # if self.character_type == "ch":
-        #     imgW = int((32 * max_wh_ratio))
-        h, w = img.shape[:2]
-        ratio = w / float(h)
-        if math.ceil(imgH * ratio) > imgW:
-            resized_w = imgW
-        else:
-            resized_w = int(math.ceil(imgH * ratio))
-        resized_image = cv2.resize(img, (resized_w, imgH))
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        padding_im[:, :, 0:resized_w] = resized_image
-        return padding_im
-
-    def __call__(self, img_list):
-        img_num = len(img_list)
-        # Calculate the aspect ratio of all text bars
-        width_list = []
-        for img in img_list:
-            width_list.append(img.shape[1] / float(img.shape[0]))
-        # Sorting can speed up the recognition process
-        indices = np.argsort(np.array(width_list))
-
-        # rec_res = []
-        rec_res = [['', 0.0]] * img_num
-        batch_num = self.rec_batch_num
-        elapse = 0
-        for beg_img_no in range(0, img_num, batch_num):
-            end_img_no = min(img_num, beg_img_no + batch_num)
-            norm_img_batch = []
-            max_wh_ratio = 0
-            for ino in range(beg_img_no, end_img_no):
-                # h, w = img_list[ino].shape[0:2]
-                h, w = img_list[indices[ino]].shape[0:2]
-                wh_ratio = w * 1.0 / h
-                max_wh_ratio = max(max_wh_ratio, wh_ratio)
-            for ino in range(beg_img_no, end_img_no):
-                # norm_img = self.resize_norm_img(img_list[ino], max_wh_ratio)
-                norm_img = self.resize_norm_img(img_list[indices[ino]],
-                                                max_wh_ratio)
-                norm_img = norm_img[np.newaxis, :]
-                norm_img_batch.append(norm_img)
-            norm_img_batch = np.concatenate(norm_img_batch)
-            norm_img_batch = norm_img_batch.copy()
-            starttime = time.time()
-            input_dict = {}
-            input_dict[self.input_tensor.name] = norm_img_batch
-            outputs = self.predictor.run(self.output_tensors, input_dict)
-            preds = outputs[0]
-            rec_result = self.postprocess_op(preds)
-            for rno in range(len(rec_result)):
-                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
-            elapse += time.time() - starttime
-        return rec_res, elapse
-
-
-def main(args):
-    image_file_list = get_image_file_list(args.image_dir)
-    text_recognizer = TextRecognizer(args)
-    valid_image_file_list = []
-    img_list = []
-    for image_file in image_file_list:
-        img, flag = check_and_read_gif(image_file)
-        if not flag:
-            img = cv2.imread(image_file)
-        if img is None:
-            logger.info("error in loading image:{}".format(image_file))
-            continue
-        valid_image_file_list.append(image_file)
-        img_list.append(img)
-    try:
-        rec_res, predict_time = text_recognizer(img_list)
-    except:
-        logger.info(traceback.format_exc())
-        logger.info(
-            "ERROR!!!! \n"
-            "Please read the FAQ：https://github.com/PaddlePaddle/PaddleOCR#faq \n"
-            "If your model has tps module:  "
-            "TPS does not support variable shape.\n"
-            "Please set --rec_image_shape='3,32,100' and --rec_char_type='en' ")
-        exit()
-    for ino in range(len(img_list)):
-        logger.info("Predicts of {}:{}".format(valid_image_file_list[ino],
-                                               rec_res[ino]))
-    logger.info("Total predict time for {} images, cost: {:.3f}".format(
-        len(img_list), predict_time))
-
-
-if __name__ == "__main__":
-    main(utility.parse_args())
--- a/deploy/paddle2onnx/predict_system.py
+++ b/deploy/paddle2onnx/predict_system.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
-import cv2
-import copy
-import numpy as np
-import time
-from PIL import Image
-import utility as utility
-import predict_rec as predict_rec
-import predict_det as predict_det
-import predict_cls as predict_cls
-from ppocr.utils.utility import get_image_file_list, check_and_read_gif
-from ppocr.utils.logging import get_logger
-from utility import draw_ocr_box_txt
-
-logger = get_logger()
-
-
-class TextSystem(object):
-    def __init__(self, args):
-        self.text_detector = predict_det.TextDetector(args)
-        self.text_recognizer = predict_rec.TextRecognizer(args)
-        self.use_angle_cls = args.use_angle_cls
-        self.drop_score = args.drop_score
-        if self.use_angle_cls:
-            self.text_classifier = predict_cls.TextClassifier(args)
-
-    def get_rotate_crop_image(self, img, points):
-        '''
-        img_height, img_width = img.shape[0:2]
-        left = int(np.min(points[:, 0]))
-        right = int(np.max(points[:, 0]))
-        top = int(np.min(points[:, 1]))
-        bottom = int(np.max(points[:, 1]))
-        img_crop = img[top:bottom, left:right, :].copy()
-        points[:, 0] = points[:, 0] - left
-        points[:, 1] = points[:, 1] - top
-        '''
-        img_crop_width = int(
-            max(
-                np.linalg.norm(points[0] - points[1]),
-                np.linalg.norm(points[2] - points[3])))
-        img_crop_height = int(
-            max(
-                np.linalg.norm(points[0] - points[3]),
-                np.linalg.norm(points[1] - points[2])))
-        pts_std = np.float32([[0, 0], [img_crop_width, 0],
-                              [img_crop_width, img_crop_height],
-                              [0, img_crop_height]])
-        M = cv2.getPerspectiveTransform(points, pts_std)
-        dst_img = cv2.warpPerspective(
-            img,
-            M, (img_crop_width, img_crop_height),
-            borderMode=cv2.BORDER_REPLICATE,
-            flags=cv2.INTER_CUBIC)
-        dst_img_height, dst_img_width = dst_img.shape[0:2]
-        if dst_img_height * 1.0 / dst_img_width >= 1.5:
-            dst_img = np.rot90(dst_img)
-        return dst_img
-
-    def print_draw_crop_rec_res(self, img_crop_list, rec_res):
-        bbox_num = len(img_crop_list)
-        for bno in range(bbox_num):
-            cv2.imwrite("./output/img_crop_%d.jpg" % bno, img_crop_list[bno])
-            logger.info(bno, rec_res[bno])
-
-    def __call__(self, img):
-        ori_im = img.copy()
-        dt_boxes, elapse = self.text_detector(img)
-        logger.info("dt_boxes num : {}, elapse : {}".format(
-            len(dt_boxes), elapse))
-        if dt_boxes is None:
-            return None, None
-        img_crop_list = []
-
-        dt_boxes = sorted_boxes(dt_boxes)
-
-        for bno in range(len(dt_boxes)):
-            tmp_box = copy.deepcopy(dt_boxes[bno])
-            img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
-            img_crop_list.append(img_crop)
-        if self.use_angle_cls:
-            img_crop_list, angle_list, elapse = self.text_classifier(
-                img_crop_list)
-            logger.info("cls num  : {}, elapse : {}".format(
-                len(img_crop_list), elapse))
-
-        rec_res, elapse = self.text_recognizer(img_crop_list)
-        logger.info("rec_res num  : {}, elapse : {}".format(
-            len(rec_res), elapse))
-        # self.print_draw_crop_rec_res(img_crop_list, rec_res)
-        filter_boxes, filter_rec_res = [], []
-        for box, rec_reuslt in zip(dt_boxes, rec_res):
-            text, score = rec_reuslt
-            if score >= self.drop_score:
-                filter_boxes.append(box)
-                filter_rec_res.append(rec_reuslt)
-        return filter_boxes, filter_rec_res
-
-
-def sorted_boxes(dt_boxes):
-    """
-    Sort text boxes in order from top to bottom, left to right
-    args:
-        dt_boxes(array):detected text boxes with shape [4, 2]
-    return:
-        sorted boxes(array) with shape [4, 2]
-    """
-    num_boxes = dt_boxes.shape[0]
-    sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
-    _boxes = list(sorted_boxes)
-
-    for i in range(num_boxes - 1):
-        if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \
-                (_boxes[i + 1][0][0] < _boxes[i][0][0]):
-            tmp = _boxes[i]
-            _boxes[i] = _boxes[i + 1]
-            _boxes[i + 1] = tmp
-    return _boxes
-
-
-def main(args):
-    image_file_list = get_image_file_list(args.image_dir)
-    text_sys = TextSystem(args)
-    is_visualize = True
-    font_path = args.vis_font_path
-    drop_score = args.drop_score
-    for image_file in image_file_list:
-        img, flag = check_and_read_gif(image_file)
-        if not flag:
-            img = cv2.imread(image_file)
-        #img = utility.resize_img(img, 640)
-        #img = utility.padding_img(img, [640,640])
-        print(img.shape)
-        if img is None:
-            logger.info("error in loading image:{}".format(image_file))
-            continue
-        starttime = time.time()
-        dt_boxes, rec_res = text_sys(img)
-        elapse = time.time() - starttime
-        logger.info("Predict time of %s: %.3fs" % (image_file, elapse))
-
-        for text, score in rec_res:
-            logger.info("{}, {:.3f}".format(text, score))
-
-        if is_visualize:
-            image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-            boxes = dt_boxes
-            txts = [rec_res[i][0] for i in range(len(rec_res))]
-            scores = [rec_res[i][1] for i in range(len(rec_res))]
-
-            draw_img = draw_ocr_box_txt(
-                image,
-                boxes,
-                txts,
-                scores,
-                drop_score=drop_score,
-                font_path=font_path)
-            draw_img_save = "./results/"
-            if not os.path.exists(draw_img_save):
-                os.makedirs(draw_img_save)
-            cv2.imwrite(
-                os.path.join(draw_img_save, os.path.basename(image_file)),
-                draw_img[:, :, ::-1])
-            logger.info("The visualized image saved in {}".format(
-                os.path.join(draw_img_save, os.path.basename(image_file))))
-
-
-if __name__ == "__main__":
-    main(utility.parse_args())
\ No newline at end of file
--- a/deploy/paddle2onnx/readme.md
+++ b/deploy/paddle2onnx/readme.md
+# paddle2onnx 模型转化与预测
+
+本章节介绍 PaddleOCR 模型如何转化为 ONNX 模型，并基于 ONNX 引擎预测。
+
+## 1. 环境准备
+
+需要准备 Paddle2ONNX 模型转化环境，和 ONNX 模型预测环境
+
+###  Paddle2ONNX
+paddle2onnx 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式，算子目前稳定支持导出 ONNX Opset 9~11，部分Paddle算子支持更低的ONNX Opset转换。
+更多细节可参考 [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)
+
+- 安装 Paddle2ONNX
+```
+python3.7 -m pip install paddle2onnx
+```
+
+- 安装 ONNX
+```
+# 建议安装 1.4.0 版本，可根据环境更换版本号
+python3.7 -m pip install onnxruntime==1.4.0
+```
+- Paddle 模型下载
+
+有两种方式获取Paddle静态图模型：在 [model_list](../../doc/doc_ch/models_list.md) 中下载PaddleOCR提供的预测模型；
+参考[模型导出说明](../../doc/doc_ch/inference.md#训练模型转inference模型)把训练好的权重转为 inference_model。
+
+以 ppocr 检测模型为例：
+
+```
+wget -nc  -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar
+cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && cd ..
+```
+
+## 2. 模型转换
+
+使用 Paddle2ONNX 将Paddle静态图模型转换为ONNX模型格式：
+
+```
+paddle2onnx --model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ \
+--model_filename=inference.pdmodel \
+--params_filename=inference.pdiparams \
+--save_file=./inference/det_mobile_onnx/model.onnx \
+--opset_version=10 \
+--enable_onnx_checker=True
+```
+
+执行完毕后，ONNX模型会被保存在 `./inference/det_mobile_onnx/` 路径下
+
+## 3. onnx 预测
+
+以检测模型为例，使用onnx预测可执行如下命令：
+
+```
+python3.7 ../../tools/infer/predict_det.py --use_gpu=False --use_onnx=True \
+--det_model_dir=./inference/det_mobile_onnx/model.onnx \
+--image_dir=../../doc/imgs/1.jpg
+```
+
+执行命令后在终端会打印出预测的检测框坐标，并在 `./inference_results/` 下保存可视化结果。
+
+```
+root INFO: 1.jpg  [[[291, 295], [334, 292], [348, 844], [305, 847]], [[344, 296], [379, 294], [387, 669], [353, 671]]]
+The predict time of ../../doc/imgs/1.jpg: 0.06162881851196289
+The visualized image saved in ./inference_results/det_res_1.jpg
+```
+
+* 注意：ONNX暂时不支持变长预测，因为需要将输入resize到固定输入，预测结果可能与直接使用Paddle预测有细微不同。
--- a/deploy/paddle2onnx/utility.py
+++ b/deploy/paddle2onnx/utility.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import sys
-import cv2
-import numpy as np
-import json
-from PIL import Image, ImageDraw, ImageFont
-import math
-import onnxruntime as ort
-
-
-def parse_args():
-    def str2bool(v):
-        return v.lower() in ("true", "t", "1")
-
-    parser = argparse.ArgumentParser()
-    # params for prediction engine
-    parser.add_argument("--use_gpu", type=str2bool, default=True)
-    parser.add_argument("--ir_optim", type=str2bool, default=True)
-    parser.add_argument("--use_tensorrt", type=str2bool, default=False)
-    parser.add_argument("--use_fp16", type=str2bool, default=False)
-    parser.add_argument("--gpu_mem", type=int, default=500)
-
-    # params for text detector
-    parser.add_argument("--image_dir", type=str)
-    parser.add_argument("--det_algorithm", type=str, default='DB')
-    parser.add_argument("--det_model_dir", type=str)
-    parser.add_argument("--det_limit_side_len", type=float, default=960)
-    parser.add_argument("--det_limit_type", type=str, default='max')
-
-    # DB parmas
-    parser.add_argument("--det_db_thresh", type=float, default=0.3)
-    parser.add_argument("--det_db_box_thresh", type=float, default=0.5)
-    parser.add_argument("--det_db_unclip_ratio", type=float, default=1.6)
-    parser.add_argument("--max_batch_size", type=int, default=10)
-    # EAST parmas
-    parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
-    parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
-    parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
-
-    # SAST parmas
-    parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
-    parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
-    parser.add_argument("--det_sast_polygon", type=bool, default=False)
-
-    # params for text recognizer
-    parser.add_argument("--rec_algorithm", type=str, default='CRNN')
-    parser.add_argument("--rec_model_dir", type=str)
-    parser.add_argument("--rec_image_shape", type=str, default="3, 32, 100")
-    parser.add_argument("--rec_char_type", type=str, default='ch')
-    parser.add_argument("--rec_batch_num", type=int, default=6)
-    parser.add_argument("--max_text_length", type=int, default=25)
-    parser.add_argument(
-        "--rec_char_dict_path",
-        type=str,
-        default="./ppocr/utils/ppocr_keys_v1.txt")
-    parser.add_argument("--use_space_char", type=str2bool, default=True)
-    parser.add_argument(
-        "--vis_font_path", type=str, default="./doc/simfang.ttf")
-    parser.add_argument("--drop_score", type=float, default=0.5)
-
-    # params for text classifier
-    parser.add_argument("--use_angle_cls", type=str2bool, default=False)
-    parser.add_argument("--cls_model_dir", type=str)
-    parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
-    parser.add_argument("--label_list", type=list, default=['0', '180'])
-    parser.add_argument("--cls_batch_num", type=int, default=6)
-    parser.add_argument("--cls_thresh", type=float, default=0.9)
-
-    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
-    parser.add_argument("--use_pdserving", type=str2bool, default=False)
-
-    return parser.parse_args()
-
-
-def create_predictor(args, mode, logger):
-    if mode == "det":
-        model_dir = args.det_model_dir
-    elif mode == 'cls':
-        model_dir = args.cls_model_dir
-    else:
-        model_dir = args.rec_model_dir
-
-    if model_dir is None:
-        logger.info("not find {} model file path {}".format(mode, model_dir))
-        sys.exit(0)
-    model_file_path = model_dir
-
-    if not os.path.exists(model_file_path):
-        logger.info("not find model file path {}".format(model_file_path))
-        sys.exit(0)
-
-    sess = ort.InferenceSession(model_file_path)
-
-    return sess, sess.get_inputs()[0], None
-
-
-def draw_text_det_res(dt_boxes, img_path):
-    src_im = cv2.imread(img_path)
-    for box in dt_boxes:
-        box = np.array(box).astype(np.int32).reshape(-1, 2)
-        cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
-    return src_im
-
-
-def resize_img(img, input_size=600):
-    """
-    resize img and limit the longest side of the image to input_size
-    """
-    img = np.array(img)
-    im_shape = img.shape
-    im_size_max = np.max(im_shape[0:2])
-    im_scale = float(input_size) / float(im_size_max)
-    img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
-    return img
-
-
-def padding_img(img, input_size=[640, 640]):
-    padding_im_h = input_size[0]
-    padding_im_w = input_size[1]
-    im_c = img.shape[2]
-    im_h = img.shape[0]
-    im_w = img.shape[1]
-    padding_im = np.zeros(
-        (padding_im_h, padding_im_w, im_c), dtype=np.float32) + 255
-    padding_im[:im_h, :im_w, :] = img
-    return padding_im
-
-
-def draw_ocr(image,
-             boxes,
-             txts=None,
-             scores=None,
-             drop_score=0.5,
-             font_path="./doc/simfang.ttf"):
-    """
-    Visualize the results of OCR detection and recognition
-    args:
-        image(Image|array): RGB image
-        boxes(list): boxes with shape(N, 4, 2)
-        txts(list): the texts
-        scores(list): txxs corresponding scores
-        drop_score(float): only scores greater than drop_threshold will be visualized
-        font_path: the path of font which is used to draw text
-    return(array):
-        the visualized img
-    """
-    if scores is None:
-        scores = [1] * len(boxes)
-    box_num = len(boxes)
-    for i in range(box_num):
-        if scores is not None and (scores[i] < drop_score or
-                                   math.isnan(scores[i])):
-            continue
-        box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64)
-        image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
-    if txts is not None:
-        img = np.array(resize_img(image, input_size=600))
-        txt_img = text_visual(
-            txts,
-            scores,
-            img_h=img.shape[0],
-            img_w=600,
-            threshold=drop_score,
-            font_path=font_path)
-        img = np.concatenate([np.array(img), np.array(txt_img)], axis=1)
-        return img
-    return image
-
-
-def draw_ocr_box_txt(image,
-                     boxes,
-                     txts,
-                     scores=None,
-                     drop_score=0.5,
-                     font_path="./doc/fonts/simfang.ttf"):
-    h, w = image.height, image.width
-    img_left = image.copy()
-    img_right = Image.new('RGB', (w, h), (255, 255, 255))
-
-    import random
-
-    random.seed(0)
-    draw_left = ImageDraw.Draw(img_left)
-    draw_right = ImageDraw.Draw(img_right)
-    for idx, (box, txt) in enumerate(zip(boxes, txts)):
-        if scores is not None and scores[idx] < drop_score:
-            continue
-        color = (random.randint(0, 255), random.randint(0, 255),
-                 random.randint(0, 255))
-        draw_left.polygon(box, fill=color)
-        draw_right.polygon(
-            [
-                box[0][0], box[0][1], box[1][0], box[1][1], box[2][0],
-                box[2][1], box[3][0], box[3][1]
-            ],
-            outline=color)
-        box_height = math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][
-            1])**2)
-        box_width = math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][
-            1])**2)
-        if box_height > 2 * box_width:
-            font_size = max(int(box_width * 0.9), 10)
-            font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-            cur_y = box[0][1]
-            for c in txt:
-                char_size = font.getsize(c)
-                draw_right.text(
-                    (box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font)
-                cur_y += char_size[1]
-        else:
-            font_size = max(int(box_height * 0.8), 10)
-            font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-            draw_right.text(
-                [box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font)
-    img_left = Image.blend(image, img_left, 0.5)
-    img_show = Image.new('RGB', (w * 2, h), (255, 255, 255))
-    img_show.paste(img_left, (0, 0, w, h))
-    img_show.paste(img_right, (w, 0, w * 2, h))
-    return np.array(img_show)
-
-
-def str_count(s):
-    """
-    Count the number of Chinese characters,
-    a single English character and a single number
-    equal to half the length of Chinese characters.
-    args:
-        s(string): the input of string
-    return(int):
-        the number of Chinese characters
-    """
-    import string
-    count_zh = count_pu = 0
-    s_len = len(s)
-    en_dg_count = 0
-    for c in s:
-        if c in string.ascii_letters or c.isdigit() or c.isspace():
-            en_dg_count += 1
-        elif c.isalpha():
-            count_zh += 1
-        else:
-            count_pu += 1
-    return s_len - math.ceil(en_dg_count / 2)
-
-
-def text_visual(texts,
-                scores,
-                img_h=400,
-                img_w=600,
-                threshold=0.,
-                font_path="./doc/simfang.ttf"):
-    """
-    create new blank img and draw txt on it
-    args:
-        texts(list): the text will be draw
-        scores(list|None): corresponding score of each txt
-        img_h(int): the height of blank img
-        img_w(int): the width of blank img
-        font_path: the path of font which is used to draw text
-    return(array):
-    """
-    if scores is not None:
-        assert len(texts) == len(
-            scores), "The number of txts and corresponding scores must match"
-
-    def create_blank_img():
-        blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255
-        blank_img[:, img_w - 1:] = 0
-        blank_img = Image.fromarray(blank_img).convert("RGB")
-        draw_txt = ImageDraw.Draw(blank_img)
-        return blank_img, draw_txt
-
-    blank_img, draw_txt = create_blank_img()
-
-    font_size = 20
-    txt_color = (0, 0, 0)
-    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-
-    gap = font_size + 5
-    txt_img_list = []
-    count, index = 1, 0
-    for idx, txt in enumerate(texts):
-        index += 1
-        if scores[idx] < threshold or math.isnan(scores[idx]):
-            index -= 1
-            continue
-        first_line = True
-        while str_count(txt) >= img_w // font_size - 4:
-            tmp = txt
-            txt = tmp[:img_w // font_size - 4]
-            if first_line:
-                new_txt = str(index) + ': ' + txt
-                first_line = False
-            else:
-                new_txt = '    ' + txt
-            draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
-            txt = tmp[img_w // font_size - 4:]
-            if count >= img_h // gap - 1:
-                txt_img_list.append(np.array(blank_img))
-                blank_img, draw_txt = create_blank_img()
-                count = 0
-            count += 1
-        if first_line:
-            new_txt = str(index) + ': ' + txt + '   ' + '%.3f' % (scores[idx])
-        else:
-            new_txt = "  " + txt + "  " + '%.3f' % (scores[idx])
-        draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
-        # whether add new blank img or not
-        if count >= img_h // gap - 1 and idx + 1 < len(texts):
-            txt_img_list.append(np.array(blank_img))
-            blank_img, draw_txt = create_blank_img()
-            count = 0
-        count += 1
-    txt_img_list.append(np.array(blank_img))
-    if len(txt_img_list) == 1:
-        blank_img = np.array(txt_img_list[0])
-    else:
-        blank_img = np.concatenate(txt_img_list, axis=1)
-    return np.array(blank_img)
-
-
-def base64_to_cv2(b64str):
-    import base64
-    data = base64.b64decode(b64str.encode('utf8'))
-    data = np.fromstring(data, np.uint8)
-    data = cv2.imdecode(data, cv2.IMREAD_COLOR)
-    return data
-
-
-def draw_boxes(image, boxes, scores=None, drop_score=0.5):
-    if scores is None:
-        scores = [1] * len(boxes)
-    for (box, score) in zip(boxes, scores):
-        if score < drop_score:
-            continue
-        box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
-        image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
-    return image
-
-
-if __name__ == '__main__':
-    test_img = "./doc/test_v2"
-    predict_txt = "./doc/predict.txt"
-    f = open(predict_txt, 'r')
-    data = f.readlines()
-    img_path, anno = data[0].strip().split('\t')
-    img_name = os.path.basename(img_path)
-    img_path = os.path.join(test_img, img_name)
-    image = Image.open(img_path)
-
-    data = json.loads(anno)
-    boxes, txts, scores = [], [], []
-    for dic in data:
-        boxes.append(dic['points'])
-        txts.append(dic['transcription'])
-        scores.append(round(dic['scores'], 3))
-
-    new_img = draw_ocr(image, boxes, txts, scores)
-
-    cv2.imwrite(img_name, new_img)
--- a/test_tipc/configs/ppocr_det_mobile_params.txt
+++ b/test_tipc/configs/ppocr_det_mobile_params.txt
@@ -116,7 +116,7 @@ infer_model:./models/ch_ppocr_mobile_v2.0_det_opt.nb|./models/ch_ppocr_mobile_v2
 --save_file:./inference/det_mobile_onnx/model.onnx
 --opset_version:10 
 --enable_onnx_checker:True
-inference:deploy/paddle2onnx/predict_det.py
+inference:tools/infer/predict_det.py
 --use_gpu:False
 --det_model_dir:
 --image_dir:./inference/ch_det_data_50/all-sum-510/
\ No newline at end of file
--- a/test_tipc/test_paddle2onnx.sh
+++ b/test_tipc/test_paddle2onnx.sh
@@ -11,7 +11,7 @@ python=$(func_parser_value "${lines[2]}")


 # parser params
-dataline=$(awk 'NR==111, NR==122{print}'  $FILENAME)
+dataline=$(awk 'NR==111, NR==123{print}'  $FILENAME)
 IFS=$'\n'
 lines=(${dataline})

@@ -63,7 +63,7 @@ function func_paddle2onnx(){
    set_gpu=$(func_set_params "${use_gpu_key}" "${use_gpu_value}")
    set_model_dir=$(func_set_params "${det_model_key}" "${save_file_value}")
    set_img_dir=$(func_set_params "${image_dir_key}" "${image_dir_value}")
-    infer_model_cmd="${python} ${inference_py} ${set_gpu} ${set_img_dir} ${set_model_dir} > ${_save_log_path} 2>&1 "
+    infer_model_cmd="${python} ${inference_py} ${set_gpu} ${set_img_dir} ${set_model_dir} --use_onnx=True > ${_save_log_path} 2>&1 "
    eval $infer_model_cmd
    status_check $last_status "${infer_model_cmd}" "${status_log}"
 }

--- a/tools/infer/predict_cls.py
+++ b/tools/infer/predict_cls.py
@@ -47,6 +47,7 @@ class TextClassifier(object):
        self.postprocess_op = build_post_process(postprocess_params)
        self.predictor, self.input_tensor, self.output_tensors, _ = \
            utility.create_predictor(args, 'cls', logger)
+        self.use_onnx = args.use_onnx

    def resize_norm_img(self, img):
        imgC, imgH, imgW = self.cls_image_shape
@@ -100,10 +101,16 @@ class TextClassifier(object):
            norm_img_batch = np.concatenate(norm_img_batch)
            norm_img_batch = norm_img_batch.copy()

-            self.input_tensor.copy_from_cpu(norm_img_batch)
-            self.predictor.run()
-            prob_out = self.output_tensors[0].copy_to_cpu()
-            self.predictor.try_shrink_memory()
+            if self.use_onnx:
+                input_dict = {}
+                input_dict[self.input_tensor.name] = norm_img_batch
+                outputs = self.predictor.run(self.output_tensors, input_dict)
+                prob_out = outputs[0]
+            else:
+                self.input_tensor.copy_from_cpu(norm_img_batch)
+                self.predictor.run()
+                prob_out = self.output_tensors[0].copy_to_cpu()
+                self.predictor.try_shrink_memory()
            cls_result = self.postprocess_op(prob_out)
            elapse += time.time() - starttime
            for rno in range(len(cls_result)):

--- a/tools/infer/predict_det.py
+++ b/tools/infer/predict_det.py
@@ -38,6 +38,7 @@ class TextDetector(object):
    def __init__(self, args):
        self.args = args
        self.det_algorithm = args.det_algorithm
+        self.use_onnx = args.use_onnx
        pre_process_list = [{
            'DetResizeForTest': {
                'limit_side_len': args.det_limit_side_len,
@@ -100,7 +101,12 @@ class TextDetector(object):
        else:
            logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
            sys.exit(0)
-
+        if self.use_onnx:
+            pre_process_list[0] = {
+                'DetResizeForTest': {
+                    'image_shape': [640, 640]
+                }
+            }
        self.preprocess_op = create_operators(pre_process_list)
        self.postprocess_op = build_post_process(postprocess_params)
        self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor(
@@ -198,15 +204,19 @@ class TextDetector(object):

        if self.args.benchmark:
            self.autolog.times.stamp()
-
-        self.input_tensor.copy_from_cpu(img)
-        self.predictor.run()
-        outputs = []
-        for output_tensor in self.output_tensors:
-            output = output_tensor.copy_to_cpu()
-            outputs.append(output)
-        if self.args.benchmark:
-            self.autolog.times.stamp()
+        if self.use_onnx:
+            input_dict = {}
+            input_dict[self.input_tensor.name] = img
+            outputs = self.predictor.run(self.output_tensors, input_dict)
+        else:
+            self.input_tensor.copy_from_cpu(img)
+            self.predictor.run()
+            outputs = []
+            for output_tensor in self.output_tensors:
+                output = output_tensor.copy_to_cpu()
+                outputs.append(output)
+            if self.args.benchmark:
+                self.autolog.times.stamp()

        preds = {}
        if self.det_algorithm == "EAST":

--- a/tools/infer/predict_rec.py
+++ b/tools/infer/predict_rec.py
@@ -73,6 +73,7 @@ class TextRecognizer(object):
        self.predictor, self.input_tensor, self.output_tensors, self.config = \
            utility.create_predictor(args, 'rec', logger)
        self.benchmark = args.benchmark
+        self.use_onnx = args.use_onnx
        if args.benchmark:
            import auto_log
            pid = os.getpid()
@@ -108,6 +109,8 @@ class TextRecognizer(object):
        assert imgC == img.shape[2]
        max_wh_ratio = max(max_wh_ratio, imgW / imgH)
        imgW = int((32 * max_wh_ratio))
+        if self.use_onnx:
+            imgW = 100
        h, w = img.shape[:2]
        ratio = w / float(h)
        if math.ceil(imgH * ratio) > imgW:
@@ -297,51 +300,72 @@ class TextRecognizer(object):
                    gsrm_slf_attn_bias1_list,
                    gsrm_slf_attn_bias2_list,
                ]
-                input_names = self.predictor.get_input_names()
-                for i in range(len(input_names)):
-                    input_tensor = self.predictor.get_input_handle(input_names[
-                        i])
-                    input_tensor.copy_from_cpu(inputs[i])
-                self.predictor.run()
-                outputs = []
-                for output_tensor in self.output_tensors:
-                    output = output_tensor.copy_to_cpu()
-                    outputs.append(output)
-                if self.benchmark:
-                    self.autolog.times.stamp()
-                preds = {"predict": outputs[2]}
+                if self.use_onnx:
+                    input_dict = {}
+                    input_dict[self.input_tensor.name] = norm_img_batch
+                    outputs = self.predictor.run(self.output_tensors,
+                                                 input_dict)
+                    preds = {"predict": outputs[2]}
+                else:
+                    input_names = self.predictor.get_input_names()
+                    for i in range(len(input_names)):
+                        input_tensor = self.predictor.get_input_handle(
+                            input_names[i])
+                        input_tensor.copy_from_cpu(inputs[i])
+                    self.predictor.run()
+                    outputs = []
+                    for output_tensor in self.output_tensors:
+                        output = output_tensor.copy_to_cpu()
+                        outputs.append(output)
+                    if self.benchmark:
+                        self.autolog.times.stamp()
+                    preds = {"predict": outputs[2]}
            elif self.rec_algorithm == "SAR":
                valid_ratios = np.concatenate(valid_ratios)
                inputs = [
                    norm_img_batch,
                    valid_ratios,
                ]
-                input_names = self.predictor.get_input_names()
-                for i in range(len(input_names)):
-                    input_tensor = self.predictor.get_input_handle(input_names[
-                        i])
-                    input_tensor.copy_from_cpu(inputs[i])
-                self.predictor.run()
-                outputs = []
-                for output_tensor in self.output_tensors:
-                    output = output_tensor.copy_to_cpu()
-                    outputs.append(output)
-                if self.benchmark:
-                    self.autolog.times.stamp()
-                preds = outputs[0]
-            else:
-                self.input_tensor.copy_from_cpu(norm_img_batch)
-                self.predictor.run()
-                outputs = []
-                for output_tensor in self.output_tensors:
-                    output = output_tensor.copy_to_cpu()
-                    outputs.append(output)
-                if self.benchmark:
-                    self.autolog.times.stamp()
-                if len(outputs) != 1:
-                    preds = outputs
+                if self.use_onnx:
+                    input_dict = {}
+                    input_dict[self.input_tensor.name] = norm_img_batch
+                    outputs = self.predictor.run(self.output_tensors,
+                                                 input_dict)
+                    preds = outputs[0]
                else:
+                    input_names = self.predictor.get_input_names()
+                    for i in range(len(input_names)):
+                        input_tensor = self.predictor.get_input_handle(
+                            input_names[i])
+                        input_tensor.copy_from_cpu(inputs[i])
+                    self.predictor.run()
+                    outputs = []
+                    for output_tensor in self.output_tensors:
+                        output = output_tensor.copy_to_cpu()
+                        outputs.append(output)
+                    if self.benchmark:
+                        self.autolog.times.stamp()
                    preds = outputs[0]
+            else:
+                if self.use_onnx:
+                    input_dict = {}
+                    input_dict[self.input_tensor.name] = norm_img_batch
+                    outputs = self.predictor.run(self.output_tensors,
+                                                 input_dict)
+                    preds = outputs[0]
+                else:
+                    self.input_tensor.copy_from_cpu(norm_img_batch)
+                    self.predictor.run()
+                    outputs = []
+                    for output_tensor in self.output_tensors:
+                        output = output_tensor.copy_to_cpu()
+                        outputs.append(output)
+                    if self.benchmark:
+                        self.autolog.times.stamp()
+                    if len(outputs) != 1:
+                        preds = outputs
+                    else:
+                        preds = outputs[0]
            rec_result = self.postprocess_op(preds)
            for rno in range(len(rec_result)):
                rec_res[indices[beg_img_no + rno]] = rec_result[rno]

--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -121,6 +121,7 @@ def init_args():
    parser.add_argument("--save_log_path", type=str, default="./log_output/")

    parser.add_argument("--show_log", type=str2bool, default=True)
+    parser.add_argument("--use_onnx", type=str2bool, default=False)
    return parser


@@ -144,152 +145,163 @@ def create_predictor(args, mode, logger):
    if model_dir is None:
        logger.info("not find {} model file path {}".format(mode, model_dir))
        sys.exit(0)
-    model_file_path = model_dir + "/inference.pdmodel"
-    params_file_path = model_dir + "/inference.pdiparams"
-    if not os.path.exists(model_file_path):
-        raise ValueError("not find model file path {}".format(model_file_path))
-    if not os.path.exists(params_file_path):
-        raise ValueError("not find params file path {}".format(
-            params_file_path))
-
-    config = inference.Config(model_file_path, params_file_path)
-
-    if hasattr(args, 'precision'):
-        if args.precision == "fp16" and args.use_tensorrt:
-            precision = inference.PrecisionType.Half
-        elif args.precision == "int8":
-            precision = inference.PrecisionType.Int8
-        else:
-            precision = inference.PrecisionType.Float32
+    if args.use_onnx:
+        import onnxruntime as ort
+        model_file_path = model_dir
+        if not os.path.exists(model_file_path):
+            raise ValueError("not find model file path {}".format(
+                model_file_path))
+        sess = ort.InferenceSession(model_file_path)
+        return sess, sess.get_inputs()[0], None, None
+
    else:
-        precision = inference.PrecisionType.Float32
-
-    if args.use_gpu:
-        gpu_id = get_infer_gpuid()
-        if gpu_id is None:
-            raise ValueError(
-                "Not found GPU in current device. Please check your device or set args.use_gpu as False"
-            )
-        config.enable_use_gpu(args.gpu_mem, 0)
-        if args.use_tensorrt:
-            config.enable_tensorrt_engine(
-                precision_mode=precision,
-                max_batch_size=args.max_batch_size,
-                min_subgraph_size=args.min_subgraph_size)
-            # skip the minmum trt subgraph
-        if mode == "det":
-            min_input_shape = {
-                "x": [1, 3, 50, 50],
-                "conv2d_92.tmp_0": [1, 120, 20, 20],
-                "conv2d_91.tmp_0": [1, 24, 10, 10],
-                "conv2d_59.tmp_0": [1, 96, 20, 20],
-                "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10],
-                "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20],
-                "conv2d_124.tmp_0": [1, 256, 20, 20],
-                "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20],
-                "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20],
-                "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20],
-                "elementwise_add_7": [1, 56, 2, 2],
-                "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2]
-            }
-            max_input_shape = {
-                "x": [1, 3, 2000, 2000],
-                "conv2d_92.tmp_0": [1, 120, 400, 400],
-                "conv2d_91.tmp_0": [1, 24, 200, 200],
-                "conv2d_59.tmp_0": [1, 96, 400, 400],
-                "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200],
-                "conv2d_124.tmp_0": [1, 256, 400, 400],
-                "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400],
-                "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400],
-                "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400],
-                "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400],
-                "elementwise_add_7": [1, 56, 400, 400],
-                "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400]
-            }
-            opt_input_shape = {
-                "x": [1, 3, 640, 640],
-                "conv2d_92.tmp_0": [1, 120, 160, 160],
-                "conv2d_91.tmp_0": [1, 24, 80, 80],
-                "conv2d_59.tmp_0": [1, 96, 160, 160],
-                "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80],
-                "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160],
-                "conv2d_124.tmp_0": [1, 256, 160, 160],
-                "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160],
-                "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160],
-                "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160],
-                "elementwise_add_7": [1, 56, 40, 40],
-                "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40]
-            }
-            min_pact_shape = {
-                "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20],
-                "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20],
-                "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20],
-                "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20]
-            }
-            max_pact_shape = {
-                "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400],
-                "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400],
-                "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400],
-                "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400]
-            }
-            opt_pact_shape = {
-                "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160],
-                "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160],
-                "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160],
-                "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160]
-            }
-            min_input_shape.update(min_pact_shape)
-            max_input_shape.update(max_pact_shape)
-            opt_input_shape.update(opt_pact_shape)
-        elif mode == "rec":
-            min_input_shape = {"x": [1, 3, 32, 10]}
-            max_input_shape = {"x": [args.rec_batch_num, 3, 32, 2000]}
-            opt_input_shape = {"x": [args.rec_batch_num, 3, 32, 320]}
-        elif mode == "cls":
-            min_input_shape = {"x": [1, 3, 48, 10]}
-            max_input_shape = {"x": [args.rec_batch_num, 3, 48, 2000]}
-            opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]}
+        model_file_path = model_dir + "/inference.pdmodel"
+        params_file_path = model_dir + "/inference.pdiparams"
+        if not os.path.exists(model_file_path):
+            raise ValueError("not find model file path {}".format(
+                model_file_path))
+        if not os.path.exists(params_file_path):
+            raise ValueError("not find params file path {}".format(
+                params_file_path))
+
+        config = inference.Config(model_file_path, params_file_path)
+
+        if hasattr(args, 'precision'):
+            if args.precision == "fp16" and args.use_tensorrt:
+                precision = inference.PrecisionType.Half
+            elif args.precision == "int8":
+                precision = inference.PrecisionType.Int8
+            else:
+                precision = inference.PrecisionType.Float32
        else:
-            min_input_shape = {"x": [1, 3, 10, 10]}
-            max_input_shape = {"x": [1, 3, 1000, 1000]}
-            opt_input_shape = {"x": [1, 3, 500, 500]}
-        config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
-                                          opt_input_shape)
+            precision = inference.PrecisionType.Float32
+
+        if args.use_gpu:
+            gpu_id = get_infer_gpuid()
+            if gpu_id is None:
+                raise ValueError(
+                    "Not found GPU in current device. Please check your device or set args.use_gpu as False"
+                )
+            config.enable_use_gpu(args.gpu_mem, 0)
+            if args.use_tensorrt:
+                config.enable_tensorrt_engine(
+                    precision_mode=precision,
+                    max_batch_size=args.max_batch_size,
+                    min_subgraph_size=args.min_subgraph_size)
+                # skip the minmum trt subgraph
+            if mode == "det":
+                min_input_shape = {
+                    "x": [1, 3, 50, 50],
+                    "conv2d_92.tmp_0": [1, 120, 20, 20],
+                    "conv2d_91.tmp_0": [1, 24, 10, 10],
+                    "conv2d_59.tmp_0": [1, 96, 20, 20],
+                    "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10],
+                    "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20],
+                    "conv2d_124.tmp_0": [1, 256, 20, 20],
+                    "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20],
+                    "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20],
+                    "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20],
+                    "elementwise_add_7": [1, 56, 2, 2],
+                    "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2]
+                }
+                max_input_shape = {
+                    "x": [1, 3, 2000, 2000],
+                    "conv2d_92.tmp_0": [1, 120, 400, 400],
+                    "conv2d_91.tmp_0": [1, 24, 200, 200],
+                    "conv2d_59.tmp_0": [1, 96, 400, 400],
+                    "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200],
+                    "conv2d_124.tmp_0": [1, 256, 400, 400],
+                    "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400],
+                    "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400],
+                    "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400],
+                    "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400],
+                    "elementwise_add_7": [1, 56, 400, 400],
+                    "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400]
+                }
+                opt_input_shape = {
+                    "x": [1, 3, 640, 640],
+                    "conv2d_92.tmp_0": [1, 120, 160, 160],
+                    "conv2d_91.tmp_0": [1, 24, 80, 80],
+                    "conv2d_59.tmp_0": [1, 96, 160, 160],
+                    "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80],
+                    "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160],
+                    "conv2d_124.tmp_0": [1, 256, 160, 160],
+                    "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160],
+                    "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160],
+                    "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160],
+                    "elementwise_add_7": [1, 56, 40, 40],
+                    "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40]
+                }
+                min_pact_shape = {
+                    "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20],
+                    "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20],
+                    "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20],
+                    "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20]
+                }
+                max_pact_shape = {
+                    "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400],
+                    "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400],
+                    "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400],
+                    "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400]
+                }
+                opt_pact_shape = {
+                    "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160],
+                    "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160],
+                    "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160],
+                    "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160]
+                }
+                min_input_shape.update(min_pact_shape)
+                max_input_shape.update(max_pact_shape)
+                opt_input_shape.update(opt_pact_shape)
+            elif mode == "rec":
+                min_input_shape = {"x": [1, 3, 32, 10]}
+                max_input_shape = {"x": [args.rec_batch_num, 3, 32, 2000]}
+                opt_input_shape = {"x": [args.rec_batch_num, 3, 32, 320]}
+            elif mode == "cls":
+                min_input_shape = {"x": [1, 3, 48, 10]}
+                max_input_shape = {"x": [args.rec_batch_num, 3, 48, 2000]}
+                opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]}
+            else:
+                min_input_shape = {"x": [1, 3, 10, 10]}
+                max_input_shape = {"x": [1, 3, 1000, 1000]}
+                opt_input_shape = {"x": [1, 3, 500, 500]}
+            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
+                                              opt_input_shape)

-    else:
-        config.disable_gpu()
-        if hasattr(args, "cpu_threads"):
-            config.set_cpu_math_library_num_threads(args.cpu_threads)
        else:
-            # default cpu threads as 10
-            config.set_cpu_math_library_num_threads(10)
-        if args.enable_mkldnn:
-            # cache 10 different shapes for mkldnn to avoid memory leak
-            config.set_mkldnn_cache_capacity(10)
-            config.enable_mkldnn()
-            if args.precision == "fp16":
-                config.enable_mkldnn_bfloat16()
-    # enable memory optim
-    config.enable_memory_optim()
-    config.disable_glog_info()
-
-    config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
-    if mode == 'table':
-        config.delete_pass("fc_fuse_pass")  # not supported for table
-    config.switch_use_feed_fetch_ops(False)
-    config.switch_ir_optim(True)
-
-    # create predictor
-    predictor = inference.create_predictor(config)
-    input_names = predictor.get_input_names()
-    for name in input_names:
-        input_tensor = predictor.get_input_handle(name)
-    output_names = predictor.get_output_names()
-    output_tensors = []
-    for output_name in output_names:
-        output_tensor = predictor.get_output_handle(output_name)
-        output_tensors.append(output_tensor)
-    return predictor, input_tensor, output_tensors, config
+            config.disable_gpu()
+            if hasattr(args, "cpu_threads"):
+                config.set_cpu_math_library_num_threads(args.cpu_threads)
+            else:
+                # default cpu threads as 10
+                config.set_cpu_math_library_num_threads(10)
+            if args.enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+                if args.precision == "fp16":
+                    config.enable_mkldnn_bfloat16()
+        # enable memory optim
+        config.enable_memory_optim()
+        config.disable_glog_info()
+
+        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
+        if mode == 'table':
+            config.delete_pass("fc_fuse_pass")  # not supported for table
+        config.switch_use_feed_fetch_ops(False)
+        config.switch_ir_optim(True)
+
+        # create predictor
+        predictor = inference.create_predictor(config)
+        input_names = predictor.get_input_names()
+        for name in input_names:
+            input_tensor = predictor.get_input_handle(name)
+        output_names = predictor.get_output_names()
+        output_tensors = []
+        for output_name in output_names:
+            output_tensor = predictor.get_output_handle(output_name)
+            output_tensors.append(output_tensor)
+        return predictor, input_tensor, output_tensors, config


 def get_infer_gpuid():