dataset_traversal.py 4.1 KB
Newer Older
L
LDOUBLEV 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import os
T
tink2123 已提交
16
import sys
L
LDOUBLEV 已提交
17 18 19 20 21 22 23 24 25
import math
import random
import functools
import numpy as np
import cv2
import string
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.utils.utility import create_module
L
LDOUBLEV 已提交
26
from ppocr.utils.utility import get_image_file_list
L
LDOUBLEV 已提交
27 28 29 30 31 32 33 34
import time


class TrainReader(object):
    def __init__(self, params):
        self.num_workers = params['num_workers']
        self.label_file_path = params['label_file_path']
        self.batch_size = params['train_batch_size_per_card']
T
tink2123 已提交
35
        self.drop_last = params['drop_last']
L
LDOUBLEV 已提交
36 37 38 39 40 41 42 43 44 45 46
        assert 'process_function' in params,\
            "absence process_function in Reader"
        self.process = create_module(params['process_function'])(params)

    def __call__(self, process_id):
        def sample_iter_reader():
            with open(self.label_file_path, "rb") as fin:
                label_infor_list = fin.readlines()
            img_num = len(label_infor_list)
            img_id_list = list(range(img_num))
            random.shuffle(img_id_list)
T
tink2123 已提交
47 48 49 50
            if sys.platform == "win32":
                print("multiprocess is not fully compatible with Windows."
                      "num_workers will be 1.")
                self.num_workers = 1
L
LDOUBLEV 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63 64
            for img_id in range(process_id, img_num, self.num_workers):
                label_infor = label_infor_list[img_id_list[img_id]]
                outs = self.process(label_infor)
                if outs is None:
                    continue
                yield outs

        def batch_iter_reader():
            batch_outs = []
            for outs in sample_iter_reader():
                batch_outs.append(outs)
                if len(batch_outs) == self.batch_size:
                    yield batch_outs
                    batch_outs = []
T
tink2123 已提交
65 66 67
            if not self.drop_last:
                if len(batch_outs) != 0:
                    yield batch_outs
L
LDOUBLEV 已提交
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83

        return batch_iter_reader


class EvalTestReader(object):
    def __init__(self, params):
        self.params = params
        assert 'process_function' in params,\
            "absence process_function in EvalTestReader"

    def __call__(self, mode):
        process_function = create_module(self.params['process_function'])(
            self.params)
        batch_size = self.params['test_batch_size_per_card']

        img_list = []
L
LDOUBLEV 已提交
84
        if mode != "test":
L
LDOUBLEV 已提交
85 86 87 88 89 90
            img_set_dir = self.params['img_set_dir']
            img_name_list_path = self.params['label_file_path']
            with open(img_name_list_path, "rb") as fin:
                lines = fin.readlines()
                for line in lines:
                    img_name = line.decode().strip("\n").split("\t")[0]
L
LDOUBLEV 已提交
91
                    img_path = os.path.join(img_set_dir, img_name)
L
LDOUBLEV 已提交
92
                    img_list.append(img_path)
L
LDOUBLEV 已提交
93
        else:
94
            img_path = self.params['infer_img']
L
LDOUBLEV 已提交
95
            img_list = get_image_file_list(img_path)
L
LDOUBLEV 已提交
96 97 98

        def batch_iter_reader():
            batch_outs = []
L
LDOUBLEV 已提交
99
            for img_path in img_list:
L
LDOUBLEV 已提交
100 101 102 103 104
                img = cv2.imread(img_path)
                if img is None:
                    logger.info("load image error:" + img_path)
                    continue
                outs = process_function(img)
L
LDOUBLEV 已提交
105
                outs.append(img_path)
L
LDOUBLEV 已提交
106 107 108 109 110 111 112 113
                batch_outs.append(outs)
                if len(batch_outs) == batch_size:
                    yield batch_outs
                    batch_outs = []
            if len(batch_outs) != 0:
                yield batch_outs

        return batch_iter_reader