diff --git a/ppocr/data/simple_dataset.py b/ppocr/data/simple_dataset.py index ce9e1b38675ae8df4a2e83b88c1adae4476a10b5..c8708a4ab94f1761551dc9ecbe17316ac0ab67f7 100644 --- a/ppocr/data/simple_dataset.py +++ b/ppocr/data/simple_dataset.py @@ -46,6 +46,7 @@ class SimpleDataSet(Dataset): self.seed = seed logger.info("Initialize indexs of datasets:%s" % label_file_list) self.data_lines = self.get_image_info_list(label_file_list, ratio_list) + self.check_data() self.data_idx_order_list = list(range(len(self.data_lines))) if self.mode == "train" and self.do_shuffle: self.shuffle_data_random() @@ -102,16 +103,8 @@ class SimpleDataSet(Dataset): def __getitem__(self, idx): file_idx = self.data_idx_order_list[idx] - data_line = self.data_lines[file_idx] + data = self.data_lines[file_idx] try: - data_line = data_line.decode('utf-8') - substr = data_line.strip("\n").strip("\r").split(self.delimiter) - file_name = substr[0] - label = substr[1] - img_path = os.path.join(self.data_dir, file_name) - data = {'img_path': img_path, 'label': label} - if not os.path.exists(img_path): - raise Exception("{} does not exist!".format(img_path)) with open(data['img_path'], 'rb') as f: img = f.read() data['image'] = img @@ -120,8 +113,8 @@ class SimpleDataSet(Dataset): except: error_meg = traceback.format_exc() self.logger.error( - "When parsing line {}, error happened with msg: {}".format( - data_line, error_meg)) + "When parsing file {} and label {}, error happened with msg: {}".format( + data['img_path'],data['label'], error_meg)) outs = None if outs is None: # during evaluation, we should fix the idx to get same results for many times of evaluation. @@ -132,3 +125,17 @@ class SimpleDataSet(Dataset): def __len__(self): return len(self.data_idx_order_list) + + def check_data(self): + new_data_lines = [] + for data_line in self.data_lines: + data_line = data_line.decode('utf-8') + substr = data_line.strip("\n").strip("\r").split(self.delimiter) + file_name = substr[0] + label = substr[1] + img_path = os.path.join(self.data_dir, file_name) + if os.path.exists(img_path): + new_data_lines.append({'img_path': img_path, 'label': label}) + else: + self.logger.info("{} does not exist!".format(img_path)) + self.data_lines = new_data_lines \ No newline at end of file