diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index 148b09368717a09fcc20e0852c69a948311cb511..48f12b96a07cb3ba15fefa8ce1b3f706406cf3a2 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -118,7 +118,7 @@ class BaseRecLabelEncode(object): self.lower = True else: self.character_str = [] - with open(character_dict_path, "rb") as fin: + with open(character_dict_path, "rb", encoding="utf-8") as fin: lines = fin.readlines() for line in lines: line = line.decode('utf-8').strip("\n").strip("\r\n") @@ -278,7 +278,7 @@ class KieLabelEncode(object): char = line.strip() self.dict[char] = idx idx += 1 - with open(class_path, "r") as fin: + with open(class_path, "r", encoding="utf-8") as fin: lines = fin.readlines() for idx, line in enumerate(lines): line = line.strip("\n") @@ -640,7 +640,7 @@ class TableLabelEncode(AttnLabelEncode): self.replace_empty_cell_token = replace_empty_cell_token dict_character = [] - with open(character_dict_path, "rb") as fin: + with open(character_dict_path, "rb", encoding="utf-8") as fin: lines = fin.readlines() for line in lines: line = line.decode('utf-8').strip("\n").strip("\r\n") @@ -1380,7 +1380,7 @@ class SRLabelEncode(BaseRecLabelEncode): super(SRLabelEncode, self).__init__(max_text_length, character_dict_path, use_space_char) self.dic = {} - with open(character_dict_path, 'r') as fin: + with open(character_dict_path, 'r', encoding="utf-8") as fin: for line in fin.readlines(): line = line.strip() character, sequence = line.split() diff --git a/ppocr/data/pubtab_dataset.py b/ppocr/data/pubtab_dataset.py index 642d3eb1961cbf0e829e6fb122f38c6af99df1c5..c84a7af902c5fdf18ce7016a3c806bd1f8d40dd6 100644 --- a/ppocr/data/pubtab_dataset.py +++ b/ppocr/data/pubtab_dataset.py @@ -59,7 +59,7 @@ class PubTabDataSet(Dataset): file_list = [file_list] data_lines = [] for idx, file in enumerate(file_list): - with open(file, "rb") as f: + with open(file, "rb", encoding="utf-8") as f: lines = f.readlines() if self.mode == "train" or ratio_list[idx] < 1.0: random.seed(self.seed) @@ -112,7 +112,7 @@ class PubTabDataSet(Dataset): 'file_name': file_name } - with open(data['img_path'], 'rb') as f: + with open(data['img_path'], 'rb', encoding="utf-8") as f: img = f.read() data['image'] = img outs = transform(data, self.ops) diff --git a/ppocr/data/simple_dataset.py b/ppocr/data/simple_dataset.py index 044eafe10ebc381ee623500a2207491e1c51f08a..d17f931facd5b6cd215f2e9249d752632bb42c56 100644 --- a/ppocr/data/simple_dataset.py +++ b/ppocr/data/simple_dataset.py @@ -74,7 +74,7 @@ class SimpleDataSet(Dataset): file_list = [file_list] data_lines = [] for idx, file in enumerate(file_list): - with open(file, "rb") as f: + with open(file, "rb", encoding="utf-8") as f: lines = f.readlines() if self.mode == "train" or ratio_list[idx] < 1.0: random.seed(self.seed) @@ -120,7 +120,7 @@ class SimpleDataSet(Dataset): data = {'img_path': img_path, 'label': label} if not os.path.exists(img_path): continue - with open(data['img_path'], 'rb') as f: + with open(data['img_path'], 'rb', encoding="utf-8") as f: img = f.read() data['image'] = img data = transform(data, load_data_ops) @@ -146,7 +146,7 @@ class SimpleDataSet(Dataset): data = {'img_path': img_path, 'label': label} if not os.path.exists(img_path): raise Exception("{} does not exist!".format(img_path)) - with open(data['img_path'], 'rb') as f: + with open(data['img_path'], 'rb', encoding="utf-8") as f: img = f.read() data['image'] = img data['ext_data'] = self.get_ext_data() @@ -240,7 +240,7 @@ class MultiScaleDataSet(SimpleDataSet): data = {'img_path': img_path, 'label': label} if not os.path.exists(img_path): raise Exception("{} does not exist!".format(img_path)) - with open(data['img_path'], 'rb') as f: + with open(data['img_path'], 'rb', encoding="utf-8") as f: img = f.read() data['image'] = img data['ext_data'] = self.get_ext_data() diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index f64ea1ce7a72641ccf7da8be9355e31968a030f7..3af3536e351e089346918dac24790d4cc3a59e7f 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -31,7 +31,7 @@ class BaseRecLabelDecode(object): self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) else: - with open(character_dict_path, "rb") as fin: + with open(character_dict_path, "rb", encoding="utf-8") as fin: lines = fin.readlines() for line in lines: line = line.decode('utf-8').strip("\n").strip("\r\n") diff --git a/ppocr/postprocess/table_postprocess.py b/ppocr/postprocess/table_postprocess.py index a47061f935e31b24fdb624df170f8abb38e01f40..05e89bb92cb202fdcaf11b473fc4fbc57a831df6 100644 --- a/ppocr/postprocess/table_postprocess.py +++ b/ppocr/postprocess/table_postprocess.py @@ -26,7 +26,7 @@ class TableLabelDecode(AttnLabelDecode): merge_no_span_structure=False, **kwargs): dict_character = [] - with open(character_dict_path, "rb") as fin: + with open(character_dict_path, "rb", encoding="utf-8") as fin: lines = fin.readlines() for line in lines: line = line.decode('utf-8').strip("\n").strip("\r\n")