diff --git a/MANIFEST.in b/MANIFEST.in index cd1c9636d4d23cc4d0f745403ec8ca407d1cc1a8..e16f157d6e9dd249d6c6a14ae54313759a6752c4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,7 @@ -include LICENSE +include LICENSE.txt include README.md -recursive-include ppocr/utils *.txt utility.py logging.py network.py +recursive-include ppocr/utils *.txt utility.py logging.py recursive-include ppocr/data/ *.py recursive-include ppocr/postprocess *.py recursive-include tools/infer *.py diff --git a/doc/table/1.png b/doc/table/1.png deleted file mode 100644 index 47df618ab1bef431a5dd94418c01be16b09d31aa..0000000000000000000000000000000000000000 Binary files a/doc/table/1.png and /dev/null differ diff --git a/paddleocr.py b/paddleocr.py index 48c8c9c6523dc3f813189477e641f0e51b740885..1e4d94ff4e72da951e1ffb92edb50715482581ae 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -19,16 +19,17 @@ __dir__ = os.path.dirname(__file__) sys.path.append(os.path.join(__dir__, '')) import cv2 -import logging import numpy as np from pathlib import Path +import tarfile +import requests +from tqdm import tqdm from tools.infer import predict_system from ppocr.utils.logging import get_logger logger = get_logger() from ppocr.utils.utility import check_and_read_gif, get_image_file_list -from ppocr.utils.network import maybe_download, download_with_progressbar from tools.infer.utility import draw_ocr, init_args, str2bool __all__ = ['PaddleOCR'] @@ -36,84 +37,84 @@ __all__ = ['PaddleOCR'] model_urls = { 'det': { 'ch': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar', 'en': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_ppocr_mobile_v2.0_det_infer.tar' + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_ppocr_mobile_v2.0_det_infer.tar' }, 'rec': { 'ch': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/ppocr_keys_v1.txt' }, 'en': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/en_dict.txt' }, 'french': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/french_dict.txt' }, 'german': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/german_dict.txt' }, 'korean': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/korean_dict.txt' }, 'japan': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/japan_dict.txt' }, 'chinese_cht': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/chinese_cht_dict.txt' }, 'ta': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/ta_dict.txt' }, 'te': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/te_dict.txt' }, 'ka': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/ka_dict.txt' }, 'latin': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/latin_dict.txt' }, 'arabic': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/arabic_dict.txt' }, 'cyrillic': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/cyrillic_dict.txt' }, 'devanagari': { 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar', + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar', 'dict_path': './ppocr/utils/dict/devanagari_dict.txt' } }, 'cls': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar' + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar' } SUPPORT_DET_MODEL = ['DB'] @@ -122,6 +123,50 @@ SUPPORT_REC_MODEL = ['CRNN'] BASE_DIR = os.path.expanduser("~/.paddleocr/") +def download_with_progressbar(url, save_path): + response = requests.get(url, stream=True) + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 # 1 Kibibyte + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(save_path, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes: + logger.error("Something went wrong while downloading models") + sys.exit(0) + + +def maybe_download(model_storage_directory, url): + # using custom model + tar_file_name_list = [ + 'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel' + ] + if not os.path.exists( + os.path.join(model_storage_directory, 'inference.pdiparams') + ) or not os.path.exists( + os.path.join(model_storage_directory, 'inference.pdmodel')): + tmp_path = os.path.join(model_storage_directory, url.split('/')[-1]) + print('download {} to {}'.format(url, tmp_path)) + os.makedirs(model_storage_directory, exist_ok=True) + download_with_progressbar(url, tmp_path) + with tarfile.open(tmp_path, 'r') as tarObj: + for member in tarObj.getmembers(): + filename = None + for tar_file_name in tar_file_name_list: + if tar_file_name in member.name: + filename = tar_file_name + if filename is None: + continue + file = tarObj.extractfile(member) + with open( + os.path.join(model_storage_directory, filename), + 'wb') as f: + f.write(file.read()) + os.remove(tmp_path) + + def parse_args(mMain=True): import argparse parser = init_args() @@ -149,12 +194,10 @@ class PaddleOCR(predict_system.TextSystem): args: **kwargs: other params show in paddleocr --help """ - params = parse_args(mMain=False) - params.__dict__.update(**kwargs) - if params.show_log: - logger.setLevel(logging.DEBUG) - self.use_angle_cls = params.use_angle_cls - lang = params.lang + postprocess_params = parse_args(mMain=False) + postprocess_params.__dict__.update(**kwargs) + self.use_angle_cls = postprocess_params.use_angle_cls + lang = postprocess_params.lang latin_lang = [ 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', @@ -180,46 +223,46 @@ class PaddleOCR(predict_system.TextSystem): lang = "devanagari" assert lang in model_urls[ 'rec'], 'param lang must in {}, but got {}'.format( - model_urls['rec'].keys(), lang) + model_urls['rec'].keys(), lang) if lang == "ch": det_lang = "ch" else: det_lang = "en" use_inner_dict = False - if params.rec_char_dict_path is None: + if postprocess_params.rec_char_dict_path is None: use_inner_dict = True - params.rec_char_dict_path = model_urls['rec'][lang][ + postprocess_params.rec_char_dict_path = model_urls['rec'][lang][ 'dict_path'] # init model dir - if params.det_model_dir is None: - params.det_model_dir = os.path.join(BASE_DIR, VERSION, + if postprocess_params.det_model_dir is None: + postprocess_params.det_model_dir = os.path.join(BASE_DIR, VERSION, 'det', det_lang) - if params.rec_model_dir is None: - params.rec_model_dir = os.path.join(BASE_DIR, VERSION, + if postprocess_params.rec_model_dir is None: + postprocess_params.rec_model_dir = os.path.join(BASE_DIR, VERSION, 'rec', lang) - if params.cls_model_dir is None: - params.cls_model_dir = os.path.join(BASE_DIR, 'cls') + if postprocess_params.cls_model_dir is None: + postprocess_params.cls_model_dir = os.path.join(BASE_DIR, 'cls') + print(postprocess_params) # download model - maybe_download(params.det_model_dir, + maybe_download(postprocess_params.det_model_dir, model_urls['det'][det_lang]) - maybe_download(params.rec_model_dir, + maybe_download(postprocess_params.rec_model_dir, model_urls['rec'][lang]['url']) - maybe_download(params.cls_model_dir, model_urls['cls']) + maybe_download(postprocess_params.cls_model_dir, model_urls['cls']) - if params.det_algorithm not in SUPPORT_DET_MODEL: + if postprocess_params.det_algorithm not in SUPPORT_DET_MODEL: logger.error('det_algorithm must in {}'.format(SUPPORT_DET_MODEL)) sys.exit(0) - if params.rec_algorithm not in SUPPORT_REC_MODEL: + if postprocess_params.rec_algorithm not in SUPPORT_REC_MODEL: logger.error('rec_algorithm must in {}'.format(SUPPORT_REC_MODEL)) sys.exit(0) if use_inner_dict: - params.rec_char_dict_path = str( - Path(__file__).parent / params.rec_char_dict_path) + postprocess_params.rec_char_dict_path = str( + Path(__file__).parent / postprocess_params.rec_char_dict_path) - print(params) # init det_model and rec_model - super().__init__(params) + super().__init__(postprocess_params) def ocr(self, img, det=True, rec=True, cls=True): """ diff --git a/ppocr/data/imaug/operators.py b/ppocr/data/imaug/operators.py index 2535b4420c503f2e9e9cc5a677ef70c4dd9c36be..9c48b09647527cf718113ea1b5df152ff7befa04 100644 --- a/ppocr/data/imaug/operators.py +++ b/ppocr/data/imaug/operators.py @@ -81,7 +81,7 @@ class NormalizeImage(object): assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage" data['image'] = ( - img.astype('float32') * self.scale - self.mean) / self.std + img.astype('float32') * self.scale - self.mean) / self.std return data @@ -163,7 +163,7 @@ class DetResizeForTest(object): img, (ratio_h, ratio_w) """ limit_side_len = self.limit_side_len - h, w, c = img.shape + h, w, _ = img.shape # limit the max side if self.limit_type == 'max': @@ -174,7 +174,7 @@ class DetResizeForTest(object): ratio = float(limit_side_len) / w else: ratio = 1. - elif self.limit_type == 'min': + else: if min(h, w) < limit_side_len: if h < w: ratio = float(limit_side_len) / h @@ -182,10 +182,6 @@ class DetResizeForTest(object): ratio = float(limit_side_len) / w else: ratio = 1. - elif self.limit_type == 'resize_long': - ratio = float(limit_side_len) / max(h,w) - else: - raise Exception('not support limit type, image ') resize_h = int(h * ratio) resize_w = int(w * ratio) diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 164dec557a672842df868e2f6a01fc0fbc3e4946..ae5470a520eb90464c548bbe8a00d54840cc529c 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -44,16 +44,16 @@ class BaseRecLabelDecode(object): self.character_str = string.printable[:-6] dict_character = list(self.character_str) elif character_type in support_character_type: - self.character_str = [] + self.character_str = "" assert character_dict_path is not None, "character_dict_path should not be None when character_type is {}".format( character_type) with open(character_dict_path, "rb") as fin: lines = fin.readlines() for line in lines: line = line.decode('utf-8').strip("\n").strip("\r\n") - self.character_str.append(line) + self.character_str += line if use_space_char: - self.character_str.append(" ") + self.character_str += " " dict_character = list(self.character_str) else: @@ -319,156 +319,3 @@ class SRNLabelDecode(BaseRecLabelDecode): assert False, "unsupport type %s in get_beg_end_flag_idx" \ % beg_or_end return idx - - -class TableLabelDecode(object): - """ """ - - def __init__(self, - max_text_length, - max_elem_length, - max_cell_num, - character_dict_path, - **kwargs): - self.max_text_length = max_text_length - self.max_elem_length = max_elem_length - self.max_cell_num = max_cell_num - list_character, list_elem = self.load_char_elem_dict(character_dict_path) - list_character = self.add_special_char(list_character) - list_elem = self.add_special_char(list_elem) - self.dict_character = {} - self.dict_idx_character = {} - for i, char in enumerate(list_character): - self.dict_idx_character[i] = char - self.dict_character[char] = i - self.dict_elem = {} - self.dict_idx_elem = {} - for i, elem in enumerate(list_elem): - self.dict_idx_elem[i] = elem - self.dict_elem[elem] = i - - def load_char_elem_dict(self, character_dict_path): - list_character = [] - list_elem = [] - with open(character_dict_path, "rb") as fin: - lines = fin.readlines() - substr = lines[0].decode('utf-8').strip("\n").split("\t") - character_num = int(substr[0]) - elem_num = int(substr[1]) - for cno in range(1, 1 + character_num): - character = lines[cno].decode('utf-8').strip("\n") - list_character.append(character) - for eno in range(1 + character_num, 1 + character_num + elem_num): - elem = lines[eno].decode('utf-8').strip("\n") - list_elem.append(elem) - return list_character, list_elem - - def add_special_char(self, list_character): - self.beg_str = "sos" - self.end_str = "eos" - list_character = [self.beg_str] + list_character + [self.end_str] - return list_character - - def get_sp_tokens(self): - char_beg_idx = self.get_beg_end_flag_idx('beg', 'char') - char_end_idx = self.get_beg_end_flag_idx('end', 'char') - elem_beg_idx = self.get_beg_end_flag_idx('beg', 'elem') - elem_end_idx = self.get_beg_end_flag_idx('end', 'elem') - elem_char_idx1 = self.dict_elem[''] - elem_char_idx2 = self.dict_elem['', ' 0 and tmp_elem_idx == end_idx: - break - if tmp_elem_idx in ignored_tokens: - continue - - char_list.append(current_dict[tmp_elem_idx]) - elem_pos_list.append(idx) - score_list.append(structure_probs[batch_idx, idx]) - elem_idx_list.append(tmp_elem_idx) - result_list.append(char_list) - result_pos_list.append(elem_pos_list) - result_score_list.append(score_list) - result_elem_idx_list.append(elem_idx_list) - return result_list, result_pos_list, result_score_list, result_elem_idx_list - - def get_ignored_tokens(self, char_or_elem): - beg_idx = self.get_beg_end_flag_idx("beg", char_or_elem) - end_idx = self.get_beg_end_flag_idx("end", char_or_elem) - return [beg_idx, end_idx] - - def get_beg_end_flag_idx(self, beg_or_end, char_or_elem): - if char_or_elem == "char": - if beg_or_end == "beg": - idx = self.dict_character[self.beg_str] - elif beg_or_end == "end": - idx = self.dict_character[self.end_str] - else: - assert False, "Unsupport type %s in get_beg_end_flag_idx of char" \ - % beg_or_end - elif char_or_elem == "elem": - if beg_or_end == "beg": - idx = self.dict_elem[self.beg_str] - elif beg_or_end == "end": - idx = self.dict_elem[self.end_str] - else: - assert False, "Unsupport type %s in get_beg_end_flag_idx of elem" \ - % beg_or_end - else: - assert False, "Unsupport type %s in char_or_elem" \ - % char_or_elem - return idx diff --git a/ppocr/utils/dict/table_dict.txt b/ppocr/utils/dict/table_dict.txt deleted file mode 100644 index 2ef028c786cbce6d1e25856c62986d757b31f93b..0000000000000000000000000000000000000000 --- a/ppocr/utils/dict/table_dict.txt +++ /dev/null @@ -1,277 +0,0 @@ -← - -☆ -─ -α - - -⋅ -$ -ω -ψ -χ -( -υ -≥ -σ -, -ρ -ε -0 -■ -4 -8 -✗ -b -< -✓ -Ψ -Ω -€ -D -3 -Π -H -║ - -L -Φ -Χ -θ -P -κ -λ -μ -T -ξ -X -β -γ -δ -\ -ζ -η -` -d - -h -f -l -Θ -p -√ -t - -x -Β -Γ -Δ -| -ǂ -ɛ -j -̧ -➢ -⁡ -̌ -′ -« -△ -▲ -# - -' -Ι -+ -¶ -/ -▼ -⇑ -□ -· -7 -▪ -; -? -➔ -∩ -C -÷ -G -⇒ -K - -O -S -С -W -Α -[ -○ -_ -● -‡ -c -z -g - -o - -〈 -〉 -s -⩽ -w -φ -ʹ -{ -» -∣ -̆ -e -ˆ -∈ -τ -◆ -ι -∅ -∆ -∙ -∘ -Ø -ß -✔ -∞ -∑ -− -× -◊ -∗ -∖ -˃ -˂ -∫ -" -i -& -π -↔ -* -∥ -æ -∧ -. -⁄ -ø -Q -∼ -6 -⁎ -: -★ -> -a -B -≈ -F -J -̄ -N -♯ -R -V - -― -Z -♣ -^ -¤ -¥ -§ - -¢ -£ -≦ -­ -≤ -‖ -Λ -© -n -↓ -→ -↑ -r -° -± -v - -♂ -k -♀ -~ -ᅟ -̇ -@ -” -♦ -ł -® -⊕ -„ -! - -% -⇓ -) -- -1 -5 -9 -= -А -A -‰ -⋆ -Σ -E -◦ -I -※ -M -m -̨ -⩾ -† - -• -U -Y -
 -] -̸ -2 -‐ -– -‒ -̂ -— -̀ -́ -’ -‘ -⋮ -⋯ -̊ -“ -̈ -≧ -q -u -ı -y - -​ -̃ -} -ν diff --git a/ppocr/utils/dict/table_structure_dict.txt b/ppocr/utils/dict/table_structure_dict.txt deleted file mode 100644 index 9c4531e5f3b8c498e70d3c2ea0471e5e746a2c30..0000000000000000000000000000000000000000 --- a/ppocr/utils/dict/table_structure_dict.txt +++ /dev/null @@ -1,2759 +0,0 @@ -277 28 1267 1186 - -V -a -r -i -b -l -e - -H -z -d - -t -o -9 -5 -% -C -I - -p - -v -u -* -A -g -( -m -n -) -0 -. -7 -1 -6 -≤ -> -8 -3 -– -2 -G -4 -M -F -T -y -f -s -L -w -c -U -h -D -S -Q -R -x -P -- -E -O -/ -k -, -+ -N -K -q -′ -[ -] -< -≥ - -− - -μ -± -J -j -W -_ -Δ -B -“ -: -Y -α -λ -; - - -? -∼ -= -° -# -̊ -̈ -̂ -’ -Z -X -∗ -— -β -' -† -~ -@ -" -γ -↓ -↑ -& -‡ -χ -” -σ -§ -| -¶ -‐ -× -$ -→ -√ -✓ -‘ -\ -∞ -π -• -® -^ -∆ -≧ - - -́ -♀ -♂ -‒ -⁎ -▲ -· -£ -φ -Ψ -ß -△ -☆ -▪ -η -€ -∧ -̃ -Φ -ρ -̄ -δ -‰ -̧ -Ω -♦ -{ -} -̀ -∑ -∫ -ø -κ -ε -¥ -※ -` -ω -Σ -➔ -‖ -Β -̸ -
 -─ -● -⩾ -Χ -Α -⋅ -◆ -★ -■ -ψ -ǂ -□ -ζ -! -Γ -↔ -θ -⁄ -〈 -〉 -― -υ -τ -⋆ -Ø -© -∥ -С -˂ -➢ -ɛ -⁡ -✗ -← -○ -¢ -⩽ -∖ -˃ -­ -≈ -Π -̌ -≦ -∅ -ᅟ - - -∣ -¤ -♯ -̆ -ξ -÷ -▼ - -ι -ν -║ - - -◦ -​ -◊ -∙ -« -» -ł -ı -Θ -∈ -„ -∘ -✔ -̇ -æ -ʹ -ˆ -♣ -⇓ -∩ -⊕ -⇒ -⇑ -̨ -Ι -Λ -⋯ -А -⋮ - - - - - - - - - - colspan="2" - colspan="3" - rowspan="2" - colspan="4" - colspan="6" - rowspan="3" - colspan="9" - colspan="10" - colspan="7" - rowspan="4" - rowspan="5" - rowspan="9" - colspan="8" - rowspan="8" - rowspan="6" - rowspan="7" - rowspan="10" -0 2924682 -1 3405345 -2 2363468 -3 2709165 -4 4078680 -5 3250792 -6 1923159 -7 1617890 -8 1450532 -9 1717624 -10 1477550 -11 1489223 -12 915528 -13 819193 -14 593660 -15 518924 -16 682065 -17 494584 -18 400591 -19 396421 -20 340994 -21 280688 -22 250328 -23 226786 -24 199927 -25 182707 -26 164629 -27 141613 -28 127554 -29 116286 -30 107682 -31 96367 -32 88002 -33 79234 -34 72186 -35 65921 -36 60374 -37 55976 -38 52166 -39 47414 -40 44932 -41 41279 -42 38232 -43 35463 -44 33703 -45 30557 -46 29639 -47 27000 -48 25447 -49 23186 -50 22093 -51 20412 -52 19844 -53 18261 -54 17561 -55 16499 -56 15597 -57 14558 -58 14372 -59 13445 -60 13514 -61 12058 -62 11145 -63 10767 -64 10370 -65 9630 -66 9337 -67 8881 -68 8727 -69 8060 -70 7994 -71 7740 -72 7189 -73 6729 -74 6749 -75 6548 -76 6321 -77 5957 -78 5740 -79 5407 -80 5370 -81 5035 -82 4921 -83 4656 -84 4600 -85 4519 -86 4277 -87 4023 -88 3939 -89 3910 -90 3861 -91 3560 -92 3483 -93 3406 -94 3346 -95 3229 -96 3122 -97 3086 -98 3001 -99 2884 -100 2822 -101 2677 -102 2670 -103 2610 -104 2452 -105 2446 -106 2400 -107 2300 -108 2316 -109 2196 -110 2089 -111 2083 -112 2041 -113 1881 -114 1838 -115 1896 -116 1795 -117 1786 -118 1743 -119 1765 -120 1750 -121 1683 -122 1563 -123 1499 -124 1513 -125 1462 -126 1388 -127 1441 -128 1417 -129 1392 -130 1306 -131 1321 -132 1274 -133 1294 -134 1240 -135 1126 -136 1157 -137 1130 -138 1084 -139 1130 -140 1083 -141 1040 -142 980 -143 1031 -144 974 -145 980 -146 932 -147 898 -148 960 -149 907 -150 852 -151 912 -152 859 -153 847 -154 876 -155 792 -156 791 -157 765 -158 788 -159 787 -160 744 -161 673 -162 683 -163 697 -164 666 -165 680 -166 632 -167 677 -168 657 -169 618 -170 587 -171 585 -172 567 -173 549 -174 562 -175 548 -176 542 -177 539 -178 542 -179 549 -180 547 -181 526 -182 525 -183 514 -184 512 -185 505 -186 515 -187 467 -188 475 -189 458 -190 435 -191 443 -192 427 -193 424 -194 404 -195 389 -196 429 -197 404 -198 386 -199 351 -200 388 -201 408 -202 361 -203 346 -204 324 -205 361 -206 363 -207 364 -208 323 -209 336 -210 342 -211 315 -212 325 -213 328 -214 314 -215 327 -216 320 -217 300 -218 295 -219 315 -220 310 -221 295 -222 275 -223 248 -224 274 -225 232 -226 293 -227 259 -228 286 -229 263 -230 242 -231 214 -232 261 -233 231 -234 211 -235 250 -236 233 -237 206 -238 224 -239 210 -240 233 -241 223 -242 216 -243 222 -244 207 -245 212 -246 196 -247 205 -248 201 -249 202 -250 211 -251 201 -252 215 -253 179 -254 163 -255 179 -256 191 -257 188 -258 196 -259 150 -260 154 -261 176 -262 211 -263 166 -264 171 -265 165 -266 149 -267 182 -268 159 -269 161 -270 164 -271 161 -272 141 -273 151 -274 127 -275 129 -276 142 -277 158 -278 148 -279 135 -280 127 -281 134 -282 138 -283 131 -284 126 -285 125 -286 130 -287 126 -288 135 -289 125 -290 135 -291 131 -292 95 -293 135 -294 106 -295 117 -296 136 -297 128 -298 128 -299 118 -300 109 -301 112 -302 117 -303 108 -304 120 -305 100 -306 95 -307 108 -308 112 -309 77 -310 120 -311 104 -312 109 -313 89 -314 98 -315 82 -316 98 -317 93 -318 77 -319 93 -320 77 -321 98 -322 93 -323 86 -324 89 -325 73 -326 70 -327 71 -328 77 -329 87 -330 77 -331 93 -332 100 -333 83 -334 72 -335 74 -336 69 -337 77 -338 68 -339 78 -340 90 -341 98 -342 75 -343 80 -344 63 -345 71 -346 83 -347 66 -348 71 -349 70 -350 62 -351 62 -352 59 -353 63 -354 62 -355 52 -356 64 -357 64 -358 56 -359 49 -360 57 -361 63 -362 60 -363 68 -364 62 -365 55 -366 54 -367 40 -368 75 -369 70 -370 53 -371 58 -372 57 -373 55 -374 69 -375 57 -376 53 -377 43 -378 45 -379 47 -380 56 -381 51 -382 59 -383 51 -384 43 -385 34 -386 57 -387 49 -388 39 -389 46 -390 48 -391 43 -392 40 -393 54 -394 50 -395 41 -396 43 -397 33 -398 27 -399 49 -400 44 -401 44 -402 38 -403 30 -404 32 -405 37 -406 39 -407 42 -408 53 -409 39 -410 34 -411 31 -412 32 -413 52 -414 27 -415 41 -416 34 -417 36 -418 50 -419 35 -420 32 -421 33 -422 45 -423 35 -424 40 -425 29 -426 41 -427 40 -428 39 -429 32 -430 31 -431 34 -432 29 -433 27 -434 26 -435 22 -436 34 -437 28 -438 30 -439 38 -440 35 -441 36 -442 36 -443 27 -444 24 -445 33 -446 31 -447 25 -448 33 -449 27 -450 32 -451 46 -452 31 -453 35 -454 35 -455 34 -456 26 -457 21 -458 25 -459 26 -460 24 -461 27 -462 33 -463 30 -464 35 -465 21 -466 32 -467 19 -468 27 -469 16 -470 28 -471 26 -472 27 -473 26 -474 25 -475 25 -476 27 -477 20 -478 28 -479 22 -480 23 -481 16 -482 25 -483 27 -484 19 -485 23 -486 19 -487 15 -488 15 -489 23 -490 24 -491 19 -492 20 -493 18 -494 17 -495 30 -496 28 -497 20 -498 29 -499 17 -500 19 -501 21 -502 15 -503 24 -504 15 -505 19 -506 25 -507 16 -508 23 -509 26 -510 21 -511 15 -512 12 -513 16 -514 18 -515 24 -516 26 -517 18 -518 8 -519 25 -520 14 -521 8 -522 24 -523 20 -524 18 -525 15 -526 13 -527 17 -528 18 -529 22 -530 21 -531 9 -532 16 -533 17 -534 13 -535 17 -536 15 -537 13 -538 20 -539 13 -540 19 -541 29 -542 10 -543 8 -544 18 -545 13 -546 9 -547 18 -548 10 -549 18 -550 18 -551 9 -552 9 -553 15 -554 13 -555 15 -556 14 -557 14 -558 18 -559 8 -560 13 -561 9 -562 7 -563 12 -564 6 -565 9 -566 9 -567 18 -568 9 -569 10 -570 13 -571 14 -572 13 -573 21 -574 8 -575 16 -576 12 -577 9 -578 16 -579 17 -580 22 -581 6 -582 14 -583 13 -584 15 -585 11 -586 13 -587 5 -588 12 -589 13 -590 15 -591 13 -592 15 -593 12 -594 7 -595 18 -596 12 -597 13 -598 13 -599 13 -600 12 -601 12 -602 10 -603 11 -604 6 -605 6 -606 2 -607 9 -608 8 -609 12 -610 9 -611 12 -612 13 -613 12 -614 14 -615 9 -616 8 -617 9 -618 14 -619 13 -620 12 -621 6 -622 8 -623 8 -624 8 -625 12 -626 8 -627 7 -628 5 -629 8 -630 12 -631 6 -632 10 -633 10 -634 7 -635 8 -636 9 -637 6 -638 9 -639 4 -640 12 -641 4 -642 3 -643 11 -644 10 -645 6 -646 12 -647 12 -648 4 -649 4 -650 9 -651 8 -652 6 -653 5 -654 14 -655 10 -656 11 -657 8 -658 5 -659 5 -660 9 -661 13 -662 4 -663 5 -664 9 -665 11 -666 12 -667 7 -668 13 -669 2 -670 1 -671 7 -672 7 -673 7 -674 10 -675 9 -676 6 -677 5 -678 7 -679 6 -680 3 -681 3 -682 4 -683 9 -684 8 -685 5 -686 3 -687 11 -688 9 -689 2 -690 6 -691 5 -692 9 -693 5 -694 6 -695 5 -696 9 -697 8 -698 3 -699 7 -700 5 -701 9 -702 8 -703 7 -704 2 -705 3 -706 7 -707 6 -708 6 -709 10 -710 2 -711 10 -712 6 -713 7 -714 5 -715 6 -716 4 -717 6 -718 8 -719 4 -720 6 -721 7 -722 5 -723 7 -724 3 -725 10 -726 10 -727 3 -728 7 -729 7 -730 5 -731 2 -732 1 -733 5 -734 1 -735 5 -736 6 -737 2 -738 2 -739 3 -740 7 -741 2 -742 7 -743 4 -744 5 -745 4 -746 5 -747 3 -748 1 -749 4 -750 4 -751 2 -752 4 -753 6 -754 6 -755 6 -756 3 -757 2 -758 5 -759 5 -760 3 -761 4 -762 2 -763 1 -764 8 -765 3 -766 4 -767 3 -768 1 -769 5 -770 3 -771 3 -772 4 -773 4 -774 1 -775 3 -776 2 -777 2 -778 3 -779 3 -780 1 -781 4 -782 3 -783 4 -784 6 -785 3 -786 5 -787 4 -788 2 -789 4 -790 5 -791 4 -792 6 -794 4 -795 1 -796 1 -797 4 -798 2 -799 3 -800 3 -801 1 -802 5 -803 5 -804 3 -805 3 -806 3 -807 4 -808 4 -809 2 -811 5 -812 4 -813 6 -814 3 -815 2 -816 2 -817 3 -818 5 -819 3 -820 1 -821 1 -822 4 -823 3 -824 4 -825 8 -826 3 -827 5 -828 5 -829 3 -830 6 -831 3 -832 4 -833 8 -834 5 -835 3 -836 3 -837 2 -838 4 -839 2 -840 1 -841 3 -842 2 -843 1 -844 3 -846 4 -847 4 -848 3 -849 3 -850 2 -851 3 -853 1 -854 4 -855 4 -856 2 -857 4 -858 1 -859 2 -860 5 -861 1 -862 1 -863 4 -864 2 -865 2 -867 5 -868 1 -869 4 -870 1 -871 1 -872 1 -873 2 -875 5 -876 3 -877 1 -878 3 -879 3 -880 3 -881 2 -882 1 -883 6 -884 2 -885 2 -886 1 -887 1 -888 3 -889 2 -890 2 -891 3 -892 1 -893 3 -894 1 -895 5 -896 1 -897 3 -899 2 -900 2 -902 1 -903 2 -904 4 -905 4 -906 3 -907 1 -908 1 -909 2 -910 5 -911 2 -912 3 -914 1 -915 1 -916 2 -918 2 -919 2 -920 4 -921 4 -922 1 -923 1 -924 4 -925 5 -926 1 -928 2 -929 1 -930 1 -931 1 -932 1 -933 1 -934 2 -935 1 -936 1 -937 1 -938 2 -939 1 -941 1 -942 4 -944 2 -945 2 -946 2 -947 1 -948 1 -950 1 -951 2 -953 1 -954 2 -955 1 -956 1 -957 2 -958 1 -960 3 -962 4 -963 1 -964 1 -965 3 -966 2 -967 2 -968 1 -969 3 -970 3 -972 1 -974 4 -975 3 -976 3 -977 2 -979 2 -980 1 -981 1 -983 5 -984 1 -985 3 -986 1 -987 2 -988 4 -989 2 -991 2 -992 2 -993 1 -994 1 -996 2 -997 2 -998 1 -999 3 -1000 2 -1001 1 -1002 3 -1003 3 -1004 2 -1005 3 -1006 1 -1007 2 -1009 1 -1011 1 -1013 3 -1014 1 -1016 2 -1017 1 -1018 1 -1019 1 -1020 4 -1021 1 -1022 2 -1025 1 -1026 1 -1027 2 -1028 1 -1030 1 -1031 2 -1032 4 -1034 3 -1035 2 -1036 1 -1038 1 -1039 1 -1040 1 -1041 1 -1042 2 -1043 1 -1044 2 -1045 4 -1048 1 -1050 1 -1051 1 -1052 2 -1054 1 -1055 3 -1056 2 -1057 1 -1059 1 -1061 2 -1063 1 -1064 1 -1065 1 -1066 1 -1067 1 -1068 1 -1069 2 -1074 1 -1075 1 -1077 1 -1078 1 -1079 1 -1082 1 -1085 1 -1088 1 -1090 1 -1091 1 -1092 2 -1094 2 -1097 2 -1098 1 -1099 2 -1101 2 -1102 1 -1104 1 -1105 1 -1107 1 -1109 1 -1111 2 -1112 1 -1114 2 -1115 2 -1116 2 -1117 1 -1118 1 -1119 1 -1120 1 -1122 1 -1123 1 -1127 1 -1128 3 -1132 2 -1138 3 -1142 1 -1145 4 -1150 1 -1153 2 -1154 1 -1158 1 -1159 1 -1163 1 -1165 1 -1169 2 -1174 1 -1176 1 -1177 1 -1178 2 -1179 1 -1180 2 -1181 1 -1182 1 -1183 2 -1185 1 -1187 1 -1191 2 -1193 1 -1195 3 -1196 1 -1201 3 -1203 1 -1206 1 -1210 1 -1213 1 -1214 1 -1215 2 -1218 1 -1220 1 -1221 1 -1225 1 -1226 1 -1233 2 -1241 1 -1243 1 -1249 1 -1250 2 -1251 1 -1254 1 -1255 2 -1260 1 -1268 1 -1270 1 -1273 1 -1274 1 -1277 1 -1284 1 -1287 1 -1291 1 -1292 2 -1294 1 -1295 2 -1297 1 -1298 1 -1301 1 -1307 1 -1308 3 -1311 2 -1313 1 -1316 1 -1321 1 -1324 1 -1325 1 -1330 1 -1333 1 -1334 1 -1338 2 -1340 1 -1341 1 -1342 1 -1343 1 -1345 1 -1355 1 -1357 1 -1360 2 -1375 1 -1376 1 -1380 1 -1383 1 -1387 1 -1389 1 -1393 1 -1394 1 -1396 1 -1398 1 -1410 1 -1414 1 -1419 1 -1425 1 -1434 1 -1435 1 -1438 1 -1439 1 -1447 1 -1455 2 -1460 1 -1461 1 -1463 1 -1466 1 -1470 1 -1473 1 -1478 1 -1480 1 -1483 1 -1484 1 -1485 2 -1492 2 -1499 1 -1509 1 -1512 1 -1513 1 -1523 1 -1524 1 -1525 2 -1529 1 -1539 1 -1544 1 -1568 1 -1584 1 -1591 1 -1598 1 -1600 1 -1604 1 -1614 1 -1617 1 -1621 1 -1622 1 -1626 1 -1638 1 -1648 1 -1658 1 -1661 1 -1679 1 -1682 1 -1693 1 -1700 1 -1705 1 -1707 1 -1722 1 -1728 1 -1758 1 -1762 1 -1763 1 -1775 1 -1776 1 -1801 1 -1810 1 -1812 1 -1827 1 -1834 1 -1846 1 -1847 1 -1848 1 -1851 1 -1862 1 -1866 1 -1877 2 -1884 1 -1888 1 -1903 1 -1912 1 -1925 1 -1938 1 -1955 1 -1998 1 -2054 1 -2058 1 -2065 1 -2069 1 -2076 1 -2089 1 -2104 1 -2111 1 -2133 1 -2138 1 -2156 1 -2204 1 -2212 1 -2237 1 -2246 2 -2298 1 -2304 1 -2360 1 -2400 1 -2481 1 -2544 1 -2586 1 -2622 1 -2666 1 -2682 1 -2725 1 -2920 1 -3997 1 -4019 1 -5211 1 -12 19 -14 1 -16 401 -18 2 -20 421 -22 557 -24 625 -26 50 -28 4481 -30 52 -32 550 -34 5840 -36 4644 -38 87 -40 5794 -41 33 -42 571 -44 11805 -46 4711 -47 7 -48 597 -49 12 -50 678 -51 2 -52 14715 -53 3 -54 7322 -55 3 -56 508 -57 39 -58 3486 -59 11 -60 8974 -61 45 -62 1276 -63 4 -64 15693 -65 15 -66 657 -67 13 -68 6409 -69 10 -70 3188 -71 25 -72 1889 -73 27 -74 10370 -75 9 -76 12432 -77 23 -78 520 -79 15 -80 1534 -81 29 -82 2944 -83 23 -84 12071 -85 36 -86 1502 -87 10 -88 10978 -89 11 -90 889 -91 16 -92 4571 -93 17 -94 7855 -95 21 -96 2271 -97 33 -98 1423 -99 15 -100 11096 -101 21 -102 4082 -103 13 -104 5442 -105 25 -106 2113 -107 26 -108 3779 -109 43 -110 1294 -111 29 -112 7860 -113 29 -114 4965 -115 22 -116 7898 -117 25 -118 1772 -119 28 -120 1149 -121 38 -122 1483 -123 32 -124 10572 -125 25 -126 1147 -127 31 -128 1699 -129 22 -130 5533 -131 22 -132 4669 -133 34 -134 3777 -135 10 -136 5412 -137 21 -138 855 -139 26 -140 2485 -141 46 -142 1970 -143 27 -144 6565 -145 40 -146 933 -147 15 -148 7923 -149 16 -150 735 -151 23 -152 1111 -153 33 -154 3714 -155 27 -156 2445 -157 30 -158 3367 -159 10 -160 4646 -161 27 -162 990 -163 23 -164 5679 -165 25 -166 2186 -167 17 -168 899 -169 32 -170 1034 -171 22 -172 6185 -173 32 -174 2685 -175 17 -176 1354 -177 38 -178 1460 -179 15 -180 3478 -181 20 -182 958 -183 20 -184 6055 -185 23 -186 2180 -187 15 -188 1416 -189 30 -190 1284 -191 22 -192 1341 -193 21 -194 2413 -195 18 -196 4984 -197 13 -198 830 -199 22 -200 1834 -201 19 -202 2238 -203 9 -204 3050 -205 22 -206 616 -207 17 -208 2892 -209 22 -210 711 -211 30 -212 2631 -213 19 -214 3341 -215 21 -216 987 -217 26 -218 823 -219 9 -220 3588 -221 20 -222 692 -223 7 -224 2925 -225 31 -226 1075 -227 16 -228 2909 -229 18 -230 673 -231 20 -232 2215 -233 14 -234 1584 -235 21 -236 1292 -237 29 -238 1647 -239 25 -240 1014 -241 30 -242 1648 -243 19 -244 4465 -245 10 -246 787 -247 11 -248 480 -249 25 -250 842 -251 15 -252 1219 -253 23 -254 1508 -255 8 -256 3525 -257 16 -258 490 -259 12 -260 1678 -261 14 -262 822 -263 16 -264 1729 -265 28 -266 604 -267 11 -268 2572 -269 7 -270 1242 -271 15 -272 725 -273 18 -274 1983 -275 13 -276 1662 -277 19 -278 491 -279 12 -280 1586 -281 14 -282 563 -283 10 -284 2363 -285 10 -286 656 -287 14 -288 725 -289 28 -290 871 -291 9 -292 2606 -293 12 -294 961 -295 9 -296 478 -297 13 -298 1252 -299 10 -300 736 -301 19 -302 466 -303 13 -304 2254 -305 12 -306 486 -307 14 -308 1145 -309 13 -310 955 -311 13 -312 1235 -313 13 -314 931 -315 14 -316 1768 -317 11 -318 330 -319 10 -320 539 -321 23 -322 570 -323 12 -324 1789 -325 13 -326 884 -327 5 -328 1422 -329 14 -330 317 -331 11 -332 509 -333 13 -334 1062 -335 12 -336 577 -337 27 -338 378 -339 10 -340 2313 -341 9 -342 391 -343 13 -344 894 -345 17 -346 664 -347 9 -348 453 -349 6 -350 363 -351 15 -352 1115 -353 13 -354 1054 -355 8 -356 1108 -357 12 -358 354 -359 7 -360 363 -361 16 -362 344 -363 11 -364 1734 -365 12 -366 265 -367 10 -368 969 -369 16 -370 316 -371 12 -372 757 -373 7 -374 563 -375 15 -376 857 -377 9 -378 469 -379 9 -380 385 -381 12 -382 921 -383 15 -384 764 -385 14 -386 246 -387 6 -388 1108 -389 14 -390 230 -391 8 -392 266 -393 11 -394 641 -395 8 -396 719 -397 9 -398 243 -399 4 -400 1108 -401 7 -402 229 -403 7 -404 903 -405 7 -406 257 -407 12 -408 244 -409 3 -410 541 -411 6 -412 744 -413 8 -414 419 -415 8 -416 388 -417 19 -418 470 -419 14 -420 612 -421 6 -422 342 -423 3 -424 1179 -425 3 -426 116 -427 14 -428 207 -429 6 -430 255 -431 4 -432 288 -433 12 -434 343 -435 6 -436 1015 -437 3 -438 538 -439 10 -440 194 -441 6 -442 188 -443 15 -444 524 -445 7 -446 214 -447 7 -448 574 -449 6 -450 214 -451 5 -452 635 -453 9 -454 464 -455 5 -456 205 -457 9 -458 163 -459 2 -460 558 -461 4 -462 171 -463 14 -464 444 -465 11 -466 543 -467 5 -468 388 -469 6 -470 141 -471 4 -472 647 -473 3 -474 210 -475 4 -476 193 -477 7 -478 195 -479 7 -480 443 -481 10 -482 198 -483 3 -484 816 -485 6 -486 128 -487 9 -488 215 -489 9 -490 328 -491 7 -492 158 -493 11 -494 335 -495 8 -496 435 -497 6 -498 174 -499 1 -500 373 -501 5 -502 140 -503 7 -504 330 -505 9 -506 149 -507 5 -508 642 -509 3 -510 179 -511 3 -512 159 -513 8 -514 204 -515 7 -516 306 -517 4 -518 110 -519 5 -520 326 -521 6 -522 305 -523 6 -524 294 -525 7 -526 268 -527 5 -528 149 -529 4 -530 133 -531 2 -532 513 -533 10 -534 116 -535 5 -536 258 -537 4 -538 113 -539 4 -540 138 -541 6 -542 116 -544 485 -545 4 -546 93 -547 9 -548 299 -549 3 -550 256 -551 6 -552 92 -553 3 -554 175 -555 6 -556 253 -557 7 -558 95 -559 2 -560 128 -561 4 -562 206 -563 2 -564 465 -565 3 -566 69 -567 3 -568 157 -569 7 -570 97 -571 8 -572 118 -573 5 -574 130 -575 4 -576 301 -577 6 -578 177 -579 2 -580 397 -581 3 -582 80 -583 1 -584 128 -585 5 -586 52 -587 2 -588 72 -589 1 -590 84 -591 6 -592 323 -593 11 -594 77 -595 5 -596 205 -597 1 -598 244 -599 4 -600 69 -601 3 -602 89 -603 5 -604 254 -605 6 -606 147 -607 3 -608 83 -609 3 -610 77 -611 3 -612 194 -613 1 -614 98 -615 3 -616 243 -617 3 -618 50 -619 8 -620 188 -621 4 -622 67 -623 4 -624 123 -625 2 -626 50 -627 1 -628 239 -629 2 -630 51 -631 4 -632 65 -633 5 -634 188 -636 81 -637 3 -638 46 -639 3 -640 103 -641 1 -642 136 -643 3 -644 188 -645 3 -646 58 -648 122 -649 4 -650 47 -651 2 -652 155 -653 4 -654 71 -655 1 -656 71 -657 3 -658 50 -659 2 -660 177 -661 5 -662 66 -663 2 -664 183 -665 3 -666 50 -667 2 -668 53 -669 2 -670 115 -672 66 -673 2 -674 47 -675 1 -676 197 -677 2 -678 46 -679 3 -680 95 -681 3 -682 46 -683 3 -684 107 -685 1 -686 86 -687 2 -688 158 -689 4 -690 51 -691 1 -692 80 -694 56 -695 4 -696 40 -698 43 -699 3 -700 95 -701 2 -702 51 -703 2 -704 133 -705 1 -706 100 -707 2 -708 121 -709 2 -710 15 -711 3 -712 35 -713 2 -714 20 -715 3 -716 37 -717 2 -718 78 -720 55 -721 1 -722 42 -723 2 -724 218 -725 3 -726 23 -727 2 -728 26 -729 1 -730 64 -731 2 -732 65 -734 24 -735 2 -736 53 -737 1 -738 32 -739 1 -740 60 -742 81 -743 1 -744 77 -745 1 -746 47 -747 1 -748 62 -749 1 -750 19 -751 1 -752 86 -753 3 -754 40 -756 55 -757 2 -758 38 -759 1 -760 101 -761 1 -762 22 -764 67 -765 2 -766 35 -767 1 -768 38 -769 1 -770 22 -771 1 -772 82 -773 1 -774 73 -776 29 -777 1 -778 55 -780 23 -781 1 -782 16 -784 84 -785 3 -786 28 -788 59 -789 1 -790 33 -791 3 -792 24 -794 13 -795 1 -796 110 -797 2 -798 15 -800 22 -801 3 -802 29 -803 1 -804 87 -806 21 -808 29 -810 48 -812 28 -813 1 -814 58 -815 1 -816 48 -817 1 -818 31 -819 1 -820 66 -822 17 -823 2 -824 58 -826 10 -827 2 -828 25 -829 1 -830 29 -831 1 -832 63 -833 1 -834 26 -835 3 -836 52 -837 1 -838 18 -840 27 -841 2 -842 12 -843 1 -844 83 -845 1 -846 7 -847 1 -848 10 -850 26 -852 25 -853 1 -854 15 -856 27 -858 32 -859 1 -860 15 -862 43 -864 32 -865 1 -866 6 -868 39 -870 11 -872 25 -873 1 -874 10 -875 1 -876 20 -877 2 -878 19 -879 1 -880 30 -882 11 -884 53 -886 25 -887 1 -888 28 -890 6 -892 36 -894 10 -896 13 -898 14 -900 31 -902 14 -903 2 -904 43 -906 25 -908 9 -910 11 -911 1 -912 16 -913 1 -914 24 -916 27 -918 6 -920 15 -922 27 -923 1 -924 23 -926 13 -928 42 -929 1 -930 3 -932 27 -934 17 -936 8 -937 1 -938 11 -940 33 -942 4 -943 1 -944 18 -946 15 -948 13 -950 18 -952 12 -954 11 -956 21 -958 10 -960 13 -962 5 -964 32 -966 13 -968 8 -970 8 -971 1 -972 23 -973 2 -974 12 -975 1 -976 22 -978 7 -979 1 -980 14 -982 8 -984 22 -985 1 -986 6 -988 17 -989 1 -990 6 -992 13 -994 19 -996 11 -998 4 -1000 9 -1002 2 -1004 14 -1006 5 -1008 3 -1010 9 -1012 29 -1014 6 -1016 22 -1017 1 -1018 8 -1019 1 -1020 7 -1022 6 -1023 1 -1024 10 -1026 2 -1028 8 -1030 11 -1031 2 -1032 8 -1034 9 -1036 13 -1038 12 -1040 12 -1042 3 -1044 12 -1046 3 -1048 11 -1050 2 -1051 1 -1052 2 -1054 11 -1056 6 -1058 8 -1059 1 -1060 23 -1062 6 -1063 1 -1064 8 -1066 3 -1068 6 -1070 8 -1071 1 -1072 5 -1074 3 -1076 5 -1078 3 -1080 11 -1081 1 -1082 7 -1084 18 -1086 4 -1087 1 -1088 3 -1090 3 -1092 7 -1094 3 -1096 12 -1098 6 -1099 1 -1100 2 -1102 6 -1104 14 -1106 3 -1108 6 -1110 5 -1112 2 -1114 8 -1116 3 -1118 3 -1120 7 -1122 10 -1124 6 -1126 8 -1128 1 -1130 4 -1132 3 -1134 2 -1136 5 -1138 5 -1140 8 -1142 3 -1144 7 -1146 3 -1148 11 -1150 1 -1152 5 -1154 1 -1156 5 -1158 1 -1160 5 -1162 3 -1164 6 -1165 1 -1166 1 -1168 4 -1169 1 -1170 3 -1171 1 -1172 2 -1174 5 -1176 3 -1177 1 -1180 8 -1182 2 -1184 4 -1186 2 -1188 3 -1190 2 -1192 5 -1194 6 -1196 1 -1198 2 -1200 2 -1204 10 -1206 2 -1208 9 -1210 1 -1214 6 -1216 3 -1218 4 -1220 9 -1221 2 -1222 1 -1224 5 -1226 4 -1228 8 -1230 1 -1232 1 -1234 3 -1236 5 -1240 3 -1242 1 -1244 3 -1245 1 -1246 4 -1248 6 -1250 2 -1252 7 -1256 3 -1258 2 -1260 2 -1262 3 -1264 4 -1265 1 -1266 1 -1270 1 -1271 1 -1272 2 -1274 3 -1276 3 -1278 1 -1280 3 -1284 1 -1286 1 -1290 1 -1292 3 -1294 1 -1296 7 -1300 2 -1302 4 -1304 3 -1306 2 -1308 2 -1312 1 -1314 1 -1316 3 -1318 2 -1320 1 -1324 8 -1326 1 -1330 1 -1331 1 -1336 2 -1338 1 -1340 3 -1341 1 -1344 1 -1346 2 -1347 1 -1348 3 -1352 1 -1354 2 -1356 1 -1358 1 -1360 3 -1362 1 -1364 4 -1366 1 -1370 1 -1372 3 -1380 2 -1384 2 -1388 2 -1390 2 -1392 2 -1394 1 -1396 1 -1398 1 -1400 2 -1402 1 -1404 1 -1406 1 -1410 1 -1412 5 -1418 1 -1420 1 -1424 1 -1432 2 -1434 2 -1442 3 -1444 5 -1448 1 -1454 1 -1456 1 -1460 3 -1462 4 -1468 1 -1474 1 -1476 1 -1478 2 -1480 1 -1486 2 -1488 1 -1492 1 -1496 1 -1500 3 -1503 1 -1506 1 -1512 2 -1516 1 -1522 1 -1524 2 -1534 4 -1536 1 -1538 1 -1540 2 -1544 2 -1548 1 -1556 1 -1560 1 -1562 1 -1564 2 -1566 1 -1568 1 -1570 1 -1572 1 -1576 1 -1590 1 -1594 1 -1604 1 -1608 1 -1614 1 -1622 1 -1624 2 -1628 1 -1629 1 -1636 1 -1642 1 -1654 2 -1660 1 -1664 1 -1670 1 -1684 4 -1698 1 -1732 3 -1742 1 -1752 1 -1760 1 -1764 1 -1772 2 -1798 1 -1808 1 -1820 1 -1852 1 -1856 1 -1874 1 -1902 1 -1908 1 -1952 1 -2004 1 -2018 1 -2020 1 -2028 1 -2174 1 -2233 1 -2244 1 -2280 1 -2290 1 -2352 1 -2604 1 -4190 1 diff --git a/ppocr/utils/network.py b/ppocr/utils/network.py deleted file mode 100644 index 453abb693d4c0ed370c1031b677d5bf51661add9..0000000000000000000000000000000000000000 --- a/ppocr/utils/network.py +++ /dev/null @@ -1,82 +0,0 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import tarfile -import requests -from tqdm import tqdm - -from ppocr.utils.logging import get_logger - - -def download_with_progressbar(url, save_path): - logger = get_logger() - response = requests.get(url, stream=True) - total_size_in_bytes = int(response.headers.get('content-length', 0)) - block_size = 1024 # 1 Kibibyte - progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) - with open(save_path, 'wb') as file: - for data in response.iter_content(block_size): - progress_bar.update(len(data)) - file.write(data) - progress_bar.close() - if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes: - logger.error("Something went wrong while downloading models") - sys.exit(0) - - -def maybe_download(model_storage_directory, url): - # using custom model - tar_file_name_list = [ - 'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel' - ] - if not os.path.exists( - os.path.join(model_storage_directory, 'inference.pdiparams') - ) or not os.path.exists( - os.path.join(model_storage_directory, 'inference.pdmodel')): - assert url.endswith('.tar'), 'Only supports tar compressed package' - tmp_path = os.path.join(model_storage_directory, url.split('/')[-1]) - print('download {} to {}'.format(url, tmp_path)) - os.makedirs(model_storage_directory, exist_ok=True) - download_with_progressbar(url, tmp_path) - with tarfile.open(tmp_path, 'r') as tarObj: - for member in tarObj.getmembers(): - filename = None - for tar_file_name in tar_file_name_list: - if tar_file_name in member.name: - filename = tar_file_name - if filename is None: - continue - file = tarObj.extractfile(member) - with open( - os.path.join(model_storage_directory, filename), - 'wb') as f: - f.write(file.read()) - os.remove(tmp_path) - - -def is_link(s): - return s is not None and s.startswith('http') - - -def confirm_model_dir_url(model_dir, default_model_dir, default_url): - url = default_url - if model_dir is None or is_link(model_dir): - if is_link(model_dir): - url = model_dir - file_name = url.split('/')[-1][:-4] - model_dir = default_model_dir - model_dir = os.path.join(model_dir, file_name) - return model_dir, url diff --git a/ppstructure/MANIFEST.in b/ppstructure/MANIFEST.in deleted file mode 100644 index 2961e722b7cebe8e1912be2dd903fcdecb694019..0000000000000000000000000000000000000000 --- a/ppstructure/MANIFEST.in +++ /dev/null @@ -1,9 +0,0 @@ -include LICENSE -include README.md - -recursive-include ppocr/utils *.txt utility.py logging.py network.py -recursive-include ppocr/data/ *.py -recursive-include ppocr/postprocess *.py -recursive-include tools/infer *.py -recursive-include ppstructure *.py - diff --git a/ppstructure/README_ch.md b/ppstructure/README_ch.md index 22505ad83c6dc58adf472f3db94cbf608b9bbd01..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/ppstructure/README_ch.md +++ b/ppstructure/README_ch.md @@ -1,30 +0,0 @@ -# TableStructurer - -1. 代码使用 -```python -import cv2 -from paddlestructure import PaddleStructure,draw_result - -table_engine = PaddleStructure( - output='./output/table', - show_log=True) - -img_path = '../doc/table/1.png' -img = cv2.imread(img_path) -result = table_engine(img) -for line in result: - print(line) - -from PIL import Image - -font_path = 'path/tp/PaddleOCR/doc/fonts/simfang.ttf' -image = Image.open(img_path).convert('RGB') -im_show = draw_result(image, result,font_path=font_path) -im_show = Image.fromarray(im_show) -im_show.save('result.jpg') -``` - -2. 命令行使用 -```bash -paddlestructure --image_dir=../doc/table/1.png -``` diff --git a/ppstructure/__init__.py b/ppstructure/__init__.py deleted file mode 100644 index 7055bee443fb86648b80bcb892778a114bc47d71..0000000000000000000000000000000000000000 --- a/ppstructure/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .paddlestructure import PaddleStructure, draw_result, to_excel - -__all__ = ['PaddleStructure', 'draw_result', 'to_excel'] diff --git a/ppstructure/layout/README.md b/ppstructure/layout/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ppstructure/layout/README_ch.md b/ppstructure/layout/README_ch.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ppstructure/paddlestructure.py b/ppstructure/paddlestructure.py deleted file mode 100644 index 57a53d6496f66771f1f6f7628751b4f0ac0fc3b5..0000000000000000000000000000000000000000 --- a/ppstructure/paddlestructure.py +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os -import sys - -__dir__ = os.path.dirname(__file__) -sys.path.append(__dir__) -sys.path.append(os.path.join(__dir__, '..')) - -import cv2 -import numpy as np -from pathlib import Path - -from ppocr.utils.logging import get_logger -from ppstructure.predict_system import OCRSystem, save_res -from ppstructure.table.predict_table import to_excel -from ppstructure.utility import init_args, draw_result - -logger = get_logger() -from ppocr.utils.utility import check_and_read_gif, get_image_file_list -from ppocr.utils.network import maybe_download, download_with_progressbar, confirm_model_dir_url, is_link - -__all__ = ['PaddleStructure', 'draw_result', 'to_excel'] - -VERSION = '2.1' -BASE_DIR = os.path.expanduser("~/.paddlestructure/") - -model_urls = { - 'det': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar', - 'rec': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar', - 'structure': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar' - -} - - -def parse_args(mMain=True): - import argparse - parser = init_args() - parser.add_help = mMain - - for action in parser._actions: - if action.dest in ['rec_char_dict_path', 'structure_char_dict_path']: - action.default = None - if mMain: - return parser.parse_args() - else: - inference_args_dict = {} - for action in parser._actions: - inference_args_dict[action.dest] = action.default - return argparse.Namespace(**inference_args_dict) - - -class PaddleStructure(OCRSystem): - def __init__(self, **kwargs): - params = parse_args(mMain=False) - params.__dict__.update(**kwargs) - if params.show_log: - logger.setLevel(logging.DEBUG) - params.use_angle_cls = False - # init model dir - params.det_model_dir, det_url = confirm_model_dir_url(params.det_model_dir, - os.path.join(BASE_DIR, VERSION, 'det'), - model_urls['det']) - params.rec_model_dir, rec_url = confirm_model_dir_url(params.rec_model_dir, - os.path.join(BASE_DIR, VERSION, 'rec'), - model_urls['rec']) - params.structure_model_dir, structure_url = confirm_model_dir_url(params.structure_model_dir, - os.path.join(BASE_DIR, VERSION, 'structure'), - model_urls['structure']) - # download model - maybe_download(params.det_model_dir, det_url) - maybe_download(params.rec_model_dir, rec_url) - maybe_download(params.structure_model_dir, structure_url) - - if params.rec_char_dict_path is None: - params.rec_char_type = 'EN' - if os.path.exists(str(Path(__file__).parent / 'ppocr/utils/dict/table_dict.txt')): - params.rec_char_dict_path = str(Path(__file__).parent / 'ppocr/utils/dict/table_dict.txt') - else: - params.rec_char_dict_path = str(Path(__file__).parent.parent / 'ppocr/utils/dict/table_dict.txt') - if params.structure_char_dict_path is None: - if os.path.exists(str(Path(__file__).parent / 'ppocr/utils/dict/table_structure_dict.txt')): - params.structure_char_dict_path = str( - Path(__file__).parent / 'ppocr/utils/dict/table_structure_dict.txt') - else: - params.structure_char_dict_path = str( - Path(__file__).parent.parent / 'ppocr/utils/dict/table_structure_dict.txt') - - print(params) - super().__init__(params) - - def __call__(self, img): - if isinstance(img, str): - # download net image - if img.startswith('http'): - download_with_progressbar(img, 'tmp.jpg') - img = 'tmp.jpg' - image_file = img - img, flag = check_and_read_gif(image_file) - if not flag: - with open(image_file, 'rb') as f: - np_arr = np.frombuffer(f.read(), dtype=np.uint8) - img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - return None - if isinstance(img, np.ndarray) and len(img.shape) == 2: - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - - res = super().__call__(img) - return res - - -def main(): - # for cmd - args = parse_args(mMain=True) - image_dir = args.image_dir - save_folder = args.output - if image_dir.startswith('http'): - download_with_progressbar(image_dir, 'tmp.jpg') - image_file_list = ['tmp.jpg'] - else: - image_file_list = get_image_file_list(args.image_dir) - if len(image_file_list) == 0: - logger.error('no images find in {}'.format(args.image_dir)) - return - - structure_engine = PaddleStructure(**(args.__dict__)) - for img_path in image_file_list: - img_name = os.path.basename(img_path).split('.')[0] - logger.info('{}{}{}'.format('*' * 10, img_path, '*' * 10)) - result = structure_engine(img_path) - for item in result: - logger.info(item['res']) - save_res(result, save_folder, img_name) - logger.info('result save to {}'.format(os.path.join(save_folder, img_name))) \ No newline at end of file diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index 2cdfcce2eb3ad4abe4407f781eb99e3591ecebde..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -1,132 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import subprocess - -__dir__ = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(__dir__) -sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) - -os.environ["FLAGS_allocator_strategy"] = 'auto_growth' -import cv2 -import numpy as np -import time - -import layoutparser as lp - -from ppocr.utils.utility import get_image_file_list, check_and_read_gif -from ppocr.utils.logging import get_logger -from tools.infer.predict_system import TextSystem -from ppstructure.table.predict_table import TableSystem, to_excel -from ppstructure.utility import parse_args,draw_result - -logger = get_logger() - - -class OCRSystem(object): - def __init__(self, args): - args.det_limit_type = 'resize_long' - args.drop_score = 0 - self.text_system = TextSystem(args) - self.table_system = TableSystem(args, self.text_system.text_detector, self.text_system.text_recognizer) - self.table_layout = lp.PaddleDetectionLayoutModel("lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config", - threshold=0.5, enable_mkldnn=args.enable_mkldnn, - enforce_cpu=not args.use_gpu, thread_num=args.cpu_threads) - self.use_angle_cls = args.use_angle_cls - self.drop_score = args.drop_score - - def __call__(self, img): - ori_im = img.copy() - layout_res = self.table_layout.detect(img[..., ::-1]) - res_list = [] - for region in layout_res: - x1, y1, x2, y2 = region.coordinates - x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) - roi_img = ori_im[y1:y2, x1:x2, :] - if region.type == 'Table': - res = self.table_system(roi_img) - elif region.type == 'Figure': - continue - else: - filter_boxes, filter_rec_res = self.text_system(roi_img) - filter_boxes = [x + [x1, y1] for x in filter_boxes] - filter_boxes = [x.reshape(-1).tolist() for x in filter_boxes] - - res = (filter_boxes, filter_rec_res) - res_list.append({'type': region.type, 'bbox': [x1, y1, x2, y2], 'res': res}) - return res_list - -def save_res(res, save_folder, img_name): - excel_save_folder = os.path.join(save_folder, img_name) - os.makedirs(excel_save_folder, exist_ok=True) - # save res - for region in res: - if region['type'] == 'Table': - excel_path = os.path.join(excel_save_folder, '{}.xlsx'.format(region['bbox'])) - to_excel(region['res'], excel_path) - elif region['type'] == 'Figure': - pass - else: - with open(os.path.join(excel_save_folder, 'res.txt'), 'a', encoding='utf8') as f: - for box, rec_res in zip(region['res'][0], region['res'][1]): - f.write('{}\t{}\n'.format(np.array(box).reshape(-1).tolist(), rec_res)) - - -def main(args): - image_file_list = get_image_file_list(args.image_dir) - image_file_list = image_file_list - image_file_list = image_file_list[args.process_id::args.total_process_num] - save_folder = args.output - os.makedirs(save_folder, exist_ok=True) - - structure_sys = OCRSystem(args) - img_num = len(image_file_list) - for i, image_file in enumerate(image_file_list): - logger.info("[{}/{}] {}".format(i, img_num, image_file)) - img, flag = check_and_read_gif(image_file) - img_name = os.path.basename(image_file).split('.')[0] - - if not flag: - img = cv2.imread(image_file) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - continue - starttime = time.time() - res = structure_sys(img) - save_res(res, save_folder, img_name) - draw_img = draw_result(img,res, args.vis_font_path) - cv2.imwrite(os.path.join(save_folder, img_name, 'show.jpg'), draw_img) - logger.info('result save to {}'.format(os.path.join(save_folder, img_name))) - elapse = time.time() - starttime - logger.info("Predict time : {:.3f}s".format(elapse)) - - -if __name__ == "__main__": - args = parse_args() - if args.use_mp: - p_list = [] - total_process_num = args.total_process_num - for process_id in range(total_process_num): - cmd = [sys.executable, "-u"] + sys.argv + [ - "--process_id={}".format(process_id), - "--use_mp={}".format(False) - ] - p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout) - p_list.append(p) - for p in p_list: - p.wait() - else: - main(args) diff --git a/ppstructure/setup.py b/ppstructure/setup.py deleted file mode 100644 index 8e68b2e44140f6ad5a13661349666d17cfe45524..0000000000000000000000000000000000000000 --- a/ppstructure/setup.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -from setuptools import setup -from io import open -import shutil - -with open('../requirements.txt', encoding="utf-8-sig") as f: - requirements = f.readlines() - requirements.append('tqdm') - requirements.append('layoutparser') - requirements.append('iopath') - - -def readme(): - with open('README_ch.md', encoding="utf-8-sig") as f: - README = f.read() - return README - - -shutil.copytree('../ppstructure/table', './ppstructure/table') -shutil.copyfile('../ppstructure/predict_system.py', './ppstructure/predict_system.py') -shutil.copyfile('../ppstructure/utility.py', './ppstructure/utility.py') -shutil.copytree('../ppocr', './ppocr') -shutil.copytree('../tools', './tools') -shutil.copyfile('../LICENSE', './LICENSE') - -setup( - name='paddlestructure', - packages=['paddlestructure'], - package_dir={'paddlestructure': ''}, - include_package_data=True, - entry_points={"console_scripts": ["paddlestructure= paddlestructure.paddlestructure:main"]}, - version='1.0', - install_requires=requirements, - license='Apache License 2.0', - description='Awesome OCR toolkits based on PaddlePaddle (8.6M ultra-lightweight pre-trained model, support training and deployment among server, mobile, embeded and IoT devices', - long_description=readme(), - long_description_content_type='text/markdown', - url='https://github.com/PaddlePaddle/PaddleOCR', - download_url='https://github.com/PaddlePaddle/PaddleOCR.git', - keywords=[ - 'ocr textdetection textrecognition paddleocr crnn east star-net rosetta ocrlite db chineseocr chinesetextdetection chinesetextrecognition' - ], - classifiers=[ - 'Intended Audience :: Developers', 'Operating System :: OS Independent', - 'Natural Language :: Chinese (Simplified)', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', 'Topic :: Utilities' - ], ) - -shutil.rmtree('ppocr') -shutil.rmtree('tools') -shutil.rmtree('ppstructure') -os.remove('LICENSE') diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index 105231068a99eb6c012a125ba3fb65934c5d4ac6..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -1,15 +0,0 @@ -# 表格结构和内容预测 - -先cd到PaddleOCR/ppstructure目录下 - -预测 -```python -python3 table/predict_table.py --det_model_dir=../inference/db --rec_model_dir=../inference/rec_mv3_large1.0/infer --table_model_dir=../inference/explite3/infer --image_dir=../table/imgs/PMC3006023_004_00.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --table_output ../output/table -``` -运行完成后,每张图片的excel表格会保存到table_output字段指定的目录下 - -评估 - -```python -python3 table/eval_table.py --det_model_dir=../inference/db --rec_model_dir=../inference/rec_mv3_large1.0/infer --table_model_dir=../inference/explite3/infer --image_dir=../table/imgs --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json -``` diff --git a/ppstructure/table/__init__.py b/ppstructure/table/__init__.py deleted file mode 100644 index 1d11e265597c7c8e39098a228108da3bb954b892..0000000000000000000000000000000000000000 --- a/ppstructure/table/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/ppstructure/table/eval_table.py b/ppstructure/table/eval_table.py deleted file mode 100755 index 1bcbaa8d0d0b2669828dc6b19c3370a30c522ede..0000000000000000000000000000000000000000 --- a/ppstructure/table/eval_table.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import sys -__dir__ = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(__dir__) -sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) - -import cv2 -import json -from tqdm import tqdm -from ppstructure.table.table_metric import TEDS -from ppstructure.table.predict_table import TableSystem -from ppstructure.utility import init_args - - -def parse_args(): - parser = init_args() - parser.add_argument("--gt_path", type=str) - return parser.parse_args() - -def main(gt_path, img_root, args): - teds = TEDS(n_jobs=16) - - text_sys = TableSystem(args) - jsons_gt = json.load(open(gt_path)) # gt - pred_htmls = [] - gt_htmls = [] - for img_name in tqdm(jsons_gt): - # read image - img = cv2.imread(os.path.join(img_root,img_name)) - pred_html = text_sys(img) - pred_htmls.append(pred_html) - - gt_structures, gt_bboxes, gt_contents, contents_with_block = jsons_gt[img_name] - gt_html, gt = get_gt_html(gt_structures, contents_with_block) - gt_htmls.append(gt_html) - scores = teds.batch_evaluate_html(gt_htmls, pred_htmls) - print('teds:', sum(scores) / len(scores)) - - -def get_gt_html(gt_structures, contents_with_block): - end_html = [] - td_index = 0 - for tag in gt_structures: - if '' in tag: - if contents_with_block[td_index] != []: - end_html.extend(contents_with_block[td_index]) - end_html.append(tag) - td_index += 1 - else: - end_html.append(tag) - return ''.join(end_html), end_html - - -if __name__ == '__main__': - args = parse_args() - main(args.gt_path,args.image_dir, args) diff --git a/ppstructure/table/matcher.py b/ppstructure/table/matcher.py deleted file mode 100755 index c3b56384403f5fd92a8db4b4bb378a6d55e5a76c..0000000000000000000000000000000000000000 --- a/ppstructure/table/matcher.py +++ /dev/null @@ -1,192 +0,0 @@ -import json -def distance(box_1, box_2): - x1, y1, x2, y2 = box_1 - x3, y3, x4, y4 = box_2 - dis = abs(x3 - x1) + abs(y3 - y1) + abs(x4- x2) + abs(y4 - y2) - dis_2 = abs(x3 - x1) + abs(y3 - y1) - dis_3 = abs(x4- x2) + abs(y4 - y2) - return dis + min(dis_2, dis_3) - -def compute_iou(rec1, rec2): - """ - computing IoU - :param rec1: (y0, x0, y1, x1), which reflects - (top, left, bottom, right) - :param rec2: (y0, x0, y1, x1) - :return: scala value of IoU - """ - # computing area of each rectangles - S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1]) - S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1]) - - # computing the sum_area - sum_area = S_rec1 + S_rec2 - - # find the each edge of intersect rectangle - left_line = max(rec1[1], rec2[1]) - right_line = min(rec1[3], rec2[3]) - top_line = max(rec1[0], rec2[0]) - bottom_line = min(rec1[2], rec2[2]) - - # judge if there is an intersect - if left_line >= right_line or top_line >= bottom_line: - return 0.0 - else: - intersect = (right_line - left_line) * (bottom_line - top_line) - return (intersect / (sum_area - intersect))*1.0 - - - -def matcher_merge(ocr_bboxes, pred_bboxes): - all_dis = [] - ious = [] - matched = {} - for i, gt_box in enumerate(ocr_bboxes): - distances = [] - for j, pred_box in enumerate(pred_bboxes): - # compute l1 distence and IOU between two boxes - distances.append((distance(gt_box, pred_box), 1. - compute_iou(gt_box, pred_box))) - sorted_distances = distances.copy() - # select nearest cell - sorted_distances = sorted(sorted_distances, key = lambda item: (item[1], item[0])) - if distances.index(sorted_distances[0]) not in matched.keys(): - matched[distances.index(sorted_distances[0])] = [i] - else: - matched[distances.index(sorted_distances[0])].append(i) - return matched#, sum(ious) / len(ious) - -def complex_num(pred_bboxes): - complex_nums = [] - for bbox in pred_bboxes: - distances = [] - temp_ious = [] - for pred_bbox in pred_bboxes: - if bbox != pred_bbox: - distances.append(distance(bbox, pred_bbox)) - temp_ious.append(compute_iou(bbox, pred_bbox)) - complex_nums.append(temp_ious[distances.index(min(distances))]) - return sum(complex_nums) / len(complex_nums) - -def get_rows(pred_bboxes): - pre_bbox = pred_bboxes[0] - res = [] - step = 0 - for i in range(len(pred_bboxes)): - bbox = pred_bboxes[i] - if bbox[1] - pre_bbox[1] > 2 or bbox[0] - pre_bbox[0] < 0: - break - else: - res.append(bbox) - step += 1 - for i in range(step): - pred_bboxes.pop(0) - return res, pred_bboxes -def refine_rows(pred_bboxes): # 微调整行的框,使在一条水平线上 - ys_1 = [] - ys_2 = [] - for box in pred_bboxes: - ys_1.append(box[1]) - ys_2.append(box[3]) - min_y_1 = sum(ys_1) / len(ys_1) - min_y_2 = sum(ys_2) / len(ys_2) - re_boxes = [] - for box in pred_bboxes: - box[1] = min_y_1 - box[3] = min_y_2 - re_boxes.append(box) - return re_boxes - -def matcher_refine_row(gt_bboxes, pred_bboxes): - before_refine_pred_bboxes = pred_bboxes.copy() - pred_bboxes = [] - while(len(before_refine_pred_bboxes) != 0): - row_bboxes, before_refine_pred_bboxes = get_rows(before_refine_pred_bboxes) - print(row_bboxes) - pred_bboxes.extend(refine_rows(row_bboxes)) - all_dis = [] - ious = [] - matched = {} - for i, gt_box in enumerate(gt_bboxes): - distances = [] - #temp_ious = [] - for j, pred_box in enumerate(pred_bboxes): - distances.append(distance(gt_box, pred_box)) - #temp_ious.append(compute_iou(gt_box, pred_box)) - #all_dis.append(min(distances)) - #ious.append(temp_ious[distances.index(min(distances))]) - if distances.index(min(distances)) not in matched.keys(): - matched[distances.index(min(distances))] = [i] - else: - matched[distances.index(min(distances))].append(i) - return matched#, sum(ious) / len(ious) - - - -#先挑选出一行,再进行匹配 -def matcher_structure_1(gt_bboxes, pred_bboxes_rows, pred_bboxes): - gt_box_index = 0 - delete_gt_bboxes = gt_bboxes.copy() - match_bboxes_ready = [] - matched = {} - while(len(delete_gt_bboxes) != 0): - row_bboxes, delete_gt_bboxes = get_rows(delete_gt_bboxes) - row_bboxes = sorted(row_bboxes, key = lambda key: key[0]) - if len(pred_bboxes_rows) > 0: - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - print(row_bboxes) - for i, gt_box in enumerate(row_bboxes): - #print(gt_box) - pred_distances = [] - distances = [] - for pred_bbox in pred_bboxes: - pred_distances.append(distance(gt_box, pred_bbox)) - for j, pred_box in enumerate(match_bboxes_ready): - distances.append(distance(gt_box, pred_box)) - index = pred_distances.index(min(distances)) - #print('index', index) - if index not in matched.keys(): - matched[index] = [gt_box_index] - else: - matched[index].append(gt_box_index) - gt_box_index += 1 - return matched - -def matcher_structure(gt_bboxes, pred_bboxes_rows, pred_bboxes): - ''' - gt_bboxes: 排序后 - pred_bboxes: - ''' - pre_bbox = gt_bboxes[0] - matched = {} - match_bboxes_ready = [] - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - for i, gt_box in enumerate(gt_bboxes): - - pred_distances = [] - for pred_bbox in pred_bboxes: - pred_distances.append(distance(gt_box, pred_bbox)) - distances = [] - gap_pre = gt_box[1] - pre_bbox[1] - gap_pre_1 = gt_box[0] - pre_bbox[2] - #print(gap_pre, len(pred_bboxes_rows)) - if (gap_pre_1 < 0 and len(pred_bboxes_rows) > 0): - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - if len(pred_bboxes_rows) == 1: - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - if len(match_bboxes_ready) == 0 and len(pred_bboxes_rows) > 0: - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - if len(match_bboxes_ready) == 0 and len(pred_bboxes_rows) == 0: - break - #print(match_bboxes_ready) - for j, pred_box in enumerate(match_bboxes_ready): - distances.append(distance(gt_box, pred_box)) - index = pred_distances.index(min(distances)) - #print(gt_box, index) - #match_bboxes_ready.pop(distances.index(min(distances))) - print(gt_box, match_bboxes_ready[distances.index(min(distances))]) - if index not in matched.keys(): - matched[index] = [i] - else: - matched[index].append(i) - pre_bbox = gt_box - return matched diff --git a/ppstructure/table/predict_structure.py b/ppstructure/table/predict_structure.py deleted file mode 100755 index 6e680b3574ba28b439acad34424b51dfdc02078c..0000000000000000000000000000000000000000 --- a/ppstructure/table/predict_structure.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import sys - -__dir__ = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(__dir__) -sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) - -os.environ["FLAGS_allocator_strategy"] = 'auto_growth' - -import cv2 -import numpy as np -import math -import time -import traceback -import paddle - -import tools.infer.utility as utility -from ppocr.data import create_operators, transform -from ppocr.postprocess import build_post_process -from ppocr.utils.logging import get_logger -from ppocr.utils.utility import get_image_file_list, check_and_read_gif - -logger = get_logger() - - -class TableStructurer(object): - def __init__(self, args): - pre_process_list = [{ - 'ResizeTableImage': { - 'max_len': args.structure_max_len - } - }, { - 'NormalizeImage': { - 'std': [0.229, 0.224, 0.225], - 'mean': [0.485, 0.456, 0.406], - 'scale': '1./255.', - 'order': 'hwc' - } - }, { - 'PaddingTableImage': None - }, { - 'ToCHWImage': None - }, { - 'KeepKeys': { - 'keep_keys': ['image'] - } - }] - postprocess_params = { - 'name': 'TableLabelDecode', - "character_type": args.structure_char_type, - "character_dict_path": args.structure_char_dict_path, - "max_text_length": args.structure_max_text_length, - "max_elem_length": args.structure_max_elem_length, - "max_cell_num": args.structure_max_cell_num - } - - self.preprocess_op = create_operators(pre_process_list) - self.postprocess_op = build_post_process(postprocess_params) - self.predictor, self.input_tensor, self.output_tensors = \ - utility.create_predictor(args, 'structure', logger) - - def __call__(self, img): - ori_im = img.copy() - data = {'image': img} - data = transform(data, self.preprocess_op) - img = data[0] - if img is None: - return None, 0 - img = np.expand_dims(img, axis=0) - img = img.copy() - starttime = time.time() - - self.input_tensor.copy_from_cpu(img) - self.predictor.run() - outputs = [] - for output_tensor in self.output_tensors: - output = output_tensor.copy_to_cpu() - outputs.append(output) - - preds = {} - preds['structure_probs'] = outputs[1] - preds['loc_preds'] = outputs[0] - - post_result = self.postprocess_op(preds) - - structure_str_list = post_result['structure_str_list'] - res_loc = post_result['res_loc'] - imgh, imgw = ori_im.shape[0:2] - res_loc_final = [] - for rno in range(len(res_loc[0])): - x0, y0, x1, y1 = res_loc[0][rno] - left = max(int(imgw * x0), 0) - top = max(int(imgh * y0), 0) - right = min(int(imgw * x1), imgw - 1) - bottom = min(int(imgh * y1), imgh - 1) - res_loc_final.append([left, top, right, bottom]) - - structure_str_list = structure_str_list[0][:-1] - structure_str_list = ['', '', ''] + structure_str_list + ['
', '', ''] - - elapse = time.time() - starttime - return (structure_str_list, res_loc_final), elapse - - -def main(args): - image_file_list = get_image_file_list(args.image_dir) - table_structurer = TableStructurer(args) - count = 0 - total_time = 0 - for image_file in image_file_list: - img, flag = check_and_read_gif(image_file) - if not flag: - img = cv2.imread(image_file) - if img is None: - logger.info("error in loading image:{}".format(image_file)) - continue - structure_res, elapse = table_structurer(img) - - logger.info("result: {}".format(structure_res)) - - if count > 0: - total_time += elapse - count += 1 - logger.info("Predict time of {}: {}".format(image_file, elapse)) - - -if __name__ == "__main__": - main(utility.parse_args()) diff --git a/ppstructure/table/predict_table.py b/ppstructure/table/predict_table.py deleted file mode 100644 index c4edd22c3de4df5f0ba3e0a1e28a8c346a48d4ee..0000000000000000000000000000000000000000 --- a/ppstructure/table/predict_table.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import subprocess - -__dir__ = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(__dir__) -sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) -sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) - -os.environ["FLAGS_allocator_strategy"] = 'auto_growth' -import cv2 -import copy -import numpy as np -import time -import tools.infer.predict_rec as predict_rec -import tools.infer.predict_det as predict_det -from ppocr.utils.utility import get_image_file_list, check_and_read_gif -from ppocr.utils.logging import get_logger -from ppstructure.table.matcher import distance, compute_iou -from ppstructure.utility import parse_args -import ppstructure.table.predict_structure as predict_strture - -logger = get_logger() - - -def expand(pix, det_box, shape): - x0, y0, x1, y1 = det_box - # print(shape) - h, w, c = shape - tmp_x0 = x0 - pix - tmp_x1 = x1 + pix - tmp_y0 = y0 - pix - tmp_y1 = y1 + pix - x0_ = tmp_x0 if tmp_x0 >= 0 else 0 - x1_ = tmp_x1 if tmp_x1 <= w else w - y0_ = tmp_y0 if tmp_y0 >= 0 else 0 - y1_ = tmp_y1 if tmp_y1 <= h else h - return x0_, y0_, x1_, y1_ - - -class TableSystem(object): - def __init__(self, args, text_detector=None, text_recognizer=None): - self.text_detector = predict_det.TextDetector(args) if text_detector is None else text_detector - self.text_recognizer = predict_rec.TextRecognizer(args) if text_recognizer is None else text_recognizer - self.table_structurer = predict_strture.TableStructurer(args) - - def __call__(self, img): - ori_im = img.copy() - structure_res, elapse = self.table_structurer(copy.deepcopy(img)) - dt_boxes, elapse = self.text_detector(copy.deepcopy(img)) - dt_boxes = sorted_boxes(dt_boxes) - - r_boxes = [] - for box in dt_boxes: - x_min = box[:, 0].min() - 1 - x_max = box[:, 0].max() + 1 - y_min = box[:, 1].min() - 1 - y_max = box[:, 1].max() + 1 - box = [x_min, y_min, x_max, y_max] - r_boxes.append(box) - dt_boxes = np.array(r_boxes) - - logger.debug("dt_boxes num : {}, elapse : {}".format( - len(dt_boxes), elapse)) - if dt_boxes is None: - return None, None - img_crop_list = [] - - for i in range(len(dt_boxes)): - det_box = dt_boxes[i] - x0, y0, x1, y1 = expand(2, det_box, ori_im.shape) - text_rect = ori_im[int(y0):int(y1), int(x0):int(x1), :] - img_crop_list.append(text_rect) - rec_res, elapse = self.text_recognizer(img_crop_list) - logger.debug("rec_res num : {}, elapse : {}".format( - len(rec_res), elapse)) - - pred_html, pred = self.rebuild_table(structure_res, dt_boxes, rec_res) - return pred_html - - def rebuild_table(self, structure_res, dt_boxes, rec_res): - pred_structures, pred_bboxes = structure_res - matched_index = self.match_result(dt_boxes, pred_bboxes) - pred_html, pred = self.get_pred_html(pred_structures, matched_index, rec_res) - return pred_html, pred - - def match_result(self, dt_boxes, pred_bboxes): - matched = {} - for i, gt_box in enumerate(dt_boxes): - # gt_box = [np.min(gt_box[:, 0]), np.min(gt_box[:, 1]), np.max(gt_box[:, 0]), np.max(gt_box[:, 1])] - distances = [] - for j, pred_box in enumerate(pred_bboxes): - distances.append( - (distance(gt_box, pred_box), 1. - compute_iou(gt_box, pred_box))) # 获取两两cell之间的L1距离和 1- IOU - sorted_distances = distances.copy() - # 根据距离和IOU挑选最"近"的cell - sorted_distances = sorted(sorted_distances, key=lambda item: (item[1], item[0])) - if distances.index(sorted_distances[0]) not in matched.keys(): - matched[distances.index(sorted_distances[0])] = [i] - else: - matched[distances.index(sorted_distances[0])].append(i) - return matched - - def get_pred_html(self, pred_structures, matched_index, ocr_contents): - end_html = [] - td_index = 0 - for tag in pred_structures: - if '' in tag: - if td_index in matched_index.keys(): - b_with = False - if '' in ocr_contents[matched_index[td_index][0]] and len(matched_index[td_index]) > 1: - b_with = True - end_html.extend('') - for i, td_index_index in enumerate(matched_index[td_index]): - content = ocr_contents[td_index_index][0] - if len(matched_index[td_index]) > 1: - if len(content) == 0: - continue - if content[0] == ' ': - content = content[1:] - if '' in content: - content = content[3:] - if '' in content: - content = content[:-4] - if len(content) == 0: - continue - if i != len(matched_index[td_index]) - 1 and ' ' != content[-1]: - content += ' ' - end_html.extend(content) - if b_with: - end_html.extend('') - - end_html.append(tag) - td_index += 1 - else: - end_html.append(tag) - return ''.join(end_html), end_html - - -def sorted_boxes(dt_boxes): - """ - Sort text boxes in order from top to bottom, left to right - args: - dt_boxes(array):detected text boxes with shape [4, 2] - return: - sorted boxes(array) with shape [4, 2] - """ - num_boxes = dt_boxes.shape[0] - sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) - _boxes = list(sorted_boxes) - - for i in range(num_boxes - 1): - if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \ - (_boxes[i + 1][0][0] < _boxes[i][0][0]): - tmp = _boxes[i] - _boxes[i] = _boxes[i + 1] - _boxes[i + 1] = tmp - return _boxes - - -def to_excel(html_table, excel_path): - from tablepyxl import tablepyxl - tablepyxl.document_to_xl(html_table, excel_path) - - -def main(args): - image_file_list = get_image_file_list(args.image_dir) - image_file_list = image_file_list[args.process_id::args.total_process_num] - os.makedirs(args.output, exist_ok=True) - - text_sys = TableSystem(args) - img_num = len(image_file_list) - for i, image_file in enumerate(image_file_list): - logger.info("[{}/{}] {}".format(i, img_num, image_file)) - img, flag = check_and_read_gif(image_file) - excel_path = os.path.join(args.table_output, os.path.basename(image_file).split('.')[0] + '.xlsx') - if not flag: - img = cv2.imread(image_file) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - continue - starttime = time.time() - pred_html = text_sys(img) - - to_excel(pred_html, excel_path) - logger.info('excel saved to {}'.format(excel_path)) - logger.info(pred_html) - elapse = time.time() - starttime - logger.info("Predict time : {:.3f}s".format(elapse)) - - -if __name__ == "__main__": - args = parse_args() - if args.use_mp: - p_list = [] - total_process_num = args.total_process_num - for process_id in range(total_process_num): - cmd = [sys.executable, "-u"] + sys.argv + [ - "--process_id={}".format(process_id), - "--use_mp={}".format(False) - ] - p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout) - p_list.append(p) - for p in p_list: - p.wait() - else: - main(args) diff --git a/ppstructure/table/table_metric/__init__.py b/ppstructure/table/table_metric/__init__.py deleted file mode 100755 index de2d307430f68881ece1e41357d3b2f423e07ddd..0000000000000000000000000000000000000000 --- a/ppstructure/table/table_metric/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = ['TEDS'] -from .table_metric import TEDS \ No newline at end of file diff --git a/ppstructure/table/table_metric/parallel.py b/ppstructure/table/table_metric/parallel.py deleted file mode 100755 index f7326a1f506ca5fb7b3e97b0d077dc016e7eb7c7..0000000000000000000000000000000000000000 --- a/ppstructure/table/table_metric/parallel.py +++ /dev/null @@ -1,51 +0,0 @@ -from tqdm import tqdm -from concurrent.futures import ProcessPoolExecutor, as_completed - - -def parallel_process(array, function, n_jobs=16, use_kwargs=False, front_num=0): - """ - A parallel version of the map function with a progress bar. - Args: - array (array-like): An array to iterate over. - function (function): A python function to apply to the elements of array - n_jobs (int, default=16): The number of cores to use - use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of - keyword arguments to function - front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job. - Useful for catching bugs - Returns: - [function(array[0]), function(array[1]), ...] - """ - # We run the first few iterations serially to catch bugs - if front_num > 0: - front = [function(**a) if use_kwargs else function(a) - for a in array[:front_num]] - else: - front = [] - # If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging. - if n_jobs == 1: - return front + [function(**a) if use_kwargs else function(a) for a in tqdm(array[front_num:])] - # Assemble the workers - with ProcessPoolExecutor(max_workers=n_jobs) as pool: - # Pass the elements of array into function - if use_kwargs: - futures = [pool.submit(function, **a) for a in array[front_num:]] - else: - futures = [pool.submit(function, a) for a in array[front_num:]] - kwargs = { - 'total': len(futures), - 'unit': 'it', - 'unit_scale': True, - 'leave': True - } - # Print out the progress as tasks complete - for f in tqdm(as_completed(futures), **kwargs): - pass - out = [] - # Get the results from the futures. - for i, future in tqdm(enumerate(futures)): - try: - out.append(future.result()) - except Exception as e: - out.append(e) - return front + out diff --git a/ppstructure/table/table_metric/table_metric.py b/ppstructure/table/table_metric/table_metric.py deleted file mode 100755 index 9aca98ad785d4614a803fa5a277a6e4a27b3b078..0000000000000000000000000000000000000000 --- a/ppstructure/table/table_metric/table_metric.py +++ /dev/null @@ -1,247 +0,0 @@ -# Copyright 2020 IBM -# Author: peter.zhong@au1.ibm.com -# -# This is free software; you can redistribute it and/or modify -# it under the terms of the Apache 2.0 License. -# -# This software is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Apache 2.0 License for more details. - -import distance -from apted import APTED, Config -from apted.helpers import Tree -from lxml import etree, html -from collections import deque -from .parallel import parallel_process -from tqdm import tqdm - - -class TableTree(Tree): - def __init__(self, tag, colspan=None, rowspan=None, content=None, *children): - self.tag = tag - self.colspan = colspan - self.rowspan = rowspan - self.content = content - self.children = list(children) - - def bracket(self): - """Show tree using brackets notation""" - if self.tag == 'td': - result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \ - (self.tag, self.colspan, self.rowspan, self.content) - else: - result = '"tag": %s' % self.tag - for child in self.children: - result += child.bracket() - return "{{{}}}".format(result) - - -class CustomConfig(Config): - @staticmethod - def maximum(*sequences): - """Get maximum possible value - """ - return max(map(len, sequences)) - - def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ - return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) - - def rename(self, node1, node2): - """Compares attributes of trees""" - #print(node1.tag) - if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): - return 1. - if node1.tag == 'td': - if node1.content or node2.content: - #print(node1.content, ) - return self.normalized_distance(node1.content, node2.content) - return 0. - - - -class CustomConfig_del_short(Config): - @staticmethod - def maximum(*sequences): - """Get maximum possible value - """ - return max(map(len, sequences)) - - def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ - return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) - - def rename(self, node1, node2): - """Compares attributes of trees""" - if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): - return 1. - if node1.tag == 'td': - if node1.content or node2.content: - #print('before') - #print(node1.content, node2.content) - #print('after') - node1_content = node1.content - node2_content = node2.content - if len(node1_content) < 3: - node1_content = ['####'] - if len(node2_content) < 3: - node2_content = ['####'] - return self.normalized_distance(node1_content, node2_content) - return 0. - -class CustomConfig_del_block(Config): - @staticmethod - def maximum(*sequences): - """Get maximum possible value - """ - return max(map(len, sequences)) - - def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ - return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) - - def rename(self, node1, node2): - """Compares attributes of trees""" - if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): - return 1. - if node1.tag == 'td': - if node1.content or node2.content: - - node1_content = node1.content - node2_content = node2.content - while ' ' in node1_content: - print(node1_content.index(' ')) - node1_content.pop(node1_content.index(' ')) - while ' ' in node2_content: - print(node2_content.index(' ')) - node2_content.pop(node2_content.index(' ')) - return self.normalized_distance(node1_content, node2_content) - return 0. - -class TEDS(object): - ''' Tree Edit Distance basead Similarity - ''' - - def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None): - assert isinstance(n_jobs, int) and ( - n_jobs >= 1), 'n_jobs must be an integer greather than 1' - self.structure_only = structure_only - self.n_jobs = n_jobs - self.ignore_nodes = ignore_nodes - self.__tokens__ = [] - - def tokenize(self, node): - ''' Tokenizes table cells - ''' - self.__tokens__.append('<%s>' % node.tag) - if node.text is not None: - self.__tokens__ += list(node.text) - for n in node.getchildren(): - self.tokenize(n) - if node.tag != 'unk': - self.__tokens__.append('' % node.tag) - if node.tag != 'td' and node.tail is not None: - self.__tokens__ += list(node.tail) - - def load_html_tree(self, node, parent=None): - ''' Converts HTML tree to the format required by apted - ''' - global __tokens__ - if node.tag == 'td': - if self.structure_only: - cell = [] - else: - self.__tokens__ = [] - self.tokenize(node) - cell = self.__tokens__[1:-1].copy() - new_node = TableTree(node.tag, - int(node.attrib.get('colspan', '1')), - int(node.attrib.get('rowspan', '1')), - cell, *deque()) - else: - new_node = TableTree(node.tag, None, None, None, *deque()) - if parent is not None: - parent.children.append(new_node) - if node.tag != 'td': - for n in node.getchildren(): - self.load_html_tree(n, new_node) - if parent is None: - return new_node - - def evaluate(self, pred, true): - ''' Computes TEDS score between the prediction and the ground truth of a - given sample - ''' - if (not pred) or (not true): - return 0.0 - parser = html.HTMLParser(remove_comments=True, encoding='utf-8') - pred = html.fromstring(pred, parser=parser) - true = html.fromstring(true, parser=parser) - if pred.xpath('body/table') and true.xpath('body/table'): - pred = pred.xpath('body/table')[0] - true = true.xpath('body/table')[0] - if self.ignore_nodes: - etree.strip_tags(pred, *self.ignore_nodes) - etree.strip_tags(true, *self.ignore_nodes) - n_nodes_pred = len(pred.xpath(".//*")) - n_nodes_true = len(true.xpath(".//*")) - n_nodes = max(n_nodes_pred, n_nodes_true) - tree_pred = self.load_html_tree(pred) - tree_true = self.load_html_tree(true) - distance = APTED(tree_pred, tree_true, - CustomConfig()).compute_edit_distance() - return 1.0 - (float(distance) / n_nodes) - else: - return 0.0 - - def batch_evaluate(self, pred_json, true_json): - ''' Computes TEDS score between the prediction and the ground truth of - a batch of samples - @params pred_json: {'FILENAME': 'HTML CODE', ...} - @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...} - @output: {'FILENAME': 'TEDS SCORE', ...} - ''' - samples = true_json.keys() - if self.n_jobs == 1: - scores = [self.evaluate(pred_json.get( - filename, ''), true_json[filename]['html']) for filename in tqdm(samples)] - else: - inputs = [{'pred': pred_json.get( - filename, ''), 'true': true_json[filename]['html']} for filename in samples] - scores = parallel_process( - inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1) - scores = dict(zip(samples, scores)) - return scores - - def batch_evaluate_html(self, pred_htmls, true_htmls): - ''' Computes TEDS score between the prediction and the ground truth of - a batch of samples - ''' - if self.n_jobs == 1: - scores = [self.evaluate(pred_html, true_html) for ( - pred_html, true_html) in zip(pred_htmls, true_htmls)] - else: - inputs = [{"pred": pred_html, "true": true_html} for( - pred_html, true_html) in zip(pred_htmls, true_htmls)] - - scores = parallel_process( - inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1) - return scores - - -if __name__ == '__main__': - import json - import pprint - with open('sample_pred.json') as fp: - pred_json = json.load(fp) - with open('sample_gt.json') as fp: - true_json = json.load(fp) - teds = TEDS(n_jobs=4) - scores = teds.batch_evaluate(pred_json, true_json) - pp = pprint.PrettyPrinter() - pp.pprint(scores) diff --git a/ppstructure/table/tablepyxl/__init__.py b/ppstructure/table/tablepyxl/__init__.py deleted file mode 100644 index dc0085071cf4497b01fc648e7c38f2e8d9d173d0..0000000000000000000000000000000000000000 --- a/ppstructure/table/tablepyxl/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file diff --git a/ppstructure/table/tablepyxl/style.py b/ppstructure/table/tablepyxl/style.py deleted file mode 100644 index ebd794b1b47d7f9e4f9294dde7330f592d613656..0000000000000000000000000000000000000000 --- a/ppstructure/table/tablepyxl/style.py +++ /dev/null @@ -1,283 +0,0 @@ -# This is where we handle translating css styles into openpyxl styles -# and cascading those from parent to child in the dom. - -from openpyxl.cell import cell -from openpyxl.styles import Font, Alignment, PatternFill, NamedStyle, Border, Side, Color -from openpyxl.styles.fills import FILL_SOLID -from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE -from openpyxl.styles.colors import BLACK - -FORMAT_DATE_MMDDYYYY = 'mm/dd/yyyy' - - -def colormap(color): - """ - Convenience for looking up known colors - """ - cmap = {'black': BLACK} - return cmap.get(color, color) - - -def style_string_to_dict(style): - """ - Convert css style string to a python dictionary - """ - def clean_split(string, delim): - return (s.strip() for s in string.split(delim)) - styles = [clean_split(s, ":") for s in style.split(";") if ":" in s] - return dict(styles) - - -def get_side(style, name): - return {'border_style': style.get('border-{}-style'.format(name)), - 'color': colormap(style.get('border-{}-color'.format(name)))} - -known_styles = {} - - -def style_dict_to_named_style(style_dict, number_format=None): - """ - Change css style (stored in a python dictionary) to openpyxl NamedStyle - """ - - style_and_format_string = str({ - 'style_dict': style_dict, - 'parent': style_dict.parent, - 'number_format': number_format, - }) - - if style_and_format_string not in known_styles: - # Font - font = Font(bold=style_dict.get('font-weight') == 'bold', - color=style_dict.get_color('color', None), - size=style_dict.get('font-size')) - - # Alignment - alignment = Alignment(horizontal=style_dict.get('text-align', 'general'), - vertical=style_dict.get('vertical-align'), - wrap_text=style_dict.get('white-space', 'nowrap') == 'normal') - - # Fill - bg_color = style_dict.get_color('background-color') - fg_color = style_dict.get_color('foreground-color', Color()) - fill_type = style_dict.get('fill-type') - if bg_color and bg_color != 'transparent': - fill = PatternFill(fill_type=fill_type or FILL_SOLID, - start_color=bg_color, - end_color=fg_color) - else: - fill = PatternFill() - - # Border - border = Border(left=Side(**get_side(style_dict, 'left')), - right=Side(**get_side(style_dict, 'right')), - top=Side(**get_side(style_dict, 'top')), - bottom=Side(**get_side(style_dict, 'bottom')), - diagonal=Side(**get_side(style_dict, 'diagonal')), - diagonal_direction=None, - outline=Side(**get_side(style_dict, 'outline')), - vertical=None, - horizontal=None) - - name = 'Style {}'.format(len(known_styles) + 1) - - pyxl_style = NamedStyle(name=name, font=font, fill=fill, alignment=alignment, border=border, - number_format=number_format) - - known_styles[style_and_format_string] = pyxl_style - - return known_styles[style_and_format_string] - - -class StyleDict(dict): - """ - It's like a dictionary, but it looks for items in the parent dictionary - """ - def __init__(self, *args, **kwargs): - self.parent = kwargs.pop('parent', None) - super(StyleDict, self).__init__(*args, **kwargs) - - def __getitem__(self, item): - if item in self: - return super(StyleDict, self).__getitem__(item) - elif self.parent: - return self.parent[item] - else: - raise KeyError('{} not found'.format(item)) - - def __hash__(self): - return hash(tuple([(k, self.get(k)) for k in self._keys()])) - - # Yielding the keys avoids creating unnecessary data structures - # and happily works with both python2 and python3 where the - # .keys() method is a dictionary_view in python3 and a list in python2. - def _keys(self): - yielded = set() - for k in self.keys(): - yielded.add(k) - yield k - if self.parent: - for k in self.parent._keys(): - if k not in yielded: - yielded.add(k) - yield k - - def get(self, k, d=None): - try: - return self[k] - except KeyError: - return d - - def get_color(self, k, d=None): - """ - Strip leading # off colors if necessary - """ - color = self.get(k, d) - if hasattr(color, 'startswith') and color.startswith('#'): - color = color[1:] - if len(color) == 3: # Premailers reduces colors like #00ff00 to #0f0, openpyxl doesn't like that - color = ''.join(2 * c for c in color) - return color - - -class Element(object): - """ - Our base class for representing an html element along with a cascading style. - The element is created along with a parent so that the StyleDict that we store - can point to the parent's StyleDict. - """ - def __init__(self, element, parent=None): - self.element = element - self.number_format = None - parent_style = parent.style_dict if parent else None - self.style_dict = StyleDict(style_string_to_dict(element.get('style', '')), parent=parent_style) - self._style_cache = None - - def style(self): - """ - Turn the css styles for this element into an openpyxl NamedStyle. - """ - if not self._style_cache: - self._style_cache = style_dict_to_named_style(self.style_dict, number_format=self.number_format) - return self._style_cache - - def get_dimension(self, dimension_key): - """ - Extracts the dimension from the style dict of the Element and returns it as a float. - """ - dimension = self.style_dict.get(dimension_key) - if dimension: - if dimension[-2:] in ['px', 'em', 'pt', 'in', 'cm']: - dimension = dimension[:-2] - dimension = float(dimension) - return dimension - - -class Table(Element): - """ - The concrete implementations of Elements are semantically named for the types of elements we are interested in. - This defines a very concrete tree structure for html tables that we expect to deal with. I prefer this compared to - allowing Element to have an arbitrary number of children and dealing with an abstract element tree. - """ - def __init__(self, table): - """ - takes an html table object (from lxml) - """ - super(Table, self).__init__(table) - table_head = table.find('thead') - self.head = TableHead(table_head, parent=self) if table_head is not None else None - table_body = table.find('tbody') - self.body = TableBody(table_body if table_body is not None else table, parent=self) - - -class TableHead(Element): - """ - This class maps to the `` element of the html table. - """ - def __init__(self, head, parent=None): - super(TableHead, self).__init__(head, parent=parent) - self.rows = [TableRow(tr, parent=self) for tr in head.findall('tr')] - - -class TableBody(Element): - """ - This class maps to the `` element of the html table. - """ - def __init__(self, body, parent=None): - super(TableBody, self).__init__(body, parent=parent) - self.rows = [TableRow(tr, parent=self) for tr in body.findall('tr')] - - -class TableRow(Element): - """ - This class maps to the `` element of the html table. - """ - def __init__(self, tr, parent=None): - super(TableRow, self).__init__(tr, parent=parent) - self.cells = [TableCell(cell, parent=self) for cell in tr.findall('th') + tr.findall('td')] - - -def element_to_string(el): - return _element_to_string(el).strip() - - -def _element_to_string(el): - string = '' - - for x in el.iterchildren(): - string += '\n' + _element_to_string(x) - - text = el.text.strip() if el.text else '' - tail = el.tail.strip() if el.tail else '' - - return text + string + '\n' + tail - - -class TableCell(Element): - """ - This class maps to the `` element of the html table. - """ - CELL_TYPES = {'TYPE_STRING', 'TYPE_FORMULA', 'TYPE_NUMERIC', 'TYPE_BOOL', 'TYPE_CURRENCY', 'TYPE_PERCENTAGE', - 'TYPE_NULL', 'TYPE_INLINE', 'TYPE_ERROR', 'TYPE_FORMULA_CACHE_STRING', 'TYPE_INTEGER'} - - def __init__(self, cell, parent=None): - super(TableCell, self).__init__(cell, parent=parent) - self.value = element_to_string(cell) - self.number_format = self.get_number_format() - - def data_type(self): - cell_types = self.CELL_TYPES & set(self.element.get('class', '').split()) - if cell_types: - if 'TYPE_FORMULA' in cell_types: - # Make sure TYPE_FORMULA takes precedence over the other classes in the set. - cell_type = 'TYPE_FORMULA' - elif cell_types & {'TYPE_CURRENCY', 'TYPE_INTEGER', 'TYPE_PERCENTAGE'}: - cell_type = 'TYPE_NUMERIC' - else: - cell_type = cell_types.pop() - else: - cell_type = 'TYPE_STRING' - return getattr(cell, cell_type) - - def get_number_format(self): - if 'TYPE_CURRENCY' in self.element.get('class', '').split(): - return FORMAT_CURRENCY_USD_SIMPLE - if 'TYPE_INTEGER' in self.element.get('class', '').split(): - return '#,##0' - if 'TYPE_PERCENTAGE' in self.element.get('class', '').split(): - return FORMAT_PERCENTAGE - if 'TYPE_DATE' in self.element.get('class', '').split(): - return FORMAT_DATE_MMDDYYYY - if self.data_type() == cell.TYPE_NUMERIC: - try: - int(self.value) - except ValueError: - return '#,##0.##' - else: - return '#,##0' - - def format(self, cell): - cell.style = self.style() - data_type = self.data_type() - if data_type: - cell.data_type = data_type \ No newline at end of file diff --git a/ppstructure/table/tablepyxl/tablepyxl.py b/ppstructure/table/tablepyxl/tablepyxl.py deleted file mode 100644 index ba3cc0fc499fccd93ffe3993a99296bc6603ed8a..0000000000000000000000000000000000000000 --- a/ppstructure/table/tablepyxl/tablepyxl.py +++ /dev/null @@ -1,118 +0,0 @@ -# Do imports like python3 so our package works for 2 and 3 -from __future__ import absolute_import - -from lxml import html -from openpyxl import Workbook -from openpyxl.utils import get_column_letter -from premailer import Premailer -from tablepyxl.style import Table - - -def string_to_int(s): - if s.isdigit(): - return int(s) - return 0 - - -def get_Tables(doc): - tree = html.fromstring(doc) - comments = tree.xpath('//comment()') - for comment in comments: - comment.drop_tag() - return [Table(table) for table in tree.xpath('//table')] - - -def write_rows(worksheet, elem, row, column=1): - """ - Writes every tr child element of elem to a row in the worksheet - returns the next row after all rows are written - """ - from openpyxl.cell.cell import MergedCell - - initial_column = column - for table_row in elem.rows: - for table_cell in table_row.cells: - cell = worksheet.cell(row=row, column=column) - while isinstance(cell, MergedCell): - column += 1 - cell = worksheet.cell(row=row, column=column) - - colspan = string_to_int(table_cell.element.get("colspan", "1")) - rowspan = string_to_int(table_cell.element.get("rowspan", "1")) - if rowspan > 1 or colspan > 1: - worksheet.merge_cells(start_row=row, start_column=column, - end_row=row + rowspan - 1, end_column=column + colspan - 1) - - cell.value = table_cell.value - table_cell.format(cell) - min_width = table_cell.get_dimension('min-width') - max_width = table_cell.get_dimension('max-width') - - if colspan == 1: - # Initially, when iterating for the first time through the loop, the width of all the cells is None. - # As we start filling in contents, the initial width of the cell (which can be retrieved by: - # worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous - # cell in the same column (i.e. width of A2 = width of A1) - width = max(worksheet.column_dimensions[get_column_letter(column)].width or 0, len(table_cell.value) + 2) - if max_width and width > max_width: - width = max_width - elif min_width and width < min_width: - width = min_width - worksheet.column_dimensions[get_column_letter(column)].width = width - column += colspan - row += 1 - column = initial_column - return row - - -def table_to_sheet(table, wb): - """ - Takes a table and workbook and writes the table to a new sheet. - The sheet title will be the same as the table attribute name. - """ - ws = wb.create_sheet(title=table.element.get('name')) - insert_table(table, ws, 1, 1) - - -def document_to_workbook(doc, wb=None, base_url=None): - """ - Takes a string representation of an html document and writes one sheet for - every table in the document. - The workbook is returned - """ - if not wb: - wb = Workbook() - wb.remove(wb.active) - - inline_styles_doc = Premailer(doc, base_url=base_url, remove_classes=False).transform() - tables = get_Tables(inline_styles_doc) - - for table in tables: - table_to_sheet(table, wb) - - return wb - - -def document_to_xl(doc, filename, base_url=None): - """ - Takes a string representation of an html document and writes one sheet for - every table in the document. The workbook is written out to a file called filename - """ - wb = document_to_workbook(doc, base_url=base_url) - wb.save(filename) - - -def insert_table(table, worksheet, column, row): - if table.head: - row = write_rows(worksheet, table.head, row, column) - if table.body: - row = write_rows(worksheet, table.body, row, column) - - -def insert_table_at_cell(table, cell): - """ - Inserts a table at the location of an openpyxl Cell object. - """ - ws = cell.parent - column, row = cell.column, cell.row - insert_table(table, ws, column, row) \ No newline at end of file diff --git a/ppstructure/utility.py b/ppstructure/utility.py deleted file mode 100644 index 8112b9efd2155d69784ebc9915d9c3ec30e94f9c..0000000000000000000000000000000000000000 --- a/ppstructure/utility.py +++ /dev/null @@ -1,59 +0,0 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from PIL import Image -import numpy as np -from tools.infer.utility import draw_ocr_box_txt, init_args as infer_args - - -def init_args(): - parser = infer_args() - - # params for output - parser.add_argument("--output", type=str, default='./output/table') - # params for table structure - parser.add_argument("--structure_max_len", type=int, default=488) - parser.add_argument("--structure_max_text_length", type=int, default=100) - parser.add_argument("--structure_max_elem_length", type=int, default=800) - parser.add_argument("--structure_max_cell_num", type=int, default=500) - parser.add_argument("--structure_model_dir", type=str) - parser.add_argument("--structure_char_type", type=str, default='en') - parser.add_argument("--structure_char_dict_path", type=str, default="../ppocr/utils/dict/table_structure_dict.txt") - - # params for layout detector - parser.add_argument("--layout_model_dir", type=str) - return parser - - -def parse_args(): - parser = init_args() - return parser.parse_args() - - -def draw_result(image, result, font_path): - if isinstance(image, np.ndarray): - image = Image.fromarray(image) - boxes, txts, scores = [], [], [] - for region in result: - if region['type'] == 'Table': - pass - elif region['type'] == 'Figure': - pass - else: - for box, rec_res in zip(region['res'][0], region['res'][1]): - boxes.append(np.array(box).reshape(-1, 2)) - txts.append(rec_res[0]) - scores.append(rec_res[1]) - im_show = draw_ocr_box_txt(image, boxes, txts, scores, font_path=font_path,drop_score=0) - return im_show \ No newline at end of file diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index baa89be130084d98628656fe4e309728a0e9f661..f5bade36315fbe321927df82cdd7cd8bf40b2ae5 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -43,7 +43,7 @@ class TextDetector(object): pre_process_list = [{ 'DetResizeForTest': { 'limit_side_len': args.det_limit_side_len, - 'limit_type': args.det_limit_type, + 'limit_type': args.det_limit_type } }, { 'NormalizeImage': { diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index 58a363e3e8fa852ce37cd5a44a19e460da00c2bc..ad1b7d4ef432924f1781a16eae07c171e571826b 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -88,7 +88,8 @@ class TextSystem(object): ori_im = img.copy() dt_boxes, elapse = self.text_detector(img) - logger.debug("dt_boxes num : {}, elapse : {}".format( + logger.info("dt_boxes num : {}, elapse : {}".format( + len(dt_boxes), elapse)) if dt_boxes is None: return None, None @@ -103,11 +104,11 @@ class TextSystem(object): if self.use_angle_cls and cls: img_crop_list, angle_list, elapse = self.text_classifier( img_crop_list) - logger.debug("cls num : {}, elapse : {}".format( + logger.info("cls num : {}, elapse : {}".format( len(img_crop_list), elapse)) rec_res, elapse = self.text_recognizer(img_crop_list) - logger.debug("rec_res num : {}, elapse : {}".format( + logger.info("rec_res num : {}, elapse : {}".format( len(rec_res), elapse)) # self.print_draw_crop_rec_res(img_crop_list, rec_res) filter_boxes, filter_rec_res = [], [] @@ -155,7 +156,7 @@ def main(args): if not flag: img = cv2.imread(image_file) if img is None: - logger.error("error in loading image:{}".format(image_file)) + logger.info("error in loading image:{}".format(image_file)) continue starttime = time.time() dt_boxes, rec_res = text_sys(img) diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 9210c45783029996f8d6fd105c8413d01c768806..69f28e00b39b657954902e1b4c851fe357ea3619 100755 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -113,7 +113,6 @@ def init_args(): parser.add_argument("--benchmark", type=bool, default=False) parser.add_argument("--save_log_path", type=str, default="./log_output/") - parser.add_argument("--show_log", type=str2bool, default=True) return parser @@ -200,8 +199,6 @@ def create_predictor(args, mode, logger): model_dir = args.cls_model_dir elif mode == 'rec': model_dir = args.rec_model_dir - elif mode == 'structure': - model_dir = args.structure_model_dir else: model_dir = args.e2e_model_dir @@ -331,9 +328,7 @@ def create_predictor(args, mode, logger): config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") config.switch_use_feed_fetch_ops(False) - config.switch_ir_optim(True) - if mode == 'structure': - config.switch_ir_optim(False) + # create predictor predictor = inference.create_predictor(config) input_names = predictor.get_input_names()