diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index c17db91a5b5cd9d3cbb4b5bf6c87afd745d0870d..d0d2bb721be41fe2c4042fbea1b55e4e76bdd664 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -2531,7 +2531,7 @@ class MainWindow(QMainWindow): split = 'test' # save dict - html = {'structure': {'tokens': token_list}, 'cell': cells} + html = {'structure': {'tokens': token_list}, 'cells': cells} json_results.append({'filename': os.path.basename(image_path), 'split': split, 'imgid': imgid, 'html': html}) imgid += 1 diff --git a/README.md b/README.md index 202362c60911f6c510ec16c59f6a1f934baa65d3..81d37237af9a420fb71a0647ba8b6cfe3f30f6e5 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - [Dive Into OCR ](./doc/doc_en/ocr_book_en.md) +## Community ## 👫 Community @@ -126,7 +127,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - [Inference and Deployment](./deploy/README.md) - [Python Inference](./ppstructure/docs/inference_en.md) - [C++ Inference](./deploy/cpp_infer/readme.md) - - [Serving](./deploy/pdserving/README.md) + - [Serving](./deploy/hubserving/readme_en.md) - [Academic Algorithms](./doc/doc_en/algorithm_overview_en.md) - [Text detection](./doc/doc_en/algorithm_overview_en.md) - [Text recognition](./doc/doc_en/algorithm_overview_en.md) diff --git a/README_ch.md b/README_ch.md index 2b7f3a39b7277d40d37589ad5ed3af77fd7dd98b..1d3f62ba071bb8d46172047f53bb4c7550482227 100755 --- a/README_ch.md +++ b/README_ch.md @@ -152,7 +152,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 - [推理部署](./deploy/README_ch.md) - [基于Python预测引擎推理](./ppstructure/docs/inference.md) - [基于C++预测引擎推理](./deploy/cpp_infer/readme_ch.md) - - [服务化部署](./deploy/pdserving/README_CN.md) + - [服务化部署](./deploy/hubserving/readme.md) - [前沿算法与模型🚀](./doc/doc_ch/algorithm_overview.md) - [文本检测算法](./doc/doc_ch/algorithm_overview.md) - [文本识别算法](./doc/doc_ch/algorithm_overview.md) diff --git "a/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" "b/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" index 14a6a1c8f1dd2350767afa162063b06791e79dd4..82f5b8d48600c6bebb4d3183ee801305d305d531 100644 --- "a/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" +++ "b/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" @@ -30,7 +30,7 @@ cd PaddleOCR # 安装PaddleOCR的依赖 pip install -r requirements.txt # 安装关键信息抽取任务的依赖 -pip install -r ./ppstructure/vqa/requirements.txt +pip install -r ./ppstructure/kie/requirements.txt ``` ## 4. 关键信息抽取 @@ -94,7 +94,7 @@ VI-LayoutXLM的配置为[ser_vi_layoutxlm_xfund_zh_udml.yml](../configs/kie/vi_l ```yml Architecture: - model_type: &model_type "vqa" + model_type: &model_type "kie" name: DistillationModel algorithm: Distillation Models: @@ -177,7 +177,7 @@ python3 tools/eval.py -c ./fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone. 使用下面的命令进行预测。 ```bash -python3 tools/infer_vqa_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False +python3 tools/infer_kie_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False ``` 预测结果会保存在配置文件中的`Global.save_res_path`目录中。 @@ -195,7 +195,7 @@ python3 tools/infer_vqa_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architect ```bash -python3 tools/infer_vqa_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/imgs/b25.jpg Global.infer_mode=True +python3 tools/infer_kie_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/imgs/b25.jpg Global.infer_mode=True ``` 结果如下所示。 @@ -211,7 +211,7 @@ python3 tools/infer_vqa_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architect 如果希望构建基于你在垂类场景训练得到的OCR检测与识别模型,可以使用下面的方法传入检测与识别的inference 模型路径,即可完成OCR文本检测与识别以及SER的串联过程。 ```bash -python3 tools/infer_vqa_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/imgs/b25.jpg Global.infer_mode=True Global.kie_rec_model_dir="your_rec_model" Global.kie_det_model_dir="your_det_model" +python3 tools/infer_kie_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/imgs/b25.jpg Global.infer_mode=True Global.kie_rec_model_dir="your_rec_model" Global.kie_det_model_dir="your_det_model" ``` ### 4.4 关系抽取(Relation Extraction) @@ -316,7 +316,7 @@ python3 tools/eval.py -c ./fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.c # -o 后面的字段是RE任务的配置 # -c_ser 后面的是SER任务的配置文件 # -c_ser 后面的字段是SER任务的配置 -python3 tools/infer_vqa_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=False -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy +python3 tools/infer_kie_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_trained/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=False -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_trained/best_accuracy ``` 预测结果会保存在配置文件中的`Global.save_res_path`目录中。 @@ -333,11 +333,11 @@ python3 tools/infer_vqa_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Archite 如果希望使用OCR引擎结果得到的结果进行推理,则可以使用下面的命令进行推理。 ```bash -python3 tools/infer_vqa_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=True -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy +python3 tools/infer_kie_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=True -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy ``` 如果希望构建基于你在垂类场景训练得到的OCR检测与识别模型,可以使用下面的方法传入,即可完成SER + RE的串联过程。 ```bash -python3 tools/infer_vqa_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=True -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.kie_rec_model_dir="your_rec_model" Global.kie_det_model_dir="your_det_model" +python3 tools/infer_kie_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=True -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.kie_rec_model_dir="your_rec_model" Global.kie_det_model_dir="your_det_model" ``` diff --git a/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml b/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml index acf438950a43af3356c7ab0aadf956fdf226814e..6167b6e13f9b75b87890ba008de5dd216b18917e 100644 --- a/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml +++ b/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml @@ -94,14 +94,11 @@ Loss: - ["Student", "Student2"] maps_name: "thrink_maps" weight: 1.0 - # act: None model_name_pairs: ["Student", "Student2"] key: maps - DistillationDBLoss: weight: 1.0 model_name_list: ["Student", "Student2"] - # key: maps - # name: DBLoss balance_loss: true main_loss_type: DiceLoss alpha: 5 @@ -191,7 +188,6 @@ Eval: channel_first: False - DetLabelEncode: # Class handling label - DetResizeForTest: -# image_shape: [736, 1280] - NormalizeImage: scale: 1./255. mean: [0.485, 0.456, 0.406] diff --git a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml index ef58befd694e26704c734d7fd072ebc3370c8554..88514e76a501fc9fac887cb170eb870523b31b8e 100644 --- a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml +++ b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml @@ -24,6 +24,7 @@ Architecture: model_type: det Models: Student: + pretrained: model_type: det algorithm: DB Transform: null @@ -40,6 +41,7 @@ Architecture: name: DBHead k: 50 Student2: + pretrained: model_type: det algorithm: DB Transform: null @@ -56,6 +58,7 @@ Architecture: name: DBHead k: 50 Teacher: + pretrained: freeze_params: true return_all_feats: false model_type: det @@ -91,14 +94,11 @@ Loss: - ["Student", "Student2"] maps_name: "thrink_maps" weight: 1.0 - # act: None model_name_pairs: ["Student", "Student2"] key: maps - DistillationDBLoss: weight: 1.0 model_name_list: ["Student", "Student2"] - # key: maps - # name: DBLoss balance_loss: true main_loss_type: DiceLoss alpha: 5 @@ -204,31 +204,21 @@ Eval: label_file_list: - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt transforms: - - DecodeImage: - img_mode: BGR - channel_first: false - - DetLabelEncode: null - - DetResizeForTest: null - - NormalizeImage: - scale: 1./255. - mean: - - 0.485 - - 0.456 - - 0.406 - std: - - 0.229 - - 0.224 - - 0.225 - order: hwc - - ToCHWImage: null - - KeepKeys: - keep_keys: - - image - - shape - - polys - - ignore_tags + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] loader: - shuffle: false - drop_last: false - batch_size_per_card: 1 + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 num_workers: 2 diff --git a/configs/table/SLANet_ch.yml b/configs/table/SLANet_ch.yml index 997ff0a77b5ea824957abc1d32a7ba7f70abc12c..a3fc1c68ddd2287e6bbaa6c53c5e20961950d23f 100644 --- a/configs/table/SLANet_ch.yml +++ b/configs/table/SLANet_ch.yml @@ -107,7 +107,7 @@ Train: Eval: dataset: name: PubTabDataSet - data_dir: train_data/table/val/ + data_dir: train_data/table/val/ label_file_list: [train_data/table/val.txt] transforms: - DecodeImage: diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index 511e0421f1e249e340f2002a900b59633e31880e..315329464f15aa1127e34a38d3407a9c81dbc627 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -390,6 +390,7 @@ im_show.save('result.jpg') | det_db_thresh | DB模型输出预测图的二值化阈值 | 0.3 | | det_db_box_thresh | DB模型输出框的阈值,低于此值的预测框会被丢弃 | 0.5 | | det_db_unclip_ratio | DB模型输出框扩大的比例 | 2 | +| det_db_score_mode | 计算检测框score的方式,有'fast'和'slow',如果要检测的文字有弯曲,建议用'slow','slow'模式计算的box的score偏大,box不容易被过滤掉 | 'fast' | | det_east_score_thresh | EAST模型输出预测图的二值化阈值 | 0.8 | | det_east_cover_thresh | EAST模型输出框的阈值,低于此值的预测框会被丢弃 | 0.1 | | det_east_nms_thresh | EAST模型输出框NMS的阈值 | 0.2 | diff --git a/doc/doc_en/ocr_book_en.md b/doc/doc_en/ocr_book_en.md index b0455fe61afe8ae456f224e57d346b1fed553eb4..63162be566d515dd7f4f181f80a140cdd0376f91 100644 --- a/doc/doc_en/ocr_book_en.md +++ b/doc/doc_en/ocr_book_en.md @@ -1,6 +1,6 @@ # E-book: *Dive Into OCR* -"Dive Into OCR" is a textbook that combines OCR theory and practice, written by the PaddleOCR team, Chen Zhineng, a Pre-tenure Professor at Fudan University, Huang Wenhui, a senior expert in the field of vision at China Mobile Research Institute, and other industry-university-research colleagues, as well as OCR developers. The main features are as follows: +"Dive Into OCR" is a textbook that combines OCR theory and practice, written by the PaddleOCR community. The main features are as follows: - OCR full-stack technology covering text detection, recognition and document analysis - Closely integrate theory and practice, cross the code implementation gap, and supporting instructional videos @@ -8,6 +8,10 @@ ## Structure +
+ +
+ - The first part is the preliminary knowledge of the book, including the knowledge index and resource links needed in the process of positioning and using the book content of the book - The second part is chapters 4-8 of the book, which introduce the concepts, applications, and industry practices related to the detection and identification capabilities of the OCR engine. In the "Introduction to OCR Technology", the application scenarios and challenges of OCR, the basic concepts of technology, and the pain points in industrial applications are comprehensively explained. Then, in the two chapters of "Text Detection" and "Text Recognition", the two basic tasks of OCR are introduced. In each chapter, an algorithm is accompanied by a detailed explanation of the code and practical exercises. Chapters 6 and 7 are a detailed introduction to the PP-OCR series model, PP-OCR is a set of OCR systems for industrial applications, on the basis of the basic detection and identification model, after a series of optimization strategies to achieve the general field of industrial SOTA model, while opening up a variety of predictive deployment solutions, enabling enterprises to quickly land OCR applications. @@ -16,6 +20,11 @@ ## Address -- [E-book: *Dive Into OCR* (link generating)]() -- [Jupyter notebook](../../notebook/notebook_en/) -- [videos (Chinese only)](https://aistudio.baidu.com/aistudio/education/group/info/25207) +- [E-book: *Dive Into OCR* (PDF)](https://paddleocr.bj.bcebos.com/ebook/Dive_into_OCR.pdf) +- [Notebook (.ipynb)](https://github.com/PaddleOCR-Community/Dive-into-OCR) +- [Videos (Chinese only)](https://aistudio.baidu.com/aistudio/education/group/info/25207) + + +trackgit-views + + diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index d81e5532cf1db0193abf61b972420bdc3bacfd0b..da2dff67c16b4a9a0a653934b1f1df64cb6e9707 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -342,6 +342,7 @@ im_show.save('result.jpg') | det_db_thresh | Binarization threshold value of DB output map | 0.3 | | det_db_box_thresh | The threshold value of the DB output box. Boxes score lower than this value will be discarded | 0.5 | | det_db_unclip_ratio | The expanded ratio of DB output box | 2 | +| det_db_score_mode | The parameter that control how the score of the detection frame is calculated. There are 'fast' and 'slow' options. If the text to be detected is curved, it is recommended to use 'slow' | 'fast' | | det_east_score_thresh | Binarization threshold value of EAST output map | 0.8 | | det_east_cover_thresh | The threshold value of the EAST output box. Boxes score lower than this value will be discarded | 0.1 | | det_east_nms_thresh | The NMS threshold value of EAST model output box | 0.2 | diff --git a/paddleocr.py b/paddleocr.py index 0b7aed36279081f50208f75272fc54c5081929a7..03c9ba189a8ce0edba39f5fcb72c7ddc0611241a 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -414,6 +414,33 @@ def get_model_config(type, version, model_type, lang): return model_urls[version][model_type][lang] +def img_decode(content: bytes): + np_arr = np.frombuffer(content, dtype=np.uint8) + return cv2.imdecode(np_arr, cv2.IMREAD_COLOR) + + +def check_img(img): + if isinstance(img, bytes): + img = img_decode(img) + if isinstance(img, str): + # download net image + if is_link(img): + download_with_progressbar(img, 'tmp.jpg') + img = 'tmp.jpg' + image_file = img + img, flag, _ = check_and_read(image_file) + if not flag: + with open(image_file, 'rb') as f: + img = img_decode(f.read()) + if img is None: + logger.error("error in loading image:{}".format(image_file)) + return None + if isinstance(img, np.ndarray) and len(img.shape) == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + + return img + + class PaddleOCR(predict_system.TextSystem): def __init__(self, **kwargs): """ @@ -482,7 +509,7 @@ class PaddleOCR(predict_system.TextSystem): rec: use text recognition or not. If false, only det will be exec. Default is True cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. """ - assert isinstance(img, (np.ndarray, list, str)) + assert isinstance(img, (np.ndarray, list, str, bytes)) if isinstance(img, list) and det == True: logger.error('When input a list of images, det must be false') exit(0) @@ -491,22 +518,8 @@ class PaddleOCR(predict_system.TextSystem): 'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process' ) - if isinstance(img, str): - # download net image - if img.startswith('http'): - download_with_progressbar(img, 'tmp.jpg') - img = 'tmp.jpg' - image_file = img - img, flag, _ = check_and_read(image_file) - if not flag: - with open(image_file, 'rb') as f: - np_arr = np.frombuffer(f.read(), dtype=np.uint8) - img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - return None - if isinstance(img, np.ndarray) and len(img.shape) == 2: - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + img = check_img(img) + if det and rec: dt_boxes, rec_res, _ = self.__call__(img, cls) return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] @@ -585,23 +598,7 @@ class PPStructure(StructureSystem): super().__init__(params) def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): - if isinstance(img, str): - # download net image - if img.startswith('http'): - download_with_progressbar(img, 'tmp.jpg') - img = 'tmp.jpg' - image_file = img - img, flag, _ = check_and_read(image_file) - if not flag: - with open(image_file, 'rb') as f: - np_arr = np.frombuffer(f.read(), dtype=np.uint8) - img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - return None - if isinstance(img, np.ndarray) and len(img.shape) == 2: - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - + img = check_img(img) res, _ = super().__call__( img, return_ocr_result_in_table, img_idx=img_idx) return res diff --git a/ppocr/data/imaug/operators.py b/ppocr/data/imaug/operators.py index f8ed28929707eb750ad6e8499a73568cae3a8e6b..5e84b1aac9c54d8a8283468af6826ca917ba0384 100644 --- a/ppocr/data/imaug/operators.py +++ b/ppocr/data/imaug/operators.py @@ -225,6 +225,8 @@ class DetResizeForTest(object): def __call__(self, data): img = data['image'] src_h, src_w, _ = img.shape + if sum([src_h, src_w]) < 64: + img = self.image_padding(img) if self.resize_type == 0: # img, shape = self.resize_image_type0(img) @@ -238,6 +240,12 @@ class DetResizeForTest(object): data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) return data + def image_padding(self, im, value=0): + h, w, c = im.shape + im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value + im_pad[:h, :w, :] = im + return im_pad + def resize_image_type1(self, img): resize_h, resize_w = self.image_shape ori_h, ori_w = img.shape[:2] # (h, w, c) diff --git a/ppocr/losses/basic_loss.py b/ppocr/losses/basic_loss.py index da9faa08bc5ca35c5d65f7a7bfbbdd67192f052b..a6f0472ecd0cf3f443aeb474ca6dd5487111f8f0 100644 --- a/ppocr/losses/basic_loss.py +++ b/ppocr/losses/basic_loss.py @@ -60,19 +60,19 @@ class KLJSLoss(object): ], "mode can only be one of ['kl', 'KL', 'js', 'JS']" self.mode = mode - def __call__(self, p1, p2, reduction="mean"): + def __call__(self, p1, p2, reduction="mean", eps=1e-5): if self.mode.lower() == 'kl': loss = paddle.multiply(p2, - paddle.log((p2 + 1e-5) / (p1 + 1e-5) + 1e-5)) + paddle.log((p2 + eps) / (p1 + eps) + eps)) loss += paddle.multiply( - p1, paddle.log((p1 + 1e-5) / (p2 + 1e-5) + 1e-5)) + p1, paddle.log((p1 + eps) / (p2 + eps) + eps)) loss *= 0.5 elif self.mode.lower() == "js": loss = paddle.multiply( - p2, paddle.log((2 * p2 + 1e-5) / (p1 + p2 + 1e-5) + 1e-5)) + p2, paddle.log((2 * p2 + eps) / (p1 + p2 + eps) + eps)) loss += paddle.multiply( - p1, paddle.log((2 * p1 + 1e-5) / (p1 + p2 + 1e-5) + 1e-5)) + p1, paddle.log((2 * p1 + eps) / (p1 + p2 + eps) + eps)) loss *= 0.5 else: raise ValueError( @@ -125,7 +125,7 @@ class DMLLoss(nn.Layer): loss = ( self._kldiv(log_out1, out2) + self._kldiv(log_out2, out1)) / 2.0 else: - # for detection distillation log is not needed + # distillation log is not needed for detection loss = self.jskl_loss(out1, out2) return loss diff --git a/ppstructure/README.md b/ppstructure/README.md index fb3697bc1066262833ee20bcbb8f79833f264f14..f3f2d4a931d611003102da4b80bfb1b12d96cbab 100644 --- a/ppstructure/README.md +++ b/ppstructure/README.md @@ -1,5 +1,7 @@ English | [简体中文](README_ch.md) +# PP-Structure + - [1. Introduction](#1-introduction) - [2. Features](#2-features) - [3. Results](#3-results) diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 60642f78b6691c3ac2eeba99680a2af23299ddc9..f1f9cb8b09cd87b35fec0e7f09ff1d813e3d44db 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -95,7 +95,7 @@ paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout -#### 2.1.6 版面恢复 +#### 2.1.6 版面恢复(PDF转Word) ```bash # 中文测试图 diff --git a/ppstructure/kie/README.md b/ppstructure/kie/README.md index d9471fb18d140704fdeb76c321f8a001426f872d..b3b4d47d86d0cf2871ff96951afa0007306a572b 100644 --- a/ppstructure/kie/README.md +++ b/ppstructure/kie/README.md @@ -1,17 +1,18 @@ English | [简体中文](README_ch.md) -- [1. Introduction](#1-introduction) +# Key Information Extraction (KIE) -- [2. Accuracy and performance](#2-Accuracy-and-performance) -- [3. Visualization](#3-Visualization) +- [1. Introduction](#1-introduction) +- [2. Performance](#2-performance) +- [3. Visualization](#3-visualization) - [3.1 SER](#31-ser) - [3.2 RE](#32-re) - [4. Usage](#4-usage) - - [4.1 Prepare for the environment](#41-Prepare-for-the-environment) - - [4.2 Quick start](#42-Quick-start) - - [4.3 More](#43-More) -- [5. Reference](#5-Reference) -- [6. License](#6-License) + - [4.1 Prepare for the environment](#41-prepare-for-the-environment) + - [4.2 Quick start](#42-quick-start) + - [4.3 More](#43-more) +- [5. Reference](#5-reference) +- [6. License](#6-license) ## 1. Introduction @@ -31,7 +32,7 @@ The main features of the key information extraction module in PP-Structure are a - Support SER model export and inference using PaddleInference. -## 2. Accuracy and performance +## 2. Performance We evaluate the methods on the Chinese dataset of [XFUND](https://github.com/doc-analysis/XFUND), and the performance is as follows @@ -171,16 +172,16 @@ If you want to use OCR engine to obtain end-to-end prediction results, you can u # just predict using SER trained model python3 tools/infer_kie_token_ser.py \ -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ - -o Architecture.Backbone.checkpoints=./pretrain_models/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg # predict using SER and RE trained model at the same time python3 ./tools/infer_kie_token_ser_re.py \ -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ - -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ Global.infer_img=./train_data/XFUND/zh_val/image/zh_val_42.jpg \ -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ - -o_ser Architecture.Backbone.checkpoints=./pretrain_models/ser_vi_layoutxlm_xfund_pretrained/best_accuracy + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy ``` The visual result images and the predicted text file will be saved in the `Global.save_res_path` directory. @@ -192,18 +193,18 @@ If you want to load the text detection and recognition results collected before, # just predict using SER trained model python3 tools/infer_kie_token_ser.py \ -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ - -o Architecture.Backbone.checkpoints=./pretrain_models/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ Global.infer_img=./train_data/XFUND/zh_val/val.json \ Global.infer_mode=False # predict using SER and RE trained model at the same time python3 ./tools/infer_kie_token_ser_re.py \ -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ - -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ Global.infer_img=./train_data/XFUND/zh_val/val.json \ Global.infer_mode=False \ -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ - -o_ser Architecture.Backbone.checkpoints=./pretrain_models/ser_vi_layoutxlm_xfund_pretrained/best_accuracy + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy ``` #### 4.2.3 Inference using PaddleInference diff --git a/ppstructure/kie/README_ch.md b/ppstructure/kie/README_ch.md index 56c99ab73abe2b33ccfa18d4181312cd5f4d3622..cc8c60009f4cb83d349c45573a9fa03832665374 100644 --- a/ppstructure/kie/README_ch.md +++ b/ppstructure/kie/README_ch.md @@ -156,16 +156,16 @@ wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layou # 仅预测SER模型 python3 tools/infer_kie_token_ser.py \ -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ - -o Architecture.Backbone.checkpoints=./pretrain_models/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg # SER + RE模型串联 python3 ./tools/infer_kie_token_ser_re.py \ -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ - -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ Global.infer_img=./train_data/XFUND/zh_val/image/zh_val_42.jpg \ -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ - -o_ser Architecture.Backbone.checkpoints=./pretrain_models/ser_vi_layoutxlm_xfund_pretrained/best_accuracy + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy ``` `Global.save_res_path`目录中会保存可视化的结果图像以及预测的文本文件。 @@ -177,18 +177,18 @@ python3 ./tools/infer_kie_token_ser_re.py \ # 仅预测SER模型 python3 tools/infer_kie_token_ser.py \ -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ - -o Architecture.Backbone.checkpoints=./pretrain_models/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ Global.infer_img=./train_data/XFUND/zh_val/val.json \ Global.infer_mode=False # SER + RE模型串联 python3 ./tools/infer_kie_token_ser_re.py \ -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ - -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ Global.infer_img=./train_data/XFUND/zh_val/val.json \ Global.infer_mode=False \ -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ - -o_ser Architecture.Backbone.checkpoints=./pretrain_models/ser_vi_layoutxlm_xfund_pretrained/best_accuracy + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy ``` #### 4.2.3 基于PaddleInference的预测 diff --git a/ppstructure/layout/README.md b/ppstructure/layout/README.md index 7615953b4651a34b2afa741f6290062a9b5956c7..84b977fdd760e6de43d355b802731b5d43eb2cf5 100644 --- a/ppstructure/layout/README.md +++ b/ppstructure/layout/README.md @@ -3,7 +3,7 @@ English | [简体中文](README_ch.md) # Layout analysis - [1. Introduction](#1-Introduction) -- [2. Quick start](#3-Quick-start) +- [2. Quick start](#2-Quick-start) - [3. Install](#3-Install) - [3.1 Install PaddlePaddle](#31-Install-paddlepaddle) - [3.2 Install PaddleDetection](#32-Install-paddledetection) @@ -12,11 +12,11 @@ English | [简体中文](README_ch.md) - [4.2 More datasets](#42-More-datasets) - [5. Start training](#5-Start-training) - [5.1 Train](#51-Train) - - [5.2 FGD Distillation training](#52-FGD-Distillation-training) + - [5.2 FGD Distillation training](#52-Fgd-distillation-training) - [6. Model evaluation and prediction](#6-Model-evaluation-and-prediction) - [6.1 Indicator evaluation](#61-Indicator-evaluation) - [6.2 Test layout analysis results](#62-Test-layout-analysis-results) -- [7 Model export and inference](#7-Model-export-and-inference) +- [7. Model export and inference](#7-Model-export-and-inference) - [7.1 Model export](#71-Model-export) - [7.2 Model inference](#72-Model-inference) @@ -249,7 +249,7 @@ After starting training normally, you will see the following log output: **Note that the configuration file for prediction / evaluation must be consistent with the training.** -### 5.2. FGD Distillation Training +### 5.2. FGD Distillation Training PaddleDetection supports FGD-based [Focal and Global Knowledge Distillation for Detectors]( https://arxiv.org/abs/2111.11837v1) The training process of the target detection model of distillation, FGD distillation is divided into two parts `Focal` and `Global`. `Focal` Distillation separates the foreground and background of the image, allowing the student model to focus on the key pixels of the foreground and background features of the teacher model respectively;` Global`Distillation section reconstructs the relationships between different pixels and transfers them from the teacher to the student to compensate for the global information lost in `Focal`Distillation. diff --git a/ppstructure/layout/README_ch.md b/ppstructure/layout/README_ch.md index 49ddeb70ae5650b0a0d96fc6c6581ef7e346cdd0..46d2ba74b2d5c579d4b25cf0cadac22ebc32e5b2 100644 --- a/ppstructure/layout/README_ch.md +++ b/ppstructure/layout/README_ch.md @@ -12,7 +12,7 @@ - [4.2 更多数据集](#42-更多数据集) - [5. 开始训练](#5-开始训练) - [5.1 启动训练](#51-启动训练) - - [5.2 FGD蒸馏训练](#52-FGD蒸馏训练) + - [5.2 FGD蒸馏训练](#52-fgd蒸馏训练) - [6. 模型评估与预测](#6-模型评估与预测) - [6.1 指标评估](#61-指标评估) - [6.2 测试版面分析结果](#62-测试版面分析结果) @@ -27,12 +27,13 @@
+ ## 2. 快速开始 PP-Structure目前提供了中文、英文、表格三类文档版面分析模型,模型链接见 [models_list](../docs/models_list.md#1-版面分析模型)。也提供了whl包的形式方便快速使用,详见 [quickstart](../docs/quickstart.md)。 -## 3. 安装依赖 +## 3. 安装 ### 3.1. 安装PaddlePaddle diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md index 0d419966579890ecbca1425646a17a16cecc88a7..011d6e12fda1b09c7a87367fb887a5c99a4ae00a 100644 --- a/ppstructure/recovery/README.md +++ b/ppstructure/recovery/README.md @@ -1,14 +1,15 @@ English | [简体中文](README_ch.md) -- [Getting Started](#getting-started) - - [1. Introduction](#1) - - [2. Install](#2) - - [2.1 Installation dependencies](#2.1) +# Layout Recovery + +- [1. Introduction](#1) +- [2. Install](#2) + - [2.1 Install PaddlePaddle](#2.1) - [2.2 Install PaddleOCR](#2.2) - - [3. Quick Start](#3) +- [3. Quick Start](#3) - [3.1 Download models](#3.1) - [3.2 Layout recovery](#3.2) - - [4. More](#4) +- [4. More](#4) @@ -31,9 +32,7 @@ Layout recovery combines [layout analysis](../layout/README.md)、[table recogni -### 2.1 Install dependencies - -- **(1) Install PaddlePaddle** +### 2.1 Install PaddlePaddle ```bash python3 -m pip install --upgrade pip diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md index c67efff09c78835c10e8338aab96507332ff20d3..fd2e649024ec88e2ea5c88536ccac2e259538886 100644 --- a/ppstructure/recovery/README_ch.md +++ b/ppstructure/recovery/README_ch.md @@ -1,10 +1,10 @@ [English](README.md) | 简体中文 -# 版面恢复使用说明 +# 版面恢复 - [1. 简介](#1) - [2. 安装](#2) - - [2.1 安装依赖](#2.1) + - [2.1 安装PaddlePaddle](#2.1) - [2.2 安装PaddleOCR](#2.2) - [3. 使用](#3) - [3.1 下载模型](#3.1) @@ -32,9 +32,7 @@ -### 2.1 安装依赖 - -- **(1) 安装PaddlePaddle** +### 2.1 安装PaddlePaddle ```bash python3 -m pip install --upgrade pip