diff --git a/configs/picodet/legacy_model/application/layout_analysis/README.md b/configs/picodet/legacy_model/application/layout_analysis/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9fe3d361825beaddff761b8f0a4f43d5dbffed09 --- /dev/null +++ b/configs/picodet/legacy_model/application/layout_analysis/README.md @@ -0,0 +1,48 @@ +# 更多应用 + + +## 1. 版面分析任务 + +版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等。版面分析示意图如下图所示。 + +
+ +
+ +### 1.1 数据集 + +训练版面分析模型时主要用到了以下几个数据集。 + +| dataset | 简介 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cTDaR2019_cTDaR](https://cndplab-founder.github.io/cTDaR2019/) | 用于表格检测(TRACKA)和表格识别(TRACKB)。图片类型包含历史数据集(以cTDaR_t0开头,如cTDaR_t00872.jpg)和现代数据集(以cTDaR_t1开头,cTDaR_t10482.jpg)。 | +| [IIIT-AR-13K](http://cvit.iiit.ac.in/usodi/iiitar13k.php) | 手动注释公开的年度报告中的图形或页面而构建的数据集,包含5类:table, figure, natural image, logo, and signature | +| [CDLA](https://github.com/buptlihang/CDLA) | 中文文档版面分析数据集,面向中文文献类(论文)场景,包含10类:Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation | +| [TableBank](https://github.com/doc-analysis/TableBank) | 用于表格检测和识别大型数据集,包含Word和Latex2种文档格式 | +| [DocBank](https://github.com/doc-analysis/DocBank) | 使用弱监督方法构建的大规模数据集(500K文档页面),用于文档布局分析,包含12类:Author、Caption、Date、Equation、Figure、Footer、List、Paragraph、Reference、Section、Table、Title | + + +### 1.2 模型库 + +| 模型 | 图像输入尺寸 | mAPval
0.5 | 下载地址 | 配置文件 | +| :-------- | :--------: | :----------------: | :---------------: | ----------------- | +| PicoDet-LCNet_x1_0 | 800*608 | 93.5 | [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout.pdparams) | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) | [config](./picodet_lcnet_x1_0_layout.yml) | +| PicoDet-LCNet_x1_0 + FGD | 800*608 | 94 | [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar) | [teacher config](./picodet_lcnet_x2_5_layout.yml)|[student config](./picodet_lcnet_x1_0_layout.yml) | + +### 1.3 模型推理 + +下载模型库中的inference_model模型,版面恢复任务进行推理,可以执行如下命令: + +```bash +python3 deploy/python/infer.py \ + --model_dir=picodet_lcnet_x1_0_layout/ \ + --image_file=docs/images/layout.jpg \ + --device=CPU +``` + +可视化版面结果如下图所示: + +
+ +
+ diff --git a/configs/picodet/legacy_model/application/layout_analysis/images/layout_demo.png b/configs/picodet/legacy_model/application/layout_analysis/images/layout_demo.png new file mode 100644 index 0000000000000000000000000000000000000000..da9640e245e34659771353e328bf97da129bd622 Binary files /dev/null and b/configs/picodet/legacy_model/application/layout_analysis/images/layout_demo.png differ diff --git a/configs/picodet/legacy_model/application/layout_analysis/images/layout_res.jpg b/configs/picodet/legacy_model/application/layout_analysis/images/layout_res.jpg new file mode 100644 index 0000000000000000000000000000000000000000..93b3a8bef3bfc9f5c80a9505239af05d526b45a7 Binary files /dev/null and b/configs/picodet/legacy_model/application/layout_analysis/images/layout_res.jpg differ diff --git a/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml b/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml new file mode 100644 index 0000000000000000000000000000000000000000..25acd05d37ecdb0dc886a92bb19b05789a5c0c85 --- /dev/null +++ b/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml @@ -0,0 +1,85 @@ +_BASE_: [ + '../../../../runtime.yml', + '../../_base_/picodet_esnet.yml', + '../../_base_/optimizer_100e.yml', + '../../_base_/picodet_640_reader.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/LCNet_x1_0_pretrained.pdparams +weights: output/picodet_lcnet_x1_0_layout/model_final +find_unused_parameters: True +use_ema: true +cycle_epoch: 10 +snapshot_epoch: 1 +epoch: 100 + +PicoDet: + backbone: LCNet + neck: CSPPAN + head: PicoHead + +LCNet: + scale: 1.0 + feature_maps: [3, 4, 5] + +metric: COCO +num_classes: 5 + +TrainDataset: + !COCODataSet + image_dir: train + anno_path: train.json + dataset_dir: ./dataset/publaynet/ + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + image_dir: val + anno_path: val.json + dataset_dir: ./dataset/publaynet/ + +TestDataset: + !ImageFolder + anno_path: ./dataset/publaynet/val.json + + +worker_num: 8 +TrainReader: + sample_transforms: + - Decode: {} + - RandomCrop: {} + - RandomFlip: {prob: 0.5} + - RandomDistort: {} + batch_transforms: + - BatchRandomResize: {target_size: [[768, 576], [800, 608], [832, 640]], random_size: True, random_interp: True, keep_ratio: False} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_size: 24 + shuffle: true + drop_last: true + collate_batch: false + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 608], keep_ratio: False} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + batch_size: 8 + shuffle: false + + +TestReader: + inputs_def: + image_shape: [1, 3, 800, 608] + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 608], keep_ratio: False} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + batch_size: 1 + shuffle: false diff --git a/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml b/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml new file mode 100644 index 0000000000000000000000000000000000000000..b6d771f618b5f4b9b4569f55930863cf6644c2b2 --- /dev/null +++ b/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml @@ -0,0 +1,33 @@ +_BASE_: [ + '../../_base_/picodet_esnet.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/LCNet_x2_5_ssld_pretrained.pdparams +weights: output/picodet_lcnet_x2_5_layout/model_final +find_unused_parameters: True + +PicoDet: + backbone: LCNet + neck: CSPPAN + head: PicoHead + +LCNet: + scale: 2.5 + feature_maps: [3, 4, 5] + +CSPPAN: + spatial_scales: [0.125, 0.0625, 0.03125] + +slim: Distill +slim_method: FGD +distill_loss: FGDFeatureLoss +distill_loss_name: ['neck_f_3', 'neck_f_2', 'neck_f_1', 'neck_f_0'] + +FGDFeatureLoss: + student_channels: 128 + teacher_channels: 128 + temp: 0.5 + alpha_fgd: 0.001 + beta_fgd: 0.0005 + gamma_fgd: 0.0005 + lambda_fgd: 0.000005 diff --git a/docs/images/layout.jpg b/docs/images/layout.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1c3ca618d30c4c04f062a7db382326ebb4d4e599 Binary files /dev/null and b/docs/images/layout.jpg differ