diff --git a/configs/vqa/re/layoutlmv2_funsd.yml b/configs/vqa/re/layoutlmv2_funsd.yml new file mode 100644 index 0000000000000000000000000000000000000000..1c3d8f7854cab2fe71a2c22738c0ea1252753998 --- /dev/null +++ b/configs/vqa/re/layoutlmv2_funsd.yml @@ -0,0 +1,125 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/re_layoutlmv2_funsd + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 57 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: train_data/FUNSD/testing_data/images/83624198.png + save_res_path: ./output/re_layoutlmv2_funsd/res/ + +Architecture: + model_type: vqa + algorithm: &algorithm "LayoutLMv2" + Transform: + Backbone: + name: LayoutLMv2ForRe + pretrained: True + checkpoints: + +Loss: + name: LossFromOutput + key: loss + reduction: mean + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + clip_norm: 10 + lr: + learning_rate: 0.00005 + warmup_epoch: 10 + regularizer: + name: L2 + factor: 0.00000 + +PostProcess: + name: VQAReTokenLayoutLMPostProcess + +Metric: + name: VQAReTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/FUNSD/training_data/images/ + label_file_list: + - ./train_data/FUNSD/train.json + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: True + algorithm: *algorithm + class_path: &class_path train_data/FUNSD/class_list.txt + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQAReTokenRelation: + - VQAReTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations'] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 8 + collate_fn: ListCollator + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/FUNSD/testing_data/images/ + label_file_list: + - ./train_data/FUNSD/test.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: True + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQAReTokenRelation: + - VQAReTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 8 + collate_fn: ListCollator diff --git a/configs/vqa/re/layoutlmv2.yml b/configs/vqa/re/layoutlmv2_xund_zh.yml similarity index 95% rename from configs/vqa/re/layoutlmv2.yml rename to configs/vqa/re/layoutlmv2_xund_zh.yml index 737dbf6b600b1b414a7f66f422e59f46154d91a9..986b9b5cef17bf6b8347ee47f7e045ac0ed13124 100644 --- a/configs/vqa/re/layoutlmv2.yml +++ b/configs/vqa/re/layoutlmv2_xund_zh.yml @@ -3,16 +3,16 @@ Global: epoch_num: &epoch_num 200 log_smooth_window: 10 print_batch_step: 10 - save_model_dir: ./output/re_layoutlmv2/ + save_model_dir: ./output/re_layoutlmv2_xfund_zh save_epoch_step: 2000 # evaluation is run every 10 iterations after the 0th iteration - eval_batch_step: [ 0, 19 ] + eval_batch_step: [ 0, 57 ] cal_metric_during_train: False save_inference_dir: use_visualdl: False seed: 2048 infer_img: ppstructure/docs/vqa/input/zh_val_21.jpg - save_res_path: ./output/re/ + save_res_path: ./output/re_layoutlmv2_xfund_zh/res/ Architecture: model_type: vqa diff --git a/configs/vqa/re/layoutxlm_funsd.yml b/configs/vqa/re/layoutxlm_funsd.yml new file mode 100644 index 0000000000000000000000000000000000000000..af28be10d6e0390f106d0f06d5e9c26b20dcabb8 --- /dev/null +++ b/configs/vqa/re/layoutxlm_funsd.yml @@ -0,0 +1,129 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/re_layoutxlm_funsd + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 57 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: train_data/FUNSD/testing_data/images/83624198.png + save_res_path: ./output/re_layoutxlm_funsd/res/ + +Architecture: + model_type: vqa + algorithm: &algorithm "LayoutXLM" + Transform: + Backbone: + name: LayoutXLMForRe + pretrained: True + checkpoints: + +Loss: + name: LossFromOutput + key: loss + reduction: mean + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + clip_norm: 10 + lr: + learning_rate: 0.00005 + warmup_epoch: 10 + regularizer: + name: L2 + factor: 0.00000 + +PostProcess: + name: VQAReTokenLayoutLMPostProcess + +Metric: + name: VQAReTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/FUNSD/training_data/images/ + label_file_list: + - ./train_data/FUNSD/train_v4.json + # - ./train_data/FUNSD/train.json + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: True + algorithm: *algorithm + class_path: &class_path ./train_data/FUNSD/class_list.txt + use_textline_bbox_info: &use_textline_bbox_info True + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQAReTokenRelation: + - VQAReTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 16 + collate_fn: ListCollator + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/FUNSD/testing_data/images/ + label_file_list: + - ./train_data/FUNSD/test_v4.json + # - ./train_data/FUNSD/test.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: True + algorithm: *algorithm + class_path: *class_path + use_textline_bbox_info: *use_textline_bbox_info + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQAReTokenRelation: + - VQAReTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 8 + collate_fn: ListCollator diff --git a/configs/vqa/re/layoutxlm.yml b/configs/vqa/re/layoutxlm_xfund_zh.yml similarity index 100% rename from configs/vqa/re/layoutxlm.yml rename to configs/vqa/re/layoutxlm_xfund_zh.yml diff --git a/configs/vqa/ser/layoutlm_funsd.yml b/configs/vqa/ser/layoutlm_funsd.yml new file mode 100644 index 0000000000000000000000000000000000000000..0ef3502bc8923df6a04662f6eca032f064896a6b --- /dev/null +++ b/configs/vqa/ser/layoutlm_funsd.yml @@ -0,0 +1,124 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/ser_layoutlm_funsd + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 57 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: train_data/FUNSD/testing_data/images/83624198.png + save_res_path: ./output/ser_layoutlm_funsd/res/ + +Architecture: + model_type: vqa + algorithm: &algorithm "LayoutLM" + Transform: + Backbone: + name: LayoutLMForSer + pretrained: True + checkpoints: + num_classes: &num_classes 7 + +Loss: + name: VQASerTokenLayoutLMLoss + num_classes: *num_classes + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Linear + learning_rate: 0.00005 + epochs: *epoch_num + warmup_epoch: 2 + regularizer: + name: L2 + factor: 0.00000 + +PostProcess: + name: VQASerTokenLayoutLMPostProcess + class_path: &class_path ./train_data/FUNSD/class_list.txt + +Metric: + name: VQASerTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/FUNSD/training_data/images/ + label_file_list: + - ./train_data/FUNSD/train.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + use_textline_bbox_info: &use_textline_bbox_info True + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/FUNSD/testing_data/images/ + label_file_list: + - ./train_data/FUNSD/test.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + use_textline_bbox_info: *use_textline_bbox_info + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 4 diff --git a/configs/vqa/ser/layoutlm_sroie.yml b/configs/vqa/ser/layoutlm_sroie.yml new file mode 100644 index 0000000000000000000000000000000000000000..6abb1151e5102710d54fc163283ab7ec14f85ff4 --- /dev/null +++ b/configs/vqa/ser/layoutlm_sroie.yml @@ -0,0 +1,124 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/ser_layoutlm_sroie + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 200 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: train_data/SROIE/test/X00016469670.jpg + save_res_path: ./output/ser_layoutlm_sroie/res/ + +Architecture: + model_type: vqa + algorithm: &algorithm "LayoutLM" + Transform: + Backbone: + name: LayoutLMForSer + pretrained: True + checkpoints: + num_classes: &num_classes 9 + +Loss: + name: VQASerTokenLayoutLMLoss + num_classes: *num_classes + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Linear + learning_rate: 0.00005 + epochs: *epoch_num + warmup_epoch: 2 + regularizer: + name: L2 + factor: 0.00000 + +PostProcess: + name: VQASerTokenLayoutLMPostProcess + class_path: &class_path ./train_data/SROIE/class_list.txt + +Metric: + name: VQASerTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/SROIE/train + label_file_list: + - ./train_data/SROIE/train.txt + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + use_textline_bbox_info: &use_textline_bbox_info True + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/SROIE/test + label_file_list: + - ./train_data/SROIE/test.txt + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + use_textline_bbox_info: *use_textline_bbox_info + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 4 diff --git a/configs/vqa/ser/layoutlm.yml b/configs/vqa/ser/layoutlm_xfund_zh.yml similarity index 95% rename from configs/vqa/ser/layoutlm.yml rename to configs/vqa/ser/layoutlm_xfund_zh.yml index 53e114defd4cdfa427ae27b647603744302eb0e8..99763c1963e92a010e5f2d2dc795d0ab90755426 100644 --- a/configs/vqa/ser/layoutlm.yml +++ b/configs/vqa/ser/layoutlm_xfund_zh.yml @@ -3,16 +3,16 @@ Global: epoch_num: &epoch_num 200 log_smooth_window: 10 print_batch_step: 10 - save_model_dir: ./output/ser_layoutlm/ + save_model_dir: ./output/ser_layoutlm_xfund_zh save_epoch_step: 2000 # evaluation is run every 10 iterations after the 0th iteration - eval_batch_step: [ 0, 19 ] + eval_batch_step: [ 0, 57 ] cal_metric_during_train: False save_inference_dir: use_visualdl: False seed: 2022 infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg - save_res_path: ./output/ser/ + save_res_path: ./output/ser_layoutlm_xfund_zh/res/ Architecture: model_type: vqa diff --git a/configs/vqa/ser/layoutlmv2_funsd.yml b/configs/vqa/ser/layoutlmv2_funsd.yml new file mode 100644 index 0000000000000000000000000000000000000000..438edc1aa7148a641645a00493ad9073e9239eab --- /dev/null +++ b/configs/vqa/ser/layoutlmv2_funsd.yml @@ -0,0 +1,123 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/ser_layoutlmv2_funsd + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 100 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: train_data/FUNSD/testing_data/images/83624198.png + save_res_path: ./output/ser_layoutlmv2_funsd/res/ + +Architecture: + model_type: vqa + algorithm: &algorithm "LayoutLMv2" + Transform: + Backbone: + name: LayoutLMv2ForSer + pretrained: True + checkpoints: + num_classes: &num_classes 7 + +Loss: + name: VQASerTokenLayoutLMLoss + num_classes: *num_classes + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Linear + learning_rate: 0.00005 + epochs: *epoch_num + warmup_epoch: 2 + regularizer: + + name: L2 + factor: 0.00000 + +PostProcess: + name: VQASerTokenLayoutLMPostProcess + class_path: &class_path train_data/FUNSD/class_list.txt + +Metric: + name: VQASerTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/FUNSD/training_data/images/ + label_file_list: + - ./train_data/FUNSD/train.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/FUNSD/testing_data/images/ + label_file_list: + - ./train_data/FUNSD/test.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 4 diff --git a/configs/vqa/ser/layoutlmv2_sroie.yml b/configs/vqa/ser/layoutlmv2_sroie.yml new file mode 100644 index 0000000000000000000000000000000000000000..549beb8ec520483fe59db0e6caf509339fb90f76 --- /dev/null +++ b/configs/vqa/ser/layoutlmv2_sroie.yml @@ -0,0 +1,123 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/ser_layoutlmv2_sroie + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 200 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: train_data/SROIE/test/X00016469670.jpg + save_res_path: ./output/ser_layoutlmv2_sroie/res/ + +Architecture: + model_type: vqa + algorithm: &algorithm "LayoutLMv2" + Transform: + Backbone: + name: LayoutLMv2ForSer + pretrained: True + checkpoints: + num_classes: &num_classes 9 + +Loss: + name: VQASerTokenLayoutLMLoss + num_classes: *num_classes + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Linear + learning_rate: 0.00005 + epochs: *epoch_num + warmup_epoch: 2 + regularizer: + + name: L2 + factor: 0.00000 + +PostProcess: + name: VQASerTokenLayoutLMPostProcess + class_path: &class_path ./train_data/SROIE/class_list.txt + +Metric: + name: VQASerTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/SROIE/train + label_file_list: + - ./train_data/SROIE/train.txt + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/SROIE/test + label_file_list: + - ./train_data/SROIE/test.txt + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 4 diff --git a/configs/vqa/ser/layoutlmv2.yml b/configs/vqa/ser/layoutlmv2_xfund_zh.yml similarity index 96% rename from configs/vqa/ser/layoutlmv2.yml rename to configs/vqa/ser/layoutlmv2_xfund_zh.yml index e48c7469567a740ca74240f0ca9f782ed5bb3c6d..ebdc5f31695d74d13219c65b420e3da3e1adfe92 100644 --- a/configs/vqa/ser/layoutlmv2.yml +++ b/configs/vqa/ser/layoutlmv2_xfund_zh.yml @@ -3,7 +3,7 @@ Global: epoch_num: &epoch_num 200 log_smooth_window: 10 print_batch_step: 10 - save_model_dir: ./output/ser_layoutlmv2/ + save_model_dir: ./output/ser_layoutlmv2_xfund_zh/ save_epoch_step: 2000 # evaluation is run every 10 iterations after the 0th iteration eval_batch_step: [ 0, 19 ] @@ -12,7 +12,7 @@ Global: use_visualdl: False seed: 2022 infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg - save_res_path: ./output/ser/ + save_res_path: ./output/ser_layoutlmv2_xfund_zh/res/ Architecture: model_type: vqa diff --git a/configs/vqa/ser/layoutxlm_funsd.yml b/configs/vqa/ser/layoutxlm_funsd.yml new file mode 100644 index 0000000000000000000000000000000000000000..be1e9d4f1e986864fc57891d56b4a459df63fd78 --- /dev/null +++ b/configs/vqa/ser/layoutxlm_funsd.yml @@ -0,0 +1,123 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/ser_layoutxlm_funsd + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 57 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: train_data/FUNSD/testing_data/images/83624198.png + save_res_path: output/ser_layoutxlm_funsd/res/ + +Architecture: + model_type: vqa + algorithm: &algorithm "LayoutXLM" + Transform: + Backbone: + name: LayoutXLMForSer + pretrained: True + checkpoints: + num_classes: &num_classes 7 + +Loss: + name: VQASerTokenLayoutLMLoss + num_classes: *num_classes + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Linear + learning_rate: 0.00005 + epochs: *epoch_num + warmup_epoch: 2 + regularizer: + name: L2 + factor: 0.00000 + +PostProcess: + name: VQASerTokenLayoutLMPostProcess + class_path: &class_path ./train_data/FUNSD/class_list.txt + +Metric: + name: VQASerTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/FUNSD/training_data/images/ + label_file_list: + - ./train_data/FUNSD/train.json + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/FUNSD/testing_data/images/ + label_file_list: + - ./train_data/FUNSD/test.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 4 diff --git a/configs/vqa/ser/layoutxlm_sroie.yml b/configs/vqa/ser/layoutxlm_sroie.yml new file mode 100644 index 0000000000000000000000000000000000000000..dd63d888d0110d98314858c9189ebcefca16e8e2 --- /dev/null +++ b/configs/vqa/ser/layoutxlm_sroie.yml @@ -0,0 +1,123 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/ser_layoutxlm_sroie + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 200 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: train_data/SROIE/test/X00016469670.jpg + save_res_path: res_img_aug_with_gt + +Architecture: + model_type: vqa + algorithm: &algorithm "LayoutXLM" + Transform: + Backbone: + name: LayoutXLMForSer + pretrained: True + checkpoints: + num_classes: &num_classes 9 + +Loss: + name: VQASerTokenLayoutLMLoss + num_classes: *num_classes + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Linear + learning_rate: 0.00005 + epochs: *epoch_num + warmup_epoch: 2 + regularizer: + name: L2 + factor: 0.00000 + +PostProcess: + name: VQASerTokenLayoutLMPostProcess + class_path: &class_path ./train_data/SROIE/class_list.txt + +Metric: + name: VQASerTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/SROIE/train + label_file_list: + - ./train_data/SROIE/train.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/SROIE/test + label_file_list: + - ./train_data/SROIE/test.txt + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 4 diff --git a/configs/vqa/ser/layoutxlm_wildreceipt.yml b/configs/vqa/ser/layoutxlm_wildreceipt.yml new file mode 100644 index 0000000000000000000000000000000000000000..92c039429646f6a4bee6fb0d5dd94e142246b526 --- /dev/null +++ b/configs/vqa/ser/layoutxlm_wildreceipt.yml @@ -0,0 +1,123 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 100 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/ser_layoutxlm_wildreceipt + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 200 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: train_data//wildreceipt/image_files/Image_12/10/845be0dd6f5b04866a2042abd28d558032ef2576.jpeg + save_res_path: ./output/ser_layoutxlm_wildreceipt/res + +Architecture: + model_type: vqa + algorithm: &algorithm "LayoutXLM" + Transform: + Backbone: + name: LayoutXLMForSer + pretrained: True + checkpoints: + num_classes: &num_classes 51 + +Loss: + name: VQASerTokenLayoutLMLoss + num_classes: *num_classes + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Linear + learning_rate: 0.00005 + epochs: *epoch_num + warmup_epoch: 2 + regularizer: + name: L2 + factor: 0.00000 + +PostProcess: + name: VQASerTokenLayoutLMPostProcess + class_path: &class_path ./train_data/wildreceipt/class_list.txt + +Metric: + name: VQASerTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/wildreceipt/ + label_file_list: + - ./train_data/wildreceipt/wildreceipt_train.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/wildreceipt + label_file_list: + - ./train_data/wildreceipt/wildreceipt_test.txt + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 4 diff --git a/configs/vqa/ser/layoutxlm.yml b/configs/vqa/ser/layoutxlm_xfund_zh.yml similarity index 96% rename from configs/vqa/ser/layoutxlm.yml rename to configs/vqa/ser/layoutxlm_xfund_zh.yml index fa9df192afbc1d638c220cba3ef3640715585b37..68df7d9f035bf1359951fb3a6fb30b47a929717a 100644 --- a/configs/vqa/ser/layoutxlm.yml +++ b/configs/vqa/ser/layoutxlm_xfund_zh.yml @@ -3,7 +3,7 @@ Global: epoch_num: &epoch_num 200 log_smooth_window: 10 print_batch_step: 10 - save_model_dir: ./output/ser_layoutxlm/ + save_model_dir: ./output/ser_layoutxlm_xfund_zh save_epoch_step: 2000 # evaluation is run every 10 iterations after the 0th iteration eval_batch_step: [ 0, 19 ] @@ -12,7 +12,7 @@ Global: use_visualdl: False seed: 2022 infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg - save_res_path: ./output/ser + save_res_path: ./output/ser_layoutxlm_xfund_zh/res Architecture: model_type: vqa diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index 7cc4cef46a8298001ac4089dbd5e32dbca009caf..a4087d53287fcd57f9c4992ba712c700f33b9981 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -869,6 +869,7 @@ class VQATokenLabelEncode(object): contains_re=False, add_special_ids=False, algorithm='LayoutXLM', + use_textline_bbox_info=True, infer_mode=False, ocr_engine=None, **kwargs): @@ -897,11 +898,51 @@ class VQATokenLabelEncode(object): self.add_special_ids = add_special_ids self.infer_mode = infer_mode self.ocr_engine = ocr_engine + self.use_textline_bbox_info = use_textline_bbox_info + + def split_bbox(self, bbox, text, tokenizer): + words = text.split() + token_bboxes = [] + curr_word_idx = 0 + x1, y1, x2, y2 = bbox + unit_w = (x2 - x1) / len(text) + for idx, word in enumerate(words): + curr_w = len(word) * unit_w + word_bbox = [x1, y1, x1 + curr_w, y2] + token_bboxes.extend([word_bbox] * len(tokenizer.tokenize(word))) + x1 += (len(word) + 1) * unit_w + return token_bboxes + + def filter_empty_contents(self, ocr_info): + """ + find out the empty texts and remove the links + """ + new_ocr_info = [] + empty_index = [] + for idx, info in enumerate(ocr_info): + if len(info["transcription"]) > 0: + new_ocr_info.append(copy.deepcopy(info)) + else: + empty_index.append(info["id"]) + + for idx, info in enumerate(new_ocr_info): + new_link = [] + for link in info["linking"]: + if link[0] in empty_index or link[1] in empty_index: + continue + new_link.append(link) + new_ocr_info[idx]["linking"] = new_link + return new_ocr_info def __call__(self, data): # load bbox and label info ocr_info = self._load_ocr_info(data) + # for re + train_re = self.contains_re and not self.infer_mode + if train_re: + ocr_info = self.filter_empty_contents(ocr_info) + height, width, _ = data['image'].shape words_list = [] @@ -913,8 +954,6 @@ class VQATokenLabelEncode(object): entities = [] - # for re - train_re = self.contains_re and not self.infer_mode if train_re: relations = [] id2label = {} @@ -924,18 +963,19 @@ class VQATokenLabelEncode(object): data['ocr_info'] = copy.deepcopy(ocr_info) for info in ocr_info: + text = info["transcription"] + if len(text) <= 0: + continue if train_re: # for re - if len(info["transcription"]) == 0: + if len(text) == 0: empty_entity.add(info["id"]) continue id2label[info["id"]] = info["label"] relations.extend([tuple(sorted(l)) for l in info["linking"]]) # smooth_box info["bbox"] = self.trans_poly_to_bbox(info["points"]) - bbox = self._smooth_box(info["bbox"], height, width) - text = info["transcription"] encode_res = self.tokenizer.encode( text, pad_to_max_seq_len=False, return_attention_mask=True) @@ -946,6 +986,19 @@ class VQATokenLabelEncode(object): -1] encode_res["attention_mask"] = encode_res["attention_mask"][1: -1] + + if self.use_textline_bbox_info: + bbox = [info["bbox"]] * len(encode_res["input_ids"]) + else: + bbox = self.split_bbox(info["bbox"], info["transcription"], + self.tokenizer) + if len(bbox) <= 0: + continue + bbox = self._smooth_box(bbox, height, width) + if self.add_special_ids: + bbox.insert(0, [0, 0, 0, 0]) + bbox.append([0, 0, 0, 0]) + # parse label if not self.infer_mode: label = info['label'] @@ -970,7 +1023,7 @@ class VQATokenLabelEncode(object): }) input_ids_list.extend(encode_res["input_ids"]) token_type_ids_list.extend(encode_res["token_type_ids"]) - bbox_list.extend([bbox] * len(encode_res["input_ids"])) + bbox_list.extend(bbox) words_list.append(text) segment_offset_id.append(len(input_ids_list)) if not self.infer_mode: @@ -1019,12 +1072,14 @@ class VQATokenLabelEncode(object): info_dict = json.loads(info) return info_dict - def _smooth_box(self, bbox, height, width): - bbox[0] = int(bbox[0] * 1000.0 / width) - bbox[2] = int(bbox[2] * 1000.0 / width) - bbox[1] = int(bbox[1] * 1000.0 / height) - bbox[3] = int(bbox[3] * 1000.0 / height) - return bbox + def _smooth_box(self, bboxes, height, width): + bboxes = np.array(bboxes) + bboxes[:, 0] = bboxes[:, 0] * 1000 / width + bboxes[:, 2] = bboxes[:, 2] * 1000 / width + bboxes[:, 1] = bboxes[:, 1] * 1000 / height + bboxes[:, 3] = bboxes[:, 3] * 1000 / height + bboxes = bboxes.astype("int64").tolist() + return bboxes def _parse_label(self, label, encode_res): gt_label = [] diff --git a/ppocr/metrics/vqa_token_re_metric.py b/ppocr/metrics/vqa_token_re_metric.py index 8a13bc081298284194d365933cd67d5633957ee8..f84387d8beb729bcc4b420ceea24a5e9b2993c64 100644 --- a/ppocr/metrics/vqa_token_re_metric.py +++ b/ppocr/metrics/vqa_token_re_metric.py @@ -37,23 +37,26 @@ class VQAReTokenMetric(object): gt_relations = [] for b in range(len(self.relations_list)): rel_sent = [] - for head, tail in zip(self.relations_list[b]["head"], - self.relations_list[b]["tail"]): - rel = {} - rel["head_id"] = head - rel["head"] = (self.entities_list[b]["start"][rel["head_id"]], - self.entities_list[b]["end"][rel["head_id"]]) - rel["head_type"] = self.entities_list[b]["label"][rel[ - "head_id"]] - - rel["tail_id"] = tail - rel["tail"] = (self.entities_list[b]["start"][rel["tail_id"]], - self.entities_list[b]["end"][rel["tail_id"]]) - rel["tail_type"] = self.entities_list[b]["label"][rel[ - "tail_id"]] - - rel["type"] = 1 - rel_sent.append(rel) + if "head" in self.relations_list[b]: + for head, tail in zip(self.relations_list[b]["head"], + self.relations_list[b]["tail"]): + rel = {} + rel["head_id"] = head + rel["head"] = ( + self.entities_list[b]["start"][rel["head_id"]], + self.entities_list[b]["end"][rel["head_id"]]) + rel["head_type"] = self.entities_list[b]["label"][rel[ + "head_id"]] + + rel["tail_id"] = tail + rel["tail"] = ( + self.entities_list[b]["start"][rel["tail_id"]], + self.entities_list[b]["end"][rel["tail_id"]]) + rel["tail_type"] = self.entities_list[b]["label"][rel[ + "tail_id"]] + + rel["type"] = 1 + rel_sent.append(rel) gt_relations.append(rel_sent) re_metrics = self.re_score( self.pred_relations_list, gt_relations, mode="boundaries") diff --git a/ppocr/modeling/backbones/vqa_layoutlm.py b/ppocr/modeling/backbones/vqa_layoutlm.py index 2fd1b1b2a78a98dba1930378f4a06783aadd8834..34dd9d10ea36758059448d96674d4d2c249d3ad0 100644 --- a/ppocr/modeling/backbones/vqa_layoutlm.py +++ b/ppocr/modeling/backbones/vqa_layoutlm.py @@ -43,9 +43,11 @@ class NLPBaseModel(nn.Layer): super(NLPBaseModel, self).__init__() if checkpoints is not None: self.model = model_class.from_pretrained(checkpoints) + elif isinstance(pretrained, (str, )) and os.path.exists(pretrained): + self.model = model_class.from_pretrained(pretrained) else: pretrained_model_name = pretrained_model_dict[base_model_class] - if pretrained: + if pretrained is True: base_model = base_model_class.from_pretrained( pretrained_model_name) else: diff --git a/ppstructure/vqa/tools/trans_funsd_label.py b/ppstructure/vqa/tools/trans_funsd_label.py new file mode 100644 index 0000000000000000000000000000000000000000..ef7d1db010a925b37d285befe77aa202db2141d9 --- /dev/null +++ b/ppstructure/vqa/tools/trans_funsd_label.py @@ -0,0 +1,151 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sys +import cv2 +import numpy as np +from copy import deepcopy + + +def trans_poly_to_bbox(poly): + x1 = np.min([p[0] for p in poly]) + x2 = np.max([p[0] for p in poly]) + y1 = np.min([p[1] for p in poly]) + y2 = np.max([p[1] for p in poly]) + return [x1, y1, x2, y2] + + +def get_outer_poly(bbox_list): + x1 = min([bbox[0] for bbox in bbox_list]) + y1 = min([bbox[1] for bbox in bbox_list]) + x2 = max([bbox[2] for bbox in bbox_list]) + y2 = max([bbox[3] for bbox in bbox_list]) + return [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + + +def load_funsd_label(image_dir, anno_dir): + imgs = os.listdir(image_dir) + annos = os.listdir(anno_dir) + + imgs = [img.replace(".png", "") for img in imgs] + annos = [anno.replace(".json", "") for anno in annos] + + fn_info_map = dict() + for anno_fn in annos: + res = [] + with open(os.path.join(anno_dir, anno_fn + ".json"), "r") as fin: + infos = json.load(fin) + infos = infos["form"] + old_id2new_id_map = dict() + global_new_id = 0 + for info in infos: + if info["text"] is None: + continue + words = info["words"] + if len(words) <= 0: + continue + word_idx = 1 + curr_bboxes = [words[0]["box"]] + curr_texts = [words[0]["text"]] + while word_idx < len(words): + # switch to a new link + if words[word_idx]["box"][0] + 10 <= words[word_idx - 1][ + "box"][2]: + if len("".join(curr_texts[0])) > 0: + res.append({ + "transcription": " ".join(curr_texts), + "label": info["label"], + "points": get_outer_poly(curr_bboxes), + "linking": info["linking"], + "id": global_new_id, + }) + if info["id"] not in old_id2new_id_map: + old_id2new_id_map[info["id"]] = [] + old_id2new_id_map[info["id"]].append(global_new_id) + global_new_id += 1 + curr_bboxes = [words[word_idx]["box"]] + curr_texts = [words[word_idx]["text"]] + else: + curr_bboxes.append(words[word_idx]["box"]) + curr_texts.append(words[word_idx]["text"]) + word_idx += 1 + if len("".join(curr_texts[0])) > 0: + res.append({ + "transcription": " ".join(curr_texts), + "label": info["label"], + "points": get_outer_poly(curr_bboxes), + "linking": info["linking"], + "id": global_new_id, + }) + if info["id"] not in old_id2new_id_map: + old_id2new_id_map[info["id"]] = [] + old_id2new_id_map[info["id"]].append(global_new_id) + global_new_id += 1 + res = sorted( + res, key=lambda r: (r["points"][0][1], r["points"][0][0])) + for i in range(len(res) - 1): + for j in range(i, 0, -1): + if abs(res[j + 1]["points"][0][1] - res[j]["points"][0][1]) < 20 and \ + (res[j + 1]["points"][0][0] < res[j]["points"][0][0]): + tmp = deepcopy(res[j]) + res[j] = deepcopy(res[j + 1]) + res[j + 1] = deepcopy(tmp) + else: + break + # re-generate unique ids + for idx, r in enumerate(res): + new_links = [] + for link in r["linking"]: + # illegal links will be removed + if link[0] not in old_id2new_id_map or link[ + 1] not in old_id2new_id_map: + continue + for src in old_id2new_id_map[link[0]]: + for dst in old_id2new_id_map[link[1]]: + new_links.append([src, dst]) + res[idx]["linking"] = deepcopy(new_links) + + fn_info_map[anno_fn] = res + + return fn_info_map + + +def main(): + test_image_dir = "train_data/FUNSD/testing_data/images/" + test_anno_dir = "train_data/FUNSD/testing_data/annotations/" + test_output_dir = "train_data/FUNSD/test.json" + + fn_info_map = load_funsd_label(test_image_dir, test_anno_dir) + with open(test_output_dir, "w") as fout: + for fn in fn_info_map: + fout.write(fn + ".png" + "\t" + json.dumps( + fn_info_map[fn], ensure_ascii=False) + "\n") + + train_image_dir = "train_data/FUNSD/training_data/images/" + train_anno_dir = "train_data/FUNSD/training_data/annotations/" + train_output_dir = "train_data/FUNSD/train.json" + + fn_info_map = load_funsd_label(train_image_dir, train_anno_dir) + with open(train_output_dir, "w") as fout: + for fn in fn_info_map: + fout.write(fn + ".png" + "\t" + json.dumps( + fn_info_map[fn], ensure_ascii=False) + "\n") + print("====ok====") + return + + +if __name__ == "__main__": + main()