add more dataset yamls and fix re exceptions (#6791)

* add more dataset yamls and fix re exceptions

add more dataset yamls and fix re exceptions (#6791)
* add more dataset yamls and fix re exceptions
7a99588d · littletomatodonkey · GitHub · 5a0108b8 · 7a99588d · 7a99588d
18 changed file
--- a/configs/vqa/re/layoutlmv2_funsd.yml
+++ b/configs/vqa/re/layoutlmv2_funsd.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/re_layoutlmv2_funsd
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 57 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: train_data/FUNSD/testing_data/images/83624198.png
+  save_res_path: ./output/re_layoutlmv2_funsd/res/
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutLMv2"
+  Transform:
+  Backbone:
+    name: LayoutLMv2ForRe
+    pretrained: True
+    checkpoints:
+Loss:
+  name: LossFromOutput
+  key: loss
+  reduction: mean
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  clip_norm: 10
+  lr:
+    learning_rate: 0.00005
+    warmup_epoch: 10
+  regularizer:
+    name: L2
+    factor: 0.00000
+PostProcess:
+  name: VQAReTokenLayoutLMPostProcess
+Metric:
+  name: VQAReTokenMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/FUNSD/training_data/images/
+    label_file_list: 
+      - ./train_data/FUNSD/train.json
+    ratio_list: [ 1.0 ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: True
+          algorithm: *algorithm
+          class_path: &class_path train_data/FUNSD/class_list.txt
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQAReTokenRelation:
+      - VQAReTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1./255.
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations']
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 8
+    collate_fn: ListCollator
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/FUNSD/testing_data/images/
+    label_file_list: 
+      - ./train_data/FUNSD/test.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: True
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQAReTokenRelation:
+      - VQAReTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1./255.
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 8
+    collate_fn: ListCollator
--- a/configs/vqa/re/layoutlmv2.yml
+++ b/configs/vqa/re/layoutlmv2.yml
@@ -3,16 +3,16 @@ Global:
  epoch_num: &epoch_num 200
  log_smooth_window: 10
  print_batch_step: 10
-  save_model_dir: ./output/re_layoutlmv2/
+  save_model_dir: ./output/re_layoutlmv2_xfund_zh
  save_epoch_step: 2000
  # evaluation is run every 10 iterations after the 0th iteration
-  eval_batch_step: [ 0, 19 ]
+  eval_batch_step: [ 0, 57 ]
  cal_metric_during_train: False
  save_inference_dir:
  use_visualdl: False
  seed: 2048
  infer_img: ppstructure/docs/vqa/input/zh_val_21.jpg
-  save_res_path: ./output/re/
+  save_res_path: ./output/re_layoutlmv2_xfund_zh/res/
 Architecture:
  model_type: vqa

--- a/configs/vqa/re/layoutxlm_funsd.yml
+++ b/configs/vqa/re/layoutxlm_funsd.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/re_layoutxlm_funsd
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 57 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: train_data/FUNSD/testing_data/images/83624198.png
+  save_res_path: ./output/re_layoutxlm_funsd/res/
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutXLM"
+  Transform:
+  Backbone:
+    name: LayoutXLMForRe
+    pretrained: True
+    checkpoints:
+Loss:
+  name: LossFromOutput
+  key: loss
+  reduction: mean
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  clip_norm: 10
+  lr:
+    learning_rate: 0.00005
+    warmup_epoch: 10
+  regularizer:
+    name: L2
+    factor: 0.00000
+PostProcess:
+  name: VQAReTokenLayoutLMPostProcess
+Metric:
+  name: VQAReTokenMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/FUNSD/training_data/images/
+    label_file_list: 
+      - ./train_data/FUNSD/train_v4.json
+      # - ./train_data/FUNSD/train.json
+    ratio_list: [ 1.0 ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: True
+          algorithm: *algorithm
+          class_path: &class_path ./train_data/FUNSD/class_list.txt
+          use_textline_bbox_info: &use_textline_bbox_info True
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQAReTokenRelation:
+      - VQAReTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 16
+    collate_fn: ListCollator
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/FUNSD/testing_data/images/
+    label_file_list: 
+      - ./train_data/FUNSD/test_v4.json
+      # - ./train_data/FUNSD/test.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: True
+          algorithm: *algorithm
+          class_path: *class_path
+          use_textline_bbox_info: *use_textline_bbox_info
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQAReTokenRelation:
+      - VQAReTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 8
+    collate_fn: ListCollator
--- a/configs/vqa/re/layoutxlm.yml
+++ b/configs/vqa/re/layoutxlm.yml
--- a/configs/vqa/ser/layoutlm_funsd.yml
+++ b/configs/vqa/ser/layoutlm_funsd.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutlm_funsd
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 57 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: train_data/FUNSD/testing_data/images/83624198.png
+  save_res_path: ./output/ser_layoutlm_funsd/res/
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutLM"
+  Transform:
+  Backbone:
+    name: LayoutLMForSer
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 7
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path ./train_data/FUNSD/class_list.txt
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/FUNSD/training_data/images/
+    label_file_list: 
+      - ./train_data/FUNSD/train.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+          use_textline_bbox_info: &use_textline_bbox_info True
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/FUNSD/testing_data/images/
+    label_file_list:
+      - ./train_data/FUNSD/test.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+          use_textline_bbox_info: *use_textline_bbox_info
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/configs/vqa/ser/layoutlm_sroie.yml
+++ b/configs/vqa/ser/layoutlm_sroie.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutlm_sroie
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 200 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: train_data/SROIE/test/X00016469670.jpg
+  save_res_path: ./output/ser_layoutlm_sroie/res/
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutLM"
+  Transform:
+  Backbone:
+    name: LayoutLMForSer
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 9
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path ./train_data/SROIE/class_list.txt
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/SROIE/train
+    label_file_list: 
+      - ./train_data/SROIE/train.txt
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+          use_textline_bbox_info: &use_textline_bbox_info True
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/SROIE/test
+    label_file_list: 
+      - ./train_data/SROIE/test.txt
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+          use_textline_bbox_info: *use_textline_bbox_info
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/configs/vqa/ser/layoutlm.yml
+++ b/configs/vqa/ser/layoutlm.yml
@@ -3,16 +3,16 @@ Global:
  epoch_num: &epoch_num 200
  log_smooth_window: 10
  print_batch_step: 10
-  save_model_dir: ./output/ser_layoutlm/
+  save_model_dir: ./output/ser_layoutlm_xfund_zh
  save_epoch_step: 2000
  # evaluation is run every 10 iterations after the 0th iteration
-  eval_batch_step: [ 0, 19 ]
+  eval_batch_step: [ 0, 57 ]
  cal_metric_during_train: False
  save_inference_dir:
  use_visualdl: False
  seed: 2022
  infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg
-  save_res_path: ./output/ser/
+  save_res_path: ./output/ser_layoutlm_xfund_zh/res/
 Architecture:
  model_type: vqa

--- a/configs/vqa/ser/layoutlmv2_funsd.yml
+++ b/configs/vqa/ser/layoutlmv2_funsd.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutlmv2_funsd
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 100 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: train_data/FUNSD/testing_data/images/83624198.png
+  save_res_path: ./output/ser_layoutlmv2_funsd/res/
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutLMv2"
+  Transform:
+  Backbone:
+    name: LayoutLMv2ForSer
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 7
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path train_data/FUNSD/class_list.txt
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/FUNSD/training_data/images/
+    label_file_list: 
+      - ./train_data/FUNSD/train.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/FUNSD/testing_data/images/
+    label_file_list: 
+      - ./train_data/FUNSD/test.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/configs/vqa/ser/layoutlmv2_sroie.yml
+++ b/configs/vqa/ser/layoutlmv2_sroie.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutlmv2_sroie
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 200 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: train_data/SROIE/test/X00016469670.jpg
+  save_res_path: ./output/ser_layoutlmv2_sroie/res/
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutLMv2"
+  Transform:
+  Backbone:
+    name: LayoutLMv2ForSer
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 9
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path ./train_data/SROIE/class_list.txt
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/SROIE/train
+    label_file_list: 
+      - ./train_data/SROIE/train.txt
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/SROIE/test
+    label_file_list: 
+      - ./train_data/SROIE/test.txt
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/configs/vqa/ser/layoutlmv2.yml
+++ b/configs/vqa/ser/layoutlmv2.yml
@@ -3,7 +3,7 @@ Global:
  epoch_num: &epoch_num 200
  log_smooth_window: 10
  print_batch_step: 10
-  save_model_dir: ./output/ser_layoutlmv2/
+  save_model_dir: ./output/ser_layoutlmv2_xfund_zh/
  save_epoch_step: 2000
  # evaluation is run every 10 iterations after the 0th iteration
  eval_batch_step: [ 0, 19 ]
@@ -12,7 +12,7 @@ Global:
  use_visualdl: False
  seed: 2022
  infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg
-  save_res_path: ./output/ser/
+  save_res_path: ./output/ser_layoutlmv2_xfund_zh/res/
 Architecture:
  model_type: vqa

--- a/configs/vqa/ser/layoutxlm_funsd.yml
+++ b/configs/vqa/ser/layoutxlm_funsd.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutxlm_funsd
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 57 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: train_data/FUNSD/testing_data/images/83624198.png
+  save_res_path: output/ser_layoutxlm_funsd/res/
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutXLM"
+  Transform:
+  Backbone:
+    name: LayoutXLMForSer
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 7
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path ./train_data/FUNSD/class_list.txt
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/FUNSD/training_data/images/
+    label_file_list: 
+      - ./train_data/FUNSD/train.json
+    ratio_list: [ 1.0 ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/FUNSD/testing_data/images/
+    label_file_list:
+      - ./train_data/FUNSD/test.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/configs/vqa/ser/layoutxlm_sroie.yml
+++ b/configs/vqa/ser/layoutxlm_sroie.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutxlm_sroie
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 200 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: train_data/SROIE/test/X00016469670.jpg
+  save_res_path: res_img_aug_with_gt
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutXLM"
+  Transform:
+  Backbone:
+    name: LayoutXLMForSer
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 9
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path ./train_data/SROIE/class_list.txt
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/SROIE/train
+    label_file_list: 
+      - ./train_data/SROIE/train.txt
+    ratio_list: [ 1.0 ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/SROIE/test
+    label_file_list:
+      - ./train_data/SROIE/test.txt
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/configs/vqa/ser/layoutxlm_wildreceipt.yml
+++ b/configs/vqa/ser/layoutxlm_wildreceipt.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 100
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutxlm_wildreceipt
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 200 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: train_data//wildreceipt/image_files/Image_12/10/845be0dd6f5b04866a2042abd28d558032ef2576.jpeg
+  save_res_path: ./output/ser_layoutxlm_wildreceipt/res
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutXLM"
+  Transform:
+  Backbone:
+    name: LayoutXLMForSer
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 51
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path ./train_data/wildreceipt/class_list.txt
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/wildreceipt/
+    label_file_list: 
+      - ./train_data/wildreceipt/wildreceipt_train.txt
+    ratio_list: [ 1.0 ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/wildreceipt
+    label_file_list:
+      - ./train_data/wildreceipt/wildreceipt_test.txt
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          # dataloader will return list in this order
+          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/configs/vqa/ser/layoutxlm.yml
+++ b/configs/vqa/ser/layoutxlm.yml
@@ -3,7 +3,7 @@ Global:
  epoch_num: &epoch_num 200
  log_smooth_window: 10
  print_batch_step: 10
-  save_model_dir: ./output/ser_layoutxlm/
+  save_model_dir: ./output/ser_layoutxlm_xfund_zh
  save_epoch_step: 2000
  # evaluation is run every 10 iterations after the 0th iteration
  eval_batch_step: [ 0, 19 ]
@@ -12,7 +12,7 @@ Global:
  use_visualdl: False
  seed: 2022
  infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg
-  save_res_path: ./output/ser
+  save_res_path: ./output/ser_layoutxlm_xfund_zh/res
 Architecture:
  model_type: vqa

--- a/ppocr/data/imaug/label_ops.py
+++ b/ppocr/data/imaug/label_ops.py
@@ -869,6 +869,7 @@ class VQATokenLabelEncode(object):
                 contains_re=False,
                 add_special_ids=False,
                 algorithm='LayoutXLM',
+                 use_textline_bbox_info=True,
                 infer_mode=False,
                 ocr_engine=None,
                 **kwargs):
@@ -897,11 +898,51 @@ class VQATokenLabelEncode(object):
        self.add_special_ids = add_special_ids
        self.infer_mode = infer_mode
        self.ocr_engine = ocr_engine
+        self.use_textline_bbox_info = use_textline_bbox_info
+    def split_bbox(self, bbox, text, tokenizer):
+        words = text.split()
+        token_bboxes = []
+        curr_word_idx = 0
+        x1, y1, x2, y2 = bbox
+        unit_w = (x2 - x1) / len(text)
+        for idx, word in enumerate(words):
+            curr_w = len(word) * unit_w
+            word_bbox = [x1, y1, x1 + curr_w, y2]
+            token_bboxes.extend([word_bbox] * len(tokenizer.tokenize(word)))
+            x1 += (len(word) + 1) * unit_w
+        return token_bboxes
+    def filter_empty_contents(self, ocr_info):
+        """
+        find out the empty texts and remove the links
+        """
+        new_ocr_info = []
+        empty_index = []
+        for idx, info in enumerate(ocr_info):
+            if len(info["transcription"]) > 0:
+                new_ocr_info.append(copy.deepcopy(info))
+            else:
+                empty_index.append(info["id"])
+        for idx, info in enumerate(new_ocr_info):
+            new_link = []
+            for link in info["linking"]:
+                if link[0] in empty_index or link[1] in empty_index:
+                    continue
+                new_link.append(link)
+            new_ocr_info[idx]["linking"] = new_link
+        return new_ocr_info
    def __call__(self, data):
        # load bbox and label info
        ocr_info = self._load_ocr_info(data)
+        # for re
+        train_re = self.contains_re and not self.infer_mode
+        if train_re:
+            ocr_info = self.filter_empty_contents(ocr_info)
        height, width, _ = data['image'].shape
        words_list = []
@@ -913,8 +954,6 @@ class VQATokenLabelEncode(object):
        entities = []
-        # for re
-        train_re = self.contains_re and not self.infer_mode
        if train_re:
            relations = []
            id2label = {}
@@ -924,18 +963,19 @@ class VQATokenLabelEncode(object):
        data['ocr_info'] = copy.deepcopy(ocr_info)
        for info in ocr_info:
+            text = info["transcription"]
+            if len(text) <= 0:
+                continue
            if train_re:
                # for re
-                if len(info["transcription"]) == 0:
+                if len(text) == 0:
                    empty_entity.add(info["id"])
                    continue
                id2label[info["id"]] = info["label"]
                relations.extend([tuple(sorted(l)) for l in info["linking"]])
            # smooth_box
            info["bbox"] = self.trans_poly_to_bbox(info["points"])
-            bbox = self._smooth_box(info["bbox"], height, width)
-            text = info["transcription"]
            encode_res = self.tokenizer.encode(
                text, pad_to_max_seq_len=False, return_attention_mask=True)
@@ -946,6 +986,19 @@ class VQATokenLabelEncode(object):
                                                                            -1]
                encode_res["attention_mask"] = encode_res["attention_mask"][1:
                                                                            -1]
+            if self.use_textline_bbox_info:
+                bbox = [info["bbox"]] * len(encode_res["input_ids"])
+            else:
+                bbox = self.split_bbox(info["bbox"], info["transcription"],
+                                       self.tokenizer)
+            if len(bbox) <= 0:
+                continue
+            bbox = self._smooth_box(bbox, height, width)
+            if self.add_special_ids:
+                bbox.insert(0, [0, 0, 0, 0])
+                bbox.append([0, 0, 0, 0])
            # parse label
            if not self.infer_mode:
                label = info['label']
@@ -970,7 +1023,7 @@ class VQATokenLabelEncode(object):
                })
            input_ids_list.extend(encode_res["input_ids"])
            token_type_ids_list.extend(encode_res["token_type_ids"])
-            bbox_list.extend([bbox] * len(encode_res["input_ids"]))
+            bbox_list.extend(bbox)
            words_list.append(text)
            segment_offset_id.append(len(input_ids_list))
            if not self.infer_mode:
@@ -1019,12 +1072,14 @@ class VQATokenLabelEncode(object):
            info_dict = json.loads(info)
            return info_dict
-    def _smooth_box(self, bbox, height, width):
+    def _smooth_box(self, bboxes, height, width):
-        bbox[0] = int(bbox[0] * 1000.0 / width)
+        bboxes = np.array(bboxes)
-        bbox[2] = int(bbox[2] * 1000.0 / width)
+        bboxes[:, 0] = bboxes[:, 0] * 1000 / width
-        bbox[1] = int(bbox[1] * 1000.0 / height)
+        bboxes[:, 2] = bboxes[:, 2] * 1000 / width
-        bbox[3] = int(bbox[3] * 1000.0 / height)
+        bboxes[:, 1] = bboxes[:, 1] * 1000 / height
-        return bbox
+        bboxes[:, 3] = bboxes[:, 3] * 1000 / height
+        bboxes = bboxes.astype("int64").tolist()
+        return bboxes
    def _parse_label(self, label, encode_res):
        gt_label = []

--- a/ppocr/metrics/vqa_token_re_metric.py
+++ b/ppocr/metrics/vqa_token_re_metric.py
@@ -37,23 +37,26 @@ class VQAReTokenMetric(object):
        gt_relations = []
        for b in range(len(self.relations_list)):
            rel_sent = []
-            for head, tail in zip(self.relations_list[b]["head"],
+            if "head" in self.relations_list[b]:
-                                  self.relations_list[b]["tail"]):
+                for head, tail in zip(self.relations_list[b]["head"],
-                rel = {}
+                                      self.relations_list[b]["tail"]):
-                rel["head_id"] = head
+                    rel = {}
-                rel["head"] = (self.entities_list[b]["start"][rel["head_id"]],
+                    rel["head_id"] = head
-                               self.entities_list[b]["end"][rel["head_id"]])
+                    rel["head"] = (
-                rel["head_type"] = self.entities_list[b]["label"][rel[
+                        self.entities_list[b]["start"][rel["head_id"]],
-                    "head_id"]]
+                        self.entities_list[b]["end"][rel["head_id"]])
+                    rel["head_type"] = self.entities_list[b]["label"][rel[
-                rel["tail_id"] = tail
+                        "head_id"]]
-                rel["tail"] = (self.entities_list[b]["start"][rel["tail_id"]],
-                               self.entities_list[b]["end"][rel["tail_id"]])
+                    rel["tail_id"] = tail
-                rel["tail_type"] = self.entities_list[b]["label"][rel[
+                    rel["tail"] = (
-                    "tail_id"]]
+                        self.entities_list[b]["start"][rel["tail_id"]],
+                        self.entities_list[b]["end"][rel["tail_id"]])
-                rel["type"] = 1
+                    rel["tail_type"] = self.entities_list[b]["label"][rel[
-                rel_sent.append(rel)
+                        "tail_id"]]
+                    rel["type"] = 1
+                    rel_sent.append(rel)
            gt_relations.append(rel_sent)
        re_metrics = self.re_score(
            self.pred_relations_list, gt_relations, mode="boundaries")

--- a/ppocr/modeling/backbones/vqa_layoutlm.py
+++ b/ppocr/modeling/backbones/vqa_layoutlm.py
@@ -43,9 +43,11 @@ class NLPBaseModel(nn.Layer):
        super(NLPBaseModel, self).__init__()
        if checkpoints is not None:
            self.model = model_class.from_pretrained(checkpoints)
+        elif isinstance(pretrained, (str, )) and os.path.exists(pretrained):
+            self.model = model_class.from_pretrained(pretrained)
        else:
            pretrained_model_name = pretrained_model_dict[base_model_class]
-            if pretrained:
+            if pretrained is True:
                base_model = base_model_class.from_pretrained(
                    pretrained_model_name)
            else:

--- a/ppstructure/vqa/tools/trans_funsd_label.py
+++ b/ppstructure/vqa/tools/trans_funsd_label.py
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import sys
+import cv2
+import numpy as np
+from copy import deepcopy
+def trans_poly_to_bbox(poly):
+    x1 = np.min([p[0] for p in poly])
+    x2 = np.max([p[0] for p in poly])
+    y1 = np.min([p[1] for p in poly])
+    y2 = np.max([p[1] for p in poly])
+    return [x1, y1, x2, y2]
+def get_outer_poly(bbox_list):
+    x1 = min([bbox[0] for bbox in bbox_list])
+    y1 = min([bbox[1] for bbox in bbox_list])
+    x2 = max([bbox[2] for bbox in bbox_list])
+    y2 = max([bbox[3] for bbox in bbox_list])
+    return [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
+def load_funsd_label(image_dir, anno_dir):
+    imgs = os.listdir(image_dir)
+    annos = os.listdir(anno_dir)
+    imgs = [img.replace(".png", "") for img in imgs]
+    annos = [anno.replace(".json", "") for anno in annos]
+    fn_info_map = dict()
+    for anno_fn in annos:
+        res = []
+        with open(os.path.join(anno_dir, anno_fn + ".json"), "r") as fin:
+            infos = json.load(fin)
+            infos = infos["form"]
+            old_id2new_id_map = dict()
+            global_new_id = 0
+            for info in infos:
+                if info["text"] is None:
+                    continue
+                words = info["words"]
+                if len(words) <= 0:
+                    continue
+                word_idx = 1
+                curr_bboxes = [words[0]["box"]]
+                curr_texts = [words[0]["text"]]
+                while word_idx < len(words):
+                    # switch to a new link
+                    if words[word_idx]["box"][0] + 10 <= words[word_idx - 1][
+                            "box"][2]:
+                        if len("".join(curr_texts[0])) > 0:
+                            res.append({
+                                "transcription": " ".join(curr_texts),
+                                "label": info["label"],
+                                "points": get_outer_poly(curr_bboxes),
+                                "linking": info["linking"],
+                                "id": global_new_id,
+                            })
+                            if info["id"] not in old_id2new_id_map:
+                                old_id2new_id_map[info["id"]] = []
+                            old_id2new_id_map[info["id"]].append(global_new_id)
+                            global_new_id += 1
+                        curr_bboxes = [words[word_idx]["box"]]
+                        curr_texts = [words[word_idx]["text"]]
+                    else:
+                        curr_bboxes.append(words[word_idx]["box"])
+                        curr_texts.append(words[word_idx]["text"])
+                    word_idx += 1
+                if len("".join(curr_texts[0])) > 0:
+                    res.append({
+                        "transcription": " ".join(curr_texts),
+                        "label": info["label"],
+                        "points": get_outer_poly(curr_bboxes),
+                        "linking": info["linking"],
+                        "id": global_new_id,
+                    })
+                    if info["id"] not in old_id2new_id_map:
+                        old_id2new_id_map[info["id"]] = []
+                    old_id2new_id_map[info["id"]].append(global_new_id)
+                    global_new_id += 1
+            res = sorted(
+                res, key=lambda r: (r["points"][0][1], r["points"][0][0]))
+            for i in range(len(res) - 1):
+                for j in range(i, 0, -1):
+                    if abs(res[j + 1]["points"][0][1] - res[j]["points"][0][1]) < 20 and \
+                            (res[j + 1]["points"][0][0] < res[j]["points"][0][0]):
+                        tmp = deepcopy(res[j])
+                        res[j] = deepcopy(res[j + 1])
+                        res[j + 1] = deepcopy(tmp)
+                    else:
+                        break
+            # re-generate unique ids
+            for idx, r in enumerate(res):
+                new_links = []
+                for link in r["linking"]:
+                    # illegal links will be removed
+                    if link[0] not in old_id2new_id_map or link[
+                            1] not in old_id2new_id_map:
+                        continue
+                    for src in old_id2new_id_map[link[0]]:
+                        for dst in old_id2new_id_map[link[1]]:
+                            new_links.append([src, dst])
+                res[idx]["linking"] = deepcopy(new_links)
+            fn_info_map[anno_fn] = res
+    return fn_info_map
+def main():
+    test_image_dir = "train_data/FUNSD/testing_data/images/"
+    test_anno_dir = "train_data/FUNSD/testing_data/annotations/"
+    test_output_dir = "train_data/FUNSD/test.json"
+    fn_info_map = load_funsd_label(test_image_dir, test_anno_dir)
+    with open(test_output_dir, "w") as fout:
+        for fn in fn_info_map:
+            fout.write(fn + ".png" + "\t" + json.dumps(
+                fn_info_map[fn], ensure_ascii=False) + "\n")
+    train_image_dir = "train_data/FUNSD/training_data/images/"
+    train_anno_dir = "train_data/FUNSD/training_data/annotations/"
+    train_output_dir = "train_data/FUNSD/train.json"
+    fn_info_map = load_funsd_label(train_image_dir, train_anno_dir)
+    with open(train_output_dir, "w") as fout:
+        for fn in fn_info_map:
+            fout.write(fn + ".png" + "\t" + json.dumps(
+                fn_info_map[fn], ensure_ascii=False) + "\n")
+    print("====ok====")
+    return
+if __name__ == "__main__":
+    main()