Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
weixin_41840029
PaddleOCR
提交
7a99588d
P
PaddleOCR
项目概览
weixin_41840029
/
PaddleOCR
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleOCR
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleOCR
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7a99588d
编写于
7月 06, 2022
作者:
littletomatodonkey
提交者:
GitHub
7月 06, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add more dataset yamls and fix re exceptions (#6791)
* add more dataset yamls and fix re exceptions
上级
5a0108b8
变更
18
显示空白变更内容
内联
并排
Showing
18 changed file
with
1368 addition
and
40 deletion
+1368
-40
configs/vqa/re/layoutlmv2_funsd.yml
configs/vqa/re/layoutlmv2_funsd.yml
+125
-0
configs/vqa/re/layoutlmv2_xund_zh.yml
configs/vqa/re/layoutlmv2_xund_zh.yml
+3
-3
configs/vqa/re/layoutxlm_funsd.yml
configs/vqa/re/layoutxlm_funsd.yml
+129
-0
configs/vqa/re/layoutxlm_xfund_zh.yml
configs/vqa/re/layoutxlm_xfund_zh.yml
+0
-0
configs/vqa/ser/layoutlm_funsd.yml
configs/vqa/ser/layoutlm_funsd.yml
+124
-0
configs/vqa/ser/layoutlm_sroie.yml
configs/vqa/ser/layoutlm_sroie.yml
+124
-0
configs/vqa/ser/layoutlm_xfund_zh.yml
configs/vqa/ser/layoutlm_xfund_zh.yml
+3
-3
configs/vqa/ser/layoutlmv2_funsd.yml
configs/vqa/ser/layoutlmv2_funsd.yml
+123
-0
configs/vqa/ser/layoutlmv2_sroie.yml
configs/vqa/ser/layoutlmv2_sroie.yml
+123
-0
configs/vqa/ser/layoutlmv2_xfund_zh.yml
configs/vqa/ser/layoutlmv2_xfund_zh.yml
+2
-2
configs/vqa/ser/layoutxlm_funsd.yml
configs/vqa/ser/layoutxlm_funsd.yml
+123
-0
configs/vqa/ser/layoutxlm_sroie.yml
configs/vqa/ser/layoutxlm_sroie.yml
+123
-0
configs/vqa/ser/layoutxlm_wildreceipt.yml
configs/vqa/ser/layoutxlm_wildreceipt.yml
+123
-0
configs/vqa/ser/layoutxlm_xfund_zh.yml
configs/vqa/ser/layoutxlm_xfund_zh.yml
+2
-2
ppocr/data/imaug/label_ops.py
ppocr/data/imaug/label_ops.py
+67
-12
ppocr/metrics/vqa_token_re_metric.py
ppocr/metrics/vqa_token_re_metric.py
+20
-17
ppocr/modeling/backbones/vqa_layoutlm.py
ppocr/modeling/backbones/vqa_layoutlm.py
+3
-1
ppstructure/vqa/tools/trans_funsd_label.py
ppstructure/vqa/tools/trans_funsd_label.py
+151
-0
未找到文件。
configs/vqa/re/layoutlmv2_funsd.yml
0 → 100644
浏览文件 @
7a99588d
Global
:
use_gpu
:
True
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
print_batch_step
:
10
save_model_dir
:
./output/re_layoutlmv2_funsd
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
57
]
cal_metric_during_train
:
False
save_inference_dir
:
use_visualdl
:
False
seed
:
2022
infer_img
:
train_data/FUNSD/testing_data/images/83624198.png
save_res_path
:
./output/re_layoutlmv2_funsd/res/
Architecture
:
model_type
:
vqa
algorithm
:
&algorithm
"
LayoutLMv2"
Transform
:
Backbone
:
name
:
LayoutLMv2ForRe
pretrained
:
True
checkpoints
:
Loss
:
name
:
LossFromOutput
key
:
loss
reduction
:
mean
Optimizer
:
name
:
AdamW
beta1
:
0.9
beta2
:
0.999
clip_norm
:
10
lr
:
learning_rate
:
0.00005
warmup_epoch
:
10
regularizer
:
name
:
L2
factor
:
0.00000
PostProcess
:
name
:
VQAReTokenLayoutLMPostProcess
Metric
:
name
:
VQAReTokenMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/FUNSD/training_data/images/
label_file_list
:
-
./train_data/FUNSD/train.json
ratio_list
:
[
1.0
]
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
True
algorithm
:
*algorithm
class_path
:
&class_path
train_data/FUNSD/class_list.txt
-
VQATokenPad
:
max_seq_len
:
&max_seq_len
512
return_attention_mask
:
True
-
VQAReTokenRelation
:
-
VQAReTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1./255.
mean
:
[
0.485
,
0.456
,
0.406
]
std
:
[
0.229
,
0.224
,
0.225
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
entities'
,
'
relations'
]
loader
:
shuffle
:
True
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
8
collate_fn
:
ListCollator
Eval
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/FUNSD/testing_data/images/
label_file_list
:
-
./train_data/FUNSD/test.json
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
True
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
*max_seq_len
return_attention_mask
:
True
-
VQAReTokenRelation
:
-
VQAReTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1./255.
mean
:
[
0.485
,
0.456
,
0.406
]
std
:
[
0.229
,
0.224
,
0.225
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
entities'
,
'
relations'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
8
collate_fn
:
ListCollator
configs/vqa/re/layoutlmv2.yml
→
configs/vqa/re/layoutlmv2
_xund_zh
.yml
浏览文件 @
7a99588d
...
@@ -3,16 +3,16 @@ Global:
...
@@ -3,16 +3,16 @@ Global:
epoch_num
:
&epoch_num
200
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
log_smooth_window
:
10
print_batch_step
:
10
print_batch_step
:
10
save_model_dir
:
./output/re_layoutlmv2
/
save_model_dir
:
./output/re_layoutlmv2
_xfund_zh
save_epoch_step
:
2000
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
19
]
eval_batch_step
:
[
0
,
57
]
cal_metric_during_train
:
False
cal_metric_during_train
:
False
save_inference_dir
:
save_inference_dir
:
use_visualdl
:
False
use_visualdl
:
False
seed
:
2048
seed
:
2048
infer_img
:
ppstructure/docs/vqa/input/zh_val_21.jpg
infer_img
:
ppstructure/docs/vqa/input/zh_val_21.jpg
save_res_path
:
./output/re/
save_res_path
:
./output/re
_layoutlmv2_xfund_zh/res
/
Architecture
:
Architecture
:
model_type
:
vqa
model_type
:
vqa
...
...
configs/vqa/re/layoutxlm_funsd.yml
0 → 100644
浏览文件 @
7a99588d
Global
:
use_gpu
:
True
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
print_batch_step
:
10
save_model_dir
:
./output/re_layoutxlm_funsd
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
57
]
cal_metric_during_train
:
False
save_inference_dir
:
use_visualdl
:
False
seed
:
2022
infer_img
:
train_data/FUNSD/testing_data/images/83624198.png
save_res_path
:
./output/re_layoutxlm_funsd/res/
Architecture
:
model_type
:
vqa
algorithm
:
&algorithm
"
LayoutXLM"
Transform
:
Backbone
:
name
:
LayoutXLMForRe
pretrained
:
True
checkpoints
:
Loss
:
name
:
LossFromOutput
key
:
loss
reduction
:
mean
Optimizer
:
name
:
AdamW
beta1
:
0.9
beta2
:
0.999
clip_norm
:
10
lr
:
learning_rate
:
0.00005
warmup_epoch
:
10
regularizer
:
name
:
L2
factor
:
0.00000
PostProcess
:
name
:
VQAReTokenLayoutLMPostProcess
Metric
:
name
:
VQAReTokenMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/FUNSD/training_data/images/
label_file_list
:
-
./train_data/FUNSD/train_v4.json
# - ./train_data/FUNSD/train.json
ratio_list
:
[
1.0
]
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
True
algorithm
:
*algorithm
class_path
:
&class_path
./train_data/FUNSD/class_list.txt
use_textline_bbox_info
:
&use_textline_bbox_info
True
-
VQATokenPad
:
max_seq_len
:
&max_seq_len
512
return_attention_mask
:
True
-
VQAReTokenRelation
:
-
VQAReTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
entities'
,
'
relations'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
16
collate_fn
:
ListCollator
Eval
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/FUNSD/testing_data/images/
label_file_list
:
-
./train_data/FUNSD/test_v4.json
# - ./train_data/FUNSD/test.json
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
True
algorithm
:
*algorithm
class_path
:
*class_path
use_textline_bbox_info
:
*use_textline_bbox_info
-
VQATokenPad
:
max_seq_len
:
*max_seq_len
return_attention_mask
:
True
-
VQAReTokenRelation
:
-
VQAReTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
entities'
,
'
relations'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
8
collate_fn
:
ListCollator
configs/vqa/re/layoutxlm.yml
→
configs/vqa/re/layoutxlm
_xfund_zh
.yml
浏览文件 @
7a99588d
文件已移动
configs/vqa/ser/layoutlm_funsd.yml
0 → 100644
浏览文件 @
7a99588d
Global
:
use_gpu
:
True
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutlm_funsd
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
57
]
cal_metric_during_train
:
False
save_inference_dir
:
use_visualdl
:
False
seed
:
2022
infer_img
:
train_data/FUNSD/testing_data/images/83624198.png
save_res_path
:
./output/ser_layoutlm_funsd/res/
Architecture
:
model_type
:
vqa
algorithm
:
&algorithm
"
LayoutLM"
Transform
:
Backbone
:
name
:
LayoutLMForSer
pretrained
:
True
checkpoints
:
num_classes
:
&num_classes
7
Loss
:
name
:
VQASerTokenLayoutLMLoss
num_classes
:
*num_classes
Optimizer
:
name
:
AdamW
beta1
:
0.9
beta2
:
0.999
lr
:
name
:
Linear
learning_rate
:
0.00005
epochs
:
*epoch_num
warmup_epoch
:
2
regularizer
:
name
:
L2
factor
:
0.00000
PostProcess
:
name
:
VQASerTokenLayoutLMPostProcess
class_path
:
&class_path
./train_data/FUNSD/class_list.txt
Metric
:
name
:
VQASerTokenMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/FUNSD/training_data/images/
label_file_list
:
-
./train_data/FUNSD/train.json
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
use_textline_bbox_info
:
&use_textline_bbox_info
True
-
VQATokenPad
:
max_seq_len
:
&max_seq_len
512
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
True
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
Eval
:
dataset
:
name
:
SimpleDataSet
data_dir
:
train_data/FUNSD/testing_data/images/
label_file_list
:
-
./train_data/FUNSD/test.json
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
use_textline_bbox_info
:
*use_textline_bbox_info
-
VQATokenPad
:
max_seq_len
:
*max_seq_len
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
configs/vqa/ser/layoutlm_sroie.yml
0 → 100644
浏览文件 @
7a99588d
Global
:
use_gpu
:
True
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutlm_sroie
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
200
]
cal_metric_during_train
:
False
save_inference_dir
:
use_visualdl
:
False
seed
:
2022
infer_img
:
train_data/SROIE/test/X00016469670.jpg
save_res_path
:
./output/ser_layoutlm_sroie/res/
Architecture
:
model_type
:
vqa
algorithm
:
&algorithm
"
LayoutLM"
Transform
:
Backbone
:
name
:
LayoutLMForSer
pretrained
:
True
checkpoints
:
num_classes
:
&num_classes
9
Loss
:
name
:
VQASerTokenLayoutLMLoss
num_classes
:
*num_classes
Optimizer
:
name
:
AdamW
beta1
:
0.9
beta2
:
0.999
lr
:
name
:
Linear
learning_rate
:
0.00005
epochs
:
*epoch_num
warmup_epoch
:
2
regularizer
:
name
:
L2
factor
:
0.00000
PostProcess
:
name
:
VQASerTokenLayoutLMPostProcess
class_path
:
&class_path
./train_data/SROIE/class_list.txt
Metric
:
name
:
VQASerTokenMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/SROIE/train
label_file_list
:
-
./train_data/SROIE/train.txt
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
use_textline_bbox_info
:
&use_textline_bbox_info
True
-
VQATokenPad
:
max_seq_len
:
&max_seq_len
512
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
True
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
Eval
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/SROIE/test
label_file_list
:
-
./train_data/SROIE/test.txt
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
use_textline_bbox_info
:
*use_textline_bbox_info
-
VQATokenPad
:
max_seq_len
:
*max_seq_len
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
configs/vqa/ser/layoutlm.yml
→
configs/vqa/ser/layoutlm
_xfund_zh
.yml
浏览文件 @
7a99588d
...
@@ -3,16 +3,16 @@ Global:
...
@@ -3,16 +3,16 @@ Global:
epoch_num
:
&epoch_num
200
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
log_smooth_window
:
10
print_batch_step
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutlm
/
save_model_dir
:
./output/ser_layoutlm
_xfund_zh
save_epoch_step
:
2000
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
19
]
eval_batch_step
:
[
0
,
57
]
cal_metric_during_train
:
False
cal_metric_during_train
:
False
save_inference_dir
:
save_inference_dir
:
use_visualdl
:
False
use_visualdl
:
False
seed
:
2022
seed
:
2022
infer_img
:
ppstructure/docs/vqa/input/zh_val_42.jpg
infer_img
:
ppstructure/docs/vqa/input/zh_val_42.jpg
save_res_path
:
./output/ser/
save_res_path
:
./output/ser
_layoutlm_xfund_zh/res
/
Architecture
:
Architecture
:
model_type
:
vqa
model_type
:
vqa
...
...
configs/vqa/ser/layoutlmv2_funsd.yml
0 → 100644
浏览文件 @
7a99588d
Global
:
use_gpu
:
True
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutlmv2_funsd
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
100
]
cal_metric_during_train
:
False
save_inference_dir
:
use_visualdl
:
False
seed
:
2022
infer_img
:
train_data/FUNSD/testing_data/images/83624198.png
save_res_path
:
./output/ser_layoutlmv2_funsd/res/
Architecture
:
model_type
:
vqa
algorithm
:
&algorithm
"
LayoutLMv2"
Transform
:
Backbone
:
name
:
LayoutLMv2ForSer
pretrained
:
True
checkpoints
:
num_classes
:
&num_classes
7
Loss
:
name
:
VQASerTokenLayoutLMLoss
num_classes
:
*num_classes
Optimizer
:
name
:
AdamW
beta1
:
0.9
beta2
:
0.999
lr
:
name
:
Linear
learning_rate
:
0.00005
epochs
:
*epoch_num
warmup_epoch
:
2
regularizer
:
name
:
L2
factor
:
0.00000
PostProcess
:
name
:
VQASerTokenLayoutLMPostProcess
class_path
:
&class_path
train_data/FUNSD/class_list.txt
Metric
:
name
:
VQASerTokenMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/FUNSD/training_data/images/
label_file_list
:
-
./train_data/FUNSD/train.json
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
&max_seq_len
512
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
True
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
Eval
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/FUNSD/testing_data/images/
label_file_list
:
-
./train_data/FUNSD/test.json
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
*max_seq_len
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
configs/vqa/ser/layoutlmv2_sroie.yml
0 → 100644
浏览文件 @
7a99588d
Global
:
use_gpu
:
True
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutlmv2_sroie
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
200
]
cal_metric_during_train
:
False
save_inference_dir
:
use_visualdl
:
False
seed
:
2022
infer_img
:
train_data/SROIE/test/X00016469670.jpg
save_res_path
:
./output/ser_layoutlmv2_sroie/res/
Architecture
:
model_type
:
vqa
algorithm
:
&algorithm
"
LayoutLMv2"
Transform
:
Backbone
:
name
:
LayoutLMv2ForSer
pretrained
:
True
checkpoints
:
num_classes
:
&num_classes
9
Loss
:
name
:
VQASerTokenLayoutLMLoss
num_classes
:
*num_classes
Optimizer
:
name
:
AdamW
beta1
:
0.9
beta2
:
0.999
lr
:
name
:
Linear
learning_rate
:
0.00005
epochs
:
*epoch_num
warmup_epoch
:
2
regularizer
:
name
:
L2
factor
:
0.00000
PostProcess
:
name
:
VQASerTokenLayoutLMPostProcess
class_path
:
&class_path
./train_data/SROIE/class_list.txt
Metric
:
name
:
VQASerTokenMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/SROIE/train
label_file_list
:
-
./train_data/SROIE/train.txt
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
&max_seq_len
512
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
True
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
Eval
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/SROIE/test
label_file_list
:
-
./train_data/SROIE/test.txt
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
*max_seq_len
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
configs/vqa/ser/layoutlmv2.yml
→
configs/vqa/ser/layoutlmv2
_xfund_zh
.yml
浏览文件 @
7a99588d
...
@@ -3,7 +3,7 @@ Global:
...
@@ -3,7 +3,7 @@ Global:
epoch_num
:
&epoch_num
200
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
log_smooth_window
:
10
print_batch_step
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutlmv2/
save_model_dir
:
./output/ser_layoutlmv2
_xfund_zh
/
save_epoch_step
:
2000
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
19
]
eval_batch_step
:
[
0
,
19
]
...
@@ -12,7 +12,7 @@ Global:
...
@@ -12,7 +12,7 @@ Global:
use_visualdl
:
False
use_visualdl
:
False
seed
:
2022
seed
:
2022
infer_img
:
ppstructure/docs/vqa/input/zh_val_42.jpg
infer_img
:
ppstructure/docs/vqa/input/zh_val_42.jpg
save_res_path
:
./output/ser/
save_res_path
:
./output/ser
_layoutlmv2_xfund_zh/res
/
Architecture
:
Architecture
:
model_type
:
vqa
model_type
:
vqa
...
...
configs/vqa/ser/layoutxlm_funsd.yml
0 → 100644
浏览文件 @
7a99588d
Global
:
use_gpu
:
True
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutxlm_funsd
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
57
]
cal_metric_during_train
:
False
save_inference_dir
:
use_visualdl
:
False
seed
:
2022
infer_img
:
train_data/FUNSD/testing_data/images/83624198.png
save_res_path
:
output/ser_layoutxlm_funsd/res/
Architecture
:
model_type
:
vqa
algorithm
:
&algorithm
"
LayoutXLM"
Transform
:
Backbone
:
name
:
LayoutXLMForSer
pretrained
:
True
checkpoints
:
num_classes
:
&num_classes
7
Loss
:
name
:
VQASerTokenLayoutLMLoss
num_classes
:
*num_classes
Optimizer
:
name
:
AdamW
beta1
:
0.9
beta2
:
0.999
lr
:
name
:
Linear
learning_rate
:
0.00005
epochs
:
*epoch_num
warmup_epoch
:
2
regularizer
:
name
:
L2
factor
:
0.00000
PostProcess
:
name
:
VQASerTokenLayoutLMPostProcess
class_path
:
&class_path
./train_data/FUNSD/class_list.txt
Metric
:
name
:
VQASerTokenMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/FUNSD/training_data/images/
label_file_list
:
-
./train_data/FUNSD/train.json
ratio_list
:
[
1.0
]
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
&max_seq_len
512
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
True
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
Eval
:
dataset
:
name
:
SimpleDataSet
data_dir
:
train_data/FUNSD/testing_data/images/
label_file_list
:
-
./train_data/FUNSD/test.json
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
*max_seq_len
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
configs/vqa/ser/layoutxlm_sroie.yml
0 → 100644
浏览文件 @
7a99588d
Global
:
use_gpu
:
True
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutxlm_sroie
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
200
]
cal_metric_during_train
:
False
save_inference_dir
:
use_visualdl
:
False
seed
:
2022
infer_img
:
train_data/SROIE/test/X00016469670.jpg
save_res_path
:
res_img_aug_with_gt
Architecture
:
model_type
:
vqa
algorithm
:
&algorithm
"
LayoutXLM"
Transform
:
Backbone
:
name
:
LayoutXLMForSer
pretrained
:
True
checkpoints
:
num_classes
:
&num_classes
9
Loss
:
name
:
VQASerTokenLayoutLMLoss
num_classes
:
*num_classes
Optimizer
:
name
:
AdamW
beta1
:
0.9
beta2
:
0.999
lr
:
name
:
Linear
learning_rate
:
0.00005
epochs
:
*epoch_num
warmup_epoch
:
2
regularizer
:
name
:
L2
factor
:
0.00000
PostProcess
:
name
:
VQASerTokenLayoutLMPostProcess
class_path
:
&class_path
./train_data/SROIE/class_list.txt
Metric
:
name
:
VQASerTokenMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/SROIE/train
label_file_list
:
-
./train_data/SROIE/train.txt
ratio_list
:
[
1.0
]
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
&max_seq_len
512
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
True
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
Eval
:
dataset
:
name
:
SimpleDataSet
data_dir
:
train_data/SROIE/test
label_file_list
:
-
./train_data/SROIE/test.txt
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
*max_seq_len
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
configs/vqa/ser/layoutxlm_wildreceipt.yml
0 → 100644
浏览文件 @
7a99588d
Global
:
use_gpu
:
True
epoch_num
:
&epoch_num
100
log_smooth_window
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutxlm_wildreceipt
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
200
]
cal_metric_during_train
:
False
save_inference_dir
:
use_visualdl
:
False
seed
:
2022
infer_img
:
train_data//wildreceipt/image_files/Image_12/10/845be0dd6f5b04866a2042abd28d558032ef2576.jpeg
save_res_path
:
./output/ser_layoutxlm_wildreceipt/res
Architecture
:
model_type
:
vqa
algorithm
:
&algorithm
"
LayoutXLM"
Transform
:
Backbone
:
name
:
LayoutXLMForSer
pretrained
:
True
checkpoints
:
num_classes
:
&num_classes
51
Loss
:
name
:
VQASerTokenLayoutLMLoss
num_classes
:
*num_classes
Optimizer
:
name
:
AdamW
beta1
:
0.9
beta2
:
0.999
lr
:
name
:
Linear
learning_rate
:
0.00005
epochs
:
*epoch_num
warmup_epoch
:
2
regularizer
:
name
:
L2
factor
:
0.00000
PostProcess
:
name
:
VQASerTokenLayoutLMPostProcess
class_path
:
&class_path
./train_data/wildreceipt/class_list.txt
Metric
:
name
:
VQASerTokenMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
SimpleDataSet
data_dir
:
./train_data/wildreceipt/
label_file_list
:
-
./train_data/wildreceipt/wildreceipt_train.txt
ratio_list
:
[
1.0
]
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
&max_seq_len
512
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
True
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
Eval
:
dataset
:
name
:
SimpleDataSet
data_dir
:
train_data/wildreceipt
label_file_list
:
-
./train_data/wildreceipt/wildreceipt_test.txt
transforms
:
-
DecodeImage
:
# load image
img_mode
:
RGB
channel_first
:
False
-
VQATokenLabelEncode
:
# Class handling label
contains_re
:
False
algorithm
:
*algorithm
class_path
:
*class_path
-
VQATokenPad
:
max_seq_len
:
*max_seq_len
return_attention_mask
:
True
-
VQASerTokenChunk
:
max_seq_len
:
*max_seq_len
-
Resize
:
size
:
[
224
,
224
]
-
NormalizeImage
:
scale
:
1
mean
:
[
123.675
,
116.28
,
103.53
]
std
:
[
58.395
,
57.12
,
57.375
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
# dataloader will return list in this order
keep_keys
:
[
'
input_ids'
,
'
bbox'
,
'
attention_mask'
,
'
token_type_ids'
,
'
image'
,
'
labels'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
8
num_workers
:
4
configs/vqa/ser/layoutxlm.yml
→
configs/vqa/ser/layoutxlm
_xfund_zh
.yml
浏览文件 @
7a99588d
...
@@ -3,7 +3,7 @@ Global:
...
@@ -3,7 +3,7 @@ Global:
epoch_num
:
&epoch_num
200
epoch_num
:
&epoch_num
200
log_smooth_window
:
10
log_smooth_window
:
10
print_batch_step
:
10
print_batch_step
:
10
save_model_dir
:
./output/ser_layoutxlm
/
save_model_dir
:
./output/ser_layoutxlm
_xfund_zh
save_epoch_step
:
2000
save_epoch_step
:
2000
# evaluation is run every 10 iterations after the 0th iteration
# evaluation is run every 10 iterations after the 0th iteration
eval_batch_step
:
[
0
,
19
]
eval_batch_step
:
[
0
,
19
]
...
@@ -12,7 +12,7 @@ Global:
...
@@ -12,7 +12,7 @@ Global:
use_visualdl
:
False
use_visualdl
:
False
seed
:
2022
seed
:
2022
infer_img
:
ppstructure/docs/vqa/input/zh_val_42.jpg
infer_img
:
ppstructure/docs/vqa/input/zh_val_42.jpg
save_res_path
:
./output/ser
save_res_path
:
./output/ser
_layoutxlm_xfund_zh/res
Architecture
:
Architecture
:
model_type
:
vqa
model_type
:
vqa
...
...
ppocr/data/imaug/label_ops.py
浏览文件 @
7a99588d
...
@@ -869,6 +869,7 @@ class VQATokenLabelEncode(object):
...
@@ -869,6 +869,7 @@ class VQATokenLabelEncode(object):
contains_re
=
False
,
contains_re
=
False
,
add_special_ids
=
False
,
add_special_ids
=
False
,
algorithm
=
'LayoutXLM'
,
algorithm
=
'LayoutXLM'
,
use_textline_bbox_info
=
True
,
infer_mode
=
False
,
infer_mode
=
False
,
ocr_engine
=
None
,
ocr_engine
=
None
,
**
kwargs
):
**
kwargs
):
...
@@ -897,11 +898,51 @@ class VQATokenLabelEncode(object):
...
@@ -897,11 +898,51 @@ class VQATokenLabelEncode(object):
self
.
add_special_ids
=
add_special_ids
self
.
add_special_ids
=
add_special_ids
self
.
infer_mode
=
infer_mode
self
.
infer_mode
=
infer_mode
self
.
ocr_engine
=
ocr_engine
self
.
ocr_engine
=
ocr_engine
self
.
use_textline_bbox_info
=
use_textline_bbox_info
def
split_bbox
(
self
,
bbox
,
text
,
tokenizer
):
words
=
text
.
split
()
token_bboxes
=
[]
curr_word_idx
=
0
x1
,
y1
,
x2
,
y2
=
bbox
unit_w
=
(
x2
-
x1
)
/
len
(
text
)
for
idx
,
word
in
enumerate
(
words
):
curr_w
=
len
(
word
)
*
unit_w
word_bbox
=
[
x1
,
y1
,
x1
+
curr_w
,
y2
]
token_bboxes
.
extend
([
word_bbox
]
*
len
(
tokenizer
.
tokenize
(
word
)))
x1
+=
(
len
(
word
)
+
1
)
*
unit_w
return
token_bboxes
def
filter_empty_contents
(
self
,
ocr_info
):
"""
find out the empty texts and remove the links
"""
new_ocr_info
=
[]
empty_index
=
[]
for
idx
,
info
in
enumerate
(
ocr_info
):
if
len
(
info
[
"transcription"
])
>
0
:
new_ocr_info
.
append
(
copy
.
deepcopy
(
info
))
else
:
empty_index
.
append
(
info
[
"id"
])
for
idx
,
info
in
enumerate
(
new_ocr_info
):
new_link
=
[]
for
link
in
info
[
"linking"
]:
if
link
[
0
]
in
empty_index
or
link
[
1
]
in
empty_index
:
continue
new_link
.
append
(
link
)
new_ocr_info
[
idx
][
"linking"
]
=
new_link
return
new_ocr_info
def
__call__
(
self
,
data
):
def
__call__
(
self
,
data
):
# load bbox and label info
# load bbox and label info
ocr_info
=
self
.
_load_ocr_info
(
data
)
ocr_info
=
self
.
_load_ocr_info
(
data
)
# for re
train_re
=
self
.
contains_re
and
not
self
.
infer_mode
if
train_re
:
ocr_info
=
self
.
filter_empty_contents
(
ocr_info
)
height
,
width
,
_
=
data
[
'image'
].
shape
height
,
width
,
_
=
data
[
'image'
].
shape
words_list
=
[]
words_list
=
[]
...
@@ -913,8 +954,6 @@ class VQATokenLabelEncode(object):
...
@@ -913,8 +954,6 @@ class VQATokenLabelEncode(object):
entities
=
[]
entities
=
[]
# for re
train_re
=
self
.
contains_re
and
not
self
.
infer_mode
if
train_re
:
if
train_re
:
relations
=
[]
relations
=
[]
id2label
=
{}
id2label
=
{}
...
@@ -924,18 +963,19 @@ class VQATokenLabelEncode(object):
...
@@ -924,18 +963,19 @@ class VQATokenLabelEncode(object):
data
[
'ocr_info'
]
=
copy
.
deepcopy
(
ocr_info
)
data
[
'ocr_info'
]
=
copy
.
deepcopy
(
ocr_info
)
for
info
in
ocr_info
:
for
info
in
ocr_info
:
text
=
info
[
"transcription"
]
if
len
(
text
)
<=
0
:
continue
if
train_re
:
if
train_re
:
# for re
# for re
if
len
(
info
[
"transcription"
]
)
==
0
:
if
len
(
text
)
==
0
:
empty_entity
.
add
(
info
[
"id"
])
empty_entity
.
add
(
info
[
"id"
])
continue
continue
id2label
[
info
[
"id"
]]
=
info
[
"label"
]
id2label
[
info
[
"id"
]]
=
info
[
"label"
]
relations
.
extend
([
tuple
(
sorted
(
l
))
for
l
in
info
[
"linking"
]])
relations
.
extend
([
tuple
(
sorted
(
l
))
for
l
in
info
[
"linking"
]])
# smooth_box
# smooth_box
info
[
"bbox"
]
=
self
.
trans_poly_to_bbox
(
info
[
"points"
])
info
[
"bbox"
]
=
self
.
trans_poly_to_bbox
(
info
[
"points"
])
bbox
=
self
.
_smooth_box
(
info
[
"bbox"
],
height
,
width
)
text
=
info
[
"transcription"
]
encode_res
=
self
.
tokenizer
.
encode
(
encode_res
=
self
.
tokenizer
.
encode
(
text
,
pad_to_max_seq_len
=
False
,
return_attention_mask
=
True
)
text
,
pad_to_max_seq_len
=
False
,
return_attention_mask
=
True
)
...
@@ -946,6 +986,19 @@ class VQATokenLabelEncode(object):
...
@@ -946,6 +986,19 @@ class VQATokenLabelEncode(object):
-
1
]
-
1
]
encode_res
[
"attention_mask"
]
=
encode_res
[
"attention_mask"
][
1
:
encode_res
[
"attention_mask"
]
=
encode_res
[
"attention_mask"
][
1
:
-
1
]
-
1
]
if
self
.
use_textline_bbox_info
:
bbox
=
[
info
[
"bbox"
]]
*
len
(
encode_res
[
"input_ids"
])
else
:
bbox
=
self
.
split_bbox
(
info
[
"bbox"
],
info
[
"transcription"
],
self
.
tokenizer
)
if
len
(
bbox
)
<=
0
:
continue
bbox
=
self
.
_smooth_box
(
bbox
,
height
,
width
)
if
self
.
add_special_ids
:
bbox
.
insert
(
0
,
[
0
,
0
,
0
,
0
])
bbox
.
append
([
0
,
0
,
0
,
0
])
# parse label
# parse label
if
not
self
.
infer_mode
:
if
not
self
.
infer_mode
:
label
=
info
[
'label'
]
label
=
info
[
'label'
]
...
@@ -970,7 +1023,7 @@ class VQATokenLabelEncode(object):
...
@@ -970,7 +1023,7 @@ class VQATokenLabelEncode(object):
})
})
input_ids_list
.
extend
(
encode_res
[
"input_ids"
])
input_ids_list
.
extend
(
encode_res
[
"input_ids"
])
token_type_ids_list
.
extend
(
encode_res
[
"token_type_ids"
])
token_type_ids_list
.
extend
(
encode_res
[
"token_type_ids"
])
bbox_list
.
extend
(
[
bbox
]
*
len
(
encode_res
[
"input_ids"
])
)
bbox_list
.
extend
(
bbox
)
words_list
.
append
(
text
)
words_list
.
append
(
text
)
segment_offset_id
.
append
(
len
(
input_ids_list
))
segment_offset_id
.
append
(
len
(
input_ids_list
))
if
not
self
.
infer_mode
:
if
not
self
.
infer_mode
:
...
@@ -1019,12 +1072,14 @@ class VQATokenLabelEncode(object):
...
@@ -1019,12 +1072,14 @@ class VQATokenLabelEncode(object):
info_dict
=
json
.
loads
(
info
)
info_dict
=
json
.
loads
(
info
)
return
info_dict
return
info_dict
def
_smooth_box
(
self
,
bbox
,
height
,
width
):
def
_smooth_box
(
self
,
bboxes
,
height
,
width
):
bbox
[
0
]
=
int
(
bbox
[
0
]
*
1000.0
/
width
)
bboxes
=
np
.
array
(
bboxes
)
bbox
[
2
]
=
int
(
bbox
[
2
]
*
1000.0
/
width
)
bboxes
[:,
0
]
=
bboxes
[:,
0
]
*
1000
/
width
bbox
[
1
]
=
int
(
bbox
[
1
]
*
1000.0
/
height
)
bboxes
[:,
2
]
=
bboxes
[:,
2
]
*
1000
/
width
bbox
[
3
]
=
int
(
bbox
[
3
]
*
1000.0
/
height
)
bboxes
[:,
1
]
=
bboxes
[:,
1
]
*
1000
/
height
return
bbox
bboxes
[:,
3
]
=
bboxes
[:,
3
]
*
1000
/
height
bboxes
=
bboxes
.
astype
(
"int64"
).
tolist
()
return
bboxes
def
_parse_label
(
self
,
label
,
encode_res
):
def
_parse_label
(
self
,
label
,
encode_res
):
gt_label
=
[]
gt_label
=
[]
...
...
ppocr/metrics/vqa_token_re_metric.py
浏览文件 @
7a99588d
...
@@ -37,17 +37,20 @@ class VQAReTokenMetric(object):
...
@@ -37,17 +37,20 @@ class VQAReTokenMetric(object):
gt_relations
=
[]
gt_relations
=
[]
for
b
in
range
(
len
(
self
.
relations_list
)):
for
b
in
range
(
len
(
self
.
relations_list
)):
rel_sent
=
[]
rel_sent
=
[]
if
"head"
in
self
.
relations_list
[
b
]:
for
head
,
tail
in
zip
(
self
.
relations_list
[
b
][
"head"
],
for
head
,
tail
in
zip
(
self
.
relations_list
[
b
][
"head"
],
self
.
relations_list
[
b
][
"tail"
]):
self
.
relations_list
[
b
][
"tail"
]):
rel
=
{}
rel
=
{}
rel
[
"head_id"
]
=
head
rel
[
"head_id"
]
=
head
rel
[
"head"
]
=
(
self
.
entities_list
[
b
][
"start"
][
rel
[
"head_id"
]],
rel
[
"head"
]
=
(
self
.
entities_list
[
b
][
"start"
][
rel
[
"head_id"
]],
self
.
entities_list
[
b
][
"end"
][
rel
[
"head_id"
]])
self
.
entities_list
[
b
][
"end"
][
rel
[
"head_id"
]])
rel
[
"head_type"
]
=
self
.
entities_list
[
b
][
"label"
][
rel
[
rel
[
"head_type"
]
=
self
.
entities_list
[
b
][
"label"
][
rel
[
"head_id"
]]
"head_id"
]]
rel
[
"tail_id"
]
=
tail
rel
[
"tail_id"
]
=
tail
rel
[
"tail"
]
=
(
self
.
entities_list
[
b
][
"start"
][
rel
[
"tail_id"
]],
rel
[
"tail"
]
=
(
self
.
entities_list
[
b
][
"start"
][
rel
[
"tail_id"
]],
self
.
entities_list
[
b
][
"end"
][
rel
[
"tail_id"
]])
self
.
entities_list
[
b
][
"end"
][
rel
[
"tail_id"
]])
rel
[
"tail_type"
]
=
self
.
entities_list
[
b
][
"label"
][
rel
[
rel
[
"tail_type"
]
=
self
.
entities_list
[
b
][
"label"
][
rel
[
"tail_id"
]]
"tail_id"
]]
...
...
ppocr/modeling/backbones/vqa_layoutlm.py
浏览文件 @
7a99588d
...
@@ -43,9 +43,11 @@ class NLPBaseModel(nn.Layer):
...
@@ -43,9 +43,11 @@ class NLPBaseModel(nn.Layer):
super
(
NLPBaseModel
,
self
).
__init__
()
super
(
NLPBaseModel
,
self
).
__init__
()
if
checkpoints
is
not
None
:
if
checkpoints
is
not
None
:
self
.
model
=
model_class
.
from_pretrained
(
checkpoints
)
self
.
model
=
model_class
.
from_pretrained
(
checkpoints
)
elif
isinstance
(
pretrained
,
(
str
,
))
and
os
.
path
.
exists
(
pretrained
):
self
.
model
=
model_class
.
from_pretrained
(
pretrained
)
else
:
else
:
pretrained_model_name
=
pretrained_model_dict
[
base_model_class
]
pretrained_model_name
=
pretrained_model_dict
[
base_model_class
]
if
pretrained
:
if
pretrained
is
True
:
base_model
=
base_model_class
.
from_pretrained
(
base_model
=
base_model_class
.
from_pretrained
(
pretrained_model_name
)
pretrained_model_name
)
else
:
else
:
...
...
ppstructure/vqa/tools/trans_funsd_label.py
0 → 100644
浏览文件 @
7a99588d
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
os
import
sys
import
cv2
import
numpy
as
np
from
copy
import
deepcopy
def
trans_poly_to_bbox
(
poly
):
x1
=
np
.
min
([
p
[
0
]
for
p
in
poly
])
x2
=
np
.
max
([
p
[
0
]
for
p
in
poly
])
y1
=
np
.
min
([
p
[
1
]
for
p
in
poly
])
y2
=
np
.
max
([
p
[
1
]
for
p
in
poly
])
return
[
x1
,
y1
,
x2
,
y2
]
def
get_outer_poly
(
bbox_list
):
x1
=
min
([
bbox
[
0
]
for
bbox
in
bbox_list
])
y1
=
min
([
bbox
[
1
]
for
bbox
in
bbox_list
])
x2
=
max
([
bbox
[
2
]
for
bbox
in
bbox_list
])
y2
=
max
([
bbox
[
3
]
for
bbox
in
bbox_list
])
return
[[
x1
,
y1
],
[
x2
,
y1
],
[
x2
,
y2
],
[
x1
,
y2
]]
def
load_funsd_label
(
image_dir
,
anno_dir
):
imgs
=
os
.
listdir
(
image_dir
)
annos
=
os
.
listdir
(
anno_dir
)
imgs
=
[
img
.
replace
(
".png"
,
""
)
for
img
in
imgs
]
annos
=
[
anno
.
replace
(
".json"
,
""
)
for
anno
in
annos
]
fn_info_map
=
dict
()
for
anno_fn
in
annos
:
res
=
[]
with
open
(
os
.
path
.
join
(
anno_dir
,
anno_fn
+
".json"
),
"r"
)
as
fin
:
infos
=
json
.
load
(
fin
)
infos
=
infos
[
"form"
]
old_id2new_id_map
=
dict
()
global_new_id
=
0
for
info
in
infos
:
if
info
[
"text"
]
is
None
:
continue
words
=
info
[
"words"
]
if
len
(
words
)
<=
0
:
continue
word_idx
=
1
curr_bboxes
=
[
words
[
0
][
"box"
]]
curr_texts
=
[
words
[
0
][
"text"
]]
while
word_idx
<
len
(
words
):
# switch to a new link
if
words
[
word_idx
][
"box"
][
0
]
+
10
<=
words
[
word_idx
-
1
][
"box"
][
2
]:
if
len
(
""
.
join
(
curr_texts
[
0
]))
>
0
:
res
.
append
({
"transcription"
:
" "
.
join
(
curr_texts
),
"label"
:
info
[
"label"
],
"points"
:
get_outer_poly
(
curr_bboxes
),
"linking"
:
info
[
"linking"
],
"id"
:
global_new_id
,
})
if
info
[
"id"
]
not
in
old_id2new_id_map
:
old_id2new_id_map
[
info
[
"id"
]]
=
[]
old_id2new_id_map
[
info
[
"id"
]].
append
(
global_new_id
)
global_new_id
+=
1
curr_bboxes
=
[
words
[
word_idx
][
"box"
]]
curr_texts
=
[
words
[
word_idx
][
"text"
]]
else
:
curr_bboxes
.
append
(
words
[
word_idx
][
"box"
])
curr_texts
.
append
(
words
[
word_idx
][
"text"
])
word_idx
+=
1
if
len
(
""
.
join
(
curr_texts
[
0
]))
>
0
:
res
.
append
({
"transcription"
:
" "
.
join
(
curr_texts
),
"label"
:
info
[
"label"
],
"points"
:
get_outer_poly
(
curr_bboxes
),
"linking"
:
info
[
"linking"
],
"id"
:
global_new_id
,
})
if
info
[
"id"
]
not
in
old_id2new_id_map
:
old_id2new_id_map
[
info
[
"id"
]]
=
[]
old_id2new_id_map
[
info
[
"id"
]].
append
(
global_new_id
)
global_new_id
+=
1
res
=
sorted
(
res
,
key
=
lambda
r
:
(
r
[
"points"
][
0
][
1
],
r
[
"points"
][
0
][
0
]))
for
i
in
range
(
len
(
res
)
-
1
):
for
j
in
range
(
i
,
0
,
-
1
):
if
abs
(
res
[
j
+
1
][
"points"
][
0
][
1
]
-
res
[
j
][
"points"
][
0
][
1
])
<
20
and
\
(
res
[
j
+
1
][
"points"
][
0
][
0
]
<
res
[
j
][
"points"
][
0
][
0
]):
tmp
=
deepcopy
(
res
[
j
])
res
[
j
]
=
deepcopy
(
res
[
j
+
1
])
res
[
j
+
1
]
=
deepcopy
(
tmp
)
else
:
break
# re-generate unique ids
for
idx
,
r
in
enumerate
(
res
):
new_links
=
[]
for
link
in
r
[
"linking"
]:
# illegal links will be removed
if
link
[
0
]
not
in
old_id2new_id_map
or
link
[
1
]
not
in
old_id2new_id_map
:
continue
for
src
in
old_id2new_id_map
[
link
[
0
]]:
for
dst
in
old_id2new_id_map
[
link
[
1
]]:
new_links
.
append
([
src
,
dst
])
res
[
idx
][
"linking"
]
=
deepcopy
(
new_links
)
fn_info_map
[
anno_fn
]
=
res
return
fn_info_map
def
main
():
test_image_dir
=
"train_data/FUNSD/testing_data/images/"
test_anno_dir
=
"train_data/FUNSD/testing_data/annotations/"
test_output_dir
=
"train_data/FUNSD/test.json"
fn_info_map
=
load_funsd_label
(
test_image_dir
,
test_anno_dir
)
with
open
(
test_output_dir
,
"w"
)
as
fout
:
for
fn
in
fn_info_map
:
fout
.
write
(
fn
+
".png"
+
"
\t
"
+
json
.
dumps
(
fn_info_map
[
fn
],
ensure_ascii
=
False
)
+
"
\n
"
)
train_image_dir
=
"train_data/FUNSD/training_data/images/"
train_anno_dir
=
"train_data/FUNSD/training_data/annotations/"
train_output_dir
=
"train_data/FUNSD/train.json"
fn_info_map
=
load_funsd_label
(
train_image_dir
,
train_anno_dir
)
with
open
(
train_output_dir
,
"w"
)
as
fout
:
for
fn
in
fn_info_map
:
fout
.
write
(
fn
+
".png"
+
"
\t
"
+
json
.
dumps
(
fn_info_map
[
fn
],
ensure_ascii
=
False
)
+
"
\n
"
)
print
(
"====ok===="
)
return
if
__name__
==
"__main__"
:
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录