提交 1c75ff63 编写于 作者: T Topdu

delete yml file, fix quant and svtr

上级 68fb057d
...@@ -3,7 +3,7 @@ Global: ...@@ -3,7 +3,7 @@ Global:
epoch_num: 20 epoch_num: 20
log_smooth_window: 20 log_smooth_window: 20
print_batch_step: 10 print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_tiny_en/ save_model_dir: ./output/rec/svtr_tiny/
save_epoch_step: 1 save_epoch_step: 1
# evaluation is run every 2000 iterations after the 0th iteration # evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000] eval_batch_step: [0, 2000]
...@@ -47,16 +47,16 @@ Architecture: ...@@ -47,16 +47,16 @@ Architecture:
stn_activation: none stn_activation: none
Backbone: Backbone:
name: SVTRNet name: SVTRNet
img_size: [32, 100] # input size 可以尝试[64,200] img_size: [32, 100]
out_char_num: 25 # output char patch out_char_num: 25
out_channels: 192 # char patch dim out_channels: 192
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None patch_merging: 'Conv'
embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim embed_dim: [64, 128, 256]
depth: [3, 6, 3] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数 depth: [3, 6, 3]
num_heads: [2, 4, 8] # 三个阶段中的sub-patch heads num_heads: [2, 4, 8]
mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global']
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围 local_mixer: [[7, 11], [7, 11], [7, 11]]
last_stage: True # 三个阶段中的sub-patch heads last_stage: True
prenorm: false prenorm: false
Neck: Neck:
name: SequenceEncoder name: SequenceEncoder
...@@ -93,12 +93,12 @@ Train: ...@@ -93,12 +93,12 @@ Train:
shuffle: True shuffle: True
batch_size_per_card: 512 batch_size_per_card: 512
drop_last: True drop_last: True
num_workers: 2 num_workers: 4
Eval: Eval:
dataset: dataset:
name: LMDBDataSet name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation/ data_dir: ./train_data/data_lmdb_release/validation/
transforms: transforms:
- DecodeImage: # load image - DecodeImage: # load image
img_mode: BGR img_mode: BGR
......
Global:
use_gpu: True
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_base_stn_ch/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 40
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_svtr_base_ch.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0003
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 320]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 320] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 256 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
depth: [3, 6, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [4, 8, 12] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/ch_scene
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/scene_test
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
Global:
use_gpu: True
epoch_num: 20
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_base_stn_en/
save_epoch_step: 1
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_svtr_base.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.00025
warmup_epoch: 2
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [48, 160]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [48, 160] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 256 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
depth: [3, 6, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [4, 8, 12] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
last_stage: True
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 256
drop_last: True
num_workers: 4
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 128
num_workers: 2
Global:
use_gpu: True
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_large_ch/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 40
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_svtr_large_ch.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0003
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 320]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 320] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 384 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
depth: [3, 9, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [6, 8, 16] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/ch_scene
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/scene_test
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
Global:
use_gpu: True
epoch_num: 20
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_large_en/
save_epoch_step: 1
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_svtr_large.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.000125
warmup_epoch: 2
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [48, 160]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [48, 160] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 384 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
depth: [3, 9, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [6, 8, 16] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
prenorm: false
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RecAug:
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 128
num_workers: 2
Global:
use_gpu: True
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_small_ch/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 40
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_svtr_small_ch.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0003
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 320]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 320] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 192 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
depth: [3, 6, 6] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [3, 6, 8] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
last_stage: True
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/ch_scene
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/scene_test
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
Global:
use_gpu: True
epoch_num: 20
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_small_stn_en/
save_epoch_step: 1
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_svtr_small.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0005
warmup_epoch: 2
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 100]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 100] # input size 可以尝试[64,200]
out_char_num: 25 # output char patch
out_channels: 192 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
depth: [3, 6, 6] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [3, 6, 8] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
last_stage: True
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 512
drop_last: True
num_workers: 4
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
Global:
use_gpu: True
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_tiny_ch/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 40
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_svtr_tiny_ch.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0003
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 320]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 320] # input size
out_char_num: 40 # number char patch
out_channels: 192 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim
depth: [3, 6, 3] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [2, 4, 8] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
last_stage: True # 三个阶段中的sub-patch heads
prenorm: false
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/ch_scene
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/scene_test
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
...@@ -137,7 +137,7 @@ def main(config, device, logger, vdl_writer): ...@@ -137,7 +137,7 @@ def main(config, device, logger, vdl_writer):
config['Optimizer'], config['Optimizer'],
epochs=config['Global']['epoch_num'], epochs=config['Global']['epoch_num'],
step_each_epoch=len(train_dataloader), step_each_epoch=len(train_dataloader),
parameters=model.parameters()) model=model)
# resume PACT training process # resume PACT training process
if config["Global"]["checkpoints"] is not None: if config["Global"]["checkpoints"] is not None:
......
...@@ -219,7 +219,6 @@ class SVTRRecResizeImg(object): ...@@ -219,7 +219,6 @@ class SVTRRecResizeImg(object):
self.character_dict_path = character_dict_path self.character_dict_path = character_dict_path
self.padding = padding self.padding = padding
def __call__(self, data): def __call__(self, data):
img = data['image'] img = data['image']
norm_img = resize_norm_img_svtr(img, self.image_shape, self.padding) norm_img = resize_norm_img_svtr(img, self.image_shape, self.padding)
...@@ -227,7 +226,6 @@ class SVTRRecResizeImg(object): ...@@ -227,7 +226,6 @@ class SVTRRecResizeImg(object):
return data return data
def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
imgC, imgH, imgW_min, imgW_max = image_shape imgC, imgH, imgW_min, imgW_max = image_shape
h = img.shape[0] h = img.shape[0]
...@@ -346,13 +344,11 @@ def resize_norm_img_srn(img, image_shape): ...@@ -346,13 +344,11 @@ def resize_norm_img_srn(img, image_shape):
return np.reshape(img_black, (c, row, col)).astype(np.float32) return np.reshape(img_black, (c, row, col)).astype(np.float32)
def resize_norm_img_svtr(img, image_shape, padding=False):
def resize_norm_img_svtr(img, image_shape, padding=True):
imgC, imgH, imgW = image_shape imgC, imgH, imgW = image_shape
h = img.shape[0] h = img.shape[0]
w = img.shape[1] w = img.shape[1]
if not padding: if not padding:
if h > 2.0 * w: if h > 2.0 * w:
image = Image.fromarray(img) image = Image.fromarray(img)
image1 = image.rotate(90, expand=True) image1 = image.rotate(90, expand=True)
......
...@@ -296,47 +296,49 @@ class PatchEmbed(nn.Layer): ...@@ -296,47 +296,49 @@ class PatchEmbed(nn.Layer):
if sub_num == 2: if sub_num == 2:
self.proj = nn.Sequential( self.proj = nn.Sequential(
ConvBNLayer( ConvBNLayer(
in_channels, in_channels=in_channels,
embed_dim // 2, out_channels=embed_dim // 2,
3, kernel_size=3,
2, stride=2,
1, padding=1,
act=nn.GELU, act=nn.GELU,
bias_attr=None), bias_attr=None),
ConvBNLayer( ConvBNLayer(
embed_dim // 2, in_channels=embed_dim // 2,
embed_dim, out_channels=embed_dim,
3, kernel_size=3,
2, stride=2,
1, padding=1,
act=nn.GELU, act=nn.GELU,
bias_attr=None)) bias_attr=None))
if sub_num == 3: if sub_num == 3:
self.proj = nn.Sequential( self.proj = nn.Sequential(
ConvBNLayer( ConvBNLayer(
in_channels, in_channels=in_channels,
embed_dim // 4, out_channels=embed_dim // 4,
3, kernel_size=3,
2, stride=2,
1, padding=1,
act=nn.GELU, act=nn.GELU,
bias_attr=None), bias_attr=None),
ConvBNLayer( ConvBNLayer(
embed_dim // 4, in_channels=embed_dim // 4,
embed_dim // 2, out_channels=embed_dim // 2,
3, kernel_size=3,
2, stride=2,
1, padding=1,
act=nn.GELU, act=nn.GELU,
bias_attr=None), bias_attr=None),
ConvBNLayer( ConvBNLayer(
embed_dim // 2, embed_dim // 2,
embed_dim, embed_dim,
3, in_channels=embed_dim // 2,
2, out_channels=embed_dim,
1, kernel_size=3,
stride=2,
padding=1,
act=nn.GELU, act=nn.GELU,
bias_attr=None), ) bias_attr=None))
def forward(self, x): def forward(self, x):
B, C, H, W = x.shape B, C, H, W = x.shape
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册