提交 1c75ff63 编写于 作者: T Topdu

delete yml file, fix quant and svtr

上级 68fb057d
......@@ -3,7 +3,7 @@ Global:
epoch_num: 20
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_tiny_en/
save_model_dir: ./output/rec/svtr_tiny/
save_epoch_step: 1
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
......@@ -47,16 +47,16 @@ Architecture:
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 100] # input size 可以尝试[64,200]
out_char_num: 25 # output char patch
out_channels: 192 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim
depth: [3, 6, 3] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [2, 4, 8] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
last_stage: True # 三个阶段中的sub-patch heads
img_size: [32, 100]
out_char_num: 25
out_channels: 192
patch_merging: 'Conv'
embed_dim: [64, 128, 256]
depth: [3, 6, 3]
num_heads: [2, 4, 8]
mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global']
local_mixer: [[7, 11], [7, 11], [7, 11]]
last_stage: True
prenorm: false
Neck:
name: SequenceEncoder
......@@ -93,12 +93,12 @@ Train:
shuffle: True
batch_size_per_card: 512
drop_last: True
num_workers: 2
num_workers: 4
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation/
data_dir: ./train_data/data_lmdb_release/validation/
transforms:
- DecodeImage: # load image
img_mode: BGR
......
Global:
use_gpu: True
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_base_stn_ch/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 40
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_svtr_base_ch.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0003
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 320]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 320] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 256 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
depth: [3, 6, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [4, 8, 12] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/ch_scene
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/scene_test
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
Global:
use_gpu: True
epoch_num: 20
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_base_stn_en/
save_epoch_step: 1
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_svtr_base.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.00025
warmup_epoch: 2
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [48, 160]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [48, 160] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 256 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
depth: [3, 6, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [4, 8, 12] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
last_stage: True
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 256
drop_last: True
num_workers: 4
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 128
num_workers: 2
Global:
use_gpu: True
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_large_ch/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 40
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_svtr_large_ch.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0003
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 320]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 320] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 384 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
depth: [3, 9, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [6, 8, 16] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/ch_scene
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/scene_test
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
Global:
use_gpu: True
epoch_num: 20
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_large_en/
save_epoch_step: 1
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_svtr_large.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.000125
warmup_epoch: 2
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [48, 160]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [48, 160] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 384 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
depth: [3, 9, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [6, 8, 16] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
prenorm: false
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RecAug:
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 128
num_workers: 2
Global:
use_gpu: True
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_small_ch/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 40
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_svtr_small_ch.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0003
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 320]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 320] # input size 可以尝试[64,200]
out_char_num: 40 # output char patch
out_channels: 192 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
depth: [3, 6, 6] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [3, 6, 8] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
last_stage: True
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/ch_scene
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/scene_test
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
Global:
use_gpu: True
epoch_num: 20
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_small_stn_en/
save_epoch_step: 1
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_svtr_small.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0005
warmup_epoch: 2
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 100]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 100] # input size 可以尝试[64,200]
out_char_num: 25 # output char patch
out_channels: 192 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
depth: [3, 6, 6] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [3, 6, 8] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
last_stage: True
prenorm: False
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 512
drop_last: True
num_workers: 4
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
character_dict_path:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
Global:
use_gpu: True
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/rec_svtr_tiny_ch/
save_epoch_step: 10
# evaluation is run every 2000 iterations after the 0th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 40
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_svtr_tiny_ch.txt
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.99
epsilon: 0.00000008
weight_decay: 0.05
no_weight_decay_name: norm pos_embed
one_dim_param_no_weight_decay: true
lr:
name: Cosine
learning_rate: 0.0003
warmup_epoch: 5
Architecture:
model_type: rec
algorithm: SVTR
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 320]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: SVTRNet
img_size: [32, 320] # input size
out_char_num: 40 # number char patch
out_channels: 192 # char patch dim
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim
depth: [3, 6, 3] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
num_heads: [2, 4, 8] # 三个阶段中的sub-patch heads
mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
last_stage: True # 三个阶段中的sub-patch heads
prenorm: false
Neck:
name: SequenceEncoder
encoder_type: reshape
Head:
name: CTCHead
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/ch_scene
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 128
drop_last: True
num_workers: 2
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/scene_ch/scene_test
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 2
......@@ -137,7 +137,7 @@ def main(config, device, logger, vdl_writer):
config['Optimizer'],
epochs=config['Global']['epoch_num'],
step_each_epoch=len(train_dataloader),
parameters=model.parameters())
model=model)
# resume PACT training process
if config["Global"]["checkpoints"] is not None:
......
......@@ -219,7 +219,6 @@ class SVTRRecResizeImg(object):
self.character_dict_path = character_dict_path
self.padding = padding
def __call__(self, data):
img = data['image']
norm_img = resize_norm_img_svtr(img, self.image_shape, self.padding)
......@@ -227,7 +226,6 @@ class SVTRRecResizeImg(object):
return data
def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
imgC, imgH, imgW_min, imgW_max = image_shape
h = img.shape[0]
......@@ -346,13 +344,11 @@ def resize_norm_img_srn(img, image_shape):
return np.reshape(img_black, (c, row, col)).astype(np.float32)
def resize_norm_img_svtr(img, image_shape, padding=True):
def resize_norm_img_svtr(img, image_shape, padding=False):
imgC, imgH, imgW = image_shape
h = img.shape[0]
w = img.shape[1]
if not padding:
if h > 2.0 * w:
image = Image.fromarray(img)
image1 = image.rotate(90, expand=True)
......
......@@ -296,47 +296,49 @@ class PatchEmbed(nn.Layer):
if sub_num == 2:
self.proj = nn.Sequential(
ConvBNLayer(
in_channels,
embed_dim // 2,
3,
2,
1,
in_channels=in_channels,
out_channels=embed_dim // 2,
kernel_size=3,
stride=2,
padding=1,
act=nn.GELU,
bias_attr=None),
ConvBNLayer(
embed_dim // 2,
embed_dim,
3,
2,
1,
in_channels=embed_dim // 2,
out_channels=embed_dim,
kernel_size=3,
stride=2,
padding=1,
act=nn.GELU,
bias_attr=None))
if sub_num == 3:
self.proj = nn.Sequential(
ConvBNLayer(
in_channels,
embed_dim // 4,
3,
2,
1,
in_channels=in_channels,
out_channels=embed_dim // 4,
kernel_size=3,
stride=2,
padding=1,
act=nn.GELU,
bias_attr=None),
ConvBNLayer(
embed_dim // 4,
embed_dim // 2,
3,
2,
1,
in_channels=embed_dim // 4,
out_channels=embed_dim // 2,
kernel_size=3,
stride=2,
padding=1,
act=nn.GELU,
bias_attr=None),
ConvBNLayer(
embed_dim // 2,
embed_dim,
3,
2,
1,
in_channels=embed_dim // 2,
out_channels=embed_dim,
kernel_size=3,
stride=2,
padding=1,
act=nn.GELU,
bias_attr=None), )
bias_attr=None))
def forward(self, x):
B, C, H, W = x.shape
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册