提交 c907a8de 编写于 作者: H huangyuxin

change all recipes

上级 5d6494de
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev
max_input_len: 27.0 # second test_manifest: data/manifest.test
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 27.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 64 # one gpu # Dataloader #
mean_std_filepath: data/mean_std.json ###########################################
unit_type: char batch_size: 64 # one gpu
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
feat_dim: spm_model_prefix:
delta_delta: False spectrum_type: linear
stride_ms: 10.0 feat_dim:
window_ms: 20.0 delta_delta: False
n_fft: None stride_ms: 10.0
max_freq: None window_ms: 20.0
target_sample_rate: 16000 n_fft: None
use_dB_normalization: True max_freq: None
target_dB: -20 target_sample_rate: 16000
dither: 1.0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True dither: 1.0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 1024 num_conv_layers: 2
use_gru: True num_rnn_layers: 3
share_rnn_weights: False rnn_layer_size: 1024
blank_id: 0 use_gru: True
ctc_grad_norm_type: instance share_rnn_weights: False
blank_id: 0
ctc_grad_norm_type: instance
training: ###########################################
n_epoch: 80 # Training #
accum_grad: 1 ###########################################
lr: 2e-3 n_epoch: 80
lr_decay: 0.83 accum_grad: 1
weight_decay: 1e-06 lr: 2e-3
global_grad_clip: 3.0 lr_decay: 0.83
log_interval: 100 weight_decay: 1e-06
checkpoint: global_grad_clip: 3.0
kbest_n: 50 log_interval: 100
latest_n: 5 checkpoint:
kbest_n: 50
decoding: latest_n: 5
batch_size: 128
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 1.9
beta: 5.0
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 10
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev
max_input_len: 27.0 # second test_manifest: data/manifest.test
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 27.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 64 # one gpu # Dataloader #
mean_std_filepath: data/mean_std.json ###########################################
unit_type: char batch_size: 64 # one gpu
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear #linear, mfcc, fbank random_seed: 0
feat_dim: spm_model_prefix:
delta_delta: False spectrum_type: linear #linear, mfcc, fbank
stride_ms: 10.0 feat_dim:
window_ms: 20.0 delta_delta: False
n_fft: None stride_ms: 10.0
max_freq: None window_ms: 20.0
target_sample_rate: 16000 n_fft: None
use_dB_normalization: True max_freq: None
target_dB: -20 target_sample_rate: 16000
dither: 1.0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True dither: 1.0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 0 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 0
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 5 ############################################
rnn_layer_size: 1024 num_conv_layers: 2
rnn_direction: forward # [forward, bidirect] num_rnn_layers: 5
num_fc_layers: 0 rnn_layer_size: 1024
fc_layers_size_list: -1, rnn_direction: forward # [forward, bidirect]
use_gru: False num_fc_layers: 0
blank_id: 0 fc_layers_size_list: -1,
use_gru: False
blank_id: 0
training: ###########################################
n_epoch: 65 # Training #
accum_grad: 1 ###########################################
lr: 5e-4 n_epoch: 65
lr_decay: 0.93 accum_grad: 1
weight_decay: 1e-06 lr: 5e-4
global_grad_clip: 3.0 lr_decay: 0.93
log_interval: 100 weight_decay: 1e-06
checkpoint: global_grad_clip: 3.0
kbest_n: 50 log_interval: 100
latest_n: 5 checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 32
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.2 #1.9
beta: 4.3
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 10
chunk_batch_size: 32
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.2 #1.9
beta: 4.3
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 10
decode_batch_size: 128
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 1.9
beta: 5.0
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 10
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_ch.sh bash local/download_lm_ch.sh
...@@ -21,6 +22,7 @@ fi ...@@ -21,6 +22,7 @@ fi
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} --model_type ${model_type}
......
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
jit_model_export_path=$2 decode_config_path=$2
model_type=$3 jit_model_export_path=$3
model_type=$4
# download language model # download language model
bash local/download_lm_ch.sh > /dev/null 2>&1 bash local/download_lm_ch.sh > /dev/null 2>&1
...@@ -21,6 +22,7 @@ fi ...@@ -21,6 +22,7 @@ fi
python3 -u ${BIN_DIR}/test_export.py \ python3 -u ${BIN_DIR}/test_export.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${jit_model_export_path}.rsl \ --result_file ${jit_model_export_path}.rsl \
--export_path ${jit_model_export_path} \ --export_path ${jit_model_export_path} \
--model_type ${model_type} --model_type ${model_type}
......
#!/bin/bash
if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_prefix=$2
model_type=$3
audio_file=$4
mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
if [ $? -ne 0 ]; then
exit 1
fi
if [ ! -f ${audio_file} ]; then
echo "Plase input the right audio_file path"
exit 1
fi
# download language model
bash local/download_lm_ch.sh
if [ $? -ne 0 ]; then
exit 1
fi
python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} \
--audio_file ${audio_file}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 5 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file"
exit -1 exit -1
fi fi
...@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
audio_file=$4 model_type=$4
audio_file=$5
mkdir -p data mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
...@@ -33,6 +34,7 @@ fi ...@@ -33,6 +34,7 @@ fi
python3 -u ${BIN_DIR}/test_wav.py \ python3 -u ${BIN_DIR}/test_wav.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} \ --model_type ${model_type} \
......
...@@ -6,6 +6,7 @@ gpus=0,1,2,3 ...@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=1 avg_num=1
model_type=offline # offline or online model_type=offline # offline or online
audio_file=data/demo_01_03.wav audio_file=data/demo_01_03.wav
...@@ -34,7 +35,7 @@ fi ...@@ -34,7 +35,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...@@ -44,11 +45,11 @@ fi ...@@ -44,11 +45,11 @@ fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test export ckpt avg_n # test export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
fi fi
# Optionally, you can add LM and test it with runtime. # Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# test a single .wav file # test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
fi fi
...@@ -54,8 +54,9 @@ test_manifest: data/manifest.test ...@@ -54,8 +54,9 @@ test_manifest: data/manifest.test
########################################### ###########################################
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: ''
unit_type: 'char' unit_type: 'char'
augmentation_config: conf/preprocess.yaml preprocess_config: conf/preprocess.yaml
feat_dim: 80 feat_dim: 80
stride_ms: 10.0 stride_ms: 10.0
window_ms: 25.0 window_ms: 25.0
...@@ -74,7 +75,7 @@ subsampling_factor: 1 ...@@ -74,7 +75,7 @@ subsampling_factor: 1
num_encs: 1 num_encs: 1
########################################### ###########################################
# training # # Training #
########################################### ###########################################
n_epoch: 240 n_epoch: 240
accum_grad: 2 accum_grad: 2
...@@ -82,7 +83,7 @@ global_grad_clip: 5.0 ...@@ -82,7 +83,7 @@ global_grad_clip: 5.0
optim: adam optim: adam
optim_conf: optim_conf:
lr: 0.002 lr: 0.002
weight_decay: 1e-6 weight_decay: 1.0e-6
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
......
...@@ -49,8 +49,9 @@ test_manifest: data/manifest.test ...@@ -49,8 +49,9 @@ test_manifest: data/manifest.test
# Dataloader # # Dataloader #
########################################### ###########################################
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: ''
unit_type: 'char' unit_type: 'char'
augmentation_config: conf/preprocess.yaml preprocess_config: conf/preprocess.yaml
feat_dim: 80 feat_dim: 80
stride_ms: 10.0 stride_ms: 10.0
window_ms: 25.0 window_ms: 25.0
...@@ -69,7 +70,7 @@ subsampling_factor: 1 ...@@ -69,7 +70,7 @@ subsampling_factor: 1
num_encs: 1 num_encs: 1
########################################### ###########################################
# training # # Training #
########################################### ###########################################
n_epoch: 240 n_epoch: 240
accum_grad: 2 accum_grad: 2
......
...@@ -46,6 +46,7 @@ test_manifest: data/manifest.test ...@@ -46,6 +46,7 @@ test_manifest: data/manifest.test
########################################### ###########################################
unit_type: 'char' unit_type: 'char'
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: ''
feat_dim: 80 feat_dim: 80
stride_ms: 10.0 stride_ms: 10.0
window_ms: 25.0 window_ms: 25.0
...@@ -59,13 +60,13 @@ batch_bins: 0 ...@@ -59,13 +60,13 @@ batch_bins: 0
batch_frames_in: 0 batch_frames_in: 0
batch_frames_out: 0 batch_frames_out: 0
batch_frames_inout: 0 batch_frames_inout: 0
augmentation_config: conf/preprocess.yaml preprocess_config: conf/preprocess.yaml
num_workers: 0 num_workers: 0
subsampling_factor: 1 subsampling_factor: 1
num_encs: 1 num_encs: 1
########################################### ###########################################
# training # # Training #
########################################### ###########################################
n_epoch: 240 n_epoch: 240
accum_grad: 2 accum_grad: 2
...@@ -73,7 +74,7 @@ global_grad_clip: 5.0 ...@@ -73,7 +74,7 @@ global_grad_clip: 5.0
optim: adam optim: adam
optim_conf: optim_conf:
lr: 0.002 lr: 0.002
weight_decay: 1e-6 weight_decay: 1.0e-6
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
......
...@@ -21,7 +21,7 @@ mkdir -p ${output_dir} ...@@ -21,7 +21,7 @@ mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/alignment.py \ python3 -u ${BIN_DIR}/alignment.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.align \ --result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decode.decode_batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
......
...@@ -30,14 +30,14 @@ for type in attention ctc_greedy_search; do ...@@ -30,14 +30,14 @@ for type in attention ctc_greedy_search; do
# stream decoding only support batchsize=1 # stream decoding only support batchsize=1
batch_size=1 batch_size=1
else else
batch_size=1 batch_size=64
fi fi
output_dir=${ckpt_prefix} output_dir=${ckpt_prefix}
mkdir -p ${output_dir} mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \ --opts decode.decoding_method ${type} \
...@@ -57,7 +57,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do ...@@ -57,7 +57,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \ --opts decode.decoding_method ${type} \
......
...@@ -43,7 +43,7 @@ for type in attention_rescoring; do ...@@ -43,7 +43,7 @@ for type in attention_rescoring; do
python3 -u ${BIN_DIR}/test_wav.py \ python3 -u ${BIN_DIR}/test_wav.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \ --opts decode.decoding_method ${type} \
......
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.5 dev_manifest: data/manifest.dev
max_input_len: 20.0 # second test_manifest: data/manifest.test
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 8000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: True
use_cnn_module: True
cnn_module_kernel: 15
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
preprocess_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 8000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
training:
n_epoch: 240
accum_grad: 4
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.001
weight_decay: 1e-6
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
############################################
# Network Architecture #
############################################
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: True
use_cnn_module: True
cnn_module_kernel: 15
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
decoding: # decoder related
batch_size: 128 decoder: transformer
error_rate_type: cer decoder_conf:
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' attention_heads: 4
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm linear_units: 2048
alpha: 2.5 num_blocks: 6
beta: 0.3 dropout_rate: 0.1
beam_size: 10 positional_dropout_rate: 0.1
cutoff_prob: 1.0 self_attention_dropout_rate: 0.0
cutoff_top_n: 0 src_attention_dropout_rate: 0.0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: true # simulate streaming inference. Defaults to False.
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
###########################################
# Training #
###########################################
n_epoch: 240
accum_grad: 4
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.001
weight_decay: 1.0e-6
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.5 dev_manifest: data/manifest.dev
max_input_len: 20.0 # second test_manifest: data/manifest.test
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.0
max_output_input_ratio: .inf
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'char' ###########################################
spm_model_prefix: '' vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/preprocess.yaml unit_type: 'char'
batch_size: 32 spm_model_prefix: ''
raw_wav: True # use raw_wav or kaldi feature preprocess_config: conf/preprocess.yaml
spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80
feat_dim: 80 stride_ms: 10.0
delta_delta: False window_ms: 25.0
dither: 1.0 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
target_sample_rate: 8000 batch_size: 64
max_freq: None maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
n_fft: None maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
stride_ms: 10.0 minibatches: 0 # for debug
window_ms: 25.0 batch_count: auto
use_dB_normalization: True batch_bins: 0
target_dB: -20 batch_frames_in: 0
random_seed: 0 batch_frames_out: 0
keep_transcription_text: False batch_frames_inout: 0
sortagrad: True num_workers: 0
shuffle_method: batch_shuffle subsampling_factor: 1
num_workers: 2 num_encs: 1
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: conformer # encoder related
encoder_conf: encoder: conformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: True input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
use_cnn_module: True normalize_before: True
cnn_module_kernel: 15 use_cnn_module: True
activation_type: 'swish' cnn_module_kernel: 15
pos_enc_layer_type: 'rel_pos' activation_type: 'swish'
selfattention_layer_type: 'rel_selfattn' pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
training:
n_epoch: 100 # 50 will be lowest
accum_grad: 4
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1e-6
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
###########################################
# Training #
###########################################
n_epoch: 100 # 50 will be lowest
accum_grad: 4
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1.0e-6
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
process: process:
# extract kaldi fbank from PCM # extract kaldi fbank from PCM
- type: fbank_kaldi - type: fbank_kaldi
fs: 16000 fs: 8000
n_mels: 80 n_mels: 80
n_shift: 160 n_shift: 160
win_length: 400 win_length: 400
......
decode_batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: true # simulate streaming inference. Defaults to False.
\ No newline at end of file
decode_batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
#! /usr/bin/env bash #! /usr/bin/env bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
ckpt_name=$(basename ${ckpt_prefxi}) ckpt_name=$(basename ${ckpt_prefxi})
...@@ -25,9 +26,10 @@ mkdir -p ${output_dir} ...@@ -25,9 +26,10 @@ mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/alignment.py \ python3 -u ${BIN_DIR}/alignment.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.align \ --result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!" echo "Failed in ctc alignment!"
......
#! /usr/bin/env bash #! /usr/bin/env bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
ckpt_name=$(basename ${ckpt_prefxi}) ckpt_name=$(basename ${ckpt_prefxi})
...@@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do ...@@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
...@@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do ...@@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
......
...@@ -6,6 +6,7 @@ gpus=0,1,2,3 ...@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/conformer.yaml conf_path=conf/conformer.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=20 avg_num=20
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
...@@ -31,12 +32,12 @@ fi ...@@ -31,12 +32,12 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data # ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
......
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev-clean ###########################################
test_manifest: data/manifest.test-clean train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev-clean
max_input_len: 30.0 # second test_manifest: data/manifest.test-clean
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 30.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 20 # Dataloader #
mean_std_filepath: data/mean_std.json ###########################################
unit_type: char batch_size: 20
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
target_sample_rate: 16000 spm_model_prefix:
max_freq: None spectrum_type: linear
n_fft: None feat_dim:
stride_ms: 10.0 target_sample_rate: 16000
window_ms: 20.0 max_freq: None
delta_delta: False n_fft: None
dither: 1.0 stride_ms: 10.0
use_dB_normalization: True window_ms: 20.0
target_dB: -20 delta_delta: False
random_seed: 0 dither: 1.0
keep_transcription_text: False use_dB_normalization: True
sortagrad: True target_dB: -20
shuffle_method: batch_shuffle random_seed: 0
num_workers: 2 keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 2048 num_conv_layers: 2
use_gru: False num_rnn_layers: 3
share_rnn_weights: True rnn_layer_size: 2048
blank_id: 0 use_gru: False
share_rnn_weights: True
blank_id: 0
training: ###########################################
n_epoch: 50 # Training #
accum_grad: 1 ###########################################
lr: 1e-3 n_epoch: 50
lr_decay: 0.83 accum_grad: 1
weight_decay: 1e-06 lr: 1e-3
global_grad_clip: 5.0 lr_decay: 0.83
log_interval: 100 weight_decay: 1e-06
checkpoint: global_grad_clip: 5.0
kbest_n: 50 log_interval: 100
latest_n: 5 checkpoint:
kbest_n: 50
decoding: latest_n: 5
batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 1.9
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev-clean ###########################################
test_manifest: data/manifest.test-clean train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev-clean
max_input_len: 30.0 # second test_manifest: data/manifest.test-clean
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 30.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 15 # Dataloader #
mean_std_filepath: data/mean_std.json ###########################################
unit_type: char batch_size: 15
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
target_sample_rate: 16000 spm_model_prefix:
max_freq: None spectrum_type: linear
n_fft: None feat_dim:
stride_ms: 10.0 target_sample_rate: 16000
window_ms: 20.0 max_freq: None
delta_delta: False n_fft: None
dither: 1.0 stride_ms: 10.0
use_dB_normalization: True window_ms: 20.0
target_dB: -20 delta_delta: False
random_seed: 0 dither: 1.0
keep_transcription_text: False use_dB_normalization: True
sortagrad: True target_dB: -20
shuffle_method: batch_shuffle random_seed: 0
num_workers: 0 keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 0
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 2048 num_conv_layers: 2
rnn_direction: forward num_rnn_layers: 3
num_fc_layers: 2 rnn_layer_size: 2048
fc_layers_size_list: 512, 256 rnn_direction: forward
use_gru: False num_fc_layers: 2
blank_id: 0 fc_layers_size_list: 512, 256
use_gru: False
blank_id: 0
training: ###########################################
n_epoch: 50 # Training #
accum_grad: 4 ###########################################
lr: 1e-3 n_epoch: 50
lr_decay: 0.83 accum_grad: 4
weight_decay: 1e-06 lr: 1e-3
global_grad_clip: 5.0 lr_decay: 0.83
log_interval: 100 weight_decay: 1e-06
checkpoint: global_grad_clip: 5.0
kbest_n: 50 log_interval: 100
latest_n: 5 checkpoint:
kbest_n: 50
decoding: latest_n: 5
batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 1.9
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
decode_batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 1.9
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
\ No newline at end of file
decode_batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 1.9
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
\ No newline at end of file
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_en.sh bash local/download_lm_en.sh
...@@ -21,6 +22,7 @@ fi ...@@ -21,6 +22,7 @@ fi
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} --model_type ${model_type}
......
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 5 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file"
exit -1 exit -1
fi fi
...@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
audio_file=$4 model_type=$4
audio_file=$5
mkdir -p data mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
...@@ -33,6 +34,7 @@ fi ...@@ -33,6 +34,7 @@ fi
python3 -u ${BIN_DIR}/test_wav.py \ python3 -u ${BIN_DIR}/test_wav.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} \ --model_type ${model_type} \
......
...@@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7 ...@@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=30 avg_num=30
model_type=offline model_type=offline
audio_file=data/demo_002_en.wav audio_file=data/demo_002_en.wav
...@@ -33,7 +34,7 @@ fi ...@@ -33,7 +34,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...@@ -43,5 +44,5 @@ fi ...@@ -43,5 +44,5 @@ fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test a single .wav file # test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
fi fi
...@@ -57,7 +57,7 @@ vocab_filepath: data/lang_char/vocab.txt ...@@ -57,7 +57,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000' spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml preprocess_config: conf/preprocess.yaml
feat_dim: 80 feat_dim: 80
stride_ms: 10.0 stride_ms: 10.0
window_ms: 25.0 window_ms: 25.0
...@@ -70,8 +70,7 @@ batch_count: auto ...@@ -70,8 +70,7 @@ batch_count: auto
batch_bins: 0 batch_bins: 0
batch_frames_in: 0 batch_frames_in: 0
batch_frames_out: 0 batch_frames_out: 0
batch_frames_inout: 0 batch_frames_inout: 0
augmentation_config: conf/preprocess.yaml
num_workers: 0 num_workers: 0
subsampling_factor: 1 subsampling_factor: 1
num_encs: 1 num_encs: 1
...@@ -85,10 +84,11 @@ global_grad_clip: 5.0 ...@@ -85,10 +84,11 @@ global_grad_clip: 5.0
optim: adam optim: adam
optim_conf: optim_conf:
lr: 0.001 lr: 0.001
weight_decay: 1e-06 weight_decay: 1.0e-06
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
lr_decay: 1.0
log_interval: 100 log_interval: 100
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
......
...@@ -50,7 +50,7 @@ vocab_filepath: data/lang_char/vocab.txt ...@@ -50,7 +50,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000' spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml preprocess_config: conf/preprocess.yaml
feat_dim: 80 feat_dim: 80
stride_ms: 10.0 stride_ms: 10.0
window_ms: 25.0 window_ms: 25.0
...@@ -64,7 +64,6 @@ batch_bins: 0 ...@@ -64,7 +64,6 @@ batch_bins: 0
batch_frames_in: 0 batch_frames_in: 0
batch_frames_out: 0 batch_frames_out: 0
batch_frames_inout: 0 batch_frames_inout: 0
augmentation_config: conf/preprocess.yaml
num_workers: 0 num_workers: 0
subsampling_factor: 1 subsampling_factor: 1
num_encs: 1 num_encs: 1
...@@ -79,7 +78,7 @@ global_grad_clip: 5.0 ...@@ -79,7 +78,7 @@ global_grad_clip: 5.0
optim: adam optim: adam
optim_conf: optim_conf:
lr: 0.001 lr: 0.001
weight_decay: 1e-06 weight_decay: 1.0e-06
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
......
...@@ -55,7 +55,7 @@ vocab_filepath: data/lang_char/vocab.txt ...@@ -55,7 +55,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000' spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml preprocess_config: conf/preprocess.yaml
feat_dim: 80 feat_dim: 80
stride_ms: 10.0 stride_ms: 10.0
window_ms: 25.0 window_ms: 25.0
...@@ -69,7 +69,6 @@ batch_bins: 0 ...@@ -69,7 +69,6 @@ batch_bins: 0
batch_frames_in: 0 batch_frames_in: 0
batch_frames_out: 0 batch_frames_out: 0
batch_frames_inout: 0 batch_frames_inout: 0
augmentation_config: conf/preprocess.yaml
num_workers: 0 num_workers: 0
subsampling_factor: 1 subsampling_factor: 1
num_encs: 1 num_encs: 1
...@@ -84,7 +83,7 @@ global_grad_clip: 3.0 ...@@ -84,7 +83,7 @@ global_grad_clip: 3.0
optim: adam optim: adam
optim_conf: optim_conf:
lr: 0.004 lr: 0.004
weight_decay: 1e-06 weight_decay: 1.0e-06
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
......
...@@ -49,7 +49,7 @@ vocab_filepath: data/lang_char/vocab.txt ...@@ -49,7 +49,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000' spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml preprocess_config: conf/preprocess.yaml
feat_dim: 80 feat_dim: 80
stride_ms: 10.0 stride_ms: 10.0
window_ms: 25.0 window_ms: 25.0
...@@ -63,7 +63,6 @@ batch_bins: 0 ...@@ -63,7 +63,6 @@ batch_bins: 0
batch_frames_in: 0 batch_frames_in: 0
batch_frames_out: 0 batch_frames_out: 0
batch_frames_inout: 0 batch_frames_inout: 0
augmentation_config: conf/preprocess.yaml
num_workers: 0 num_workers: 0
subsampling_factor: 1 subsampling_factor: 1
num_encs: 1 num_encs: 1
...@@ -78,7 +77,7 @@ global_grad_clip: 5.0 ...@@ -78,7 +77,7 @@ global_grad_clip: 5.0
optim: adam optim: adam
optim_conf: optim_conf:
lr: 0.004 lr: 0.004
weight_decay: 1e-06 weight_decay: 1.0e-06
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
......
...@@ -21,7 +21,7 @@ mkdir -p ${output_dir} ...@@ -21,7 +21,7 @@ mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/alignment.py \ python3 -u ${BIN_DIR}/alignment.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.align \ --result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decode.decode_batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
......
...@@ -53,7 +53,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -53,7 +53,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \ --opts decode.decoding_method ${type} \
...@@ -78,7 +78,7 @@ for type in ctc_greedy_search; do ...@@ -78,7 +78,7 @@ for type in ctc_greedy_search; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \ --opts decode.decoding_method ${type} \
...@@ -99,7 +99,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do ...@@ -99,7 +99,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \ --opts decode.decoding_method ${type} \
......
...@@ -50,7 +50,7 @@ for type in attention_rescoring; do ...@@ -50,7 +50,7 @@ for type in attention_rescoring; do
python3 -u ${BIN_DIR}/test_wav.py \ python3 -u ${BIN_DIR}/test_wav.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \ --opts decode.decoding_method ${type} \
......
decode_batch_size: 1
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: transformer # encoder related
encoder_conf: encoder: transformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: true input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test-clean train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test-clean
collator: ###########################################
vocab_filepath: data/lang_char/train_960_unigram5000_units.txt # Dataloader #
unit_type: spm ###########################################
spm_model_prefix: data/lang_char/train_960_unigram5000 vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
feat_dim: 83 unit_type: spm
stride_ms: 10.0 spm_model_prefix: data/lang_char/train_960_unigram5000
window_ms: 25.0 feat_dim: 83
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs stride_ms: 10.0
batch_size: 30 window_ms: 25.0
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced batch_size: 30
minibatches: 0 # for debug maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
batch_count: auto maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
batch_bins: 0 minibatches: 0 # for debug
batch_frames_in: 0 batch_count: auto
batch_frames_out: 0 batch_bins: 0
batch_frames_inout: 0 batch_frames_in: 0
augmentation_config: conf/preprocess.yaml batch_frames_out: 0
num_workers: 0 batch_frames_inout: 0
subsampling_factor: 1 preprocess_config: conf/preprocess.yaml
num_encs: 1 num_workers: 0
subsampling_factor: 1
num_encs: 1
training: ###########################################
n_epoch: 120 # Training #
accum_grad: 2 ###########################################
log_interval: 100 n_epoch: 120
checkpoint: accum_grad: 2
kbest_n: 50 log_interval: 1
latest_n: 5 checkpoint:
kbest_n: 50
latest_n: 5
optim: adam optim: adam
optim_conf: optim_conf:
...@@ -79,23 +86,5 @@ scheduler_conf: ...@@ -79,23 +86,5 @@ scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
lr_decay: 1.0 lr_decay: 1.0
decoding:
batch_size: 1
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path dict_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path dict_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
dict_path=$2 decode_config_path=$2
ckpt_prefix=$3 dict_path=$3
ckpt_prefix=$4
batch_size=1 batch_size=1
output_dir=${ckpt_prefix} output_dir=${ckpt_prefix}
...@@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \ ...@@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \
--dict-path ${dict_path} \ --dict-path ${dict_path} \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result-file ${output_dir}/${type}.align \ --result-file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!" echo "Failed in ctc alignment!"
......
...@@ -19,8 +19,9 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe} ...@@ -19,8 +19,9 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe}
bpemodel=${bpeprefix}.model bpemodel=${bpeprefix}.model
config_path=conf/transformer.yaml config_path=conf/transformer.yaml
decode_config_path=conf/decode/decode_base.yaml
dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
ckpt_prefix= ckpt_prefix=exp/transformer/checkpoints/init
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
...@@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco ...@@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--dict-path ${dict} \ --dict-path ${dict} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--result-file ${decode_dir}/data.JOB.json \ --result-file ${decode_dir}/data.JOB.json \
--opts decoding.decoding_method ${dmethd} \ --opts decode.decoding_method ${dmethd} \
--opts decoding.batch_size ${batch_size} \ --opts decode.decode_batch_size ${batch_size} \
--opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} --opts test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict} score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict}
......
...@@ -9,12 +9,14 @@ gpus=0,1,2,3,4,5,6,7 ...@@ -9,12 +9,14 @@ gpus=0,1,2,3,4,5,6,7
stage=0 stage=0
stop_stage=50 stop_stage=50
conf_path=conf/transformer.yaml conf_path=conf/transformer.yaml
dict_path=lang_char/train_960_unigram5000_units.txt decode_conf_path=conf/decode/decode_base.yaml
dict_path=data/lang_char/train_960_unigram5000_units.txt
avg_num=10 avg_num=10
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num} avg_ckpt=avg_${avg_num}
avg_ckpt=init
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}" echo "checkpoint name ${ckpt}"
...@@ -35,7 +37,7 @@ fi ...@@ -35,7 +37,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# attetion resocre decoder # attetion resocre decoder
./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 ./local/test.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...@@ -45,7 +47,7 @@ fi ...@@ -45,7 +47,7 @@ fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# ctc alignment of test data # ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
......
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev
max_input_len: 27.0 # second test_manifest: data/manifest.test
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 27.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 64 # one gpu # Dataloader #
mean_std_filepath: data/mean_std.npz ###########################################
unit_type: char batch_size: 64 # one gpu
vocab_filepath: data/vocab.txt mean_std_filepath: data/mean_std.npz
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
feat_dim: spm_model_prefix:
delta_delta: False spectrum_type: linear
stride_ms: 10.0 feat_dim:
window_ms: 20.0 delta_delta: False
n_fft: None stride_ms: 10.0
max_freq: None window_ms: 20.0
target_sample_rate: 16000 n_fft: None
use_dB_normalization: True max_freq: None
target_dB: -20 target_sample_rate: 16000
dither: 1.0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True dither: 1.0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 1024 num_conv_layers: 2
use_gru: True num_rnn_layers: 3
share_rnn_weights: False rnn_layer_size: 1024
blank_id: 4333 use_gru: True
share_rnn_weights: False
blank_id: 4333
training: ###########################################
n_epoch: 80 # Training #
accum_grad: 1 ###########################################
lr: 2e-3 n_epoch: 80
lr_decay: 0.83 accum_grad: 1
weight_decay: 1e-06 lr: 2e-3
global_grad_clip: 3.0 lr_decay: 0.83
log_interval: 100 weight_decay: 1e-06
checkpoint: global_grad_clip: 3.0
kbest_n: 50 log_interval: 100
latest_n: 5 checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 32
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.6
beta: 5.0
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 8
decode_batch_size: 32
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.6
beta: 5.0
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 8
\ No newline at end of file
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_ch.sh bash local/download_lm_ch.sh
...@@ -21,6 +22,7 @@ fi ...@@ -21,6 +22,7 @@ fi
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} --model_type ${model_type}
......
...@@ -5,6 +5,7 @@ source path.sh ...@@ -5,6 +5,7 @@ source path.sh
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=1 avg_num=1
model_type=offline model_type=offline
gpus=2 gpus=2
...@@ -23,6 +24,6 @@ fi ...@@ -23,6 +24,6 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
fi fi
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test-clean train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev
max_input_len: .inf # second test_manifest: data/manifest.test-clean
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: .inf # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 64 # one gpu # Dataloader #
mean_std_filepath: data/mean_std.npz ###########################################
unit_type: char batch_size: 64 # one gpu
vocab_filepath: data/vocab.txt mean_std_filepath: data/mean_std.npz
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
feat_dim: spm_model_prefix:
delta_delta: False spectrum_type: linear
stride_ms: 10.0 feat_dim:
window_ms: 20.0 delta_delta: False
n_fft: None stride_ms: 10.0
max_freq: None window_ms: 20.0
target_sample_rate: 16000 n_fft: None
use_dB_normalization: True max_freq: None
target_dB: -20 target_sample_rate: 16000
dither: 1.0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True dither: 1.0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 1024 num_conv_layers: 2
use_gru: True num_rnn_layers: 3
share_rnn_weights: False rnn_layer_size: 1024
blank_id: 28 use_gru: True
share_rnn_weights: False
blank_id: 28
###########################################
# Training #
###########################################
n_epoch: 80
accum_grad: 1
lr: 2e-3
lr_decay: 0.83
weight_decay: 1e-06
global_grad_clip: 3.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
training:
n_epoch: 80
accum_grad: 1
lr: 2e-3
lr_decay: 0.83
weight_decay: 1e-06
global_grad_clip: 3.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 32
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 1.4
beta: 0.35
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
decode_batch_size: 32
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 1.4
beta: 0.35
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
\ No newline at end of file
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_en.sh bash local/download_lm_en.sh
...@@ -21,6 +22,7 @@ fi ...@@ -21,6 +22,7 @@ fi
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} --model_type ${model_type}
......
...@@ -5,6 +5,7 @@ source path.sh ...@@ -5,6 +5,7 @@ source path.sh
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=1 avg_num=1
model_type=offline model_type=offline
gpus=0 gpus=0
...@@ -23,6 +24,6 @@ fi ...@@ -23,6 +24,6 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
fi fi
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test-clean train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev
max_input_len: 1000.0 # second test_manifest: data/manifest.test-clean
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 1000.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 64 # one gpu # Dataloader #
mean_std_filepath: data/mean_std.npz ###########################################
unit_type: char batch_size: 64 # one gpu
vocab_filepath: data/vocab.txt mean_std_filepath: data/mean_std.npz
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
feat_dim: spm_model_prefix:
delta_delta: False spectrum_type: linear
stride_ms: 10.0 feat_dim:
window_ms: 20.0 delta_delta: False
n_fft: None stride_ms: 10.0
max_freq: None window_ms: 20.0
target_sample_rate: 16000 n_fft: None
use_dB_normalization: True max_freq: None
target_dB: -20 target_sample_rate: 16000
dither: 1.0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True dither: 1.0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 2048 num_conv_layers: 2
use_gru: False num_rnn_layers: 3
share_rnn_weights: True rnn_layer_size: 2048
blank_id: 28 use_gru: False
share_rnn_weights: True
blank_id: 28
###########################################
# Training #
###########################################
n_epoch: 80
accum_grad: 1
lr: 2e-3
lr_decay: 0.83
weight_decay: 1e-06
global_grad_clip: 3.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
training:
n_epoch: 80
accum_grad: 1
lr: 2e-3
lr_decay: 0.83
weight_decay: 1e-06
global_grad_clip: 3.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 32
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
decode_batch_size: 32
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
\ No newline at end of file
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_en.sh bash local/download_lm_en.sh
...@@ -21,6 +22,7 @@ fi ...@@ -21,6 +22,7 @@ fi
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} --model_type ${model_type}
......
...@@ -5,6 +5,7 @@ source path.sh ...@@ -5,6 +5,7 @@ source path.sh
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=1 avg_num=1
model_type=offline model_type=offline
gpus=1 gpus=1
...@@ -23,5 +24,5 @@ fi ...@@ -23,5 +24,5 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
fi fi
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
"""Evaluation for DeepSpeech2 model.""" """Evaluation for DeepSpeech2 model."""
from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
...@@ -44,6 +45,10 @@ if __name__ == "__main__": ...@@ -44,6 +45,10 @@ if __name__ == "__main__":
config = get_cfg_defaults(args.model_type) config = get_cfg_defaults(args.model_type)
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()
......
...@@ -233,11 +233,11 @@ class DeepSpeech2Model(nn.Layer): ...@@ -233,11 +233,11 @@ class DeepSpeech2Model(nn.Layer):
""" """
model = cls(feat_size=dataloader.collate_fn.feature_size, model = cls(feat_size=dataloader.collate_fn.feature_size,
dict_size=len(dataloader.collate_fn.vocab_list), dict_size=len(dataloader.collate_fn.vocab_list),
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
rnn_size=config.model.rnn_layer_size, rnn_size=config.rnn_layer_size,
use_gru=config.model.use_gru, use_gru=config.use_gru,
share_rnn_weights=config.model.share_rnn_weights) share_rnn_weights=config.share_rnn_weights)
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path) model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}") logger.info(f"checkpoint info: {infos}")
...@@ -250,7 +250,7 @@ class DeepSpeech2Model(nn.Layer): ...@@ -250,7 +250,7 @@ class DeepSpeech2Model(nn.Layer):
Parameters Parameters
config: yacs.config.CfgNode config: yacs.config.CfgNode
config.model config
Returns Returns
------- -------
DeepSpeech2Model DeepSpeech2Model
......
...@@ -64,7 +64,7 @@ class DeepSpeech2Trainer(Trainer): ...@@ -64,7 +64,7 @@ class DeepSpeech2Trainer(Trainer):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training train_conf = self.config
start = time.time() start = time.time()
# forward # forward
...@@ -98,7 +98,7 @@ class DeepSpeech2Trainer(Trainer): ...@@ -98,7 +98,7 @@ class DeepSpeech2Trainer(Trainer):
iteration_time = time.time() - start iteration_time = time.time() - start
msg += "train time: {:>.3f}s, ".format(iteration_time) msg += "train time: {:>.3f}s, ".format(iteration_time)
msg += "batch size: {}, ".format(self.config.collator.batch_size) msg += "batch size: {}, ".format(self.config.batch_size)
msg += "accum: {}, ".format(train_conf.accum_grad) msg += "accum: {}, ".format(train_conf.accum_grad)
msg += ', '.join('{}: {:>.6f}'.format(k, v) msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items()) for k, v in losses_np.items())
...@@ -126,7 +126,7 @@ class DeepSpeech2Trainer(Trainer): ...@@ -126,7 +126,7 @@ class DeepSpeech2Trainer(Trainer):
total_loss += float(loss) * num_utts total_loss += float(loss) * num_utts
valid_losses['val_loss'].append(float(loss)) valid_losses['val_loss'].append(float(loss))
if (i + 1) % self.config.training.log_interval == 0: if (i + 1) % self.config.log_interval == 0:
valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
valid_dump['val_history_loss'] = total_loss / num_seen_utts valid_dump['val_history_loss'] = total_loss / num_seen_utts
...@@ -146,15 +146,15 @@ class DeepSpeech2Trainer(Trainer): ...@@ -146,15 +146,15 @@ class DeepSpeech2Trainer(Trainer):
def setup_model(self): def setup_model(self):
config = self.config.clone() config = self.config.clone()
config.defrost() config.defrost()
config.model.feat_size = self.train_loader.collate_fn.feature_size config.feat_size = self.train_loader.collate_fn.feature_size
#config.model.dict_size = self.train_loader.collate_fn.vocab_size #config.dict_size = self.train_loader.collate_fn.vocab_size
config.model.dict_size = len(self.train_loader.collate_fn.vocab_list) config.dict_size = len(self.train_loader.collate_fn.vocab_list)
config.freeze() config.freeze()
if self.args.model_type == 'offline': if self.args.model_type == 'offline':
model = DeepSpeech2Model.from_config(config.model) model = DeepSpeech2Model.from_config(config)
elif self.args.model_type == 'online': elif self.args.model_type == 'online':
model = DeepSpeech2ModelOnline.from_config(config.model) model = DeepSpeech2ModelOnline.from_config(config)
else: else:
raise Exception("wrong model type") raise Exception("wrong model type")
if self.parallel: if self.parallel:
...@@ -163,17 +163,13 @@ class DeepSpeech2Trainer(Trainer): ...@@ -163,17 +163,13 @@ class DeepSpeech2Trainer(Trainer):
logger.info(f"{model}") logger.info(f"{model}")
layer_tools.print_params(model, logger.info) layer_tools.print_params(model, logger.info)
grad_clip = ClipGradByGlobalNormWithLog( grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
config.training.global_grad_clip)
lr_scheduler = paddle.optimizer.lr.ExponentialDecay( lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
learning_rate=config.training.lr, learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
gamma=config.training.lr_decay,
verbose=True)
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
parameters=model.parameters(), parameters=model.parameters(),
weight_decay=paddle.regularizer.L2Decay( weight_decay=paddle.regularizer.L2Decay(config.weight_decay),
config.training.weight_decay),
grad_clip=grad_clip) grad_clip=grad_clip)
self.model = model self.model = model
...@@ -184,59 +180,59 @@ class DeepSpeech2Trainer(Trainer): ...@@ -184,59 +180,59 @@ class DeepSpeech2Trainer(Trainer):
def setup_dataloader(self): def setup_dataloader(self):
config = self.config.clone() config = self.config.clone()
config.defrost() config.defrost()
config.collator.keep_transcription_text = False config.keep_transcription_text = False
config.data.manifest = config.data.train_manifest config.manifest = config.train_manifest
train_dataset = ManifestDataset.from_config(config) train_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.dev_manifest config.manifest = config.dev_manifest
dev_dataset = ManifestDataset.from_config(config) dev_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.test_manifest config.manifest = config.test_manifest
test_dataset = ManifestDataset.from_config(config) test_dataset = ManifestDataset.from_config(config)
if self.parallel: if self.parallel:
batch_sampler = SortagradDistributedBatchSampler( batch_sampler = SortagradDistributedBatchSampler(
train_dataset, train_dataset,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
num_replicas=None, num_replicas=None,
rank=None, rank=None,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
sortagrad=config.collator.sortagrad, sortagrad=config.sortagrad,
shuffle_method=config.collator.shuffle_method) shuffle_method=config.shuffle_method)
else: else:
batch_sampler = SortagradBatchSampler( batch_sampler = SortagradBatchSampler(
train_dataset, train_dataset,
shuffle=True, shuffle=True,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
drop_last=True, drop_last=True,
sortagrad=config.collator.sortagrad, sortagrad=config.sortagrad,
shuffle_method=config.collator.shuffle_method) shuffle_method=config.shuffle_method)
collate_fn_train = SpeechCollator.from_config(config) collate_fn_train = SpeechCollator.from_config(config)
config.collator.augmentation_config = "" config.augmentation_config = ""
collate_fn_dev = SpeechCollator.from_config(config) collate_fn_dev = SpeechCollator.from_config(config)
config.collator.keep_transcription_text = True config.keep_transcription_text = True
config.collator.augmentation_config = "" config.augmentation_config = ""
collate_fn_test = SpeechCollator.from_config(config) collate_fn_test = SpeechCollator.from_config(config)
self.train_loader = DataLoader( self.train_loader = DataLoader(
train_dataset, train_dataset,
batch_sampler=batch_sampler, batch_sampler=batch_sampler,
collate_fn=collate_fn_train, collate_fn=collate_fn_train,
num_workers=config.collator.num_workers) num_workers=config.num_workers)
self.valid_loader = DataLoader( self.valid_loader = DataLoader(
dev_dataset, dev_dataset,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=collate_fn_dev) collate_fn=collate_fn_dev)
self.test_loader = DataLoader( self.test_loader = DataLoader(
test_dataset, test_dataset,
batch_size=config.decoding.batch_size, batch_size=config.decode.decode_batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=collate_fn_test) collate_fn=collate_fn_test)
...@@ -274,7 +270,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -274,7 +270,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def __init__(self, config, args): def __init__(self, config, args):
self._text_featurizer = TextFeaturizer( self._text_featurizer = TextFeaturizer(
unit_type=config.collator.unit_type, vocab_filepath=None) unit_type=config.unit_type, vocab=None)
super().__init__(config, args) super().__init__(config, args)
def ordid2token(self, texts, texts_len): def ordid2token(self, texts, texts_len):
...@@ -293,7 +289,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -293,7 +289,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
texts, texts,
texts_len, texts_len,
fout=None): fout=None):
cfg = self.config.decoding cfg = self.config.decode
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
...@@ -399,31 +395,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -399,31 +395,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
self.export() self.export()
except KeyboardInterrupt: except KeyboardInterrupt:
exit(-1) exit(-1)
def setup(self):
"""Setup the experiment.
"""
paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
self.setup_output_dir()
self.setup_checkpointer()
self.setup_dataloader()
self.setup_model()
self.iteration = 0
self.epoch = 0
def setup_output_dir(self):
"""Create a directory used for output.
"""
# output dir
if self.args.output:
output_dir = Path(self.args.output).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
else:
output_dir = Path(
self.args.checkpoint_path).expanduser().parent.parent
output_dir.mkdir(parents=True, exist_ok=True)
self.output_dir = output_dir
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train.tiny # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train.tiny
min_input_len: 0.05 # second dev_manifest: data/manifest.dev
max_input_len: 30.0 # second test_manifest: data/manifest.test
min_output_len: 0.0 # tokens min_input_len: 0.05 # second
max_output_len: 400.0 # tokens max_input_len: 30.0 # second
min_output_input_ratio: 0.01 min_output_len: 0.0 # tokens
max_output_input_ratio: 20.0 max_output_len: 400.0 # tokens
min_output_input_ratio: 0.01
max_output_input_ratio: 20.0
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'spm' ###########################################
spm_model_prefix: data/lang_char/bpe_unigram_8000 vocab_filepath: data/lang_char/vocab.txt
mean_std_filepath: "" unit_type: 'spm'
# augmentation_config: conf/augmentation.json spm_model_prefix: data/lang_char/bpe_unigram_8000
batch_size: 10 mean_std_filepath: ""
raw_wav: True # use raw_wav or kaldi feature # augmentation_config: conf/augmentation.json
spectrum_type: fbank #linear, mfcc, fbank batch_size: 10
feat_dim: 80 raw_wav: True # use raw_wav or kaldi feature
delta_delta: False spectrum_type: fbank #linear, mfcc, fbank
dither: 1.0 feat_dim: 80
target_sample_rate: 16000 delta_delta: False
max_freq: None dither: 1.0
n_fft: None target_sample_rate: 16000
stride_ms: 10.0 max_freq: None
window_ms: 25.0 n_fft: None
use_dB_normalization: True stride_ms: 10.0
target_dB: -20 window_ms: 25.0
random_seed: 0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True random_seed: 0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture ############################################
model: # Network Architecture #
cmvn_file: "data/mean_std.json" ############################################
cmvn_file_type: "json" cmvn_file: "data/mean_std.json"
# encoder related cmvn_file_type: "json"
encoder: transformer # encoder related
encoder_conf: encoder: transformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: true input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
asr_weight: 0.0 asr_weight: 0.0
ctc_weight: 0.0 ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
###########################################
training: # Training #
n_epoch: 120 ###########################################
accum_grad: 2 n_epoch: 120
global_grad_clip: 5.0 accum_grad: 2
optim: adam global_grad_clip: 5.0
optim_conf: optim: adam
lr: 0.004 optim_conf:
weight_decay: 1e-06 lr: 0.004
scheduler: warmuplr weight_decay: 1.0e-06
scheduler_conf: scheduler: warmuplr
warmup_steps: 25000 scheduler_conf:
lr_decay: 1.0 warmup_steps: 25000
log_interval: 5 lr_decay: 1.0
checkpoint: log_interval: 5
kbest_n: 50 checkpoint:
latest_n: 5 kbest_n: 50
latest_n: 5
decoding:
batch_size: 5
error_rate_type: char-bleu
decoding_method: fullsentence # 'fullsentence', 'simultaneous'
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.05 # second dev_manifest: data/manifest.dev
max_input_len: 30.0 # second test_manifest: data/manifest.test
min_output_len: 0.0 # tokens min_input_len: 0.05 # second
max_output_len: 400.0 # tokens max_input_len: 30.0 # second
min_output_input_ratio: 0.01 min_output_len: 0.0 # tokens
max_output_input_ratio: 20.0 max_output_len: 400.0 # tokens
min_output_input_ratio: 0.01
max_output_input_ratio: 20.0
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'spm' ###########################################
spm_model_prefix: data/lang_char/bpe_unigram_8000 vocab_filepath: data/lang_char/vocab.txt
mean_std_filepath: "" unit_type: 'spm'
# augmentation_config: conf/augmentation.json spm_model_prefix: data/lang_char/bpe_unigram_8000
batch_size: 10 mean_std_filepath: ""
raw_wav: True # use raw_wav or kaldi feature # augmentation_config: conf/augmentation.json
spectrum_type: fbank #linear, mfcc, fbank batch_size: 10
feat_dim: 80 raw_wav: True # use raw_wav or kaldi feature
delta_delta: False spectrum_type: fbank #linear, mfcc, fbank
dither: 1.0 feat_dim: 80
target_sample_rate: 16000 delta_delta: False
max_freq: None dither: 1.0
n_fft: None target_sample_rate: 16000
stride_ms: 10.0 max_freq: None
window_ms: 25.0 n_fft: None
use_dB_normalization: True stride_ms: 10.0
target_dB: -20 window_ms: 25.0
random_seed: 0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True random_seed: 0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture ############################################
model: # Network Architecture #
cmvn_file: "data/mean_std.json" ############################################
cmvn_file_type: "json" cmvn_file: "data/mean_std.json"
# encoder related cmvn_file_type: "json"
encoder: transformer # encoder related
encoder_conf: encoder: transformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: true input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
asr_weight: 0.5 asr_weight: 0.5
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
training: ###########################################
n_epoch: 120 # Training #
accum_grad: 2 ###########################################
global_grad_clip: 5.0 n_epoch: 120
optim: adam accum_grad: 2
optim_conf: global_grad_clip: 5.0
lr: 2.5 optim: adam
weight_decay: 1e-06 optim_conf:
scheduler: noam lr: 2.5
scheduler_conf: weight_decay: 1.0e-06
warmup_steps: 25000 scheduler: noam
lr_decay: 1.0 scheduler_conf:
log_interval: 50 warmup_steps: 25000
checkpoint: lr_decay: 1.0
kbest_n: 50 log_interval: 50
latest_n: 5 checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 5
error_rate_type: char-bleu
decoding_method: fullsentence # 'fullsentence', 'simultaneous'
alpha: 2.5
beta: 0.3
beam_size: 10
word_reward: 0.7
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
batch_size: 5
error_rate_type: char-bleu
decoding_method: fullsentence # 'fullsentence', 'simultaneous'
beam_size: 10
word_reward: 0.7
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
\ No newline at end of file
#! /usr/bin/env bash #! /usr/bin/env bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
for type in fullsentence; do for type in fullsentence; do
echo "decoding ${type}" echo "decoding ${type}"
...@@ -17,10 +18,11 @@ for type in fullsentence; do ...@@ -17,10 +18,11 @@ for type in fullsentence; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
......
...@@ -6,6 +6,7 @@ gpus=0,1,2,3 ...@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0 stage=0
stop_stage=50 stop_stage=50
conf_path=conf/transformer_mtl_noam.yaml conf_path=conf/transformer_mtl_noam.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=5 avg_num=5
data_path=./TED_EnZh # path to unzipped data data_path=./TED_EnZh # path to unzipped data
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
...@@ -32,7 +33,7 @@ fi ...@@ -32,7 +33,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
......
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train.tiny # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train.tiny
min_input_len: 5.0 # frame dev_manifest: data/manifest.dev
max_input_len: 3000.0 # frame test_manifest: data/manifest.test
min_output_len: 0.0 # tokens min_input_len: 5.0 # frame
max_output_len: 400.0 # tokens max_input_len: 3000.0 # frame
min_output_input_ratio: 0.01 min_output_len: 0.0 # tokens
max_output_input_ratio: 20.0 max_output_len: 400.0 # tokens
min_output_input_ratio: 0.01
max_output_input_ratio: 20.0
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'spm' ###########################################
spm_model_prefix: data/lang_char/bpe_unigram_8000 vocab_filepath: data/lang_char/vocab.txt
mean_std_filepath: "" unit_type: 'spm'
# augmentation_config: conf/augmentation.json spm_model_prefix: data/lang_char/bpe_unigram_8000
batch_size: 10 mean_std_filepath: ""
raw_wav: True # use raw_wav or kaldi feature # augmentation_config: conf/augmentation.json
spectrum_type: fbank #linear, mfcc, fbank batch_size: 10
feat_dim: 83 raw_wav: True # use raw_wav or kaldi feature
delta_delta: False spectrum_type: fbank #linear, mfcc, fbank
dither: 1.0 feat_dim: 83
target_sample_rate: 16000 delta_delta: False
max_freq: None dither: 1.0
n_fft: None target_sample_rate: 16000
stride_ms: 10.0 max_freq: None
window_ms: 25.0 n_fft: None
use_dB_normalization: True stride_ms: 10.0
target_dB: -20 window_ms: 25.0
random_seed: 0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True random_seed: 0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture ############################################
model: # Network Architecture #
cmvn_file: None ############################################
cmvn_file_type: "json" cmvn_file: None
# encoder related cmvn_file_type: "json"
encoder: transformer # encoder related
encoder_conf: encoder: transformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: true input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
asr_weight: 0.0 asr_weight: 0.0
ctc_weight: 0.0 ctc_weight: 0.0
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
training: ###########################################
n_epoch: 20 # Training #
accum_grad: 2 ###########################################
global_grad_clip: 5.0 n_epoch: 20
optim: adam accum_grad: 2
optim_conf: global_grad_clip: 5.0
lr: 0.004 optim: adam
weight_decay: 1e-06 optim_conf:
scheduler: warmuplr lr: 0.004
scheduler_conf: weight_decay: 1.0e-06
warmup_steps: 25000 scheduler: warmuplr
lr_decay: 1.0 scheduler_conf:
log_interval: 5 warmup_steps: 25000
checkpoint: lr_decay: 1.0
kbest_n: 50 log_interval: 5
latest_n: 5 checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 5
error_rate_type: char-bleu
decoding_method: fullsentence # 'fullsentence', 'simultaneous'
alpha: 2.5
beta: 0.3
beam_size: 10
word_reward: 0.7
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 5.0 # frame dev_manifest: data/manifest.dev
max_input_len: 3000.0 # frame test_manifest: data/manifest.test
min_output_len: 0.0 # tokens min_input_len: 5.0 # frame
max_output_len: 400.0 # tokens max_input_len: 3000.0 # frame
min_output_input_ratio: 0.01 min_output_len: 0.0 # tokens
max_output_input_ratio: 20.0 max_output_len: 400.0 # tokens
min_output_input_ratio: 0.01
max_output_input_ratio: 20.0
collator: ###########################################
vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt # Dataloader #
unit_type: 'spm' ###########################################
spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
mean_std_filepath: "" unit_type: 'spm'
# augmentation_config: conf/augmentation.json spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
batch_size: 10 mean_std_filepath: ""
raw_wav: True # use raw_wav or kaldi feature # augmentation_config: conf/augmentation.json
spectrum_type: fbank #linear, mfcc, fbank batch_size: 10
feat_dim: 83 raw_wav: True # use raw_wav or kaldi feature
delta_delta: False spectrum_type: fbank #linear, mfcc, fbank
dither: 1.0 feat_dim: 83
target_sample_rate: 16000 delta_delta: False
max_freq: None dither: 1.0
n_fft: None target_sample_rate: 16000
stride_ms: 10.0 max_freq: None
window_ms: 25.0 n_fft: None
use_dB_normalization: True stride_ms: 10.0
target_dB: -20 window_ms: 25.0
random_seed: 0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True random_seed: 0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture ############################################
model: # Network Architecture #
cmvn_file: None ############################################
cmvn_file_type: "json" cmvn_file: None
# encoder related cmvn_file_type: "json"
encoder: transformer # encoder related
encoder_conf: encoder: transformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: true input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
asr_weight: 0.5 asr_weight: 0.5
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
training: ###########################################
n_epoch: 20 # Training #
accum_grad: 2 ###########################################
global_grad_clip: 5.0 n_epoch: 20
optim: adam accum_grad: 2
optim_conf: global_grad_clip: 5.0
lr: 2.5 optim: adam
weight_decay: 1e-06 optim_conf:
scheduler: noam lr: 2.5
scheduler_conf: weight_decay: 1.0e-06
warmup_steps: 25000 scheduler: noam
lr_decay: 1.0 scheduler_conf:
log_interval: 5 warmup_steps: 25000
checkpoint: lr_decay: 1.0
kbest_n: 50 log_interval: 5
latest_n: 5 checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 5
error_rate_type: char-bleu
decoding_method: fullsentence # 'fullsentence', 'simultaneous'
alpha: 2.5
beta: 0.3
beam_size: 10
word_reward: 0.7
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
\ No newline at end of file
batch_size: 5
error_rate_type: char-bleu
decoding_method: fullsentence # 'fullsentence', 'simultaneous'
beam_size: 10
word_reward: 0.7
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
\ No newline at end of file
#! /usr/bin/env bash #! /usr/bin/env bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
for type in fullsentence; do for type in fullsentence; do
echo "decoding ${type}" echo "decoding ${type}"
...@@ -17,10 +18,11 @@ for type in fullsentence; do ...@@ -17,10 +18,11 @@ for type in fullsentence; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
......
...@@ -7,6 +7,7 @@ gpus=0,1,2,3 ...@@ -7,6 +7,7 @@ gpus=0,1,2,3
stage=1 stage=1
stop_stage=4 stop_stage=4
conf_path=conf/transformer_mtl_noam.yaml conf_path=conf/transformer_mtl_noam.yaml
decode_conf_path=conf/tuning/decode.yaml
ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
avg_num=5 avg_num=5
data_path=./TED_EnZh # path to unzipped data data_path=./TED_EnZh # path to unzipped data
...@@ -38,5 +39,5 @@ fi ...@@ -38,5 +39,5 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_pat} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
\ No newline at end of file
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.0 # second dev_manifest: data/manifest.dev
max_input_len: 10.0 # second test_manifest: data/manifest.test
min_output_len: 0.0 # tokens
max_output_len: 150.0 # tokens
min_output_input_ratio: 0.005
max_output_input_ratio: 1000.0
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: "word" ###########################################
mean_std_filepath: "" vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/preprocess.yaml spm_model_prefix: ''
batch_size: 64 unit_type: "word"
raw_wav: True # use raw_wav or kaldi feature mean_std_filepath: ""
spectrum_type: fbank #linear, mfcc, fbank preprocess_config: conf/preprocess.yaml
feat_dim: 80 feat_dim: 80
delta_delta: False stride_ms: 10.0
dither: 1.0 window_ms: 25.0
target_sample_rate: 16000 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq: None batch_size: 64
n_fft: None maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
stride_ms: 10.0 maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
window_ms: 25.0 minibatches: 0 # for debug
use_dB_normalization: True batch_count: auto
target_dB: -20 batch_bins: 0
random_seed: 0 batch_frames_in: 0
keep_transcription_text: False batch_frames_out: 0
sortagrad: True batch_frames_inout: 0
shuffle_method: batch_shuffle num_workers: 0
num_workers: 2 subsampling_factor: 1
num_encs: 1
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: transformer # encoder related
encoder_conf: encoder: transformer
output_size: 128 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 128 # dimension of attention
linear_units: 1024 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 6 # the number of encoder blocks linear_units: 1024 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 6 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: true input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 1024 linear_units: 1024
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.5 ctc_weight: 0.5
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
training: ###########################################
n_epoch: 50 # Training #
accum_grad: 1 ###########################################
global_grad_clip: 5.0 n_epoch: 50
optim: adam accum_grad: 1
optim_conf: global_grad_clip: 5.0
lr: 0.004 optim: adam
weight_decay: 1e-06 optim_conf:
scheduler: warmuplr lr: 0.004
scheduler_conf: weight_decay: 1.0e-6
warmup_steps: 1200 scheduler: warmuplr
lr_decay: 1.0 scheduler_conf:
log_interval: 10 warmup_steps: 1200
checkpoint: lr_decay: 1.0
kbest_n: 50 log_interval: 10
latest_n: 5 checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
decode_batch_size: 64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
batch_size=1 batch_size=1
output_dir=${ckpt_prefix} output_dir=${ckpt_prefix}
...@@ -20,9 +21,10 @@ mkdir -p ${output_dir} ...@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/alignment.py \ python3 -u ${BIN_DIR}/alignment.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.align \ --result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!" echo "Failed in ctc alignment!"
......
...@@ -7,8 +7,8 @@ stop_stage=50 ...@@ -7,8 +7,8 @@ stop_stage=50
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
chunk_mode=false chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
...@@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
...@@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
...@@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
......
...@@ -7,6 +7,7 @@ gpus=0,1,2,3 ...@@ -7,6 +7,7 @@ gpus=0,1,2,3
stage=0 stage=0
stop_stage=50 stop_stage=50
conf_path=conf/transformer.yaml conf_path=conf/transformer.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=10 avg_num=10
TIMIT_path=/path/to/TIMIT TIMIT_path=/path/to/TIMIT
...@@ -34,15 +35,15 @@ fi ...@@ -34,15 +35,15 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data # ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
# # export ckpt avg_n # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi fi
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.tiny # Data #
dev_manifest: data/manifest.tiny ###########################################
test_manifest: data/manifest.tiny train_manifest: data/manifest.tiny
min_input_len: 0.0 dev_manifest: data/manifest.tiny
max_input_len: 30.0 test_manifest: data/manifest.tiny
min_output_len: 0.0 min_input_len: 0.0
max_output_len: 400.0 max_input_len: 30.0
min_output_input_ratio: 0.05 min_output_len: 0.0
max_output_input_ratio: 10.0 max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator: ###########################################
mean_std_filepath: data/mean_std.json # Dataloader #
unit_type: char ###########################################
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
feat_dim: spm_model_prefix:
delta_delta: False spectrum_type: linear
stride_ms: 10.0 feat_dim:
window_ms: 20.0 delta_delta: False
n_fft: None stride_ms: 10.0
max_freq: None window_ms: 20.0
target_sample_rate: 16000 n_fft: None
use_dB_normalization: True max_freq: None
target_dB: -20 target_sample_rate: 16000
dither: 1.0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True dither: 1.0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
batch_size: 4 shuffle_method: batch_shuffle
num_workers: 2
batch_size: 4
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 2048 num_conv_layers: 2
use_gru: False num_rnn_layers: 3
share_rnn_weights: True rnn_layer_size: 2048
blank_id: 0 use_gru: False
share_rnn_weights: True
blank_id: 0
training: ###########################################
n_epoch: 5 # Training #
accum_grad: 1 ###########################################
lr: 1e-5 n_epoch: 5
lr_decay: 0.8 accum_grad: 1
weight_decay: 1e-06 lr: 1e-5
global_grad_clip: 5.0 lr_decay: 0.8
log_interval: 1 weight_decay: 1e-06
checkpoint: global_grad_clip: 5.0
kbest_n: 3 log_interval: 1
latest_n: 2 checkpoint:
kbest_n: 3
latest_n: 2
decoding:
batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.tiny # Data #
dev_manifest: data/manifest.tiny ###########################################
test_manifest: data/manifest.tiny train_manifest: data/manifest.tiny
min_input_len: 0.0 dev_manifest: data/manifest.tiny
max_input_len: 30.0 test_manifest: data/manifest.tiny
min_output_len: 0.0 min_input_len: 0.0
max_output_len: 400.0 max_input_len: 30.0
min_output_input_ratio: 0.05 min_output_len: 0.0
max_output_input_ratio: 10.0 max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator: ###########################################
mean_std_filepath: data/mean_std.json # Dataloader #
unit_type: char ###########################################
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
feat_dim: spm_model_prefix:
delta_delta: False spectrum_type: linear
stride_ms: 10.0 feat_dim:
window_ms: 20.0 delta_delta: False
n_fft: None stride_ms: 10.0
max_freq: None window_ms: 20.0
target_sample_rate: 16000 n_fft: None
use_dB_normalization: True max_freq: None
target_dB: -20 target_sample_rate: 16000
dither: 1.0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True dither: 1.0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 0 sortagrad: True
batch_size: 4 shuffle_method: batch_shuffle
num_workers: 0
batch_size: 4
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 4 ############################################
rnn_layer_size: 2048 num_conv_layers: 2
rnn_direction: forward num_rnn_layers: 4
num_fc_layers: 2 rnn_layer_size: 2048
fc_layers_size_list: 512, 256 rnn_direction: forward
use_gru: True num_fc_layers: 2
blank_id: 0 fc_layers_size_list: 512, 256
use_gru: True
blank_id: 0
training: ###########################################
n_epoch: 5 # Training #
accum_grad: 1 ###########################################
lr: 1e-5 n_epoch: 5
lr_decay: 1.0 accum_grad: 1
weight_decay: 1e-06 lr: 1e-5
global_grad_clip: 5.0 lr_decay: 1.0
log_interval: 1 weight_decay: 1e-06
checkpoint: global_grad_clip: 5.0
kbest_n: 3 log_interval: 1
latest_n: 2 checkpoint:
kbest_n: 3
latest_n: 2
decoding:
batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
decode_batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
decode_batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_en.sh bash local/download_lm_en.sh
...@@ -21,6 +22,7 @@ fi ...@@ -21,6 +22,7 @@ fi
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} --model_type ${model_type}
......
...@@ -6,6 +6,7 @@ gpus=0 ...@@ -6,6 +6,7 @@ gpus=0
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=1 avg_num=1
model_type=offline model_type=offline
...@@ -32,7 +33,7 @@ fi ...@@ -32,7 +33,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
......
# https://yaml.org/type/float.html ############################################
data: # Network Architecture #
train_manifest: data/manifest.tiny ############################################
dev_manifest: data/manifest.tiny cmvn_file: "data/mean_std.json"
test_manifest: data/manifest.tiny cmvn_file_type: "json"
min_input_len: 0.5 # second # encoder related
max_input_len: 30.0 # second encoder: conformer
min_output_len: 0.0 # tokens encoder_conf:
max_output_len: 400.0 # tokens output_size: 256 # dimension of attention
min_output_input_ratio: 0.05 attention_heads: 4
max_output_input_ratio: 10.0 linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
collator: dropout_rate: 0.1
mean_std_filepath: "" positional_dropout_rate: 0.1
vocab_filepath: data/lang_char/vocab.txt attention_dropout_rate: 0.0
unit_type: 'spm' input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
spm_model_prefix: 'data/lang_char/bpe_unigram_200' normalize_before: True
augmentation_config: conf/preprocess.yaml use_cnn_module: True
batch_size: 4 cnn_module_kernel: 15
raw_wav: True # use raw_wav or kaldi feature activation_type: 'swish'
spectrum_type: fbank #linear, mfcc, fbank pos_enc_layer_type: 'rel_pos'
feat_dim: 80 selfattention_layer_type: 'rel_selfattn'
delta_delta: False causal: True
dither: 1.0 use_dynamic_chunk: True
target_sample_rate: 16000 cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
max_freq: None use_dynamic_left_chunk: false
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file_type: "json"
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: True
use_cnn_module: True
cnn_module_kernel: 15
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: True
use_dynamic_chunk: True
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention # decoder related
model_conf: decoder: transformer
ctc_weight: 0.3 decoder_conf:
lsm_weight: 0.1 # label smoothing option attention_heads: 4
length_normalized_loss: false linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
training:
n_epoch: 5
accum_grad: 1
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.001
weight_decay: 1e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 1
checkpoint:
kbest_n: 10
latest_n: 1
###########################################
# Data #
###########################################
train_manifest: data/manifest.tiny
dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny
decoding:
batch_size: 64 ###########################################
error_rate_type: wer # Dataloader #
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' ###########################################
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm mean_std_filepath: ""
alpha: 2.5 vocab_filepath: data/lang_char/vocab.txt
beta: 0.3 unit_type: 'spm'
beam_size: 10 spm_model_prefix: 'data/lang_char/bpe_unigram_200'
cutoff_prob: 1.0 preprocess_config: conf/preprocess.yaml
cutoff_top_n: 0 feat_dim: 80
num_proc_bsearch: 8 stride_ms: 10.0
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. window_ms: 25.0
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
# <0: for decoding, use full chunk. batch_size: 4
# >0: for decoding, use fixed chunk size as set. maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
# 0: used for training, it's prohibited here. maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. minibatches: 0 # for debug
simulate_streaming: False # simulate streaming inference. Defaults to False. batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
augmentation_config: conf/preprocess.yaml
num_workers: 0
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
###########################################
n_epoch: 5
accum_grad: 1
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.001
weight_decay: 1.0e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 1
checkpoint:
kbest_n: 10
latest_n: 1
# https://yaml.org/type/float.html ############################################
data: # Network Architecture #
train_manifest: data/manifest.tiny ############################################
dev_manifest: data/manifest.tiny cmvn_file: "data/mean_std.json"
test_manifest: data/manifest.tiny cmvn_file_type: "json"
min_input_len: 0.5 # second # encoder related
max_input_len: 20.0 # second encoder: transformer
min_output_len: 0.0 # tokens encoder_conf:
max_output_len: 400.0 # tokens output_size: 256 # dimension of attention
min_output_input_ratio: 0.05 attention_heads: 4
max_output_input_ratio: 10.0 linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
collator: dropout_rate: 0.1
mean_std_filepath: "" positional_dropout_rate: 0.1
vocab_filepath: data/lang_char/vocab.txt attention_dropout_rate: 0.0
unit_type: 'spm' input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
spm_model_prefix: 'data/lang_char/bpe_unigram_200' normalize_before: true
augmentation_config: conf/preprocess.yaml use_dynamic_chunk: true
batch_size: 4 use_dynamic_left_chunk: false
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
use_dynamic_chunk: true
use_dynamic_left_chunk: false
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
training: # https://yaml.org/type/float.html
n_epoch: 5 ###########################################
accum_grad: 1 # Data #
global_grad_clip: 5.0 ###########################################
optim: adam train_manifest: data/manifest.tiny
optim_conf: dev_manifest: data/manifest.tiny
lr: 0.002 test_manifest: data/manifest.tiny
weight_decay: 1e-06
scheduler: warmuplr ###########################################
scheduler_conf: # Dataloader #
warmup_steps: 25000 ###########################################
lr_decay: 1.0 mean_std_filepath: ""
log_interval: 1 vocab_filepath: data/lang_char/vocab.txt
checkpoint: unit_type: 'spm'
kbest_n: 10 spm_model_prefix: 'data/lang_char/bpe_unigram_200'
latest_n: 1 preprocess_config: conf/preprocess.yaml
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 4
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 0
subsampling_factor: 1
num_encs: 1
decoding:
batch_size: 64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
###########################################
# Training #
###########################################
n_epoch: 5
accum_grad: 1
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1.0e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 1
checkpoint:
kbest_n: 10
latest_n: 1
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.tiny
dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny
min_input_len: 0.5 # second
max_input_len: 20.0 # second
min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
###########################################
# Dataloader #
###########################################
mean_std_filepath: ""
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
############################################ ############################################
# Network Architecture # # Network Architecture #
############################################ ############################################
...@@ -83,7 +41,41 @@ model_conf: ...@@ -83,7 +41,41 @@ model_conf:
########################################### ###########################################
# training # # Data #
###########################################
train_manifest: data/manifest.tiny
dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny
###########################################
# Dataloader #
###########################################
mean_std_filepath: ""
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_200'
preprocess_config: conf/preprocess.yaml
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 4
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 0
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
########################################### ###########################################
n_epoch: 5 n_epoch: 5
accum_grad: 4 accum_grad: 4
...@@ -91,7 +83,7 @@ global_grad_clip: 5.0 ...@@ -91,7 +83,7 @@ global_grad_clip: 5.0
optim: adam optim: adam
optim_conf: optim_conf:
lr: 0.002 lr: 0.002
weight_decay: 1e-06 weight_decay: 1.0e-06
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
......
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest: data/manifest.tiny
dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny
min_input_len: 0.5 # second
max_input_len: 20.0 # second
min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
###########################################
# Dataloader #
###########################################
mean_std_filepath: data/mean_std.json
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
############################################ ############################################
# Network Architecture # # Network Architecture #
############################################ ############################################
...@@ -74,9 +34,41 @@ model_conf: ...@@ -74,9 +34,41 @@ model_conf:
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
###########################################
# Data #
###########################################
train_manifest: data/manifest.tiny
dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny
###########################################
# Dataloader #
###########################################
mean_std_filepath: data/mean_std.json
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_200'
preprocess_config: conf/preprocess.yaml
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 4
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 0
subsampling_factor: 1
num_encs: 1
########################################### ###########################################
# training # # Training #
########################################### ###########################################
n_epoch: 5 n_epoch: 5
accum_grad: 1 accum_grad: 1
...@@ -84,7 +76,7 @@ global_grad_clip: 5.0 ...@@ -84,7 +76,7 @@ global_grad_clip: 5.0
optim: adam optim: adam
optim_conf: optim_conf:
lr: 0.002 lr: 0.002
weight_decay: 1e-06 weight_decay: 1.0e-06
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
......
decode_batch_size: 8 #64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
\ No newline at end of file
decode_batch_size: 8 #64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
batch_size=1 batch_size=1
output_dir=${ckpt_prefix} output_dir=${ckpt_prefix}
...@@ -20,9 +21,10 @@ mkdir -p ${output_dir} ...@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/alignment.py \ python3 -u ${BIN_DIR}/alignment.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.align \ --result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!" echo "Failed in ctc alignment!"
......
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
chunk_mode=false chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
...@@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do ...@@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
...@@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do ...@@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
......
...@@ -6,6 +6,7 @@ gpus=0 ...@@ -6,6 +6,7 @@ gpus=0
stage=0 stage=0
stop_stage=50 stop_stage=50
conf_path=conf/transformer.yaml conf_path=conf/transformer.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=1 avg_num=1
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
...@@ -31,12 +32,12 @@ fi ...@@ -31,12 +32,12 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data # ctc alignment of test data
CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
......
# network architecture ############################################
model: # Network Architecture #
# encoder related ############################################
encoder: conformer cmvn_file:
encoder_conf: cmvn_file_type: "json"
output_size: 512 # dimension of attention # encoder related
attention_heads: 8 encoder: conformer
linear_units: 2048 # the number of units of position-wise feed forward encoder_conf:
num_blocks: 12 # the number of encoder blocks output_size: 512 # dimension of attention
dropout_rate: 0.1 attention_heads: 8
positional_dropout_rate: 0.1 linear_units: 2048 # the number of units of position-wise feed forward
attention_dropout_rate: 0.0 num_blocks: 12 # the number of encoder blocks
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 dropout_rate: 0.1
normalize_before: True positional_dropout_rate: 0.1
use_cnn_module: True attention_dropout_rate: 0.0
cnn_module_kernel: 15 input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
cnn_module_norm: layer_norm normalize_before: True
activation_type: swish use_cnn_module: True
pos_enc_layer_type: rel_pos cnn_module_kernel: 15
selfattention_layer_type: rel_selfattn cnn_module_norm: layer_norm
activation_type: swish
pos_enc_layer_type: rel_pos
selfattention_layer_type: rel_selfattn
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 8 attention_heads: 8
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.1 # second dev_manifest: data/manifest.dev
max_input_len: 12.0 # second test_manifest: data/manifest.test
min_output_len: 1.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'char' ###########################################
spm_model_prefix: '' vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/preprocess.yaml unit_type: 'char'
batch_size: 64 preprocess_config: conf/preprocess.yaml
raw_wav: True # use raw_wav or kaldi feature spm_model_prefix: ''
spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80
feat_dim: 80 stride_ms: 10.0
delta_delta: False window_ms: 25.0
dither: 1.0 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
target_sample_rate: 16000 batch_size: 64
max_freq: None maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
n_fft: None maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
stride_ms: 10.0 minibatches: 0 # for debug
window_ms: 25.0 batch_count: auto
use_dB_normalization: True batch_bins: 0
target_dB: -20 batch_frames_in: 0
random_seed: 0 batch_frames_out: 0
keep_transcription_text: False batch_frames_inout: 0
sortagrad: True num_workers: 0
shuffle_method: batch_shuffle subsampling_factor: 1
num_workers: 2 num_encs: 1
training: ###########################################
n_epoch: 240 # Training #
accum_grad: 16 ###########################################
global_grad_clip: 5.0 n_epoch: 240
log_interval: 100 accum_grad: 16
checkpoint: global_grad_clip: 5.0
kbest_n: 50 log_interval: 100
latest_n: 5 checkpoint:
optim: adam kbest_n: 50
optim_conf: latest_n: 5
lr: 0.001 optim: adam
weight_decay: 1e-6 optim_conf:
scheduler: warmuplr lr: 0.001
scheduler_conf: weight_decay: 1.0e-6
warmup_steps: 5000 scheduler: warmuplr
lr_decay: 1.0 scheduler_conf:
warmup_steps: 5000
lr_decay: 1.0
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
\ No newline at end of file
decode_batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
\ No newline at end of file
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
chunk_mode=false chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
...@@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do ...@@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
...@@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do ...@@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
......
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix audio_file" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
exit -1 exit -1
fi fi
...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ...@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
audio_file=$3 ckpt_prefix=$3
audio_file=$4
mkdir -p data mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
...@@ -43,10 +44,11 @@ for type in attention_rescoring; do ...@@ -43,10 +44,11 @@ for type in attention_rescoring; do
python3 -u ${BIN_DIR}/test_wav.py \ python3 -u ${BIN_DIR}/test_wav.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} \ --opts decode.decode_batch_size ${batch_size} \
--audio_file ${audio_file} --audio_file ${audio_file}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
......
...@@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7 ...@@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/conformer.yaml conf_path=conf/conformer.yaml
decode_conf_path=conf/tuning/decode.yaml
average_checkpoint=true average_checkpoint=true
avg_num=10 avg_num=10
...@@ -36,12 +36,12 @@ fi ...@@ -36,12 +36,12 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data # ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
...@@ -51,5 +51,5 @@ fi ...@@ -51,5 +51,5 @@ fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# test a single .wav file # test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi fi
...@@ -80,13 +80,13 @@ def inference(config, args): ...@@ -80,13 +80,13 @@ def inference(config, args):
def start_server(config, args): def start_server(config, args):
"""Start the ASR server""" """Start the ASR server"""
config.defrost() config.defrost()
config.data.manifest = config.data.test_manifest config.manifest = config.test_manifest
dataset = ManifestDataset.from_config(config) dataset = ManifestDataset.from_config(config)
config.collator.augmentation_config = "" config.augmentation_config = ""
config.collator.keep_transcription_text = True config.keep_transcription_text = True
config.collator.batch_size = 1 config.batch_size = 1
config.collator.num_workers = 0 config.num_workers = 0
collate_fn = SpeechCollator.from_config(config) collate_fn = SpeechCollator.from_config(config)
test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
...@@ -105,14 +105,14 @@ def start_server(config, args): ...@@ -105,14 +105,14 @@ def start_server(config, args):
paddle.to_tensor(audio), paddle.to_tensor(audio),
paddle.to_tensor(audio_len), paddle.to_tensor(audio_len),
vocab_list=test_loader.collate_fn.vocab_list, vocab_list=test_loader.collate_fn.vocab_list,
decoding_method=config.decoding.decoding_method, decoding_method=config.decode.decoding_method,
lang_model_path=config.decoding.lang_model_path, lang_model_path=config.decode.lang_model_path,
beam_alpha=config.decoding.alpha, beam_alpha=config.decode.alpha,
beam_beta=config.decoding.beta, beam_beta=config.decode.beta,
beam_size=config.decoding.beam_size, beam_size=config.decode.beam_size,
cutoff_prob=config.decoding.cutoff_prob, cutoff_prob=config.decode.cutoff_prob,
cutoff_top_n=config.decoding.cutoff_top_n, cutoff_top_n=config.decode.cutoff_top_n,
num_processes=config.decoding.num_proc_bsearch) num_processes=config.decode.num_proc_bsearch)
return result_transcript[0] return result_transcript[0]
# warming up with utterrances sampled from Librispeech # warming up with utterrances sampled from Librispeech
...@@ -179,12 +179,16 @@ if __name__ == "__main__": ...@@ -179,12 +179,16 @@ if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()
print(config) print(config)
args.warmup_manifest = config.data.test_manifest args.warmup_manifest = config.test_manifest
print_arguments(args, globals()) print_arguments(args, globals())
if args.dump_config: if args.dump_config:
......
...@@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments ...@@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments
def start_server(config, args): def start_server(config, args):
"""Start the ASR server""" """Start the ASR server"""
config.defrost() config.defrost()
config.data.manifest = config.data.test_manifest config.manifest = config.test_manifest
dataset = ManifestDataset.from_config(config) dataset = ManifestDataset.from_config(config)
config.collator.augmentation_config = "" config.augmentation_config = ""
config.collator.keep_transcription_text = True config.keep_transcription_text = True
config.collator.batch_size = 1 config.batch_size = 1
config.collator.num_workers = 0 config.num_workers = 0
collate_fn = SpeechCollator.from_config(config) collate_fn = SpeechCollator.from_config(config)
test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
...@@ -62,14 +62,14 @@ def start_server(config, args): ...@@ -62,14 +62,14 @@ def start_server(config, args):
paddle.to_tensor(audio), paddle.to_tensor(audio),
paddle.to_tensor(audio_len), paddle.to_tensor(audio_len),
vocab_list=test_loader.collate_fn.vocab_list, vocab_list=test_loader.collate_fn.vocab_list,
decoding_method=config.decoding.decoding_method, decoding_method=config.decode.decoding_method,
lang_model_path=config.decoding.lang_model_path, lang_model_path=config.decode.lang_model_path,
beam_alpha=config.decoding.alpha, beam_alpha=config.decode.alpha,
beam_beta=config.decoding.beta, beam_beta=config.decode.beta,
beam_size=config.decoding.beam_size, beam_size=config.decode.beam_size,
cutoff_prob=config.decoding.cutoff_prob, cutoff_prob=config.decode.cutoff_prob,
cutoff_top_n=config.decoding.cutoff_top_n, cutoff_top_n=config.decode.cutoff_top_n,
num_processes=config.decoding.num_proc_bsearch) num_processes=config.decode.num_proc_bsearch)
return result_transcript[0] return result_transcript[0]
# warming up with utterrances sampled from Librispeech # warming up with utterrances sampled from Librispeech
...@@ -114,12 +114,16 @@ if __name__ == "__main__": ...@@ -114,12 +114,16 @@ if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()
print(config) print(config)
args.warmup_manifest = config.data.test_manifest args.warmup_manifest = config.test_manifest
print_arguments(args, globals()) print_arguments(args, globals())
if args.dump_config: if args.dump_config:
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Evaluation for DeepSpeech2 model.""" """Evaluation for DeepSpeech2 model."""
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
...@@ -44,6 +46,10 @@ if __name__ == "__main__": ...@@ -44,6 +46,10 @@ if __name__ == "__main__":
config = get_cfg_defaults(args.model_type) config = get_cfg_defaults(args.model_type)
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Evaluation for DeepSpeech2 model.""" """Evaluation for DeepSpeech2 model."""
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
...@@ -49,6 +51,10 @@ if __name__ == "__main__": ...@@ -49,6 +51,10 @@ if __name__ == "__main__":
config = get_cfg_defaults(args.model_type) config = get_cfg_defaults(args.model_type)
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()
......
...@@ -18,6 +18,7 @@ from pathlib import Path ...@@ -18,6 +18,7 @@ from pathlib import Path
import paddle import paddle
import soundfile import soundfile
from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
...@@ -41,7 +42,7 @@ class DeepSpeech2Tester_hub(): ...@@ -41,7 +42,7 @@ class DeepSpeech2Tester_hub():
self.audio_file = args.audio_file self.audio_file = args.audio_file
self.collate_fn_test = SpeechCollator.from_config(config) self.collate_fn_test = SpeechCollator.from_config(config)
self._text_featurizer = TextFeaturizer( self._text_featurizer = TextFeaturizer(
unit_type=config.collator.unit_type, vocab=None) unit_type=config.unit_type, vocab=None)
def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
result_transcripts = self.model.decode( result_transcripts = self.model.decode(
...@@ -74,7 +75,7 @@ class DeepSpeech2Tester_hub(): ...@@ -74,7 +75,7 @@ class DeepSpeech2Tester_hub():
audio = paddle.unsqueeze(audio, axis=0) audio = paddle.unsqueeze(audio, axis=0)
vocab_list = collate_fn_test.vocab_list vocab_list = collate_fn_test.vocab_list
result_transcripts = self.compute_result_transcripts( result_transcripts = self.compute_result_transcripts(
audio, audio_len, vocab_list, cfg.decoding) audio, audio_len, vocab_list, cfg.decode)
logger.info("result_transcripts: " + result_transcripts[0]) logger.info("result_transcripts: " + result_transcripts[0])
def run_test(self): def run_test(self):
...@@ -110,13 +111,13 @@ class DeepSpeech2Tester_hub(): ...@@ -110,13 +111,13 @@ class DeepSpeech2Tester_hub():
def setup_model(self): def setup_model(self):
config = self.config.clone() config = self.config.clone()
with UpdateConfig(config): with UpdateConfig(config):
config.model.input_dim = self.collate_fn_test.feature_size config.input_dim = self.collate_fn_test.feature_size
config.model.output_dim = self.collate_fn_test.vocab_size config.output_dim = self.collate_fn_test.vocab_size
if self.args.model_type == 'offline': if self.args.model_type == 'offline':
model = DeepSpeech2Model.from_config(config.model) model = DeepSpeech2Model.from_config(config)
elif self.args.model_type == 'online': elif self.args.model_type == 'online':
model = DeepSpeech2ModelOnline.from_config(config.model) model = DeepSpeech2ModelOnline.from_config(config)
else: else:
raise Exception("wrong model type") raise Exception("wrong model type")
...@@ -134,8 +135,8 @@ class DeepSpeech2Tester_hub(): ...@@ -134,8 +135,8 @@ class DeepSpeech2Tester_hub():
self.checkpoint_dir = checkpoint_dir self.checkpoint_dir = checkpoint_dir
self.checkpoint = Checkpoint( self.checkpoint = Checkpoint(
kbest_n=self.config.training.checkpoint.kbest_n, kbest_n=self.config.checkpoint.kbest_n,
latest_n=self.config.training.checkpoint.latest_n) latest_n=self.config.checkpoint.latest_n)
def resume(self): def resume(self):
"""Resume from the checkpoint at checkpoints in the output """Resume from the checkpoint at checkpoints in the output
...@@ -190,6 +191,10 @@ if __name__ == "__main__": ...@@ -190,6 +191,10 @@ if __name__ == "__main__":
config = get_cfg_defaults(args.model_type) config = get_cfg_defaults(args.model_type)
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()
......
...@@ -23,17 +23,6 @@ from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline ...@@ -23,17 +23,6 @@ from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
def get_cfg_defaults(model_type='offline'): def get_cfg_defaults(model_type='offline'):
_C = CfgNode() _C = CfgNode()
_C.data = ManifestDataset.params()
_C.collator = SpeechCollator.params()
_C.training = DeepSpeech2Trainer.params()
_C.decoding = DeepSpeech2Tester.params()
if model_type == 'offline':
_C.model = DeepSpeech2Model.params()
else:
_C.model = DeepSpeech2ModelOnline.params()
"""Get a yacs CfgNode object with default values for my_project."""
# Return a clone so that the defaults will not be altered
# This is for the "local variable" use pattern
config = _C.clone() config = _C.clone()
config.set_new_allowed(True) config.set_new_allowed(True)
return config return config
...@@ -69,8 +69,8 @@ class DeepSpeech2Trainer(Trainer): ...@@ -69,8 +69,8 @@ class DeepSpeech2Trainer(Trainer):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
batch_size = self.config.collator.batch_size batch_size = self.config.batch_size
accum_grad = self.config.training.accum_grad accum_grad = self.config.accum_grad
start = time.time() start = time.time()
...@@ -133,7 +133,7 @@ class DeepSpeech2Trainer(Trainer): ...@@ -133,7 +133,7 @@ class DeepSpeech2Trainer(Trainer):
total_loss += float(loss) * num_utts total_loss += float(loss) * num_utts
valid_losses['val_loss'].append(float(loss)) valid_losses['val_loss'].append(float(loss))
if (i + 1) % self.config.training.log_interval == 0: if (i + 1) % self.config.log_interval == 0:
valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
valid_dump['val_history_loss'] = total_loss / num_seen_utts valid_dump['val_history_loss'] = total_loss / num_seen_utts
...@@ -154,16 +154,16 @@ class DeepSpeech2Trainer(Trainer): ...@@ -154,16 +154,16 @@ class DeepSpeech2Trainer(Trainer):
config = self.config.clone() config = self.config.clone()
with UpdateConfig(config): with UpdateConfig(config):
if self.train: if self.train:
config.model.input_dim = self.train_loader.collate_fn.feature_size config.input_dim = self.train_loader.collate_fn.feature_size
config.model.output_dim = self.train_loader.collate_fn.vocab_size config.output_dim = self.train_loader.collate_fn.vocab_size
else: else:
config.model.input_dim = self.test_loader.collate_fn.feature_size config.input_dim = self.test_loader.collate_fn.feature_size
config.model.output_dim = self.test_loader.collate_fn.vocab_size config.output_dim = self.test_loader.collate_fn.vocab_size
if self.args.model_type == 'offline': if self.args.model_type == 'offline':
model = DeepSpeech2Model.from_config(config.model) model = DeepSpeech2Model.from_config(config)
elif self.args.model_type == 'online': elif self.args.model_type == 'online':
model = DeepSpeech2ModelOnline.from_config(config.model) model = DeepSpeech2ModelOnline.from_config(config)
else: else:
raise Exception("wrong model type") raise Exception("wrong model type")
if self.parallel: if self.parallel:
...@@ -177,17 +177,13 @@ class DeepSpeech2Trainer(Trainer): ...@@ -177,17 +177,13 @@ class DeepSpeech2Trainer(Trainer):
if not self.train: if not self.train:
return return
grad_clip = ClipGradByGlobalNormWithLog( grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
config.training.global_grad_clip)
lr_scheduler = paddle.optimizer.lr.ExponentialDecay( lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
learning_rate=config.training.lr, learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
gamma=config.training.lr_decay,
verbose=True)
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
parameters=model.parameters(), parameters=model.parameters(),
weight_decay=paddle.regularizer.L2Decay( weight_decay=paddle.regularizer.L2Decay(config.weight_decay),
config.training.weight_decay),
grad_clip=grad_clip) grad_clip=grad_clip)
self.optimizer = optimizer self.optimizer = optimizer
self.lr_scheduler = lr_scheduler self.lr_scheduler = lr_scheduler
...@@ -198,66 +194,67 @@ class DeepSpeech2Trainer(Trainer): ...@@ -198,66 +194,67 @@ class DeepSpeech2Trainer(Trainer):
config.defrost() config.defrost()
if self.train: if self.train:
# train # train
config.data.manifest = config.data.train_manifest config.manifest = config.train_manifest
train_dataset = ManifestDataset.from_config(config) train_dataset = ManifestDataset.from_config(config)
if self.parallel: if self.parallel:
batch_sampler = SortagradDistributedBatchSampler( batch_sampler = SortagradDistributedBatchSampler(
train_dataset, train_dataset,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
num_replicas=None, num_replicas=None,
rank=None, rank=None,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
sortagrad=config.collator.sortagrad, sortagrad=config.sortagrad,
shuffle_method=config.collator.shuffle_method) shuffle_method=config.shuffle_method)
else: else:
batch_sampler = SortagradBatchSampler( batch_sampler = SortagradBatchSampler(
train_dataset, train_dataset,
shuffle=True, shuffle=True,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
drop_last=True, drop_last=True,
sortagrad=config.collator.sortagrad, sortagrad=config.sortagrad,
shuffle_method=config.collator.shuffle_method) shuffle_method=config.shuffle_method)
config.collator.keep_transcription_text = False config.keep_transcription_text = False
collate_fn_train = SpeechCollator.from_config(config) collate_fn_train = SpeechCollator.from_config(config)
self.train_loader = DataLoader( self.train_loader = DataLoader(
train_dataset, train_dataset,
batch_sampler=batch_sampler, batch_sampler=batch_sampler,
collate_fn=collate_fn_train, collate_fn=collate_fn_train,
num_workers=config.collator.num_workers) num_workers=config.num_workers)
# dev # dev
config.data.manifest = config.data.dev_manifest config.manifest = config.dev_manifest
dev_dataset = ManifestDataset.from_config(config) dev_dataset = ManifestDataset.from_config(config)
config.collator.augmentation_config = "" config.augmentation_config = ""
config.collator.keep_transcription_text = False config.keep_transcription_text = False
collate_fn_dev = SpeechCollator.from_config(config) collate_fn_dev = SpeechCollator.from_config(config)
self.valid_loader = DataLoader( self.valid_loader = DataLoader(
dev_dataset, dev_dataset,
batch_size=int(config.collator.batch_size), batch_size=int(config.batch_size),
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=collate_fn_dev, collate_fn=collate_fn_dev,
num_workers=config.collator.num_workers) num_workers=config.num_workers)
logger.info("Setup train/valid Dataloader!") logger.info("Setup train/valid Dataloader!")
else: else:
# test # test
config.data.manifest = config.data.test_manifest config.manifest = config.test_manifest
test_dataset = ManifestDataset.from_config(config) test_dataset = ManifestDataset.from_config(config)
config.collator.augmentation_config = "" config.augmentation_config = ""
config.collator.keep_transcription_text = True config.keep_transcription_text = True
collate_fn_test = SpeechCollator.from_config(config) collate_fn_test = SpeechCollator.from_config(config)
decode_batch_size = config.get('decode', dict()).get(
'decode_batch_size', 1)
self.test_loader = DataLoader( self.test_loader = DataLoader(
test_dataset, test_dataset,
batch_size=config.decoding.batch_size, batch_size=decode_batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=collate_fn_test, collate_fn=collate_fn_test,
num_workers=config.collator.num_workers) num_workers=config.num_workers)
logger.info("Setup test Dataloader!") logger.info("Setup test Dataloader!")
...@@ -286,7 +283,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -286,7 +283,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def __init__(self, config, args): def __init__(self, config, args):
super().__init__(config, args) super().__init__(config, args)
self._text_featurizer = TextFeaturizer( self._text_featurizer = TextFeaturizer(
unit_type=config.collator.unit_type, vocab=None) unit_type=config.unit_type, vocab=None)
def ordid2token(self, texts, texts_len): def ordid2token(self, texts, texts_len):
""" ord() id to chr() chr """ """ ord() id to chr() chr """
...@@ -304,17 +301,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -304,17 +301,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
texts, texts,
texts_len, texts_len,
fout=None): fout=None):
cfg = self.config.decoding decode_cfg = self.config.decode
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
vocab_list = self.test_loader.collate_fn.vocab_list vocab_list = self.test_loader.collate_fn.vocab_list
target_transcripts = self.ordid2token(texts, texts_len) target_transcripts = self.ordid2token(texts, texts_len)
result_transcripts = self.compute_result_transcripts(audio, audio_len, result_transcripts = self.compute_result_transcripts(
vocab_list, cfg) audio, audio_len, vocab_list, decode_cfg)
for utt, target, result in zip(utts, target_transcripts, for utt, target, result in zip(utts, target_transcripts,
result_transcripts): result_transcripts):
...@@ -327,29 +324,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -327,29 +324,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
logger.info(f"Utt: {utt}") logger.info(f"Utt: {utt}")
logger.info(f"Ref: {target}") logger.info(f"Ref: {target}")
logger.info(f"Hyp: {result}") logger.info(f"Hyp: {result}")
logger.info("Current error rate [%s] = %f" % logger.info(
(cfg.error_rate_type, error_rate_func(target, result))) "Current error rate [%s] = %f" %
(decode_cfg.error_rate_type, error_rate_func(target, result)))
return dict( return dict(
errors_sum=errors_sum, errors_sum=errors_sum,
len_refs=len_refs, len_refs=len_refs,
num_ins=num_ins, num_ins=num_ins,
error_rate=errors_sum / len_refs, error_rate=errors_sum / len_refs,
error_rate_type=cfg.error_rate_type) error_rate_type=decode_cfg.error_rate_type)
def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): def compute_result_transcripts(self, audio, audio_len, vocab_list,
decode_cfg):
result_transcripts = self.model.decode( result_transcripts = self.model.decode(
audio, audio,
audio_len, audio_len,
vocab_list, vocab_list,
decoding_method=cfg.decoding_method, decoding_method=decode_cfg.decoding_method,
lang_model_path=cfg.lang_model_path, lang_model_path=decode_cfg.lang_model_path,
beam_alpha=cfg.alpha, beam_alpha=decode_cfg.alpha,
beam_beta=cfg.beta, beam_beta=decode_cfg.beta,
beam_size=cfg.beam_size, beam_size=decode_cfg.beam_size,
cutoff_prob=cfg.cutoff_prob, cutoff_prob=decode_cfg.cutoff_prob,
cutoff_top_n=cfg.cutoff_top_n, cutoff_top_n=decode_cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch) num_processes=decode_cfg.num_proc_bsearch)
return result_transcripts return result_transcripts
...@@ -358,7 +357,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -358,7 +357,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def test(self): def test(self):
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
self.model.eval() self.model.eval()
cfg = self.config
error_rate_type = None error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
with jsonlines.open(self.args.result_file, 'w') as fout: with jsonlines.open(self.args.result_file, 'w') as fout:
...@@ -412,11 +410,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): ...@@ -412,11 +410,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
if self.args.enable_auto_log is True: if self.args.enable_auto_log is True:
from paddlespeech.s2t.utils.log import Autolog from paddlespeech.s2t.utils.log import Autolog
self.autolog = Autolog( self.autolog = Autolog(
batch_size=self.config.decoding.batch_size, batch_size=self.config.decode.decode_batch_size,
model_name="deepspeech2", model_name="deepspeech2",
model_precision="fp32").getlog() model_precision="fp32").getlog()
self.model.eval() self.model.eval()
cfg = self.config
error_rate_type = None error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
with jsonlines.open(self.args.result_file, 'w') as fout: with jsonlines.open(self.args.result_file, 'w') as fout:
...@@ -441,7 +438,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): ...@@ -441,7 +438,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
if self.args.enable_auto_log is True: if self.args.enable_auto_log is True:
self.autolog.report() self.autolog.report()
def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): def compute_result_transcripts(self, audio, audio_len, vocab_list,
decode_cfg):
if self.args.model_type == "online": if self.args.model_type == "online":
output_probs, output_lens = self.static_forward_online(audio, output_probs, output_lens = self.static_forward_online(audio,
audio_len) audio_len)
...@@ -454,13 +452,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): ...@@ -454,13 +452,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
self.predictor.clear_intermediate_tensor() self.predictor.clear_intermediate_tensor()
self.predictor.try_shrink_memory() self.predictor.try_shrink_memory()
self.model.decoder.init_decode(cfg.alpha, cfg.beta, cfg.lang_model_path, self.model.decoder.init_decode(decode_cfg.alpha, decode_cfg.beta,
vocab_list, cfg.decoding_method) decode_cfg.lang_model_path, vocab_list,
decode_cfg.decoding_method)
result_transcripts = self.model.decoder.decode_probs( result_transcripts = self.model.decoder.decode_probs(
output_probs, output_lens, vocab_list, cfg.decoding_method, output_probs, output_lens, vocab_list, decode_cfg.decoding_method,
cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size, decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch) decode_cfg.beam_size, decode_cfg.cutoff_prob,
decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
#replace the <space> with ' ' #replace the <space> with ' '
result_transcripts = [ result_transcripts = [
self._text_featurizer.detokenize(sentence) self._text_featurizer.detokenize(sentence)
...@@ -531,12 +531,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): ...@@ -531,12 +531,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
num_chunk = int(num_chunk) num_chunk = int(num_chunk)
chunk_state_h_box = np.zeros( chunk_state_h_box = np.zeros(
(self.config.model.num_rnn_layers, 1, (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
self.config.model.rnn_layer_size),
dtype=x.dtype) dtype=x.dtype)
chunk_state_c_box = np.zeros( chunk_state_c_box = np.zeros(
(self.config.model.num_rnn_layers, 1, (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
self.config.model.rnn_layer_size),
dtype=x.dtype) dtype=x.dtype)
input_names = self.predictor.get_input_names() input_names = self.predictor.get_input_names()
......
...@@ -43,9 +43,9 @@ if __name__ == "__main__": ...@@ -43,9 +43,9 @@ if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_config: if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True) decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_config) decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
......
...@@ -47,9 +47,9 @@ if __name__ == "__main__": ...@@ -47,9 +47,9 @@ if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_config: if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True) decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_config) decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
......
...@@ -38,7 +38,7 @@ class U2Infer(): ...@@ -38,7 +38,7 @@ class U2Infer():
self.config = config self.config = config
self.audio_file = args.audio_file self.audio_file = args.audio_file
self.preprocess_conf = config.augmentation_config self.preprocess_conf = config.preprocess_config
self.preprocess_args = {"train": False} self.preprocess_args = {"train": False}
self.preprocessing = Transformation(self.preprocess_conf) self.preprocessing = Transformation(self.preprocess_conf)
...@@ -132,9 +132,9 @@ if __name__ == "__main__": ...@@ -132,9 +132,9 @@ if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_config: if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True) decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_config) decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
......
...@@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2 import U2Model ...@@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2 import U2Model
_C = CfgNode(new_allowed=True) _C = CfgNode(new_allowed=True)
ManifestDataset.params(_C) # ManifestDataset.params(_C)
SpeechCollator.params(_C) # SpeechCollator.params(_C)
U2Model.params(_C) # U2Model.params(_C)
U2Trainer.params(_C) # U2Trainer.params(_C)
_C.decode = U2Tester.params() # _C.decode = U2Tester.params()
def get_cfg_defaults(): def get_cfg_defaults():
......
...@@ -264,7 +264,7 @@ class U2Trainer(Trainer): ...@@ -264,7 +264,7 @@ class U2Trainer(Trainer):
batch_frames_in=config.batch_frames_in, batch_frames_in=config.batch_frames_in,
batch_frames_out=config.batch_frames_out, batch_frames_out=config.batch_frames_out,
batch_frames_inout=config.batch_frames_inout, batch_frames_inout=config.batch_frames_inout,
preprocess_conf=config.augmentation_config, preprocess_conf=config.preprocess_config,
n_iter_processes=config.num_workers, n_iter_processes=config.num_workers,
subsampling_factor=1, subsampling_factor=1,
num_encs=1) num_encs=1)
...@@ -283,18 +283,20 @@ class U2Trainer(Trainer): ...@@ -283,18 +283,20 @@ class U2Trainer(Trainer):
batch_frames_in=0, batch_frames_in=0,
batch_frames_out=0, batch_frames_out=0,
batch_frames_inout=0, batch_frames_inout=0,
preprocess_conf=config.augmentation_config, preprocess_conf=config.preprocess_config,
n_iter_processes=config.num_workers, n_iter_processes=config.num_workers,
subsampling_factor=1, subsampling_factor=1,
num_encs=1) num_encs=1)
logger.info("Setup train/valid Dataloader!") logger.info("Setup train/valid Dataloader!")
else: else:
decode_batch_size = config.get('decode', dict()).get(
'decode_batch_size', 1)
# test dataset, return raw text # test dataset, return raw text
self.test_loader = BatchDataLoader( self.test_loader = BatchDataLoader(
json_file=config.test_manifest, json_file=config.test_manifest,
train_mode=False, train_mode=False,
sortagrad=False, sortagrad=False,
batch_size=config.decode.decode_batch_size, batch_size=decode_batch_size,
maxlen_in=float('inf'), maxlen_in=float('inf'),
maxlen_out=float('inf'), maxlen_out=float('inf'),
minibatches=0, minibatches=0,
...@@ -304,7 +306,7 @@ class U2Trainer(Trainer): ...@@ -304,7 +306,7 @@ class U2Trainer(Trainer):
batch_frames_in=0, batch_frames_in=0,
batch_frames_out=0, batch_frames_out=0,
batch_frames_inout=0, batch_frames_inout=0,
preprocess_conf=config.augmentation_config, preprocess_conf=config.preprocess_config,
n_iter_processes=1, n_iter_processes=1,
subsampling_factor=1, subsampling_factor=1,
num_encs=1) num_encs=1)
...@@ -313,7 +315,7 @@ class U2Trainer(Trainer): ...@@ -313,7 +315,7 @@ class U2Trainer(Trainer):
json_file=config.test_manifest, json_file=config.test_manifest,
train_mode=False, train_mode=False,
sortagrad=False, sortagrad=False,
batch_size=config.decode.decode_batch_size, batch_size=decode_batch_size,
maxlen_in=float('inf'), maxlen_in=float('inf'),
maxlen_out=float('inf'), maxlen_out=float('inf'),
minibatches=0, minibatches=0,
...@@ -323,7 +325,7 @@ class U2Trainer(Trainer): ...@@ -323,7 +325,7 @@ class U2Trainer(Trainer):
batch_frames_in=0, batch_frames_in=0,
batch_frames_out=0, batch_frames_out=0,
batch_frames_inout=0, batch_frames_inout=0,
preprocess_conf=config.augmentation_config, preprocess_conf=config.preprocess_config,
n_iter_processes=1, n_iter_processes=1,
subsampling_factor=1, subsampling_factor=1,
num_encs=1) num_encs=1)
...@@ -557,7 +559,7 @@ class U2Tester(U2Trainer): ...@@ -557,7 +559,7 @@ class U2Tester(U2Trainer):
"ref_len": "ref_len":
len_refs, len_refs,
"decode_method": "decode_method":
self.config.decoding_method, self.config.decode.decoding_method,
}) })
f.write(data + '\n') f.write(data + '\n')
......
...@@ -44,77 +44,77 @@ class U2Trainer(Trainer): ...@@ -44,77 +44,77 @@ class U2Trainer(Trainer):
def setup_dataloader(self): def setup_dataloader(self):
config = self.config.clone() config = self.config.clone()
config.defrost() config.defrost()
config.collator.keep_transcription_text = False config.keep_transcription_text = False
# train/valid dataset, return token ids # train/valid dataset, return token ids
config.data.manifest = config.data.train_manifest config.manifest = config.train_manifest
train_dataset = ManifestDataset.from_config(config) train_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.dev_manifest config.manifest = config.dev_manifest
dev_dataset = ManifestDataset.from_config(config) dev_dataset = ManifestDataset.from_config(config)
collate_fn_train = SpeechCollator.from_config(config) collate_fn_train = SpeechCollator.from_config(config)
config.collator.augmentation_config = "" config.augmentation_config = ""
collate_fn_dev = SpeechCollator.from_config(config) collate_fn_dev = SpeechCollator.from_config(config)
if self.parallel: if self.parallel:
batch_sampler = SortagradDistributedBatchSampler( batch_sampler = SortagradDistributedBatchSampler(
train_dataset, train_dataset,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
num_replicas=None, num_replicas=None,
rank=None, rank=None,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
sortagrad=config.collator.sortagrad, sortagrad=config.sortagrad,
shuffle_method=config.collator.shuffle_method) shuffle_method=config.shuffle_method)
else: else:
batch_sampler = SortagradBatchSampler( batch_sampler = SortagradBatchSampler(
train_dataset, train_dataset,
shuffle=True, shuffle=True,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
drop_last=True, drop_last=True,
sortagrad=config.collator.sortagrad, sortagrad=config.sortagrad,
shuffle_method=config.collator.shuffle_method) shuffle_method=config.shuffle_method)
self.train_loader = DataLoader( self.train_loader = DataLoader(
train_dataset, train_dataset,
batch_sampler=batch_sampler, batch_sampler=batch_sampler,
collate_fn=collate_fn_train, collate_fn=collate_fn_train,
num_workers=config.collator.num_workers, ) num_workers=config.num_workers, )
self.valid_loader = DataLoader( self.valid_loader = DataLoader(
dev_dataset, dev_dataset,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=collate_fn_dev, collate_fn=collate_fn_dev,
num_workers=config.collator.num_workers, ) num_workers=config.num_workers, )
# test dataset, return raw text # test dataset, return raw text
config.data.manifest = config.data.test_manifest config.manifest = config.test_manifest
# filter test examples, will cause less examples, but no mismatch with training # filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now. # and can use large batch size , save training time, so filter test egs now.
config.data.min_input_len = 0.0 # second config.min_input_len = 0.0 # second
config.data.max_input_len = float('inf') # second config.max_input_len = float('inf') # second
config.data.min_output_len = 0.0 # tokens config.min_output_len = 0.0 # tokens
config.data.max_output_len = float('inf') # tokens config.max_output_len = float('inf') # tokens
config.data.min_output_input_ratio = 0.00 config.min_output_input_ratio = 0.00
config.data.max_output_input_ratio = float('inf') config.max_output_input_ratio = float('inf')
test_dataset = ManifestDataset.from_config(config) test_dataset = ManifestDataset.from_config(config)
# return text ord id # return text ord id
config.collator.keep_transcription_text = True config.keep_transcription_text = True
config.collator.augmentation_config = "" config.augmentation_config = ""
self.test_loader = DataLoader( self.test_loader = DataLoader(
test_dataset, test_dataset,
batch_size=config.decoding.batch_size, batch_size=config.decode.batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=SpeechCollator.from_config(config)) collate_fn=SpeechCollator.from_config(config))
# return text token id # return text token id
config.collator.keep_transcription_text = False config.keep_transcription_text = False
self.align_loader = DataLoader( self.align_loader = DataLoader(
test_dataset, test_dataset,
batch_size=config.decoding.batch_size, batch_size=config.decode.batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=SpeechCollator.from_config(config)) collate_fn=SpeechCollator.from_config(config))
...@@ -122,7 +122,7 @@ class U2Trainer(Trainer): ...@@ -122,7 +122,7 @@ class U2Trainer(Trainer):
def setup_model(self): def setup_model(self):
config = self.config config = self.config
model_conf = config.model model_conf = config
with UpdateConfig(model_conf): with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.output_dim = self.train_loader.collate_fn.vocab_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size
...@@ -136,7 +136,7 @@ class U2Trainer(Trainer): ...@@ -136,7 +136,7 @@ class U2Trainer(Trainer):
logger.info(f"{model}") logger.info(f"{model}")
layer_tools.print_params(model, logger.info) layer_tools.print_params(model, logger.info)
train_config = config.training train_config = config
optim_type = train_config.optim optim_type = train_config.optim
optim_conf = train_config.optim_conf optim_conf = train_config.optim_conf
scheduler_type = train_config.scheduler scheduler_type = train_config.scheduler
...@@ -156,7 +156,7 @@ class U2Trainer(Trainer): ...@@ -156,7 +156,7 @@ class U2Trainer(Trainer):
config, config,
parameters, parameters,
lr_scheduler=None, ): lr_scheduler=None, ):
train_config = config.training train_config = config
optim_type = train_config.optim optim_type = train_config.optim
optim_conf = train_config.optim_conf optim_conf = train_config.optim_conf
scheduler_type = train_config.scheduler scheduler_type = train_config.scheduler
...@@ -182,7 +182,7 @@ class U2Trainer(Trainer): ...@@ -182,7 +182,7 @@ class U2Trainer(Trainer):
def setup_updater(self): def setup_updater(self):
output_dir = self.output_dir output_dir = self.output_dir
config = self.config.training config = self.config
updater = U2Updater( updater = U2Updater(
model=self.model, model=self.model,
......
...@@ -69,6 +69,10 @@ if __name__ == "__main__": ...@@ -69,6 +69,10 @@ if __name__ == "__main__":
config = CfgNode() config = CfgNode()
config.set_new_allowed(True) config.set_new_allowed(True)
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_cfg:
decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_cfg)
config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()
......
...@@ -80,7 +80,7 @@ class U2Trainer(Trainer): ...@@ -80,7 +80,7 @@ class U2Trainer(Trainer):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training train_conf = self.config
start = time.time() start = time.time()
# forward # forward
...@@ -122,7 +122,7 @@ class U2Trainer(Trainer): ...@@ -122,7 +122,7 @@ class U2Trainer(Trainer):
if (batch_index + 1) % train_conf.log_interval == 0: if (batch_index + 1) % train_conf.log_interval == 0:
msg += "train time: {:>.3f}s, ".format(iteration_time) msg += "train time: {:>.3f}s, ".format(iteration_time)
msg += "batch size: {}, ".format(self.config.collator.batch_size) msg += "batch size: {}, ".format(self.config.batch_size)
msg += "accum: {}, ".format(train_conf.accum_grad) msg += "accum: {}, ".format(train_conf.accum_grad)
msg += ', '.join('{}: {:>.6f}'.format(k, v) msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items()) for k, v in losses_np.items())
...@@ -157,7 +157,7 @@ class U2Trainer(Trainer): ...@@ -157,7 +157,7 @@ class U2Trainer(Trainer):
if ctc_loss: if ctc_loss:
valid_losses['val_ctc_loss'].append(float(ctc_loss)) valid_losses['val_ctc_loss'].append(float(ctc_loss))
if (i + 1) % self.config.training.log_interval == 0: if (i + 1) % self.config.log_interval == 0:
valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
valid_dump['val_history_loss'] = total_loss / num_seen_utts valid_dump['val_history_loss'] = total_loss / num_seen_utts
...@@ -186,7 +186,7 @@ class U2Trainer(Trainer): ...@@ -186,7 +186,7 @@ class U2Trainer(Trainer):
self.before_train() self.before_train()
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch: while self.epoch < self.config.n_epoch:
with Timer("Epoch-Train Time Cost: {}"): with Timer("Epoch-Train Time Cost: {}"):
self.model.train() self.model.train()
try: try:
...@@ -235,10 +235,10 @@ class U2Trainer(Trainer): ...@@ -235,10 +235,10 @@ class U2Trainer(Trainer):
config = self.config.clone() config = self.config.clone()
# train/valid dataset, return token ids # train/valid dataset, return token ids
self.train_loader = BatchDataLoader( self.train_loader = BatchDataLoader(
json_file=config.data.train_manifest, json_file=config.train_manifest,
train_mode=True, train_mode=True,
sortagrad=False, sortagrad=False,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
maxlen_in=float('inf'), maxlen_in=float('inf'),
maxlen_out=float('inf'), maxlen_out=float('inf'),
minibatches=0, minibatches=0,
...@@ -248,16 +248,16 @@ class U2Trainer(Trainer): ...@@ -248,16 +248,16 @@ class U2Trainer(Trainer):
batch_frames_in=0, batch_frames_in=0,
batch_frames_out=0, batch_frames_out=0,
batch_frames_inout=0, batch_frames_inout=0,
preprocess_conf=config.collator.augmentation_config, preprocess_conf=config.preprocess_config,
n_iter_processes=config.collator.num_workers, n_iter_processes=config.num_workers,
subsampling_factor=1, subsampling_factor=1,
num_encs=1) num_encs=1)
self.valid_loader = BatchDataLoader( self.valid_loader = BatchDataLoader(
json_file=config.data.dev_manifest, json_file=config.dev_manifest,
train_mode=False, train_mode=False,
sortagrad=False, sortagrad=False,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
maxlen_in=float('inf'), maxlen_in=float('inf'),
maxlen_out=float('inf'), maxlen_out=float('inf'),
minibatches=0, minibatches=0,
...@@ -268,16 +268,18 @@ class U2Trainer(Trainer): ...@@ -268,16 +268,18 @@ class U2Trainer(Trainer):
batch_frames_out=0, batch_frames_out=0,
batch_frames_inout=0, batch_frames_inout=0,
preprocess_conf=None, preprocess_conf=None,
n_iter_processes=config.collator.num_workers, n_iter_processes=config.num_workers,
subsampling_factor=1, subsampling_factor=1,
num_encs=1) num_encs=1)
decode_batch_size = config.get('decode', dict()).get(
'decode_batch_size', 1)
# test dataset, return raw text # test dataset, return raw text
self.test_loader = BatchDataLoader( self.test_loader = BatchDataLoader(
json_file=config.data.test_manifest, json_file=config.test_manifest,
train_mode=False, train_mode=False,
sortagrad=False, sortagrad=False,
batch_size=config.decoding.batch_size, batch_size=decode_batch_size,
maxlen_in=float('inf'), maxlen_in=float('inf'),
maxlen_out=float('inf'), maxlen_out=float('inf'),
minibatches=0, minibatches=0,
...@@ -293,10 +295,10 @@ class U2Trainer(Trainer): ...@@ -293,10 +295,10 @@ class U2Trainer(Trainer):
num_encs=1) num_encs=1)
self.align_loader = BatchDataLoader( self.align_loader = BatchDataLoader(
json_file=config.data.test_manifest, json_file=config.test_manifest,
train_mode=False, train_mode=False,
sortagrad=False, sortagrad=False,
batch_size=config.decoding.batch_size, batch_size=decode_batch_size,
maxlen_in=float('inf'), maxlen_in=float('inf'),
maxlen_out=float('inf'), maxlen_out=float('inf'),
minibatches=0, minibatches=0,
...@@ -316,7 +318,7 @@ class U2Trainer(Trainer): ...@@ -316,7 +318,7 @@ class U2Trainer(Trainer):
config = self.config config = self.config
# model # model
model_conf = config.model model_conf = config
with UpdateConfig(model_conf): with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.feat_dim model_conf.input_dim = self.train_loader.feat_dim
model_conf.output_dim = self.train_loader.vocab_size model_conf.output_dim = self.train_loader.vocab_size
...@@ -392,9 +394,9 @@ class U2Tester(U2Trainer): ...@@ -392,9 +394,9 @@ class U2Tester(U2Trainer):
def __init__(self, config, args): def __init__(self, config, args):
super().__init__(config, args) super().__init__(config, args)
self.text_feature = TextFeaturizer( self.text_feature = TextFeaturizer(
unit_type=self.config.collator.unit_type, unit_type=self.config.unit_type,
vocab=self.config.collator.vocab_filepath, vocab=self.config.vocab_filepath,
spm_model_prefix=self.config.collator.spm_model_prefix) spm_model_prefix=self.config.spm_model_prefix)
self.vocab_list = self.text_feature.vocab_list self.vocab_list = self.text_feature.vocab_list
def id2token(self, texts, texts_len, text_feature): def id2token(self, texts, texts_len, text_feature):
...@@ -413,10 +415,10 @@ class U2Tester(U2Trainer): ...@@ -413,10 +415,10 @@ class U2Tester(U2Trainer):
texts, texts,
texts_len, texts_len,
fout=None): fout=None):
cfg = self.config.decoding decode_cfg = self.config.decode
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
start_time = time.time() start_time = time.time()
target_transcripts = self.id2token(texts, texts_len, self.text_feature) target_transcripts = self.id2token(texts, texts_len, self.text_feature)
...@@ -424,12 +426,12 @@ class U2Tester(U2Trainer): ...@@ -424,12 +426,12 @@ class U2Tester(U2Trainer):
audio, audio,
audio_len, audio_len,
text_feature=self.text_feature, text_feature=self.text_feature,
decoding_method=cfg.decoding_method, decoding_method=decode_cfg.decoding_method,
beam_size=cfg.beam_size, beam_size=decode_cfg.beam_size,
ctc_weight=cfg.ctc_weight, ctc_weight=decode_cfg.ctc_weight,
decoding_chunk_size=cfg.decoding_chunk_size, decoding_chunk_size=decode_cfg.decoding_chunk_size,
num_decoding_left_chunks=cfg.num_decoding_left_chunks, num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
simulate_streaming=cfg.simulate_streaming) simulate_streaming=decode_cfg.simulate_streaming)
decode_time = time.time() - start_time decode_time = time.time() - start_time
for i, (utt, target, result, rec_tids) in enumerate( for i, (utt, target, result, rec_tids) in enumerate(
...@@ -449,15 +451,16 @@ class U2Tester(U2Trainer): ...@@ -449,15 +451,16 @@ class U2Tester(U2Trainer):
logger.info(f"Utt: {utt}") logger.info(f"Utt: {utt}")
logger.info(f"Ref: {target}") logger.info(f"Ref: {target}")
logger.info(f"Hyp: {result}") logger.info(f"Hyp: {result}")
logger.info("One example error rate [%s] = %f" % logger.info(
(cfg.error_rate_type, error_rate_func(target, result))) "One example error rate [%s] = %f" %
(decode_cfg.error_rate_type, error_rate_func(target, result)))
return dict( return dict(
errors_sum=errors_sum, errors_sum=errors_sum,
len_refs=len_refs, len_refs=len_refs,
num_ins=num_ins, # num examples num_ins=num_ins, # num examples
error_rate=errors_sum / len_refs, error_rate=errors_sum / len_refs,
error_rate_type=cfg.error_rate_type, error_rate_type=decode_cfg.error_rate_type,
num_frames=audio_len.sum().numpy().item(), num_frames=audio_len.sum().numpy().item(),
decode_time=decode_time) decode_time=decode_time)
...@@ -468,7 +471,7 @@ class U2Tester(U2Trainer): ...@@ -468,7 +471,7 @@ class U2Tester(U2Trainer):
self.model.eval() self.model.eval()
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
stride_ms = self.config.collator.stride_ms stride_ms = self.config.stride_ms
error_rate_type = None error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
num_frames = 0.0 num_frames = 0.0
...@@ -519,15 +522,15 @@ class U2Tester(U2Trainer): ...@@ -519,15 +522,15 @@ class U2Tester(U2Trainer):
"ref_len": "ref_len":
len_refs, len_refs,
"decode_method": "decode_method":
self.config.decoding.decoding_method, self.config.decode.decoding_method,
}) })
f.write(data + '\n') f.write(data + '\n')
@paddle.no_grad() @paddle.no_grad()
def align(self): def align(self):
ctc_utils.ctc_align(self.config, self.model, self.align_loader, ctc_utils.ctc_align(self.config, self.model, self.align_loader,
self.config.decoding.batch_size, self.config.decode.decode_batch_size,
self.config.collator.stride_ms, self.vocab_list, self.config.stride_ms, self.vocab_list,
self.args.result_file) self.args.result_file)
def load_inferspec(self): def load_inferspec(self):
...@@ -539,7 +542,7 @@ class U2Tester(U2Trainer): ...@@ -539,7 +542,7 @@ class U2Tester(U2Trainer):
""" """
from paddlespeech.s2t.models.u2 import U2InferModel from paddlespeech.s2t.models.u2 import U2InferModel
infer_model = U2InferModel.from_pretrained(self.test_loader, infer_model = U2InferModel.from_pretrained(self.test_loader,
self.config.model.clone(), self.config.clone(),
self.args.checkpoint_path) self.args.checkpoint_path)
feat_dim = self.test_loader.feat_dim feat_dim = self.test_loader.feat_dim
input_spec = [ input_spec = [
......
...@@ -14,12 +14,14 @@ ...@@ -14,12 +14,14 @@
"""Evaluation for U2 model.""" """Evaluation for U2 model."""
import cProfile import cProfile
from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.s2t.utils.utility import print_arguments
# TODO(hui zhang): dynamic load # TODO(hui zhang): dynamic load
def main_sp(config, args): def main_sp(config, args):
...@@ -35,7 +37,7 @@ def main(config, args): ...@@ -35,7 +37,7 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save asr result to # save asr result to
parser.add_argument( parser.add_argument(
"--result_file", type=str, help="path of save the asr result") "--result_file", type=str, help="path of save the asr result")
args = parser.parse_args() args = parser.parse_args()
...@@ -45,6 +47,10 @@ if __name__ == "__main__": ...@@ -45,6 +47,10 @@ if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.decode_cfg:
decode_conf = CfgNode(new_allowed=True)
decode_conf.merge_from_file(args.decode_cfg)
config.decode = decode_conf
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()
......
...@@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2_st import U2STModel ...@@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2_st import U2STModel
_C = CfgNode() _C = CfgNode()
_C.data = ManifestDataset.params() # _C.data = ManifestDataset.params()
_C.collator = SpeechCollator.params() # _C.collator = SpeechCollator.params()
_C.model = U2STModel.params() # _C.model = U2STModel.params()
_C.training = U2STTrainer.params() # _C.training = U2STTrainer.params()
_C.decoding = U2STTester.params() # _C.decoding = U2STTester.params()
def get_cfg_defaults(): def get_cfg_defaults():
......
...@@ -78,7 +78,7 @@ class U2STTrainer(Trainer): ...@@ -78,7 +78,7 @@ class U2STTrainer(Trainer):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training train_conf = self.config
start = time.time() start = time.time()
# forward # forward
utt, audio, audio_len, text, text_len = batch_data utt, audio, audio_len, text, text_len = batch_data
...@@ -127,7 +127,7 @@ class U2STTrainer(Trainer): ...@@ -127,7 +127,7 @@ class U2STTrainer(Trainer):
if (batch_index + 1) % train_conf.log_interval == 0: if (batch_index + 1) % train_conf.log_interval == 0:
msg += "train time: {:>.3f}s, ".format(iteration_time) msg += "train time: {:>.3f}s, ".format(iteration_time)
msg += "batch size: {}, ".format(self.config.collator.batch_size) msg += "batch size: {}, ".format(self.config.batch_size)
msg += "accum: {}, ".format(train_conf.accum_grad) msg += "accum: {}, ".format(train_conf.accum_grad)
msg += ', '.join('{}: {:>.6f}'.format(k, v) msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items()) for k, v in losses_np.items())
...@@ -168,7 +168,7 @@ class U2STTrainer(Trainer): ...@@ -168,7 +168,7 @@ class U2STTrainer(Trainer):
if ctc_loss: if ctc_loss:
valid_losses['val_ctc_loss'].append(float(ctc_loss)) valid_losses['val_ctc_loss'].append(float(ctc_loss))
if (i + 1) % self.config.training.log_interval == 0: if (i + 1) % self.config.log_interval == 0:
valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
valid_dump['val_history_st_loss'] = total_loss / num_seen_utts valid_dump['val_history_st_loss'] = total_loss / num_seen_utts
...@@ -197,7 +197,7 @@ class U2STTrainer(Trainer): ...@@ -197,7 +197,7 @@ class U2STTrainer(Trainer):
self.before_train() self.before_train()
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch: while self.epoch < self.config.n_epoch:
with Timer("Epoch-Train Time Cost: {}"): with Timer("Epoch-Train Time Cost: {}"):
self.model.train() self.model.train()
try: try:
...@@ -245,91 +245,93 @@ class U2STTrainer(Trainer): ...@@ -245,91 +245,93 @@ class U2STTrainer(Trainer):
def setup_dataloader(self): def setup_dataloader(self):
config = self.config.clone() config = self.config.clone()
config.defrost() config.defrost()
config.collator.keep_transcription_text = False config.keep_transcription_text = False
# train/valid dataset, return token ids # train/valid dataset, return token ids
config.data.manifest = config.data.train_manifest config.manifest = config.train_manifest
train_dataset = ManifestDataset.from_config(config) train_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.dev_manifest config.manifest = config.dev_manifest
dev_dataset = ManifestDataset.from_config(config) dev_dataset = ManifestDataset.from_config(config)
if config.model.model_conf.asr_weight > 0.: if config.model_conf.asr_weight > 0.:
Collator = TripletSpeechCollator Collator = TripletSpeechCollator
TestCollator = SpeechCollator TestCollator = SpeechCollator
else: else:
TestCollator = Collator = SpeechCollator TestCollator = Collator = SpeechCollator
collate_fn_train = Collator.from_config(config) collate_fn_train = Collator.from_config(config)
config.collator.augmentation_config = "" config.augmentation_config = ""
collate_fn_dev = Collator.from_config(config) collate_fn_dev = Collator.from_config(config)
if self.parallel: if self.parallel:
batch_sampler = SortagradDistributedBatchSampler( batch_sampler = SortagradDistributedBatchSampler(
train_dataset, train_dataset,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
num_replicas=None, num_replicas=None,
rank=None, rank=None,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
sortagrad=config.collator.sortagrad, sortagrad=config.sortagrad,
shuffle_method=config.collator.shuffle_method) shuffle_method=config.shuffle_method)
else: else:
batch_sampler = SortagradBatchSampler( batch_sampler = SortagradBatchSampler(
train_dataset, train_dataset,
shuffle=True, shuffle=True,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
drop_last=True, drop_last=True,
sortagrad=config.collator.sortagrad, sortagrad=config.sortagrad,
shuffle_method=config.collator.shuffle_method) shuffle_method=config.shuffle_method)
self.train_loader = DataLoader( self.train_loader = DataLoader(
train_dataset, train_dataset,
batch_sampler=batch_sampler, batch_sampler=batch_sampler,
collate_fn=collate_fn_train, collate_fn=collate_fn_train,
num_workers=config.collator.num_workers, ) num_workers=config.num_workers, )
self.valid_loader = DataLoader( self.valid_loader = DataLoader(
dev_dataset, dev_dataset,
batch_size=config.collator.batch_size, batch_size=config.batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=collate_fn_dev, collate_fn=collate_fn_dev,
num_workers=config.collator.num_workers, ) num_workers=config.num_workers, )
# test dataset, return raw text # test dataset, return raw text
config.data.manifest = config.data.test_manifest config.manifest = config.test_manifest
# filter test examples, will cause less examples, but no mismatch with training # filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now. # and can use large batch size , save training time, so filter test egs now.
# config.data.min_input_len = 0.0 # second # config.min_input_len = 0.0 # second
# config.data.max_input_len = float('inf') # second # config.max_input_len = float('inf') # second
# config.data.min_output_len = 0.0 # tokens # config.min_output_len = 0.0 # tokens
# config.data.max_output_len = float('inf') # tokens # config.max_output_len = float('inf') # tokens
# config.data.min_output_input_ratio = 0.00 # config.min_output_input_ratio = 0.00
# config.data.max_output_input_ratio = float('inf') # config.max_output_input_ratio = float('inf')
test_dataset = ManifestDataset.from_config(config) test_dataset = ManifestDataset.from_config(config)
# return text ord id # return text ord id
config.collator.keep_transcription_text = True config.keep_transcription_text = True
config.collator.augmentation_config = "" config.augmentation_config = ""
decode_batch_size = config.get('decode', dict()).get(
'decode_batch_size', 1)
self.test_loader = DataLoader( self.test_loader = DataLoader(
test_dataset, test_dataset,
batch_size=config.decoding.batch_size, batch_size=decode_batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=TestCollator.from_config(config), collate_fn=TestCollator.from_config(config),
num_workers=config.collator.num_workers, ) num_workers=config.num_workers, )
# return text token id # return text token id
config.collator.keep_transcription_text = False config.keep_transcription_text = False
self.align_loader = DataLoader( self.align_loader = DataLoader(
test_dataset, test_dataset,
batch_size=config.decoding.batch_size, batch_size=decode_batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=TestCollator.from_config(config), collate_fn=TestCollator.from_config(config),
num_workers=config.collator.num_workers, ) num_workers=config.num_workers, )
logger.info("Setup train/valid/test/align Dataloader!") logger.info("Setup train/valid/test/align Dataloader!")
def setup_model(self): def setup_model(self):
config = self.config config = self.config
model_conf = config.model model_conf = config
with UpdateConfig(model_conf): with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.output_dim = self.train_loader.collate_fn.vocab_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size
...@@ -342,7 +344,7 @@ class U2STTrainer(Trainer): ...@@ -342,7 +344,7 @@ class U2STTrainer(Trainer):
logger.info(f"{model}") logger.info(f"{model}")
layer_tools.print_params(model, logger.info) layer_tools.print_params(model, logger.info)
train_config = config.training train_config = config
optim_type = train_config.optim optim_type = train_config.optim
optim_conf = train_config.optim_conf optim_conf = train_config.optim_conf
scheduler_type = train_config.scheduler scheduler_type = train_config.scheduler
...@@ -428,7 +430,7 @@ class U2STTester(U2STTrainer): ...@@ -428,7 +430,7 @@ class U2STTester(U2STTrainer):
def translate(self, audio, audio_len): def translate(self, audio, audio_len):
""""E2E translation from extracted audio feature""" """"E2E translation from extracted audio feature"""
cfg = self.config.decoding decode_cfg = self.config.decode
text_feature = self.test_loader.collate_fn.text_feature text_feature = self.test_loader.collate_fn.text_feature
self.model.eval() self.model.eval()
...@@ -436,12 +438,12 @@ class U2STTester(U2STTrainer): ...@@ -436,12 +438,12 @@ class U2STTester(U2STTrainer):
audio, audio,
audio_len, audio_len,
text_feature=text_feature, text_feature=text_feature,
decoding_method=cfg.decoding_method, decoding_method=decode_cfg.decoding_method,
beam_size=cfg.beam_size, beam_size=decode_cfg.beam_size,
word_reward=cfg.word_reward, word_reward=decode_cfg.word_reward,
decoding_chunk_size=cfg.decoding_chunk_size, decoding_chunk_size=decode_cfg.decoding_chunk_size,
num_decoding_left_chunks=cfg.num_decoding_left_chunks, num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
simulate_streaming=cfg.simulate_streaming) simulate_streaming=decode_cfg.simulate_streaming)
return hyps return hyps
def compute_translation_metrics(self, def compute_translation_metrics(self,
...@@ -452,7 +454,7 @@ class U2STTester(U2STTrainer): ...@@ -452,7 +454,7 @@ class U2STTester(U2STTrainer):
texts_len, texts_len,
bleu_func, bleu_func,
fout=None): fout=None):
cfg = self.config.decoding decode_cfg = self.config.decode
len_refs, num_ins = 0, 0 len_refs, num_ins = 0, 0
start_time = time.time() start_time = time.time()
...@@ -467,12 +469,12 @@ class U2STTester(U2STTrainer): ...@@ -467,12 +469,12 @@ class U2STTester(U2STTrainer):
audio, audio,
audio_len, audio_len,
text_feature=text_feature, text_feature=text_feature,
decoding_method=cfg.decoding_method, decoding_method=decode_cfg.decoding_method,
beam_size=cfg.beam_size, beam_size=decode_cfg.beam_size,
word_reward=cfg.word_reward, word_reward=decode_cfg.word_reward,
decoding_chunk_size=cfg.decoding_chunk_size, decoding_chunk_size=decode_cfg.decoding_chunk_size,
num_decoding_left_chunks=cfg.num_decoding_left_chunks, num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
simulate_streaming=cfg.simulate_streaming) simulate_streaming=decode_cfg.simulate_streaming)
decode_time = time.time() - start_time decode_time = time.time() - start_time
for utt, target, result in zip(utts, refs, hyps): for utt, target, result in zip(utts, refs, hyps):
...@@ -502,8 +504,8 @@ class U2STTester(U2STTrainer): ...@@ -502,8 +504,8 @@ class U2STTester(U2STTrainer):
self.model.eval() self.model.eval()
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
cfg = self.config.decoding decode_cfg = self.config.decode
bleu_func = bleu_score.char_bleu if cfg.error_rate_type == 'char-bleu' else bleu_score.bleu bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu
stride_ms = self.test_loader.collate_fn.stride_ms stride_ms = self.test_loader.collate_fn.stride_ms
hyps, refs = [], [] hyps, refs = [], []
...@@ -549,15 +551,15 @@ class U2STTester(U2STTrainer): ...@@ -549,15 +551,15 @@ class U2STTester(U2STTrainer):
"num_examples": "num_examples":
num_ins, num_ins,
"decode_method": "decode_method":
self.config.decoding.decoding_method, self.config.decode.decoding_method,
}) })
f.write(data + '\n') f.write(data + '\n')
@paddle.no_grad() @paddle.no_grad()
def align(self): def align(self):
ctc_utils.ctc_align(self.config, self.model, self.align_loader, ctc_utils.ctc_align(self.config, self.model, self.align_loader,
self.config.decoding.batch_size, self.config.decode.decode_batch_size,
self.config.collator.stride_ms, self.vocab_list, self.config.stride_ms, self.vocab_list,
self.args.result_file) self.args.result_file)
def load_inferspec(self): def load_inferspec(self):
...@@ -569,7 +571,7 @@ class U2STTester(U2STTrainer): ...@@ -569,7 +571,7 @@ class U2STTester(U2STTrainer):
""" """
from paddlespeech.s2t.models.u2 import U2InferModel from paddlespeech.s2t.models.u2 import U2InferModel
infer_model = U2InferModel.from_pretrained(self.test_loader, infer_model = U2InferModel.from_pretrained(self.test_loader,
self.config.model.clone(), self.config.clone(),
self.args.checkpoint_path) self.args.checkpoint_path)
feat_dim = self.test_loader.collate_fn.feature_size feat_dim = self.test_loader.collate_fn.feature_size
input_spec = [ input_spec = [
......
...@@ -256,45 +256,43 @@ class SpeechCollator(SpeechCollatorBase): ...@@ -256,45 +256,43 @@ class SpeechCollator(SpeechCollatorBase):
Returns: Returns:
SpeechCollator: collator object. SpeechCollator: collator object.
""" """
assert 'augmentation_config' in config.collator assert 'augmentation_config' in config
assert 'keep_transcription_text' in config.collator assert 'keep_transcription_text' in config
assert 'mean_std_filepath' in config.collator assert 'mean_std_filepath' in config
assert 'vocab_filepath' in config.collator assert 'vocab_filepath' in config
assert 'spectrum_type' in config.collator assert 'spectrum_type' in config
assert 'n_fft' in config.collator assert 'n_fft' in config
assert config.collator assert config
if isinstance(config.collator.augmentation_config, (str, bytes)): if isinstance(config.augmentation_config, (str, bytes)):
if config.collator.augmentation_config: if config.augmentation_config:
aug_file = io.open( aug_file = io.open(
config.collator.augmentation_config, config.augmentation_config, mode='r', encoding='utf8')
mode='r',
encoding='utf8')
else: else:
aug_file = io.StringIO(initial_value='{}', newline='') aug_file = io.StringIO(initial_value='{}', newline='')
else: else:
aug_file = config.collator.augmentation_config aug_file = config.augmentation_config
assert isinstance(aug_file, io.StringIO) assert isinstance(aug_file, io.StringIO)
speech_collator = cls( speech_collator = cls(
aug_file=aug_file, aug_file=aug_file,
random_seed=0, random_seed=0,
mean_std_filepath=config.collator.mean_std_filepath, mean_std_filepath=config.mean_std_filepath,
unit_type=config.collator.unit_type, unit_type=config.unit_type,
vocab_filepath=config.collator.vocab_filepath, vocab_filepath=config.vocab_filepath,
spm_model_prefix=config.collator.spm_model_prefix, spm_model_prefix=config.spm_model_prefix,
spectrum_type=config.collator.spectrum_type, spectrum_type=config.spectrum_type,
feat_dim=config.collator.feat_dim, feat_dim=config.feat_dim,
delta_delta=config.collator.delta_delta, delta_delta=config.delta_delta,
stride_ms=config.collator.stride_ms, stride_ms=config.stride_ms,
window_ms=config.collator.window_ms, window_ms=config.window_ms,
n_fft=config.collator.n_fft, n_fft=config.n_fft,
max_freq=config.collator.max_freq, max_freq=config.max_freq,
target_sample_rate=config.collator.target_sample_rate, target_sample_rate=config.target_sample_rate,
use_dB_normalization=config.collator.use_dB_normalization, use_dB_normalization=config.use_dB_normalization,
target_dB=config.collator.target_dB, target_dB=config.target_dB,
dither=config.collator.dither, dither=config.dither,
keep_transcription_text=config.collator.keep_transcription_text) keep_transcription_text=config.keep_transcription_text)
return speech_collator return speech_collator
......
...@@ -54,17 +54,17 @@ class ManifestDataset(Dataset): ...@@ -54,17 +54,17 @@ class ManifestDataset(Dataset):
Returns: Returns:
ManifestDataset: dataet object. ManifestDataset: dataet object.
""" """
assert 'manifest' in config.data assert 'manifest' in config
assert config.data.manifest assert config.manifest
dataset = cls( dataset = cls(
manifest_path=config.data.manifest, manifest_path=config.manifest,
max_input_len=config.data.max_input_len, max_input_len=config.max_input_len,
min_input_len=config.data.min_input_len, min_input_len=config.min_input_len,
max_output_len=config.data.max_output_len, max_output_len=config.max_output_len,
min_output_len=config.data.min_output_len, min_output_len=config.min_output_len,
max_output_input_ratio=config.data.max_output_input_ratio, max_output_input_ratio=config.max_output_input_ratio,
min_output_input_ratio=config.data.min_output_input_ratio, ) min_output_input_ratio=config.min_output_input_ratio, )
return dataset return dataset
def __init__(self, def __init__(self,
......
...@@ -221,12 +221,12 @@ class DeepSpeech2Model(nn.Layer): ...@@ -221,12 +221,12 @@ class DeepSpeech2Model(nn.Layer):
model = cls( model = cls(
feat_size=dataloader.collate_fn.feature_size, feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size, dict_size=dataloader.collate_fn.vocab_size,
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
rnn_size=config.model.rnn_layer_size, rnn_size=config.rnn_layer_size,
use_gru=config.model.use_gru, use_gru=config.use_gru,
share_rnn_weights=config.model.share_rnn_weights, share_rnn_weights=config.share_rnn_weights,
blank_id=config.model.blank_id, blank_id=config.blank_id,
ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path) model, checkpoint_path=checkpoint_path)
...@@ -240,7 +240,7 @@ class DeepSpeech2Model(nn.Layer): ...@@ -240,7 +240,7 @@ class DeepSpeech2Model(nn.Layer):
Parameters Parameters
config: yacs.config.CfgNode config: yacs.config.CfgNode
config.model config
Returns Returns
------- -------
DeepSpeech2Model DeepSpeech2Model
......
...@@ -353,14 +353,14 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -353,14 +353,14 @@ class DeepSpeech2ModelOnline(nn.Layer):
model = cls( model = cls(
feat_size=dataloader.collate_fn.feature_size, feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size, dict_size=dataloader.collate_fn.vocab_size,
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
rnn_size=config.model.rnn_layer_size, rnn_size=config.rnn_layer_size,
rnn_direction=config.model.rnn_direction, rnn_direction=config.rnn_direction,
num_fc_layers=config.model.num_fc_layers, num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.model.fc_layers_size_list, fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.model.use_gru, use_gru=config.use_gru,
blank_id=config.model.blank_id, blank_id=config.blank_id,
ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path) model, checkpoint_path=checkpoint_path)
...@@ -374,7 +374,7 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -374,7 +374,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
Parameters Parameters
config: yacs.config.CfgNode config: yacs.config.CfgNode
config.model config
Returns Returns
------- -------
DeepSpeech2ModelOnline DeepSpeech2ModelOnline
......
...@@ -101,7 +101,7 @@ def default_argument_parser(parser=None): ...@@ -101,7 +101,7 @@ def default_argument_parser(parser=None):
title='Test Options', description=None) title='Test Options', description=None)
test_group.add_argument( test_group.add_argument(
"--decode_config", "--decode_cfg",
metavar="DECODE_CONFIG_FILE", metavar="DECODE_CONFIG_FILE",
help="decode config file.") help="decode config file.")
......
...@@ -22,6 +22,7 @@ sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml ...@@ -22,6 +22,7 @@ sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml
fp_item_list=(fp32) fp_item_list=(fp32)
bs_item=(16) bs_item=(16)
config_path=conf/benchmark/conformer.yaml config_path=conf/benchmark/conformer.yaml
decode_config_path=conf/tuning/decode.yaml
seed=0 seed=0
output=exp/conformer output=exp/conformer
profiler_options=None profiler_options=None
...@@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do ...@@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do
echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer" echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer"
run_mode=mp run_mode=mp
ngpu=8 ngpu=8
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
sleep 60 sleep 60
log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8
echo "index is speed, 1gpus, begin, ${log_name}" echo "index is speed, 1gpus, begin, ${log_name}"
run_mode=sp run_mode=sp
ngpu=1 ngpu=1
CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min)
sleep 60 sleep 60
done done
done done
......
...@@ -5,13 +5,14 @@ function _set_params(){ ...@@ -5,13 +5,14 @@ function _set_params(){
run_mode=${1:-"sp"} # 单卡sp|多卡mp run_mode=${1:-"sp"} # 单卡sp|多卡mp
config_path=${2:-"conf/conformer.yaml"} config_path=${2:-"conf/conformer.yaml"}
output=${3:-"exp/conformer"} decode_config_path=${3:-"conf/tuning/decode.yaml"}
seed=${4:-"0"} output=${4:-"exp/conformer"}
ngpu=${5:-"1"} seed=${5:-"0"}
profiler_options=${6:-"None"} ngpu=${6:-"1"}
batch_size=${7:-"32"} profiler_options=${7:-"None"}
fp_item=${8:-"fp32"} batch_size=${8:-"32"}
model_item=${9:-"conformer"} fp_item=${9:-"fp32"}
model_item=${10:-"conformer"}
benchmark_max_step=0 benchmark_max_step=0
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
# 添加日志解析需要的参数 # 添加日志解析需要的参数
...@@ -35,6 +36,7 @@ function _train(){ ...@@ -35,6 +36,7 @@ function _train(){
echo "Train on ${num_gpu_devices} GPUs" echo "Train on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
train_cmd="--config=${config_path} \ train_cmd="--config=${config_path} \
--decode_cfg=${decode_config_path} \
--output=${output} \ --output=${output} \
--seed=${seed} \ --seed=${seed} \
--ngpu=${ngpu} \ --ngpu=${ngpu} \
...@@ -68,7 +70,7 @@ function _train(){ ...@@ -68,7 +70,7 @@ function _train(){
} }
source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@ #_set_params $@
# _train # 如果只想产出训练log,不解析,可取消注释 #_train # 如果只想产出训练log,不解析,可取消注释
_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
...@@ -21,13 +21,13 @@ null:null ...@@ -21,13 +21,13 @@ null:null
null:null null:null
## ##
===========================eval_params=========================== ===========================eval_params===========================
eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl --model_type offline eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --result_file tests/4.rsl --model_type offline
null:null null:null
## ##
===========================infer_params=========================== ===========================infer_params===========================
null:null null:null
null:null null:null
norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --export_path exp/deepspeech_tiny/checkpoints/4.jit
quant_export:null quant_export:null
fpgm_export:null fpgm_export:null
distill_export:null distill_export:null
......
...@@ -21,7 +21,7 @@ null:null ...@@ -21,7 +21,7 @@ null:null
null:null null:null
## ##
===========================eval_params=========================== ===========================eval_params===========================
eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
null:null null:null
## ##
===========================infer_params=========================== ===========================infer_params===========================
......
bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer
cd ../../examples/tiny/s0 cd ../../../examples/tiny/asr0
source path.sh source path.sh
bash ../../../tests/chains/test.sh ../../../tests/chains/ds2_params_lite_train_infer.txt lite_train_infer bash ../../../tests/chains/ds2/test.sh ../../../tests/chains/ds2/ds2_params_lite_train_infer.txt lite_train_infer
cd ../../../tests/chains cd ../../../tests/chains
...@@ -34,7 +34,7 @@ MODE=$2 ...@@ -34,7 +34,7 @@ MODE=$2
if [ ${MODE} = "lite_train_infer" ];then if [ ${MODE} = "lite_train_infer" ];then
# pretrain lite train data # pretrain lite train data
curPath=$(readlink -f "$(dirname "$0")") curPath=$(readlink -f "$(dirname "$0")")
cd ${curPath}/../../examples/tiny/s0 cd ${curPath}/../../../examples/tiny/asr0
source path.sh source path.sh
# download audio data # download audio data
bash ./local/data.sh || exit -1 bash ./local/data.sh || exit -1
...@@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then ...@@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then
elif [ ${MODE} = "whole_train_infer" ];then elif [ ${MODE} = "whole_train_infer" ];then
curPath=$(readlink -f "$(dirname "$0")") curPath=$(readlink -f "$(dirname "$0")")
cd ${curPath}/../../examples/aishell/s0 cd ${curPath}/../../../examples/aishell/asr0
source path.sh source path.sh
# download audio data # download audio data
bash ./local/data.sh || exit -1 bash ./local/data.sh || exit -1
...@@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then ...@@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then
cd ${curPath} cd ${curPath}
elif [ ${MODE} = "whole_infer" ];then elif [ ${MODE} = "whole_infer" ];then
curPath=$(readlink -f "$(dirname "$0")") curPath=$(readlink -f "$(dirname "$0")")
cd ${curPath}/../../examples/aishell/s0 cd ${curPath}/../../../examples/aishell/asr0
source path.sh source path.sh
# download audio data # download audio data
bash ./local/data.sh || exit -1 bash ./local/data.sh || exit -1
...@@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then ...@@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then
cd ${curPath} cd ${curPath}
else else
curPath=$(readlink -f "$(dirname "$0")") curPath=$(readlink -f "$(dirname "$0")")
cd ${curPath}/../../examples/aishell/s0 cd ${curPath}/../../../examples/aishell/asr0
source path.sh source path.sh
# download audio data # download audio data
bash ./local/data.sh || exit -1 bash ./local/data.sh || exit -1
......
...@@ -324,6 +324,7 @@ else ...@@ -324,6 +324,7 @@ else
gsu=${gpu//,/ } gsu=${gpu//,/ }
nump=`echo $gsu | wc -w` nump=`echo $gsu | wc -w`
cmd="${python} ${run_train} --ngpu=$nump" cmd="${python} ${run_train} --ngpu=$nump"
export CUDA_VISIBLE_DEVICES=${gpu}
else # train with multi-machine else # train with multi-machine
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
fi fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册