Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
c907a8de
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c907a8de
编写于
12月 31, 2021
作者:
H
huangyuxin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change all recipes
上级
5d6494de
变更
122
隐藏空白更改
内联
并排
Showing
122 changed file
with
2427 addition
and
2359 deletion
+2427
-2359
examples/aishell/asr0/conf/deepspeech2.yaml
examples/aishell/asr0/conf/deepspeech2.yaml
+60
-64
examples/aishell/asr0/conf/deepspeech2_online.yaml
examples/aishell/asr0/conf/deepspeech2_online.yaml
+62
-64
examples/aishell/asr0/conf/tuning/chunk_decode.yaml
examples/aishell/asr0/conf/tuning/chunk_decode.yaml
+10
-0
examples/aishell/asr0/conf/tuning/decode.yaml
examples/aishell/asr0/conf/tuning/decode.yaml
+10
-0
examples/aishell/asr0/local/test.sh
examples/aishell/asr0/local/test.sh
+6
-4
examples/aishell/asr0/local/test_export.sh
examples/aishell/asr0/local/test_export.sh
+6
-4
examples/aishell/asr0/local/test_hub_ori
examples/aishell/asr0/local/test_hub_ori
+47
-0
examples/aishell/asr0/local/test_wav.sh
examples/aishell/asr0/local/test_wav.sh
+7
-5
examples/aishell/asr0/run.sh
examples/aishell/asr0/run.sh
+4
-3
examples/aishell/asr1/conf/chunk_conformer.yaml
examples/aishell/asr1/conf/chunk_conformer.yaml
+4
-3
examples/aishell/asr1/conf/conformer.yaml
examples/aishell/asr1/conf/conformer.yaml
+3
-2
examples/aishell/asr1/conf/transformer.yaml
examples/aishell/asr1/conf/transformer.yaml
+4
-3
examples/aishell/asr1/local/align.sh
examples/aishell/asr1/local/align.sh
+1
-1
examples/aishell/asr1/local/test.sh
examples/aishell/asr1/local/test.sh
+3
-3
examples/aishell/asr1/local/test_wav.sh
examples/aishell/asr1/local/test_wav.sh
+1
-1
examples/callcenter/asr1/conf/chunk_conformer.yaml
examples/callcenter/asr1/conf/chunk_conformer.yaml
+91
-113
examples/callcenter/asr1/conf/conformer.yaml
examples/callcenter/asr1/conf/conformer.yaml
+84
-109
examples/callcenter/asr1/conf/preprocess.yaml
examples/callcenter/asr1/conf/preprocess.yaml
+1
-1
examples/callcenter/asr1/conf/tuning/chunk_decode.yaml
examples/callcenter/asr1/conf/tuning/chunk_decode.yaml
+11
-0
examples/callcenter/asr1/conf/tuning/decode.yaml
examples/callcenter/asr1/conf/tuning/decode.yaml
+13
-0
examples/callcenter/asr1/local/align.sh
examples/callcenter/asr1/local/align.sh
+6
-4
examples/callcenter/asr1/local/test.sh
examples/callcenter/asr1/local/test.sh
+11
-7
examples/callcenter/asr1/run.sh
examples/callcenter/asr1/run.sh
+3
-2
examples/librispeech/asr0/conf/deepspeech2.yaml
examples/librispeech/asr0/conf/deepspeech2.yaml
+60
-63
examples/librispeech/asr0/conf/deepspeech2_online.yaml
examples/librispeech/asr0/conf/deepspeech2_online.yaml
+62
-65
examples/librispeech/asr0/conf/tuning/chunk_decode.yaml
examples/librispeech/asr0/conf/tuning/chunk_decode.yaml
+10
-0
examples/librispeech/asr0/conf/tuning/decode.yaml
examples/librispeech/asr0/conf/tuning/decode.yaml
+10
-0
examples/librispeech/asr0/local/test.sh
examples/librispeech/asr0/local/test.sh
+6
-4
examples/librispeech/asr0/local/test_wav.sh
examples/librispeech/asr0/local/test_wav.sh
+7
-5
examples/librispeech/asr0/run.sh
examples/librispeech/asr0/run.sh
+3
-2
examples/librispeech/asr1/conf/chunk_conformer.yaml
examples/librispeech/asr1/conf/chunk_conformer.yaml
+4
-4
examples/librispeech/asr1/conf/chunk_transformer.yaml
examples/librispeech/asr1/conf/chunk_transformer.yaml
+2
-3
examples/librispeech/asr1/conf/conformer.yaml
examples/librispeech/asr1/conf/conformer.yaml
+2
-3
examples/librispeech/asr1/conf/transformer.yaml
examples/librispeech/asr1/conf/transformer.yaml
+2
-3
examples/librispeech/asr1/local/align.sh
examples/librispeech/asr1/local/align.sh
+1
-1
examples/librispeech/asr1/local/test.sh
examples/librispeech/asr1/local/test.sh
+3
-3
examples/librispeech/asr1/local/test_wav.sh
examples/librispeech/asr1/local/test_wav.sh
+1
-1
examples/librispeech/asr2/conf/decode/decode_base.yaml
examples/librispeech/asr2/conf/decode/decode_base.yaml
+11
-0
examples/librispeech/asr2/conf/transformer.yaml
examples/librispeech/asr2/conf/transformer.yaml
+70
-81
examples/librispeech/asr2/local/align.sh
examples/librispeech/asr2/local/align.sh
+7
-5
examples/librispeech/asr2/local/test.sh
examples/librispeech/asr2/local/test.sh
+6
-4
examples/librispeech/asr2/run.sh
examples/librispeech/asr2/run.sh
+5
-3
examples/other/1xt2x/aishell/conf/deepspeech2.yaml
examples/other/1xt2x/aishell/conf/deepspeech2.yaml
+60
-62
examples/other/1xt2x/aishell/conf/tuning/decode.yaml
examples/other/1xt2x/aishell/conf/tuning/decode.yaml
+10
-0
examples/other/1xt2x/aishell/local/test.sh
examples/other/1xt2x/aishell/local/test.sh
+6
-4
examples/other/1xt2x/aishell/run.sh
examples/other/1xt2x/aishell/run.sh
+2
-1
examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+60
-63
examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
+10
-0
examples/other/1xt2x/baidu_en8k/local/test.sh
examples/other/1xt2x/baidu_en8k/local/test.sh
+6
-4
examples/other/1xt2x/baidu_en8k/run.sh
examples/other/1xt2x/baidu_en8k/run.sh
+2
-1
examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
+60
-63
examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
+10
-0
examples/other/1xt2x/librispeech/local/test.sh
examples/other/1xt2x/librispeech/local/test.sh
+6
-4
examples/other/1xt2x/librispeech/run.sh
examples/other/1xt2x/librispeech/run.sh
+2
-1
examples/other/1xt2x/src_deepspeech2x/bin/test.py
examples/other/1xt2x/src_deepspeech2x/bin/test.py
+5
-0
examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
...es/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
+6
-6
examples/other/1xt2x/src_deepspeech2x/test_model.py
examples/other/1xt2x/src_deepspeech2x/test_model.py
+29
-61
examples/ted_en_zh/st0/conf/transformer.yaml
examples/ted_en_zh/st0/conf/transformer.yaml
+89
-102
examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
+89
-101
examples/ted_en_zh/st0/conf/tuning/decode.yaml
examples/ted_en_zh/st0/conf/tuning/decode.yaml
+11
-0
examples/ted_en_zh/st0/local/test.sh
examples/ted_en_zh/st0/local/test.sh
+7
-5
examples/ted_en_zh/st0/run.sh
examples/ted_en_zh/st0/run.sh
+2
-1
examples/ted_en_zh/st1/conf/transformer.yaml
examples/ted_en_zh/st1/conf/transformer.yaml
+89
-102
examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+89
-102
examples/ted_en_zh/st1/conf/tuning/decode.yaml
examples/ted_en_zh/st1/conf/tuning/decode.yaml
+12
-0
examples/ted_en_zh/st1/local/test.sh
examples/ted_en_zh/st1/local/test.sh
+7
-5
examples/ted_en_zh/st1/run.sh
examples/ted_en_zh/st1/run.sh
+2
-1
examples/timit/asr1/conf/transformer.yaml
examples/timit/asr1/conf/transformer.yaml
+80
-101
examples/timit/asr1/conf/tuning/decode.yaml
examples/timit/asr1/conf/tuning/decode.yaml
+11
-0
examples/timit/asr1/local/align.sh
examples/timit/asr1/local/align.sh
+6
-4
examples/timit/asr1/local/test.sh
examples/timit/asr1/local/test.sh
+13
-9
examples/timit/asr1/run.sh
examples/timit/asr1/run.sh
+7
-6
examples/tiny/asr0/conf/deepspeech2.yaml
examples/tiny/asr0/conf/deepspeech2.yaml
+59
-62
examples/tiny/asr0/conf/deepspeech2_online.yaml
examples/tiny/asr0/conf/deepspeech2_online.yaml
+61
-65
examples/tiny/asr0/conf/tuning/chunk_decode.yaml
examples/tiny/asr0/conf/tuning/chunk_decode.yaml
+10
-0
examples/tiny/asr0/conf/tuning/decode.yaml
examples/tiny/asr0/conf/tuning/decode.yaml
+10
-0
examples/tiny/asr0/local/test.sh
examples/tiny/asr0/local/test.sh
+6
-4
examples/tiny/asr0/run.sh
examples/tiny/asr0/run.sh
+2
-1
examples/tiny/asr1/conf/chunk_confermer.yaml
examples/tiny/asr1/conf/chunk_confermer.yaml
+92
-114
examples/tiny/asr1/conf/chunk_transformer.yaml
examples/tiny/asr1/conf/chunk_transformer.yaml
+84
-106
examples/tiny/asr1/conf/conformer.yaml
examples/tiny/asr1/conf/conformer.yaml
+36
-44
examples/tiny/asr1/conf/transformer.yaml
examples/tiny/asr1/conf/transformer.yaml
+34
-42
examples/tiny/asr1/conf/tuning/chunk_decode.yaml
examples/tiny/asr1/conf/tuning/chunk_decode.yaml
+11
-0
examples/tiny/asr1/conf/tuning/decode.yaml
examples/tiny/asr1/conf/tuning/decode.yaml
+11
-0
examples/tiny/asr1/local/align.sh
examples/tiny/asr1/local/align.sh
+6
-4
examples/tiny/asr1/local/test.sh
examples/tiny/asr1/local/test.sh
+10
-7
examples/tiny/asr1/run.sh
examples/tiny/asr1/run.sh
+3
-2
examples/wenetspeech/asr1/conf/conformer.yaml
examples/wenetspeech/asr1/conf/conformer.yaml
+85
-104
examples/wenetspeech/asr1/conf/tuning/decode.yaml
examples/wenetspeech/asr1/conf/tuning/decode.yaml
+11
-0
examples/wenetspeech/asr1/local/test.sh
examples/wenetspeech/asr1/local/test.sh
+10
-7
examples/wenetspeech/asr1/local/test_wav.sh
examples/wenetspeech/asr1/local/test_wav.sh
+8
-6
examples/wenetspeech/asr1/run.sh
examples/wenetspeech/asr1/run.sh
+4
-4
paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
+18
-14
paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
+18
-14
paddlespeech/s2t/exps/deepspeech2/bin/test.py
paddlespeech/s2t/exps/deepspeech2/bin/test.py
+6
-0
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+6
-0
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+13
-8
paddlespeech/s2t/exps/deepspeech2/config.py
paddlespeech/s2t/exps/deepspeech2/config.py
+0
-11
paddlespeech/s2t/exps/deepspeech2/model.py
paddlespeech/s2t/exps/deepspeech2/model.py
+65
-67
paddlespeech/s2t/exps/u2/bin/alignment.py
paddlespeech/s2t/exps/u2/bin/alignment.py
+2
-2
paddlespeech/s2t/exps/u2/bin/test.py
paddlespeech/s2t/exps/u2/bin/test.py
+2
-2
paddlespeech/s2t/exps/u2/bin/test_wav.py
paddlespeech/s2t/exps/u2/bin/test_wav.py
+3
-3
paddlespeech/s2t/exps/u2/config.py
paddlespeech/s2t/exps/u2/config.py
+5
-5
paddlespeech/s2t/exps/u2/model.py
paddlespeech/s2t/exps/u2/model.py
+9
-7
paddlespeech/s2t/exps/u2/trainer.py
paddlespeech/s2t/exps/u2/trainer.py
+29
-29
paddlespeech/s2t/exps/u2_kaldi/bin/test.py
paddlespeech/s2t/exps/u2_kaldi/bin/test.py
+4
-0
paddlespeech/s2t/exps/u2_kaldi/model.py
paddlespeech/s2t/exps/u2_kaldi/model.py
+39
-36
paddlespeech/s2t/exps/u2_st/bin/test.py
paddlespeech/s2t/exps/u2_st/bin/test.py
+8
-2
paddlespeech/s2t/exps/u2_st/config.py
paddlespeech/s2t/exps/u2_st/config.py
+5
-5
paddlespeech/s2t/exps/u2_st/model.py
paddlespeech/s2t/exps/u2_st/model.py
+56
-54
paddlespeech/s2t/io/collator.py
paddlespeech/s2t/io/collator.py
+28
-30
paddlespeech/s2t/io/dataset.py
paddlespeech/s2t/io/dataset.py
+9
-9
paddlespeech/s2t/models/ds2/deepspeech2.py
paddlespeech/s2t/models/ds2/deepspeech2.py
+7
-7
paddlespeech/s2t/models/ds2_online/deepspeech2.py
paddlespeech/s2t/models/ds2_online/deepspeech2.py
+9
-9
paddlespeech/s2t/training/cli.py
paddlespeech/s2t/training/cli.py
+1
-1
tests/benchmark/conformer/run.sh
tests/benchmark/conformer/run.sh
+3
-2
tests/benchmark/conformer/run_benchmark.sh
tests/benchmark/conformer/run_benchmark.sh
+11
-9
tests/chains/ds2/ds2_params_lite_train_infer.txt
tests/chains/ds2/ds2_params_lite_train_infer.txt
+2
-2
tests/chains/ds2/ds2_params_whole_train_infer.txt
tests/chains/ds2/ds2_params_whole_train_infer.txt
+1
-1
tests/chains/ds2/lite_train_infer.sh
tests/chains/ds2/lite_train_infer.sh
+2
-2
tests/chains/ds2/prepare.sh
tests/chains/ds2/prepare.sh
+4
-4
tests/chains/ds2/test.sh
tests/chains/ds2/test.sh
+1
-0
未找到文件。
examples/aishell/asr0/conf/deepspeech2.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
min_input_len
:
0.0
dev_manifest
:
data/manifest.dev
max_input_len
:
27.0
# second
test_manifest
:
data/manifest.test
min_output_len
:
0.0
min_input_len
:
0.0
max_output_len
:
.inf
max_input_len
:
27.0
# second
min_output_input_ratio
:
0.00
min_output_len
:
0.0
max_output_input_ratio
:
.inf
max_output_len
:
.inf
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
collator
:
###########################################
batch_size
:
64
# one gpu
# Dataloader #
mean_std_filepath
:
data/mean_std.json
###########################################
unit_type
:
char
batch_size
:
64
# one gpu
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
data/mean_std.json
augmentation_config
:
conf/augmentation.json
unit_type
:
char
random_seed
:
0
vocab_filepath
:
data/lang_char/vocab.txt
spm_model_prefix
:
augmentation_config
:
conf/augmentation.json
spectrum_type
:
linear
random_seed
:
0
feat_dim
:
spm_model_prefix
:
delta_delta
:
False
spectrum_type
:
linear
stride_ms
:
10.0
feat_dim
:
window_ms
:
20.0
delta_delta
:
False
n_fft
:
None
stride_ms
:
10.0
max_freq
:
None
window_ms
:
20.0
target_sample_rate
:
16000
n_fft
:
None
use_dB_normalization
:
True
max_freq
:
None
target_dB
:
-20
target_sample_rate
:
16000
dither
:
1.0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
dither
:
1.0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
model
:
############################################
num_conv_layers
:
2
# Network Architecture #
num_rnn_layers
:
3
############################################
rnn_layer_size
:
1024
num_conv_layers
:
2
use_gru
:
True
num_rnn_layers
:
3
share_rnn_weights
:
False
rnn_layer_size
:
1024
blank_id
:
0
use_gru
:
True
ctc_grad_norm_type
:
instance
share_rnn_weights
:
False
blank_id
:
0
ctc_grad_norm_type
:
instance
training
:
###########################################
n_epoch
:
80
# Training #
accum_grad
:
1
###########################################
lr
:
2e-3
n_epoch
:
80
lr_decay
:
0.83
accum_grad
:
1
weight_decay
:
1e-06
lr
:
2e-3
global_grad_clip
:
3.0
lr_decay
:
0.83
log_interval
:
100
weight_decay
:
1e-06
checkpoint
:
global_grad_clip
:
3.0
kbest_n
:
50
log_interval
:
100
latest_n
:
5
checkpoint
:
kbest_n
:
50
decoding
:
latest_n
:
5
batch_size
:
128
error_rate_type
:
cer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha
:
1.9
beta
:
5.0
beam_size
:
300
cutoff_prob
:
0.99
cutoff_top_n
:
40
num_proc_bsearch
:
10
examples/aishell/asr0/conf/deepspeech2_online.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
min_input_len
:
0.0
dev_manifest
:
data/manifest.dev
max_input_len
:
27.0
# second
test_manifest
:
data/manifest.test
min_output_len
:
0.0
min_input_len
:
0.0
max_output_len
:
.inf
max_input_len
:
27.0
# second
min_output_input_ratio
:
0.00
min_output_len
:
0.0
max_output_input_ratio
:
.inf
max_output_len
:
.inf
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
collator
:
###########################################
batch_size
:
64
# one gpu
# Dataloader #
mean_std_filepath
:
data/mean_std.json
###########################################
unit_type
:
char
batch_size
:
64
# one gpu
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
data/mean_std.json
augmentation_config
:
conf/augmentation.json
unit_type
:
char
random_seed
:
0
vocab_filepath
:
data/lang_char/vocab.txt
spm_model_prefix
:
augmentation_config
:
conf/augmentation.json
spectrum_type
:
linear
#linear, mfcc, fbank
random_seed
:
0
feat_dim
:
spm_model_prefix
:
delta_delta
:
False
spectrum_type
:
linear
#linear, mfcc, fbank
stride_ms
:
10.0
feat_dim
:
window_ms
:
20.0
delta_delta
:
False
n_fft
:
None
stride_ms
:
10.0
max_freq
:
None
window_ms
:
20.0
target_sample_rate
:
16000
n_fft
:
None
use_dB_normalization
:
True
max_freq
:
None
target_dB
:
-20
target_sample_rate
:
16000
dither
:
1.0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
dither
:
1.0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
0
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
model
:
############################################
num_conv_layers
:
2
# Network Architecture #
num_rnn_layers
:
5
############################################
rnn_layer_size
:
1024
num_conv_layers
:
2
rnn_direction
:
forward
# [forward, bidirect]
num_rnn_layers
:
5
num_fc_layers
:
0
rnn_layer_size
:
1024
fc_layers_size_list
:
-1,
rnn_direction
:
forward
# [forward, bidirect]
use_gru
:
False
num_fc_layers
:
0
blank_id
:
0
fc_layers_size_list
:
-1,
use_gru
:
False
blank_id
:
0
training
:
###########################################
n_epoch
:
65
# Training #
accum_grad
:
1
###########################################
lr
:
5e-4
n_epoch
:
65
lr_decay
:
0.93
accum_grad
:
1
weight_decay
:
1e-06
lr
:
5e-4
global_grad_clip
:
3.0
lr_decay
:
0.93
log_interval
:
100
weight_decay
:
1e-06
checkpoint
:
global_grad_clip
:
3.0
kbest_n
:
50
log_interval
:
100
latest_n
:
5
checkpoint
:
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
32
error_rate_type
:
cer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha
:
2.2
#1.9
beta
:
4.3
beam_size
:
300
cutoff_prob
:
0.99
cutoff_top_n
:
40
num_proc_bsearch
:
10
examples/aishell/asr0/conf/tuning/chunk_decode.yaml
0 → 100644
浏览文件 @
c907a8de
chunk_batch_size
:
32
error_rate_type
:
cer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha
:
2.2
#1.9
beta
:
4.3
beam_size
:
300
cutoff_prob
:
0.99
cutoff_top_n
:
40
num_proc_bsearch
:
10
examples/aishell/asr0/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
128
error_rate_type
:
cer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha
:
1.9
beta
:
5.0
beam_size
:
300
cutoff_prob
:
0.99
cutoff_top_n
:
40
num_proc_bsearch
:
10
examples/aishell/asr0/local/test.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix model_type"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
model_type
=
$3
ckpt_prefix
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_ch.sh
bash
local
/download_lm_ch.sh
...
@@ -21,6 +22,7 @@ fi
...
@@ -21,6 +22,7 @@ fi
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
--model_type
${
model_type
}
...
...
examples/aishell/asr0/local/test_export.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix model_type"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
jit_model_export_path
=
$2
decode_config_path
=
$2
model_type
=
$3
jit_model_export_path
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_ch.sh
>
/dev/null 2>&1
bash
local
/download_lm_ch.sh
>
/dev/null 2>&1
...
@@ -21,6 +22,7 @@ fi
...
@@ -21,6 +22,7 @@ fi
python3
-u
${
BIN_DIR
}
/test_export.py
\
python3
-u
${
BIN_DIR
}
/test_export.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
jit_model_export_path
}
.rsl
\
--result_file
${
jit_model_export_path
}
.rsl
\
--export_path
${
jit_model_export_path
}
\
--export_path
${
jit_model_export_path
}
\
--model_type
${
model_type
}
--model_type
${
model_type
}
...
...
examples/aishell/asr0/local/test_hub_ori
0 → 100755
浏览文件 @
c907a8de
#!/bin/bash
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type audio_file"
exit
-1
fi
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
config_path
=
$1
ckpt_prefix
=
$2
model_type
=
$3
audio_file
=
$4
mkdir
-p
data
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav
-P
data/
if
[
$?
-ne
0
]
;
then
exit
1
fi
if
[
!
-f
${
audio_file
}
]
;
then
echo
"Plase input the right audio_file path"
exit
1
fi
# download language model
bash
local
/download_lm_ch.sh
if
[
$?
-ne
0
]
;
then
exit
1
fi
python3
-u
${
BIN_DIR
}
/test_hub.py
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
\
--audio_file
${
audio_file
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
exit
1
fi
exit
0
examples/aishell/asr0/local/test_wav.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
4
]
;
then
if
[
$#
!=
5
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type audio_file"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix model_type audio_file"
exit
-1
exit
-1
fi
fi
...
@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
model_type
=
$3
ckpt_prefix
=
$3
audio_file
=
$4
model_type
=
$4
audio_file
=
$5
mkdir
-p
data
mkdir
-p
data
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav
-P
data/
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav
-P
data/
...
@@ -33,6 +34,7 @@ fi
...
@@ -33,6 +34,7 @@ fi
python3
-u
${
BIN_DIR
}
/test_wav.py
\
python3
-u
${
BIN_DIR
}
/test_wav.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
\
--model_type
${
model_type
}
\
...
...
examples/aishell/asr0/run.sh
浏览文件 @
c907a8de
...
@@ -6,6 +6,7 @@ gpus=0,1,2,3
...
@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
#conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
conf_path
=
conf/deepspeech2.yaml
#conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
avg_num
=
1
model_type
=
offline
# offline or online
model_type
=
offline
# offline or online
audio_file
=
data/demo_01_03.wav
audio_file
=
data/demo_01_03.wav
...
@@ -34,7 +35,7 @@ fi
...
@@ -34,7 +35,7 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
...
@@ -44,11 +45,11 @@ fi
...
@@ -44,11 +45,11 @@ fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# test export ckpt avg_n
# test export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test_export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_export.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
||
exit
-1
fi
fi
# Optionally, you can add LM and test it with runtime.
# Optionally, you can add LM and test it with runtime.
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
# test a single .wav file
# test a single .wav file
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
fi
fi
examples/aishell/asr1/conf/chunk_conformer.yaml
浏览文件 @
c907a8de
...
@@ -54,8 +54,9 @@ test_manifest: data/manifest.test
...
@@ -54,8 +54,9 @@ test_manifest: data/manifest.test
###########################################
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
vocab_filepath
:
data/lang_char/vocab.txt
spm_model_prefix
:
'
'
unit_type
:
'
char'
unit_type
:
'
char'
augmentation
_config
:
conf/preprocess.yaml
preprocess
_config
:
conf/preprocess.yaml
feat_dim
:
80
feat_dim
:
80
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
25.0
window_ms
:
25.0
...
@@ -74,7 +75,7 @@ subsampling_factor: 1
...
@@ -74,7 +75,7 @@ subsampling_factor: 1
num_encs
:
1
num_encs
:
1
###########################################
###########################################
#
t
raining #
#
T
raining #
###########################################
###########################################
n_epoch
:
240
n_epoch
:
240
accum_grad
:
2
accum_grad
:
2
...
@@ -82,7 +83,7 @@ global_grad_clip: 5.0
...
@@ -82,7 +83,7 @@ global_grad_clip: 5.0
optim
:
adam
optim
:
adam
optim_conf
:
optim_conf
:
lr
:
0.002
lr
:
0.002
weight_decay
:
1e-6
weight_decay
:
1
.0
e-6
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
...
...
examples/aishell/asr1/conf/conformer.yaml
浏览文件 @
c907a8de
...
@@ -49,8 +49,9 @@ test_manifest: data/manifest.test
...
@@ -49,8 +49,9 @@ test_manifest: data/manifest.test
# Dataloader #
# Dataloader #
###########################################
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
vocab_filepath
:
data/lang_char/vocab.txt
spm_model_prefix
:
'
'
unit_type
:
'
char'
unit_type
:
'
char'
augmentation
_config
:
conf/preprocess.yaml
preprocess
_config
:
conf/preprocess.yaml
feat_dim
:
80
feat_dim
:
80
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
25.0
window_ms
:
25.0
...
@@ -69,7 +70,7 @@ subsampling_factor: 1
...
@@ -69,7 +70,7 @@ subsampling_factor: 1
num_encs
:
1
num_encs
:
1
###########################################
###########################################
#
training
#
#
Training
#
###########################################
###########################################
n_epoch
:
240
n_epoch
:
240
accum_grad
:
2
accum_grad
:
2
...
...
examples/aishell/asr1/conf/transformer.yaml
浏览文件 @
c907a8de
...
@@ -46,6 +46,7 @@ test_manifest: data/manifest.test
...
@@ -46,6 +46,7 @@ test_manifest: data/manifest.test
###########################################
###########################################
unit_type
:
'
char'
unit_type
:
'
char'
vocab_filepath
:
data/lang_char/vocab.txt
vocab_filepath
:
data/lang_char/vocab.txt
spm_model_prefix
:
'
'
feat_dim
:
80
feat_dim
:
80
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
25.0
window_ms
:
25.0
...
@@ -59,13 +60,13 @@ batch_bins: 0
...
@@ -59,13 +60,13 @@ batch_bins: 0
batch_frames_in
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
batch_frames_inout
:
0
augmentation
_config
:
conf/preprocess.yaml
preprocess
_config
:
conf/preprocess.yaml
num_workers
:
0
num_workers
:
0
subsampling_factor
:
1
subsampling_factor
:
1
num_encs
:
1
num_encs
:
1
###########################################
###########################################
#
t
raining #
#
T
raining #
###########################################
###########################################
n_epoch
:
240
n_epoch
:
240
accum_grad
:
2
accum_grad
:
2
...
@@ -73,7 +74,7 @@ global_grad_clip: 5.0
...
@@ -73,7 +74,7 @@ global_grad_clip: 5.0
optim
:
adam
optim
:
adam
optim_conf
:
optim_conf
:
lr
:
0.002
lr
:
0.002
weight_decay
:
1e-6
weight_decay
:
1
.0
e-6
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
...
...
examples/aishell/asr1/local/align.sh
浏览文件 @
c907a8de
...
@@ -21,7 +21,7 @@ mkdir -p ${output_dir}
...
@@ -21,7 +21,7 @@ mkdir -p ${output_dir}
python3
-u
${
BIN_DIR
}
/alignment.py
\
python3
-u
${
BIN_DIR
}
/alignment.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_c
onfi
g
${
decode_config_path
}
\
--decode_c
f
g
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decode.decode_batch_size
${
batch_size
}
--opts
decode.decode_batch_size
${
batch_size
}
...
...
examples/aishell/asr1/local/test.sh
浏览文件 @
c907a8de
...
@@ -30,14 +30,14 @@ for type in attention ctc_greedy_search; do
...
@@ -30,14 +30,14 @@ for type in attention ctc_greedy_search; do
# stream decoding only support batchsize=1
# stream decoding only support batchsize=1
batch_size
=
1
batch_size
=
1
else
else
batch_size
=
1
batch_size
=
64
fi
fi
output_dir
=
${
ckpt_prefix
}
output_dir
=
${
ckpt_prefix
}
mkdir
-p
${
output_dir
}
mkdir
-p
${
output_dir
}
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_c
onfi
g
${
decode_config_path
}
\
--decode_c
f
g
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decode.decoding_method
${
type
}
\
--opts
decode.decoding_method
${
type
}
\
...
@@ -57,7 +57,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
...
@@ -57,7 +57,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_c
onfi
g
${
decode_config_path
}
\
--decode_c
f
g
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decode.decoding_method
${
type
}
\
--opts
decode.decoding_method
${
type
}
\
...
...
examples/aishell/asr1/local/test_wav.sh
浏览文件 @
c907a8de
...
@@ -43,7 +43,7 @@ for type in attention_rescoring; do
...
@@ -43,7 +43,7 @@ for type in attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test_wav.py
\
python3
-u
${
BIN_DIR
}
/test_wav.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_c
onfi
g
${
decode_config_path
}
\
--decode_c
f
g
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decode.decoding_method
${
type
}
\
--opts
decode.decoding_method
${
type
}
\
...
...
examples/callcenter/asr1/conf/chunk_conformer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
min_input_len
:
0.5
dev_manifest
:
data/manifest.dev
max_input_len
:
20.0
# second
test_manifest
:
data/manifest.test
min_output_len
:
0.0
max_output_len
:
400.0
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
char'
spm_model_prefix
:
'
'
augmentation_config
:
conf/preprocess.yaml
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
target_sample_rate
:
8000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
window_ms
:
25.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
# network architecture
model
:
cmvn_file
:
cmvn_file_type
:
"
json"
# encoder related
encoder
:
conformer
encoder_conf
:
output_size
:
256
# dimension of attention
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
num_blocks
:
12
# the number of encoder blocks
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
attention_dropout_rate
:
0.0
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
True
use_cnn_module
:
True
cnn_module_kernel
:
15
activation_type
:
'
swish'
pos_enc_layer_type
:
'
rel_pos'
selfattention_layer_type
:
'
rel_selfattn'
causal
:
true
use_dynamic_chunk
:
true
cnn_module_norm
:
'
layer_norm'
# using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk
:
false
# decoder related
decoder
:
transformer
decoder_conf
:
attention_heads
:
4
linear_units
:
2048
num_blocks
:
6
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
model_conf
:
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
###########################################
# Dataloader #
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
char'
spm_model_prefix
:
'
'
preprocess_config
:
conf/preprocess.yaml
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
target_sample_rate
:
8000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
window_ms
:
25.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
training
:
n_epoch
:
240
accum_grad
:
4
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.001
weight_decay
:
1e-6
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
25000
lr_decay
:
1.0
log_interval
:
100
checkpoint
:
kbest_n
:
50
latest_n
:
5
############################################
# Network Architecture #
############################################
cmvn_file
:
cmvn_file_type
:
"
json"
# encoder related
encoder
:
conformer
encoder_conf
:
output_size
:
256
# dimension of attention
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
num_blocks
:
12
# the number of encoder blocks
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
attention_dropout_rate
:
0.0
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
True
use_cnn_module
:
True
cnn_module_kernel
:
15
activation_type
:
'
swish'
pos_enc_layer_type
:
'
rel_pos'
selfattention_layer_type
:
'
rel_selfattn'
causal
:
true
use_dynamic_chunk
:
true
cnn_module_norm
:
'
layer_norm'
# using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk
:
false
decoding
:
# decoder related
batch_size
:
128
decoder
:
transformer
error_rate_type
:
cer
decoder_conf
:
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
attention_heads
:
4
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
linear_units
:
2048
alpha
:
2.5
num_blocks
:
6
beta
:
0.3
dropout_rate
:
0.1
beam_size
:
10
positional_dropout_rate
:
0.1
cutoff_prob
:
1.0
self_attention_dropout_rate
:
0.0
cutoff_top_n
:
0
src_attention_dropout_rate
:
0.0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
true
# simulate streaming inference. Defaults to False.
# hybrid CTC/attention
model_conf
:
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
###########################################
# Training #
###########################################
n_epoch
:
240
accum_grad
:
4
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.001
weight_decay
:
1.0e-6
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
25000
lr_decay
:
1.0
log_interval
:
100
checkpoint
:
kbest_n
:
50
latest_n
:
5
examples/callcenter/asr1/conf/conformer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
min_input_len
:
0.5
dev_manifest
:
data/manifest.dev
max_input_len
:
20.0
# second
test_manifest
:
data/manifest.test
min_output_len
:
0.0
max_output_len
:
400.0
min_output_input_ratio
:
0.0
max_output_input_ratio
:
.inf
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
'
char'
###########################################
spm_model_prefix
:
'
'
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/preprocess.yaml
unit_type
:
'
char'
batch_size
:
32
spm_model_prefix
:
'
'
raw_wav
:
True
# use raw_wav or kaldi feature
preprocess_config
:
conf/preprocess.yaml
spectrum_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
8
0
stride_ms
:
10.
0
delta_delta
:
False
window_ms
:
25.0
dither
:
1.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
target_sample_rate
:
8000
batch_size
:
64
max_freq
:
None
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
n_fft
:
None
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
stride_ms
:
10.0
minibatches
:
0
# for debug
window_ms
:
25.0
batch_count
:
auto
use_dB_normalization
:
True
batch_bins
:
0
target_dB
:
-2
0
batch_frames_in
:
0
random_seed
:
0
batch_frames_out
:
0
keep_transcription_text
:
False
batch_frames_inout
:
0
sortagrad
:
True
num_workers
:
0
shuffle_method
:
batch_shuffle
subsampling_factor
:
1
num_workers
:
2
num_encs
:
1
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
# encoder related
cmvn_file_type
:
"
json"
encoder
:
conformer
# encoder related
encoder_conf
:
encoder
:
conformer
output_size
:
256
# dimension of attention
encoder_conf
:
attention_heads
:
4
output_size
:
256
# dimension of attention
linear_units
:
2048
# the number of units of position-wise feed forward
attention_heads
:
4
num_blocks
:
12
# the number of encoder blocks
linear_units
:
2048
# the number of units of position-wise feed forward
dropout_rate
:
0.1
num_blocks
:
12
# the number of encoder blocks
positional_dropout_rate
:
0.1
dropout_rate
:
0.1
attention_dropout_rate
:
0.0
positional_dropout_rate
:
0.1
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
attention_dropout_rate
:
0.0
normalize_before
:
True
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
use_cnn_module
:
True
normalize_before
:
True
cnn_module_kernel
:
15
use_cnn_module
:
True
activation_type
:
'
swish'
cnn_module_kernel
:
15
pos_enc_layer_type
:
'
rel_pos'
activation_type
:
'
swish'
selfattention_layer_type
:
'
rel_selfattn'
pos_enc_layer_type
:
'
rel_pos'
selfattention_layer_type
:
'
rel_selfattn'
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
dropout_rate
:
0.1
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
training
:
n_epoch
:
100
# 50 will be lowest
accum_grad
:
4
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.002
weight_decay
:
1e-6
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
25000
lr_decay
:
1.0
log_interval
:
100
checkpoint
:
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
128
error_rate_type
:
cer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
###########################################
# Training #
###########################################
n_epoch
:
100
# 50 will be lowest
accum_grad
:
4
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.002
weight_decay
:
1.0e-6
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
25000
lr_decay
:
1.0
log_interval
:
100
checkpoint
:
kbest_n
:
50
latest_n
:
5
examples/callcenter/asr1/conf/preprocess.yaml
浏览文件 @
c907a8de
process
:
process
:
# extract kaldi fbank from PCM
# extract kaldi fbank from PCM
-
type
:
fbank_kaldi
-
type
:
fbank_kaldi
fs
:
16
000
fs
:
8
000
n_mels
:
80
n_mels
:
80
n_shift
:
160
n_shift
:
160
win_length
:
400
win_length
:
400
...
...
examples/callcenter/asr1/conf/tuning/chunk_decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
128
error_rate_type
:
cer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
true
# simulate streaming inference. Defaults to False.
\ No newline at end of file
examples/callcenter/asr1/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
128
error_rate_type
:
cer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/callcenter/asr1/local/align.sh
浏览文件 @
c907a8de
#! /usr/bin/env bash
#! /usr/bin/env bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
ckpt_name
=
$(
basename
${
ckpt_prefxi
}
)
ckpt_name
=
$(
basename
${
ckpt_prefxi
}
)
...
@@ -25,9 +26,10 @@ mkdir -p ${output_dir}
...
@@ -25,9 +26,10 @@ mkdir -p ${output_dir}
python3
-u
${
BIN_DIR
}
/alignment.py
\
python3
-u
${
BIN_DIR
}
/alignment.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in ctc alignment!"
echo
"Failed in ctc alignment!"
...
...
examples/callcenter/asr1/local/test.sh
浏览文件 @
c907a8de
#! /usr/bin/env bash
#! /usr/bin/env bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
ckpt_name
=
$(
basename
${
ckpt_prefxi
}
)
ckpt_name
=
$(
basename
${
ckpt_prefxi
}
)
...
@@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do
...
@@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
@@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
...
@@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/callcenter/asr1/run.sh
浏览文件 @
c907a8de
...
@@ -6,6 +6,7 @@ gpus=0,1,2,3
...
@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/conformer.yaml
conf_path
=
conf/conformer.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
20
avg_num
=
20
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
@@ -31,12 +32,12 @@ fi
...
@@ -31,12 +32,12 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# ctc alignment of test data
# ctc alignment of test data
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
...
...
examples/librispeech/asr0/conf/deepspeech2.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev-clean
###########################################
test_manifest
:
data/manifest.test-clean
train_manifest
:
data/manifest.train
min_input_len
:
0.0
dev_manifest
:
data/manifest.dev-clean
max_input_len
:
30.0
# second
test_manifest
:
data/manifest.test-clean
min_output_len
:
0.0
min_input_len
:
0.0
max_output_len
:
.inf
max_input_len
:
30.0
# second
min_output_input_ratio
:
0.00
min_output_len
:
0.0
max_output_input_ratio
:
.inf
max_output_len
:
.inf
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
collator
:
###########################################
batch_size
:
20
# Dataloader #
mean_std_filepath
:
data/mean_std.json
###########################################
unit_type
:
char
batch_size
:
20
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
data/mean_std.json
augmentation_config
:
conf/augmentation.json
unit_type
:
char
random_seed
:
0
vocab_filepath
:
data/lang_char/vocab.txt
spm_model_prefix
:
augmentation_config
:
conf/augmentation.json
spectrum_type
:
linear
random_seed
:
0
target_sample_rate
:
16000
spm_model_prefix
:
max_freq
:
None
spectrum_type
:
linear
n_fft
:
None
feat_dim
:
stride_ms
:
10.0
target_sample_rate
:
16000
window_ms
:
20.0
max_freq
:
None
delta_delta
:
False
n_fft
:
None
dither
:
1.0
stride_ms
:
10.0
use_dB_normalization
:
True
window_ms
:
20.0
target_dB
:
-20
delta_delta
:
False
random_seed
:
0
dither
:
1.0
keep_transcription_text
:
False
use_dB_normalization
:
True
sortagrad
:
True
target_dB
:
-20
shuffle_method
:
batch_shuffle
random_seed
:
0
num_workers
:
2
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
model
:
############################################
num_conv_layers
:
2
# Network Architecture #
num_rnn_layers
:
3
############################################
rnn_layer_size
:
2048
num_conv_layers
:
2
use_gru
:
False
num_rnn_layers
:
3
share_rnn_weights
:
True
rnn_layer_size
:
2048
blank_id
:
0
use_gru
:
False
share_rnn_weights
:
True
blank_id
:
0
training
:
###########################################
n_epoch
:
50
# Training #
accum_grad
:
1
###########################################
lr
:
1e-3
n_epoch
:
50
lr_decay
:
0.83
accum_grad
:
1
weight_decay
:
1e-06
lr
:
1e-3
global_grad_clip
:
5.0
lr_decay
:
0.83
log_interval
:
100
weight_decay
:
1e-06
checkpoint
:
global_grad_clip
:
5.0
kbest_n
:
50
log_interval
:
100
latest_n
:
5
checkpoint
:
kbest_n
:
50
decoding
:
latest_n
:
5
batch_size
:
128
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
1.9
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/librispeech/asr0/conf/deepspeech2_online.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev-clean
###########################################
test_manifest
:
data/manifest.test-clean
train_manifest
:
data/manifest.train
min_input_len
:
0.0
dev_manifest
:
data/manifest.dev-clean
max_input_len
:
30.0
# second
test_manifest
:
data/manifest.test-clean
min_output_len
:
0.0
min_input_len
:
0.0
max_output_len
:
.inf
max_input_len
:
30.0
# second
min_output_input_ratio
:
0.00
min_output_len
:
0.0
max_output_input_ratio
:
.inf
max_output_len
:
.inf
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
collator
:
###########################################
batch_size
:
15
# Dataloader #
mean_std_filepath
:
data/mean_std.json
###########################################
unit_type
:
char
batch_size
:
15
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
data/mean_std.json
augmentation_config
:
conf/augmentation.json
unit_type
:
char
random_seed
:
0
vocab_filepath
:
data/lang_char/vocab.txt
spm_model_prefix
:
augmentation_config
:
conf/augmentation.json
spectrum_type
:
linear
random_seed
:
0
target_sample_rate
:
16000
spm_model_prefix
:
max_freq
:
None
spectrum_type
:
linear
n_fft
:
None
feat_dim
:
stride_ms
:
10.0
target_sample_rate
:
16000
window_ms
:
20.0
max_freq
:
None
delta_delta
:
False
n_fft
:
None
dither
:
1.0
stride_ms
:
10.0
use_dB_normalization
:
True
window_ms
:
20.0
target_dB
:
-20
delta_delta
:
False
random_seed
:
0
dither
:
1.0
keep_transcription_text
:
False
use_dB_normalization
:
True
sortagrad
:
True
target_dB
:
-20
shuffle_method
:
batch_shuffle
random_seed
:
0
num_workers
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
model
:
############################################
num_conv_layers
:
2
# Network Architecture #
num_rnn_layers
:
3
############################################
rnn_layer_size
:
2048
num_conv_layers
:
2
rnn_direction
:
forward
num_rnn_layers
:
3
num_fc_layers
:
2
rnn_layer_size
:
2048
fc_layers_size_list
:
512,
256
rnn_direction
:
forward
use_gru
:
False
num_fc_layers
:
2
blank_id
:
0
fc_layers_size_list
:
512,
256
use_gru
:
False
blank_id
:
0
training
:
###########################################
n_epoch
:
50
# Training #
accum_grad
:
4
###########################################
lr
:
1e-3
n_epoch
:
50
lr_decay
:
0.83
accum_grad
:
4
weight_decay
:
1e-06
lr
:
1e-3
global_grad_clip
:
5.0
lr_decay
:
0.83
log_interval
:
100
weight_decay
:
1e-06
checkpoint
:
global_grad_clip
:
5.0
kbest_n
:
50
log_interval
:
100
latest_n
:
5
checkpoint
:
kbest_n
:
50
decoding
:
latest_n
:
5
batch_size
:
128
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
1.9
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/librispeech/asr0/conf/tuning/chunk_decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
128
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
1.9
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
\ No newline at end of file
examples/librispeech/asr0/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
128
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
1.9
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
\ No newline at end of file
examples/librispeech/asr0/local/test.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix model_type"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
model_type
=
$3
ckpt_prefix
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_en.sh
bash
local
/download_lm_en.sh
...
@@ -21,6 +22,7 @@ fi
...
@@ -21,6 +22,7 @@ fi
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
--model_type
${
model_type
}
...
...
examples/librispeech/asr0/local/test_wav.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
4
]
;
then
if
[
$#
!=
5
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type audio_file"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix model_type audio_file"
exit
-1
exit
-1
fi
fi
...
@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
model_type
=
$3
ckpt_prefix
=
$3
audio_file
=
$4
model_type
=
$4
audio_file
=
$5
mkdir
-p
data
mkdir
-p
data
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav
-P
data/
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav
-P
data/
...
@@ -33,6 +34,7 @@ fi
...
@@ -33,6 +34,7 @@ fi
python3
-u
${
BIN_DIR
}
/test_wav.py
\
python3
-u
${
BIN_DIR
}
/test_wav.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
\
--model_type
${
model_type
}
\
...
...
examples/librispeech/asr0/run.sh
浏览文件 @
c907a8de
...
@@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7
...
@@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
30
avg_num
=
30
model_type
=
offline
model_type
=
offline
audio_file
=
data/demo_002_en.wav
audio_file
=
data/demo_002_en.wav
...
@@ -33,7 +34,7 @@ fi
...
@@ -33,7 +34,7 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
...
@@ -43,5 +44,5 @@ fi
...
@@ -43,5 +44,5 @@ fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# test a single .wav file
# test a single .wav file
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
fi
fi
examples/librispeech/asr1/conf/chunk_conformer.yaml
浏览文件 @
c907a8de
...
@@ -57,7 +57,7 @@ vocab_filepath: data/lang_char/vocab.txt
...
@@ -57,7 +57,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type
:
'
spm'
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
mean_std_filepath
:
"
"
mean_std_filepath
:
"
"
augmentation
_config
:
conf/preprocess.yaml
preprocess
_config
:
conf/preprocess.yaml
feat_dim
:
80
feat_dim
:
80
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
25.0
window_ms
:
25.0
...
@@ -70,8 +70,7 @@ batch_count: auto
...
@@ -70,8 +70,7 @@ batch_count: auto
batch_bins
:
0
batch_bins
:
0
batch_frames_in
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
batch_frames_inout
:
0
augmentation_config
:
conf/preprocess.yaml
num_workers
:
0
num_workers
:
0
subsampling_factor
:
1
subsampling_factor
:
1
num_encs
:
1
num_encs
:
1
...
@@ -85,10 +84,11 @@ global_grad_clip: 5.0
...
@@ -85,10 +84,11 @@ global_grad_clip: 5.0
optim
:
adam
optim
:
adam
optim_conf
:
optim_conf
:
lr
:
0.001
lr
:
0.001
weight_decay
:
1e-06
weight_decay
:
1
.0
e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
lr_decay
:
1.0
log_interval
:
100
log_interval
:
100
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
...
...
examples/librispeech/asr1/conf/chunk_transformer.yaml
浏览文件 @
c907a8de
...
@@ -50,7 +50,7 @@ vocab_filepath: data/lang_char/vocab.txt
...
@@ -50,7 +50,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type
:
'
spm'
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
mean_std_filepath
:
"
"
mean_std_filepath
:
"
"
augmentation
_config
:
conf/preprocess.yaml
preprocess
_config
:
conf/preprocess.yaml
feat_dim
:
80
feat_dim
:
80
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
25.0
window_ms
:
25.0
...
@@ -64,7 +64,6 @@ batch_bins: 0
...
@@ -64,7 +64,6 @@ batch_bins: 0
batch_frames_in
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
batch_frames_inout
:
0
augmentation_config
:
conf/preprocess.yaml
num_workers
:
0
num_workers
:
0
subsampling_factor
:
1
subsampling_factor
:
1
num_encs
:
1
num_encs
:
1
...
@@ -79,7 +78,7 @@ global_grad_clip: 5.0
...
@@ -79,7 +78,7 @@ global_grad_clip: 5.0
optim
:
adam
optim
:
adam
optim_conf
:
optim_conf
:
lr
:
0.001
lr
:
0.001
weight_decay
:
1e-06
weight_decay
:
1
.0
e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
...
...
examples/librispeech/asr1/conf/conformer.yaml
浏览文件 @
c907a8de
...
@@ -55,7 +55,7 @@ vocab_filepath: data/lang_char/vocab.txt
...
@@ -55,7 +55,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type
:
'
spm'
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
mean_std_filepath
:
"
"
mean_std_filepath
:
"
"
augmentation
_config
:
conf/preprocess.yaml
preprocess
_config
:
conf/preprocess.yaml
feat_dim
:
80
feat_dim
:
80
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
25.0
window_ms
:
25.0
...
@@ -69,7 +69,6 @@ batch_bins: 0
...
@@ -69,7 +69,6 @@ batch_bins: 0
batch_frames_in
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
batch_frames_inout
:
0
augmentation_config
:
conf/preprocess.yaml
num_workers
:
0
num_workers
:
0
subsampling_factor
:
1
subsampling_factor
:
1
num_encs
:
1
num_encs
:
1
...
@@ -84,7 +83,7 @@ global_grad_clip: 3.0
...
@@ -84,7 +83,7 @@ global_grad_clip: 3.0
optim
:
adam
optim
:
adam
optim_conf
:
optim_conf
:
lr
:
0.004
lr
:
0.004
weight_decay
:
1e-06
weight_decay
:
1
.0
e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
...
...
examples/librispeech/asr1/conf/transformer.yaml
浏览文件 @
c907a8de
...
@@ -49,7 +49,7 @@ vocab_filepath: data/lang_char/vocab.txt
...
@@ -49,7 +49,7 @@ vocab_filepath: data/lang_char/vocab.txt
unit_type
:
'
spm'
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
mean_std_filepath
:
"
"
mean_std_filepath
:
"
"
augmentation
_config
:
conf/preprocess.yaml
preprocess
_config
:
conf/preprocess.yaml
feat_dim
:
80
feat_dim
:
80
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
25.0
window_ms
:
25.0
...
@@ -63,7 +63,6 @@ batch_bins: 0
...
@@ -63,7 +63,6 @@ batch_bins: 0
batch_frames_in
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
batch_frames_inout
:
0
augmentation_config
:
conf/preprocess.yaml
num_workers
:
0
num_workers
:
0
subsampling_factor
:
1
subsampling_factor
:
1
num_encs
:
1
num_encs
:
1
...
@@ -78,7 +77,7 @@ global_grad_clip: 5.0
...
@@ -78,7 +77,7 @@ global_grad_clip: 5.0
optim
:
adam
optim
:
adam
optim_conf
:
optim_conf
:
lr
:
0.004
lr
:
0.004
weight_decay
:
1e-06
weight_decay
:
1
.0
e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
...
...
examples/librispeech/asr1/local/align.sh
浏览文件 @
c907a8de
...
@@ -21,7 +21,7 @@ mkdir -p ${output_dir}
...
@@ -21,7 +21,7 @@ mkdir -p ${output_dir}
python3
-u
${
BIN_DIR
}
/alignment.py
\
python3
-u
${
BIN_DIR
}
/alignment.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_c
onfi
g
${
decode_config_path
}
\
--decode_c
f
g
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decode.decode_batch_size
${
batch_size
}
--opts
decode.decode_batch_size
${
batch_size
}
...
...
examples/librispeech/asr1/local/test.sh
浏览文件 @
c907a8de
...
@@ -53,7 +53,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -53,7 +53,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_c
onfi
g
${
decode_config_path
}
\
--decode_c
f
g
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decode.decoding_method
${
type
}
\
--opts
decode.decoding_method
${
type
}
\
...
@@ -78,7 +78,7 @@ for type in ctc_greedy_search; do
...
@@ -78,7 +78,7 @@ for type in ctc_greedy_search; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_c
onfi
g
${
decode_config_path
}
\
--decode_c
f
g
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decode.decoding_method
${
type
}
\
--opts
decode.decoding_method
${
type
}
\
...
@@ -99,7 +99,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
...
@@ -99,7 +99,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_c
onfi
g
${
decode_config_path
}
\
--decode_c
f
g
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decode.decoding_method
${
type
}
\
--opts
decode.decoding_method
${
type
}
\
...
...
examples/librispeech/asr1/local/test_wav.sh
浏览文件 @
c907a8de
...
@@ -50,7 +50,7 @@ for type in attention_rescoring; do
...
@@ -50,7 +50,7 @@ for type in attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test_wav.py
\
python3
-u
${
BIN_DIR
}
/test_wav.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_c
onfi
g
${
decode_config_path
}
\
--decode_c
f
g
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decode.decoding_method
${
type
}
\
--opts
decode.decoding_method
${
type
}
\
...
...
examples/librispeech/asr2/conf/decode/decode_base.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
1
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/librispeech/asr2/conf/transformer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
# encoder related
cmvn_file_type
:
"
json"
encoder
:
transformer
# encoder related
encoder_conf
:
encoder
:
transformer
output_size
:
256
# dimension of attention
encoder_conf
:
attention_heads
:
4
output_size
:
256
# dimension of attention
linear_units
:
2048
# the number of units of position-wise feed forward
attention_heads
:
4
num_blocks
:
12
# the number of encoder blocks
linear_units
:
2048
# the number of units of position-wise feed forward
dropout_rate
:
0.1
num_blocks
:
12
# the number of encoder blocks
positional_dropout_rate
:
0.1
dropout_rate
:
0.1
attention_dropout_rate
:
0.0
positional_dropout_rate
:
0.1
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
attention_dropout_rate
:
0.0
normalize_before
:
true
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
true
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
dropout_rate
:
0.1
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test-clean
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev
test_manifest
:
data/manifest.test-clean
collator
:
###########################################
vocab_filepath
:
data/lang_char/train_960_unigram5000_units.txt
# Dataloader #
unit_type
:
spm
###########################################
spm_model_prefix
:
data/lang_char/train_960_unigram5000
vocab_filepath
:
data/lang_char/train_960_unigram5000_units.txt
feat_dim
:
83
unit_type
:
spm
stride_ms
:
10.0
spm_model_prefix
:
data/lang_char/train_960_unigram5000
window_ms
:
25.0
feat_dim
:
83
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
stride_ms
:
10.0
batch_size
:
30
window_ms
:
25.0
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_size
:
30
minibatches
:
0
# for debug
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
batch_count
:
auto
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_bins
:
0
minibatches
:
0
# for debug
batch_frames_in
:
0
batch_count
:
auto
batch_frames_out
:
0
batch_bins
:
0
batch_frames_inout
:
0
batch_frames_in
:
0
augmentation_config
:
conf/preprocess.yaml
batch_frames_out
:
0
num_workers
:
0
batch_frames_inout
:
0
subsampling_factor
:
1
preprocess_config
:
conf/preprocess.yaml
num_encs
:
1
num_workers
:
0
subsampling_factor
:
1
num_encs
:
1
training
:
###########################################
n_epoch
:
120
# Training #
accum_grad
:
2
###########################################
log_interval
:
100
n_epoch
:
120
checkpoint
:
accum_grad
:
2
kbest_n
:
50
log_interval
:
1
latest_n
:
5
checkpoint
:
kbest_n
:
50
latest_n
:
5
optim
:
adam
optim
:
adam
optim_conf
:
optim_conf
:
...
@@ -79,23 +86,5 @@ scheduler_conf:
...
@@ -79,23 +86,5 @@ scheduler_conf:
warmup_steps
:
25000
warmup_steps
:
25000
lr_decay
:
1.0
lr_decay
:
1.0
decoding
:
batch_size
:
1
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/librispeech/asr2/local/align.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path dict_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path d
ecode_config_path d
ict_path ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
dict_path
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
dict_path
=
$3
ckpt_prefix
=
$4
batch_size
=
1
batch_size
=
1
output_dir
=
${
ckpt_prefix
}
output_dir
=
${
ckpt_prefix
}
...
@@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \
...
@@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \
--dict-path
${
dict_path
}
\
--dict-path
${
dict_path
}
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result-file
${
output_dir
}
/
${
type
}
.align
\
--result-file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in ctc alignment!"
echo
"Failed in ctc alignment!"
...
...
examples/librispeech/asr2/local/test.sh
浏览文件 @
c907a8de
...
@@ -19,8 +19,9 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe}
...
@@ -19,8 +19,9 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe}
bpemodel
=
${
bpeprefix
}
.model
bpemodel
=
${
bpeprefix
}
.model
config_path
=
conf/transformer.yaml
config_path
=
conf/transformer.yaml
decode_config_path
=
conf/decode/decode_base.yaml
dict
=
data/lang_char/
${
train_set
}
_
${
bpemode
}${
nbpe
}
_units.txt
dict
=
data/lang_char/
${
train_set
}
_
${
bpemode
}${
nbpe
}
_units.txt
ckpt_prefix
=
ckpt_prefix
=
exp/transformer/checkpoints/init
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
@@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
...
@@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--dict-path
${
dict
}
\
--dict-path
${
dict
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--result-file
${
decode_dir
}
/data.JOB.json
\
--result-file
${
decode_dir
}
/data.JOB.json
\
--opts
decod
ing
.decoding_method
${
dmethd
}
\
--opts
decod
e
.decoding_method
${
dmethd
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
\
--opts
decod
e.decode_
batch_size
${
batch_size
}
\
--opts
data.
test_manifest
${
feat_recog_dir
}
/split
${
nj
}
/JOB/manifest.
${
rtask
}
--opts
test_manifest
${
feat_recog_dir
}
/split
${
nj
}
/JOB/manifest.
${
rtask
}
score_sclite.sh
--bpe
${
nbpe
}
--bpemodel
${
bpemodel
}
--wer
false
${
decode_dir
}
${
dict
}
score_sclite.sh
--bpe
${
nbpe
}
--bpemodel
${
bpemodel
}
--wer
false
${
decode_dir
}
${
dict
}
...
...
examples/librispeech/asr2/run.sh
浏览文件 @
c907a8de
...
@@ -9,12 +9,14 @@ gpus=0,1,2,3,4,5,6,7
...
@@ -9,12 +9,14 @@ gpus=0,1,2,3,4,5,6,7
stage
=
0
stage
=
0
stop_stage
=
50
stop_stage
=
50
conf_path
=
conf/transformer.yaml
conf_path
=
conf/transformer.yaml
dict_path
=
lang_char/train_960_unigram5000_units.txt
decode_conf_path
=
conf/decode/decode_base.yaml
dict_path
=
data/lang_char/train_960_unigram5000_units.txt
avg_num
=
10
avg_num
=
10
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
avg_ckpt
=
avg_
${
avg_num
}
avg_ckpt
=
avg_
${
avg_num
}
avg_ckpt
=
init
ckpt
=
$(
basename
${
conf_path
}
|
awk
-F
'.'
'{print $1}'
)
ckpt
=
$(
basename
${
conf_path
}
|
awk
-F
'.'
'{print $1}'
)
echo
"checkpoint name
${
ckpt
}
"
echo
"checkpoint name
${
ckpt
}
"
...
@@ -35,7 +37,7 @@ fi
...
@@ -35,7 +37,7 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# attetion resocre decoder
# attetion resocre decoder
./local/test.sh
${
conf_path
}
${
dict_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
./local/test.sh
${
conf_path
}
${
d
ecode_conf_path
}
${
d
ict_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
...
@@ -45,7 +47,7 @@ fi
...
@@ -45,7 +47,7 @@ fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# ctc alignment of test data
# ctc alignment of test data
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
${
dict_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
${
d
ecode_conf_path
}
${
d
ict_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
...
...
examples/other/1xt2x/aishell/conf/deepspeech2.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
min_input_len
:
0.0
dev_manifest
:
data/manifest.dev
max_input_len
:
27.0
# second
test_manifest
:
data/manifest.test
min_output_len
:
0.0
min_input_len
:
0.0
max_output_len
:
.inf
max_input_len
:
27.0
# second
min_output_input_ratio
:
0.00
min_output_len
:
0.0
max_output_input_ratio
:
.inf
max_output_len
:
.inf
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
collator
:
###########################################
batch_size
:
64
# one gpu
# Dataloader #
mean_std_filepath
:
data/mean_std.npz
###########################################
unit_type
:
char
batch_size
:
64
# one gpu
vocab_filepath
:
data/vocab.txt
mean_std_filepath
:
data/mean_std.npz
augmentation_config
:
conf/augmentation.json
unit_type
:
char
random_seed
:
0
vocab_filepath
:
data/vocab.txt
spm_model_prefix
:
augmentation_config
:
conf/augmentation.json
spectrum_type
:
linear
random_seed
:
0
feat_dim
:
spm_model_prefix
:
delta_delta
:
False
spectrum_type
:
linear
stride_ms
:
10.0
feat_dim
:
window_ms
:
20.0
delta_delta
:
False
n_fft
:
None
stride_ms
:
10.0
max_freq
:
None
window_ms
:
20.0
target_sample_rate
:
16000
n_fft
:
None
use_dB_normalization
:
True
max_freq
:
None
target_dB
:
-20
target_sample_rate
:
16000
dither
:
1.0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
dither
:
1.0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
model
:
############################################
num_conv_layers
:
2
# Network Architecture #
num_rnn_layers
:
3
############################################
rnn_layer_size
:
1024
num_conv_layers
:
2
use_gru
:
True
num_rnn_layers
:
3
share_rnn_weights
:
False
rnn_layer_size
:
1024
blank_id
:
4333
use_gru
:
True
share_rnn_weights
:
False
blank_id
:
4333
training
:
###########################################
n_epoch
:
80
# Training #
accum_grad
:
1
###########################################
lr
:
2e-3
n_epoch
:
80
lr_decay
:
0.83
accum_grad
:
1
weight_decay
:
1e-06
lr
:
2e-3
global_grad_clip
:
3.0
lr_decay
:
0.83
log_interval
:
100
weight_decay
:
1e-06
checkpoint
:
global_grad_clip
:
3.0
kbest_n
:
50
log_interval
:
100
latest_n
:
5
checkpoint
:
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
32
error_rate_type
:
cer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha
:
2.6
beta
:
5.0
beam_size
:
300
cutoff_prob
:
0.99
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/other/1xt2x/aishell/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
32
error_rate_type
:
cer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha
:
2.6
beta
:
5.0
beam_size
:
300
cutoff_prob
:
0.99
cutoff_top_n
:
40
num_proc_bsearch
:
8
\ No newline at end of file
examples/other/1xt2x/aishell/local/test.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix model_type"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
model_type
=
$3
ckpt_prefix
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_ch.sh
bash
local
/download_lm_ch.sh
...
@@ -21,6 +22,7 @@ fi
...
@@ -21,6 +22,7 @@ fi
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
--model_type
${
model_type
}
...
...
examples/other/1xt2x/aishell/run.sh
浏览文件 @
c907a8de
...
@@ -5,6 +5,7 @@ source path.sh
...
@@ -5,6 +5,7 @@ source path.sh
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
avg_num
=
1
model_type
=
offline
model_type
=
offline
gpus
=
2
gpus
=
2
...
@@ -23,6 +24,6 @@ fi
...
@@ -23,6 +24,6 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
v18_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
v18_ckpt
}
${
model_type
}
||
exit
-1
fi
fi
examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test-clean
train_manifest
:
data/manifest.train
min_input_len
:
0.0
dev_manifest
:
data/manifest.dev
max_input_len
:
.inf
# second
test_manifest
:
data/manifest.test-clean
min_output_len
:
0.0
min_input_len
:
0.0
max_output_len
:
.inf
max_input_len
:
.inf
# second
min_output_input_ratio
:
0.00
min_output_len
:
0.0
max_output_input_ratio
:
.inf
max_output_len
:
.inf
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
collator
:
###########################################
batch_size
:
64
# one gpu
# Dataloader #
mean_std_filepath
:
data/mean_std.npz
###########################################
unit_type
:
char
batch_size
:
64
# one gpu
vocab_filepath
:
data/vocab.txt
mean_std_filepath
:
data/mean_std.npz
augmentation_config
:
conf/augmentation.json
unit_type
:
char
random_seed
:
0
vocab_filepath
:
data/vocab.txt
spm_model_prefix
:
augmentation_config
:
conf/augmentation.json
spectrum_type
:
linear
random_seed
:
0
feat_dim
:
spm_model_prefix
:
delta_delta
:
False
spectrum_type
:
linear
stride_ms
:
10.0
feat_dim
:
window_ms
:
20.0
delta_delta
:
False
n_fft
:
None
stride_ms
:
10.0
max_freq
:
None
window_ms
:
20.0
target_sample_rate
:
16000
n_fft
:
None
use_dB_normalization
:
True
max_freq
:
None
target_dB
:
-20
target_sample_rate
:
16000
dither
:
1.0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
dither
:
1.0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
model
:
############################################
num_conv_layers
:
2
# Network Architecture #
num_rnn_layers
:
3
############################################
rnn_layer_size
:
1024
num_conv_layers
:
2
use_gru
:
True
num_rnn_layers
:
3
share_rnn_weights
:
False
rnn_layer_size
:
1024
blank_id
:
28
use_gru
:
True
share_rnn_weights
:
False
blank_id
:
28
###########################################
# Training #
###########################################
n_epoch
:
80
accum_grad
:
1
lr
:
2e-3
lr_decay
:
0.83
weight_decay
:
1e-06
global_grad_clip
:
3.0
log_interval
:
100
checkpoint
:
kbest_n
:
50
latest_n
:
5
training
:
n_epoch
:
80
accum_grad
:
1
lr
:
2e-3
lr_decay
:
0.83
weight_decay
:
1e-06
global_grad_clip
:
3.0
log_interval
:
100
checkpoint
:
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
32
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
1.4
beta
:
0.35
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
32
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
1.4
beta
:
0.35
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
\ No newline at end of file
examples/other/1xt2x/baidu_en8k/local/test.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix model_type"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
model_type
=
$3
ckpt_prefix
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_en.sh
bash
local
/download_lm_en.sh
...
@@ -21,6 +22,7 @@ fi
...
@@ -21,6 +22,7 @@ fi
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
--model_type
${
model_type
}
...
...
examples/other/1xt2x/baidu_en8k/run.sh
浏览文件 @
c907a8de
...
@@ -5,6 +5,7 @@ source path.sh
...
@@ -5,6 +5,7 @@ source path.sh
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
avg_num
=
1
model_type
=
offline
model_type
=
offline
gpus
=
0
gpus
=
0
...
@@ -23,6 +24,6 @@ fi
...
@@ -23,6 +24,6 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
v18_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
v18_ckpt
}
${
model_type
}
||
exit
-1
fi
fi
examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test-clean
train_manifest
:
data/manifest.train
min_input_len
:
0.0
dev_manifest
:
data/manifest.dev
max_input_len
:
1000.0
# second
test_manifest
:
data/manifest.test-clean
min_output_len
:
0.0
min_input_len
:
0.0
max_output_len
:
.inf
max_input_len
:
1000.0
# second
min_output_input_ratio
:
0.00
min_output_len
:
0.0
max_output_input_ratio
:
.inf
max_output_len
:
.inf
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
collator
:
###########################################
batch_size
:
64
# one gpu
# Dataloader #
mean_std_filepath
:
data/mean_std.npz
###########################################
unit_type
:
char
batch_size
:
64
# one gpu
vocab_filepath
:
data/vocab.txt
mean_std_filepath
:
data/mean_std.npz
augmentation_config
:
conf/augmentation.json
unit_type
:
char
random_seed
:
0
vocab_filepath
:
data/vocab.txt
spm_model_prefix
:
augmentation_config
:
conf/augmentation.json
spectrum_type
:
linear
random_seed
:
0
feat_dim
:
spm_model_prefix
:
delta_delta
:
False
spectrum_type
:
linear
stride_ms
:
10.0
feat_dim
:
window_ms
:
20.0
delta_delta
:
False
n_fft
:
None
stride_ms
:
10.0
max_freq
:
None
window_ms
:
20.0
target_sample_rate
:
16000
n_fft
:
None
use_dB_normalization
:
True
max_freq
:
None
target_dB
:
-20
target_sample_rate
:
16000
dither
:
1.0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
dither
:
1.0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
model
:
############################################
num_conv_layers
:
2
# Network Architecture #
num_rnn_layers
:
3
############################################
rnn_layer_size
:
2048
num_conv_layers
:
2
use_gru
:
False
num_rnn_layers
:
3
share_rnn_weights
:
True
rnn_layer_size
:
2048
blank_id
:
28
use_gru
:
False
share_rnn_weights
:
True
blank_id
:
28
###########################################
# Training #
###########################################
n_epoch
:
80
accum_grad
:
1
lr
:
2e-3
lr_decay
:
0.83
weight_decay
:
1e-06
global_grad_clip
:
3.0
log_interval
:
100
checkpoint
:
kbest_n
:
50
latest_n
:
5
training
:
n_epoch
:
80
accum_grad
:
1
lr
:
2e-3
lr_decay
:
0.83
weight_decay
:
1e-06
global_grad_clip
:
3.0
log_interval
:
100
checkpoint
:
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
32
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
32
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
\ No newline at end of file
examples/other/1xt2x/librispeech/local/test.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix model_type"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
model_type
=
$3
ckpt_prefix
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_en.sh
bash
local
/download_lm_en.sh
...
@@ -21,6 +22,7 @@ fi
...
@@ -21,6 +22,7 @@ fi
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
--model_type
${
model_type
}
...
...
examples/other/1xt2x/librispeech/run.sh
浏览文件 @
c907a8de
...
@@ -5,6 +5,7 @@ source path.sh
...
@@ -5,6 +5,7 @@ source path.sh
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
avg_num
=
1
model_type
=
offline
model_type
=
offline
gpus
=
1
gpus
=
1
...
@@ -23,5 +24,5 @@ fi
...
@@ -23,5 +24,5 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
v18_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
v18_ckpt
}
${
model_type
}
||
exit
-1
fi
fi
examples/other/1xt2x/src_deepspeech2x/bin/test.py
浏览文件 @
c907a8de
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
# limitations under the License.
# limitations under the License.
"""Evaluation for DeepSpeech2 model."""
"""Evaluation for DeepSpeech2 model."""
from
src_deepspeech2x.test_model
import
DeepSpeech2Tester
as
Tester
from
src_deepspeech2x.test_model
import
DeepSpeech2Tester
as
Tester
from
yacs.config
import
CfgNode
from
paddlespeech.s2t.exps.deepspeech2.config
import
get_cfg_defaults
from
paddlespeech.s2t.exps.deepspeech2.config
import
get_cfg_defaults
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.training.cli
import
default_argument_parser
...
@@ -44,6 +45,10 @@ if __name__ == "__main__":
...
@@ -44,6 +45,10 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
(
args
.
model_type
)
config
=
get_cfg_defaults
(
args
.
model_type
)
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_cfg
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_cfg
)
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
...
...
examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
浏览文件 @
c907a8de
...
@@ -233,11 +233,11 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -233,11 +233,11 @@ class DeepSpeech2Model(nn.Layer):
"""
"""
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
dict_size
=
len
(
dataloader
.
collate_fn
.
vocab_list
),
dict_size
=
len
(
dataloader
.
collate_fn
.
vocab_list
),
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
rnn_size
=
config
.
rnn_layer_size
,
use_gru
=
config
.
model
.
use_gru
,
use_gru
=
config
.
use_gru
,
share_rnn_weights
=
config
.
model
.
share_rnn_weights
)
share_rnn_weights
=
config
.
share_rnn_weights
)
infos
=
Checkpoint
().
load_parameters
(
infos
=
Checkpoint
().
load_parameters
(
model
,
checkpoint_path
=
checkpoint_path
)
model
,
checkpoint_path
=
checkpoint_path
)
logger
.
info
(
f
"checkpoint info:
{
infos
}
"
)
logger
.
info
(
f
"checkpoint info:
{
infos
}
"
)
...
@@ -250,7 +250,7 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -250,7 +250,7 @@ class DeepSpeech2Model(nn.Layer):
Parameters
Parameters
config: yacs.config.CfgNode
config: yacs.config.CfgNode
config
.model
config
Returns
Returns
-------
-------
DeepSpeech2Model
DeepSpeech2Model
...
...
examples/other/1xt2x/src_deepspeech2x/test_model.py
浏览文件 @
c907a8de
...
@@ -64,7 +64,7 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -64,7 +64,7 @@ class DeepSpeech2Trainer(Trainer):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
def
train_batch
(
self
,
batch_index
,
batch_data
,
msg
):
def
train_batch
(
self
,
batch_index
,
batch_data
,
msg
):
train_conf
=
self
.
config
.
training
train_conf
=
self
.
config
start
=
time
.
time
()
start
=
time
.
time
()
# forward
# forward
...
@@ -98,7 +98,7 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -98,7 +98,7 @@ class DeepSpeech2Trainer(Trainer):
iteration_time
=
time
.
time
()
-
start
iteration_time
=
time
.
time
()
-
start
msg
+=
"train time: {:>.3f}s, "
.
format
(
iteration_time
)
msg
+=
"train time: {:>.3f}s, "
.
format
(
iteration_time
)
msg
+=
"batch size: {}, "
.
format
(
self
.
config
.
collator
.
batch_size
)
msg
+=
"batch size: {}, "
.
format
(
self
.
config
.
batch_size
)
msg
+=
"accum: {}, "
.
format
(
train_conf
.
accum_grad
)
msg
+=
"accum: {}, "
.
format
(
train_conf
.
accum_grad
)
msg
+=
', '
.
join
(
'{}: {:>.6f}'
.
format
(
k
,
v
)
msg
+=
', '
.
join
(
'{}: {:>.6f}'
.
format
(
k
,
v
)
for
k
,
v
in
losses_np
.
items
())
for
k
,
v
in
losses_np
.
items
())
...
@@ -126,7 +126,7 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -126,7 +126,7 @@ class DeepSpeech2Trainer(Trainer):
total_loss
+=
float
(
loss
)
*
num_utts
total_loss
+=
float
(
loss
)
*
num_utts
valid_losses
[
'val_loss'
].
append
(
float
(
loss
))
valid_losses
[
'val_loss'
].
append
(
float
(
loss
))
if
(
i
+
1
)
%
self
.
config
.
training
.
log_interval
==
0
:
if
(
i
+
1
)
%
self
.
config
.
log_interval
==
0
:
valid_dump
=
{
k
:
np
.
mean
(
v
)
for
k
,
v
in
valid_losses
.
items
()}
valid_dump
=
{
k
:
np
.
mean
(
v
)
for
k
,
v
in
valid_losses
.
items
()}
valid_dump
[
'val_history_loss'
]
=
total_loss
/
num_seen_utts
valid_dump
[
'val_history_loss'
]
=
total_loss
/
num_seen_utts
...
@@ -146,15 +146,15 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -146,15 +146,15 @@ class DeepSpeech2Trainer(Trainer):
def
setup_model
(
self
):
def
setup_model
(
self
):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
config
.
defrost
()
config
.
defrost
()
config
.
model
.
feat_size
=
self
.
train_loader
.
collate_fn
.
feature_size
config
.
feat_size
=
self
.
train_loader
.
collate_fn
.
feature_size
#config.
model.
dict_size = self.train_loader.collate_fn.vocab_size
#config.dict_size = self.train_loader.collate_fn.vocab_size
config
.
model
.
dict_size
=
len
(
self
.
train_loader
.
collate_fn
.
vocab_list
)
config
.
dict_size
=
len
(
self
.
train_loader
.
collate_fn
.
vocab_list
)
config
.
freeze
()
config
.
freeze
()
if
self
.
args
.
model_type
==
'offline'
:
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
.
from_config
(
config
.
model
)
model
=
DeepSpeech2Model
.
from_config
(
config
)
elif
self
.
args
.
model_type
==
'online'
:
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
.
model
)
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
)
else
:
else
:
raise
Exception
(
"wrong model type"
)
raise
Exception
(
"wrong model type"
)
if
self
.
parallel
:
if
self
.
parallel
:
...
@@ -163,17 +163,13 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -163,17 +163,13 @@ class DeepSpeech2Trainer(Trainer):
logger
.
info
(
f
"
{
model
}
"
)
logger
.
info
(
f
"
{
model
}
"
)
layer_tools
.
print_params
(
model
,
logger
.
info
)
layer_tools
.
print_params
(
model
,
logger
.
info
)
grad_clip
=
ClipGradByGlobalNormWithLog
(
grad_clip
=
ClipGradByGlobalNormWithLog
(
config
.
global_grad_clip
)
config
.
training
.
global_grad_clip
)
lr_scheduler
=
paddle
.
optimizer
.
lr
.
ExponentialDecay
(
lr_scheduler
=
paddle
.
optimizer
.
lr
.
ExponentialDecay
(
learning_rate
=
config
.
training
.
lr
,
learning_rate
=
config
.
lr
,
gamma
=
config
.
lr_decay
,
verbose
=
True
)
gamma
=
config
.
training
.
lr_decay
,
verbose
=
True
)
optimizer
=
paddle
.
optimizer
.
Adam
(
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
lr_scheduler
,
learning_rate
=
lr_scheduler
,
parameters
=
model
.
parameters
(),
parameters
=
model
.
parameters
(),
weight_decay
=
paddle
.
regularizer
.
L2Decay
(
weight_decay
=
paddle
.
regularizer
.
L2Decay
(
config
.
weight_decay
),
config
.
training
.
weight_decay
),
grad_clip
=
grad_clip
)
grad_clip
=
grad_clip
)
self
.
model
=
model
self
.
model
=
model
...
@@ -184,59 +180,59 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -184,59 +180,59 @@ class DeepSpeech2Trainer(Trainer):
def
setup_dataloader
(
self
):
def
setup_dataloader
(
self
):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
config
.
defrost
()
config
.
defrost
()
config
.
collator
.
keep_transcription_text
=
False
config
.
keep_transcription_text
=
False
config
.
data
.
manifest
=
config
.
data
.
train_manifest
config
.
manifest
=
config
.
train_manifest
train_dataset
=
ManifestDataset
.
from_config
(
config
)
train_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
data
.
manifest
=
config
.
data
.
dev_manifest
config
.
manifest
=
config
.
dev_manifest
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
manifest
=
config
.
test_manifest
test_dataset
=
ManifestDataset
.
from_config
(
config
)
test_dataset
=
ManifestDataset
.
from_config
(
config
)
if
self
.
parallel
:
if
self
.
parallel
:
batch_sampler
=
SortagradDistributedBatchSampler
(
batch_sampler
=
SortagradDistributedBatchSampler
(
train_dataset
,
train_dataset
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
num_replicas
=
None
,
num_replicas
=
None
,
rank
=
None
,
rank
=
None
,
shuffle
=
True
,
shuffle
=
True
,
drop_last
=
True
,
drop_last
=
True
,
sortagrad
=
config
.
collator
.
sortagrad
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
collator
.
shuffle_method
)
shuffle_method
=
config
.
shuffle_method
)
else
:
else
:
batch_sampler
=
SortagradBatchSampler
(
batch_sampler
=
SortagradBatchSampler
(
train_dataset
,
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
drop_last
=
True
,
drop_last
=
True
,
sortagrad
=
config
.
collator
.
sortagrad
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
collator
.
shuffle_method
)
shuffle_method
=
config
.
shuffle_method
)
collate_fn_train
=
SpeechCollator
.
from_config
(
config
)
collate_fn_train
=
SpeechCollator
.
from_config
(
config
)
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
collate_fn_dev
=
SpeechCollator
.
from_config
(
config
)
collate_fn_dev
=
SpeechCollator
.
from_config
(
config
)
config
.
collator
.
keep_transcription_text
=
True
config
.
keep_transcription_text
=
True
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
collate_fn_test
=
SpeechCollator
.
from_config
(
config
)
collate_fn_test
=
SpeechCollator
.
from_config
(
config
)
self
.
train_loader
=
DataLoader
(
self
.
train_loader
=
DataLoader
(
train_dataset
,
train_dataset
,
batch_sampler
=
batch_sampler
,
batch_sampler
=
batch_sampler
,
collate_fn
=
collate_fn_train
,
collate_fn
=
collate_fn_train
,
num_workers
=
config
.
collator
.
num_workers
)
num_workers
=
config
.
num_workers
)
self
.
valid_loader
=
DataLoader
(
self
.
valid_loader
=
DataLoader
(
dev_dataset
,
dev_dataset
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
collate_fn_dev
)
collate_fn
=
collate_fn_dev
)
self
.
test_loader
=
DataLoader
(
self
.
test_loader
=
DataLoader
(
test_dataset
,
test_dataset
,
batch_size
=
config
.
decod
ing
.
batch_size
,
batch_size
=
config
.
decod
e
.
decode_
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
collate_fn_test
)
collate_fn
=
collate_fn_test
)
...
@@ -274,7 +270,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -274,7 +270,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def
__init__
(
self
,
config
,
args
):
def
__init__
(
self
,
config
,
args
):
self
.
_text_featurizer
=
TextFeaturizer
(
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
=
config
.
collator
.
unit_type
,
vocab_filepath
=
None
)
unit_type
=
config
.
unit_type
,
vocab
=
None
)
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
def
ordid2token
(
self
,
texts
,
texts_len
):
def
ordid2token
(
self
,
texts
,
texts_len
):
...
@@ -293,7 +289,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -293,7 +289,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
texts
,
texts
,
texts_len
,
texts_len
,
fout
=
None
):
fout
=
None
):
cfg
=
self
.
config
.
decod
ing
cfg
=
self
.
config
.
decod
e
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_func
=
error_rate
.
char_errors
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
word_errors
errors_func
=
error_rate
.
char_errors
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
word_errors
error_rate_func
=
error_rate
.
cer
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
wer
error_rate_func
=
error_rate
.
cer
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
wer
...
@@ -399,31 +395,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -399,31 +395,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
self
.
export
()
self
.
export
()
except
KeyboardInterrupt
:
except
KeyboardInterrupt
:
exit
(
-
1
)
exit
(
-
1
)
def
setup
(
self
):
"""Setup the experiment.
"""
paddle
.
set_device
(
'gpu'
if
self
.
args
.
ngpu
>
0
else
'cpu'
)
self
.
setup_output_dir
()
self
.
setup_checkpointer
()
self
.
setup_dataloader
()
self
.
setup_model
()
self
.
iteration
=
0
self
.
epoch
=
0
def
setup_output_dir
(
self
):
"""Create a directory used for output.
"""
# output dir
if
self
.
args
.
output
:
output_dir
=
Path
(
self
.
args
.
output
).
expanduser
()
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
else
:
output_dir
=
Path
(
self
.
args
.
checkpoint_path
).
expanduser
().
parent
.
parent
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
self
.
output_dir
=
output_dir
examples/ted_en_zh/st0/conf/transformer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train.tiny
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train.tiny
min_input_len
:
0.05
# second
dev_manifest
:
data/manifest.dev
max_input_len
:
30.0
# second
test_manifest
:
data/manifest.test
min_output_len
:
0.0
# tokens
min_input_len
:
0.05
# second
max_output_len
:
400.0
# tokens
max_input_len
:
30.0
# second
min_output_input_ratio
:
0.01
min_output_len
:
0.0
# tokens
max_output_input_ratio
:
20.0
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.01
max_output_input_ratio
:
20.0
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
'
spm'
###########################################
spm_model_prefix
:
data/lang_char/bpe_unigram_8000
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
"
"
unit_type
:
'
spm'
# augmentation_config: conf/augmentation.json
spm_model_prefix
:
data/lang_char/bpe_unigram_8000
batch_size
:
10
mean_std_filepath
:
"
"
raw_wav
:
True
# use raw_wav or kaldi feature
# augmentation_config: conf/augmentation.json
spectrum_type
:
fbank
#linear, mfcc, fbank
batch_size
:
10
feat_dim
:
80
raw_wav
:
True
# use raw_wav or kaldi feature
delta_delta
:
False
spectrum_type
:
fbank
#linear, mfcc, fbank
dither
:
1.0
feat_dim
:
80
target_sample_rate
:
16000
delta_delta
:
False
max_freq
:
None
dither
:
1.0
n_fft
:
None
target_sample_rate
:
16000
stride_ms
:
10.0
max_freq
:
None
window_ms
:
25.0
n_fft
:
None
use_dB_normalization
:
True
stride_ms
:
10.0
target_dB
:
-20
window_ms
:
25.0
random_seed
:
0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
random_seed
:
0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
"
data/mean_std.json"
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
"
data/mean_std.json"
# encoder related
cmvn_file_type
:
"
json"
encoder
:
transformer
# encoder related
encoder_conf
:
encoder
:
transformer
output_size
:
256
# dimension of attention
encoder_conf
:
attention_heads
:
4
output_size
:
256
# dimension of attention
linear_units
:
2048
# the number of units of position-wise feed forward
attention_heads
:
4
num_blocks
:
12
# the number of encoder blocks
linear_units
:
2048
# the number of units of position-wise feed forward
dropout_rate
:
0.1
num_blocks
:
12
# the number of encoder blocks
positional_dropout_rate
:
0.1
dropout_rate
:
0.1
attention_dropout_rate
:
0.0
positional_dropout_rate
:
0.1
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
attention_dropout_rate
:
0.0
normalize_before
:
true
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
true
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
dropout_rate
:
0.1
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
asr_weight
:
0.0
asr_weight
:
0.0
ctc_weight
:
0.0
ctc_weight
:
0.0
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
###########################################
training
:
# Training #
n_epoch
:
120
###########################################
accum_grad
:
2
n_epoch
:
120
global_grad_clip
:
5.0
accum_grad
:
2
optim
:
adam
global_grad_clip
:
5.0
optim_conf
:
optim
:
adam
lr
:
0.004
optim_conf
:
weight_decay
:
1e-06
lr
:
0.004
scheduler
:
warmuplr
weight_decay
:
1.0e-06
scheduler_conf
:
scheduler
:
warmuplr
warmup_steps
:
25000
scheduler_conf
:
lr_decay
:
1.0
warmup_steps
:
25000
log_interval
:
5
lr_decay
:
1.0
checkpoint
:
log_interval
:
5
kbest_n
:
50
checkpoint
:
latest_n
:
5
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
5
error_rate_type
:
char-bleu
decoding_method
:
fullsentence
# 'fullsentence', 'simultaneous'
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
min_input_len
:
0.05
# second
dev_manifest
:
data/manifest.dev
max_input_len
:
30.0
# second
test_manifest
:
data/manifest.test
min_output_len
:
0.0
# tokens
min_input_len
:
0.05
# second
max_output_len
:
400.0
# tokens
max_input_len
:
30.0
# second
min_output_input_ratio
:
0.01
min_output_len
:
0.0
# tokens
max_output_input_ratio
:
20.0
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.01
max_output_input_ratio
:
20.0
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
'
spm'
###########################################
spm_model_prefix
:
data/lang_char/bpe_unigram_8000
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
"
"
unit_type
:
'
spm'
# augmentation_config: conf/augmentation.json
spm_model_prefix
:
data/lang_char/bpe_unigram_8000
batch_size
:
10
mean_std_filepath
:
"
"
raw_wav
:
True
# use raw_wav or kaldi feature
# augmentation_config: conf/augmentation.json
spectrum_type
:
fbank
#linear, mfcc, fbank
batch_size
:
10
feat_dim
:
80
raw_wav
:
True
# use raw_wav or kaldi feature
delta_delta
:
False
spectrum_type
:
fbank
#linear, mfcc, fbank
dither
:
1.0
feat_dim
:
80
target_sample_rate
:
16000
delta_delta
:
False
max_freq
:
None
dither
:
1.0
n_fft
:
None
target_sample_rate
:
16000
stride_ms
:
10.0
max_freq
:
None
window_ms
:
25.0
n_fft
:
None
use_dB_normalization
:
True
stride_ms
:
10.0
target_dB
:
-20
window_ms
:
25.0
random_seed
:
0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
random_seed
:
0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
"
data/mean_std.json"
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
"
data/mean_std.json"
# encoder related
cmvn_file_type
:
"
json"
encoder
:
transformer
# encoder related
encoder_conf
:
encoder
:
transformer
output_size
:
256
# dimension of attention
encoder_conf
:
attention_heads
:
4
output_size
:
256
# dimension of attention
linear_units
:
2048
# the number of units of position-wise feed forward
attention_heads
:
4
num_blocks
:
12
# the number of encoder blocks
linear_units
:
2048
# the number of units of position-wise feed forward
dropout_rate
:
0.1
num_blocks
:
12
# the number of encoder blocks
positional_dropout_rate
:
0.1
dropout_rate
:
0.1
attention_dropout_rate
:
0.0
positional_dropout_rate
:
0.1
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
attention_dropout_rate
:
0.0
normalize_before
:
true
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
true
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
dropout_rate
:
0.1
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
asr_weight
:
0.5
asr_weight
:
0.5
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
training
:
###########################################
n_epoch
:
120
# Training #
accum_grad
:
2
###########################################
global_grad_clip
:
5.0
n_epoch
:
120
optim
:
adam
accum_grad
:
2
optim_conf
:
global_grad_clip
:
5.0
lr
:
2.5
optim
:
adam
weight_decay
:
1e-06
optim_conf
:
scheduler
:
noam
lr
:
2.5
scheduler_conf
:
weight_decay
:
1.0e-06
warmup_steps
:
25000
scheduler
:
noam
lr_decay
:
1.0
scheduler_conf
:
log_interval
:
50
warmup_steps
:
25000
checkpoint
:
lr_decay
:
1.0
kbest_n
:
50
log_interval
:
50
latest_n
:
5
checkpoint
:
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
5
error_rate_type
:
char-bleu
decoding_method
:
fullsentence
# 'fullsentence', 'simultaneous'
alpha
:
2.5
beta
:
0.3
beam_size
:
10
word_reward
:
0.7
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/ted_en_zh/st0/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
batch_size
:
5
error_rate_type
:
char-bleu
decoding_method
:
fullsentence
# 'fullsentence', 'simultaneous'
beam_size
:
10
word_reward
:
0.7
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
\ No newline at end of file
examples/ted_en_zh/st0/local/test.sh
浏览文件 @
c907a8de
#! /usr/bin/env bash
#! /usr/bin/env bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
for
type
in
fullsentence
;
do
for
type
in
fullsentence
;
do
echo
"decoding
${
type
}
"
echo
"decoding
${
type
}
"
...
@@ -17,10 +18,11 @@ for type in fullsentence; do
...
@@ -17,10 +18,11 @@ for type in fullsentence; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/ted_en_zh/st0/run.sh
浏览文件 @
c907a8de
...
@@ -6,6 +6,7 @@ gpus=0,1,2,3
...
@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage
=
0
stage
=
0
stop_stage
=
50
stop_stage
=
50
conf_path
=
conf/transformer_mtl_noam.yaml
conf_path
=
conf/transformer_mtl_noam.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
5
avg_num
=
5
data_path
=
./TED_EnZh
# path to unzipped data
data_path
=
./TED_EnZh
# path to unzipped data
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
@@ -32,7 +33,7 @@ fi
...
@@ -32,7 +33,7 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
51
]
&&
[
${
stop_stage
}
-ge
51
]
;
then
if
[
${
stage
}
-le
51
]
&&
[
${
stop_stage
}
-ge
51
]
;
then
...
...
examples/ted_en_zh/st1/conf/transformer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train.tiny
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train.tiny
min_input_len
:
5.0
# frame
dev_manifest
:
data/manifest.dev
max_input_len
:
3000.0
# frame
test_manifest
:
data/manifest.test
min_output_len
:
0.0
# tokens
min_input_len
:
5.0
# frame
max_output_len
:
400.0
# tokens
max_input_len
:
3000.0
# frame
min_output_input_ratio
:
0.01
min_output_len
:
0.0
# tokens
max_output_input_ratio
:
20.0
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.01
max_output_input_ratio
:
20.0
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
'
spm'
###########################################
spm_model_prefix
:
data/lang_char/bpe_unigram_8000
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
"
"
unit_type
:
'
spm'
# augmentation_config: conf/augmentation.json
spm_model_prefix
:
data/lang_char/bpe_unigram_8000
batch_size
:
10
mean_std_filepath
:
"
"
raw_wav
:
True
# use raw_wav or kaldi feature
# augmentation_config: conf/augmentation.json
spectrum_type
:
fbank
#linear, mfcc, fbank
batch_size
:
10
feat_dim
:
83
raw_wav
:
True
# use raw_wav or kaldi feature
delta_delta
:
False
spectrum_type
:
fbank
#linear, mfcc, fbank
dither
:
1.0
feat_dim
:
83
target_sample_rate
:
16000
delta_delta
:
False
max_freq
:
None
dither
:
1.0
n_fft
:
None
target_sample_rate
:
16000
stride_ms
:
10.0
max_freq
:
None
window_ms
:
25.0
n_fft
:
None
use_dB_normalization
:
True
stride_ms
:
10.0
target_dB
:
-20
window_ms
:
25.0
random_seed
:
0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
random_seed
:
0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
None
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
None
# encoder related
cmvn_file_type
:
"
json"
encoder
:
transformer
# encoder related
encoder_conf
:
encoder
:
transformer
output_size
:
256
# dimension of attention
encoder_conf
:
attention_heads
:
4
output_size
:
256
# dimension of attention
linear_units
:
2048
# the number of units of position-wise feed forward
attention_heads
:
4
num_blocks
:
12
# the number of encoder blocks
linear_units
:
2048
# the number of units of position-wise feed forward
dropout_rate
:
0.1
num_blocks
:
12
# the number of encoder blocks
positional_dropout_rate
:
0.1
dropout_rate
:
0.1
attention_dropout_rate
:
0.0
positional_dropout_rate
:
0.1
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
attention_dropout_rate
:
0.0
normalize_before
:
true
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
true
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
dropout_rate
:
0.1
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
asr_weight
:
0.0
asr_weight
:
0.0
ctc_weight
:
0.0
ctc_weight
:
0.0
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
training
:
###########################################
n_epoch
:
20
# Training #
accum_grad
:
2
###########################################
global_grad_clip
:
5.0
n_epoch
:
20
optim
:
adam
accum_grad
:
2
optim_conf
:
global_grad_clip
:
5.0
lr
:
0.004
optim
:
adam
weight_decay
:
1e-06
optim_conf
:
scheduler
:
warmuplr
lr
:
0.004
scheduler_conf
:
weight_decay
:
1.0e-06
warmup_steps
:
25000
scheduler
:
warmuplr
lr_decay
:
1.0
scheduler_conf
:
log_interval
:
5
warmup_steps
:
25000
checkpoint
:
lr_decay
:
1.0
kbest_n
:
50
log_interval
:
5
latest_n
:
5
checkpoint
:
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
5
error_rate_type
:
char-bleu
decoding_method
:
fullsentence
# 'fullsentence', 'simultaneous'
alpha
:
2.5
beta
:
0.3
beam_size
:
10
word_reward
:
0.7
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
min_input_len
:
5.0
# frame
dev_manifest
:
data/manifest.dev
max_input_len
:
3000.0
# frame
test_manifest
:
data/manifest.test
min_output_len
:
0.0
# tokens
min_input_len
:
5.0
# frame
max_output_len
:
400.0
# tokens
max_input_len
:
3000.0
# frame
min_output_input_ratio
:
0.01
min_output_len
:
0.0
# tokens
max_output_input_ratio
:
20.0
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.01
max_output_input_ratio
:
20.0
collator
:
###########################################
vocab_filepath
:
data/lang_char/ted_en_zh_bpe8000.txt
# Dataloader #
unit_type
:
'
spm'
###########################################
spm_model_prefix
:
data/lang_char/ted_en_zh_bpe8000
vocab_filepath
:
data/lang_char/ted_en_zh_bpe8000.txt
mean_std_filepath
:
"
"
unit_type
:
'
spm'
# augmentation_config: conf/augmentation.json
spm_model_prefix
:
data/lang_char/ted_en_zh_bpe8000
batch_size
:
10
mean_std_filepath
:
"
"
raw_wav
:
True
# use raw_wav or kaldi feature
# augmentation_config: conf/augmentation.json
spectrum_type
:
fbank
#linear, mfcc, fbank
batch_size
:
10
feat_dim
:
83
raw_wav
:
True
# use raw_wav or kaldi feature
delta_delta
:
False
spectrum_type
:
fbank
#linear, mfcc, fbank
dither
:
1.0
feat_dim
:
83
target_sample_rate
:
16000
delta_delta
:
False
max_freq
:
None
dither
:
1.0
n_fft
:
None
target_sample_rate
:
16000
stride_ms
:
10.0
max_freq
:
None
window_ms
:
25.0
n_fft
:
None
use_dB_normalization
:
True
stride_ms
:
10.0
target_dB
:
-20
window_ms
:
25.0
random_seed
:
0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
random_seed
:
0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
None
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
None
# encoder related
cmvn_file_type
:
"
json"
encoder
:
transformer
# encoder related
encoder_conf
:
encoder
:
transformer
output_size
:
256
# dimension of attention
encoder_conf
:
attention_heads
:
4
output_size
:
256
# dimension of attention
linear_units
:
2048
# the number of units of position-wise feed forward
attention_heads
:
4
num_blocks
:
12
# the number of encoder blocks
linear_units
:
2048
# the number of units of position-wise feed forward
dropout_rate
:
0.1
num_blocks
:
12
# the number of encoder blocks
positional_dropout_rate
:
0.1
dropout_rate
:
0.1
attention_dropout_rate
:
0.0
positional_dropout_rate
:
0.1
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
attention_dropout_rate
:
0.0
normalize_before
:
true
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
true
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
dropout_rate
:
0.1
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
asr_weight
:
0.5
asr_weight
:
0.5
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
training
:
###########################################
n_epoch
:
20
# Training #
accum_grad
:
2
###########################################
global_grad_clip
:
5.0
n_epoch
:
20
optim
:
adam
accum_grad
:
2
optim_conf
:
global_grad_clip
:
5.0
lr
:
2.5
optim
:
adam
weight_decay
:
1e-06
optim_conf
:
scheduler
:
noam
lr
:
2.5
scheduler_conf
:
weight_decay
:
1.0e-06
warmup_steps
:
25000
scheduler
:
noam
lr_decay
:
1.0
scheduler_conf
:
log_interval
:
5
warmup_steps
:
25000
checkpoint
:
lr_decay
:
1.0
kbest_n
:
50
log_interval
:
5
latest_n
:
5
checkpoint
:
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
5
error_rate_type
:
char-bleu
decoding_method
:
fullsentence
# 'fullsentence', 'simultaneous'
alpha
:
2.5
beta
:
0.3
beam_size
:
10
word_reward
:
0.7
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
\ No newline at end of file
examples/ted_en_zh/st1/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
batch_size
:
5
error_rate_type
:
char-bleu
decoding_method
:
fullsentence
# 'fullsentence', 'simultaneous'
beam_size
:
10
word_reward
:
0.7
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
\ No newline at end of file
examples/ted_en_zh/st1/local/test.sh
浏览文件 @
c907a8de
#! /usr/bin/env bash
#! /usr/bin/env bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
for
type
in
fullsentence
;
do
for
type
in
fullsentence
;
do
echo
"decoding
${
type
}
"
echo
"decoding
${
type
}
"
...
@@ -17,10 +18,11 @@ for type in fullsentence; do
...
@@ -17,10 +18,11 @@ for type in fullsentence; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/ted_en_zh/st1/run.sh
浏览文件 @
c907a8de
...
@@ -7,6 +7,7 @@ gpus=0,1,2,3
...
@@ -7,6 +7,7 @@ gpus=0,1,2,3
stage
=
1
stage
=
1
stop_stage
=
4
stop_stage
=
4
conf_path
=
conf/transformer_mtl_noam.yaml
conf_path
=
conf/transformer_mtl_noam.yaml
decode_conf_path
=
conf/tuning/decode.yaml
ckpt_path
=
# paddle.98 # (finetune from FAT-ST pretrained model)
ckpt_path
=
# paddle.98 # (finetune from FAT-ST pretrained model)
avg_num
=
5
avg_num
=
5
data_path
=
./TED_EnZh
# path to unzipped data
data_path
=
./TED_EnZh
# path to unzipped data
...
@@ -38,5 +39,5 @@ fi
...
@@ -38,5 +39,5 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_pat
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
\ No newline at end of file
examples/timit/asr1/conf/transformer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
min_input_len
:
0.0
# second
dev_manifest
:
data/manifest.dev
max_input_len
:
10.0
# second
test_manifest
:
data/manifest.test
min_output_len
:
0.0
# tokens
max_output_len
:
150.0
# tokens
min_output_input_ratio
:
0.005
max_output_input_ratio
:
1000.0
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
"
word"
###########################################
mean_std_filepath
:
"
"
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/preprocess.yaml
spm_model_prefix
:
'
'
batch_size
:
64
unit_type
:
"
word"
raw_wav
:
True
# use raw_wav or kaldi feature
mean_std_filepath
:
"
"
spectrum_type
:
fbank
#linear, mfcc, fbank
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
stride_ms
:
10.0
dither
:
1.0
window_ms
:
25.0
target_sample_rate
:
16000
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq
:
None
batch_size
:
64
n_fft
:
None
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
stride_ms
:
10.0
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
window_ms
:
25.0
minibatches
:
0
# for debug
use_dB_normalization
:
True
batch_count
:
auto
target_dB
:
-20
batch_bins
:
0
random_seed
:
0
batch_frames_in
:
0
keep_transcription_text
:
False
batch_frames_out
:
0
sortagrad
:
True
batch_frames_inout
:
0
shuffle_method
:
batch_shuffle
num_workers
:
0
num_workers
:
2
subsampling_factor
:
1
num_encs
:
1
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
# encoder related
cmvn_file_type
:
"
json"
encoder
:
transformer
# encoder related
encoder_conf
:
encoder
:
transformer
output_size
:
128
# dimension of attention
encoder_conf
:
attention_heads
:
4
output_size
:
128
# dimension of attention
linear_units
:
1024
# the number of units of position-wise feed forward
attention_heads
:
4
num_blocks
:
6
# the number of encoder blocks
linear_units
:
1024
# the number of units of position-wise feed forward
dropout_rate
:
0.1
num_blocks
:
6
# the number of encoder blocks
positional_dropout_rate
:
0.1
dropout_rate
:
0.1
attention_dropout_rate
:
0.0
positional_dropout_rate
:
0.1
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
attention_dropout_rate
:
0.0
normalize_before
:
true
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
true
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
1024
linear_units
:
1024
num_blocks
:
6
num_blocks
:
6
dropout_rate
:
0.1
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.5
ctc_weight
:
0.5
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
training
:
###########################################
n_epoch
:
50
# Training #
accum_grad
:
1
###########################################
global_grad_clip
:
5.0
n_epoch
:
50
optim
:
adam
accum_grad
:
1
optim_conf
:
global_grad_clip
:
5.0
lr
:
0.004
optim
:
adam
weight_decay
:
1e-06
optim_conf
:
scheduler
:
warmuplr
lr
:
0.004
scheduler_conf
:
weight_decay
:
1.0e-6
warmup_steps
:
1200
scheduler
:
warmuplr
lr_decay
:
1.0
scheduler_conf
:
log_interval
:
10
warmup_steps
:
1200
checkpoint
:
lr_decay
:
1.0
kbest_n
:
50
log_interval
:
10
latest_n
:
5
checkpoint
:
kbest_n
:
50
latest_n
:
5
decoding
:
batch_size
:
64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/timit/asr1/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/timit/asr1/local/align.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
batch_size
=
1
batch_size
=
1
output_dir
=
${
ckpt_prefix
}
output_dir
=
${
ckpt_prefix
}
...
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
...
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
python3
-u
${
BIN_DIR
}
/alignment.py
\
python3
-u
${
BIN_DIR
}
/alignment.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in ctc alignment!"
echo
"Failed in ctc alignment!"
...
...
examples/timit/asr1/local/test.sh
浏览文件 @
c907a8de
...
@@ -7,8 +7,8 @@ stop_stage=50
...
@@ -7,8 +7,8 @@ stop_stage=50
.
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
.
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
chunk_mode
=
false
chunk_mode
=
false
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
...
@@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
@@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
@@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
...
@@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/timit/asr1/run.sh
浏览文件 @
c907a8de
...
@@ -7,6 +7,7 @@ gpus=0,1,2,3
...
@@ -7,6 +7,7 @@ gpus=0,1,2,3
stage
=
0
stage
=
0
stop_stage
=
50
stop_stage
=
50
conf_path
=
conf/transformer.yaml
conf_path
=
conf/transformer.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
10
avg_num
=
10
TIMIT_path
=
/path/to/TIMIT
TIMIT_path
=
/path/to/TIMIT
...
@@ -34,15 +35,15 @@ fi
...
@@ -34,15 +35,15 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# ctc alignment of test data
# ctc alignment of test data
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5
]; then
if
[
${
stage
}
-le
51
]
&&
[
${
stop_stage
}
-ge
51
]
;
then
#
# export ckpt avg_n
# export ckpt avg_n
#
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
#
fi
fi
examples/tiny/asr0/conf/deepspeech2.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.tiny
# Data #
dev_manifest
:
data/manifest.tiny
###########################################
test_manifest
:
data/manifest.tiny
train_manifest
:
data/manifest.tiny
min_input_len
:
0.0
dev_manifest
:
data/manifest.tiny
max_input_len
:
30.0
test_manifest
:
data/manifest.tiny
min_output_len
:
0.0
min_input_len
:
0.0
max_output_len
:
400.0
max_input_len
:
30.0
min_output_input_ratio
:
0.05
min_output_len
:
0.0
max_output_input_ratio
:
10.0
max_output_len
:
400.0
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
###########################################
mean_std_filepath
:
data/mean_std.json
# Dataloader #
unit_type
:
char
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
data/mean_std.json
augmentation_config
:
conf/augmentation.json
unit_type
:
char
random_seed
:
0
vocab_filepath
:
data/lang_char/vocab.txt
spm_model_prefix
:
augmentation_config
:
conf/augmentation.json
spectrum_type
:
linear
random_seed
:
0
feat_dim
:
spm_model_prefix
:
delta_delta
:
False
spectrum_type
:
linear
stride_ms
:
10.0
feat_dim
:
window_ms
:
20.0
delta_delta
:
False
n_fft
:
None
stride_ms
:
10.0
max_freq
:
None
window_ms
:
20.0
target_sample_rate
:
16000
n_fft
:
None
use_dB_normalization
:
True
max_freq
:
None
target_dB
:
-20
target_sample_rate
:
16000
dither
:
1.0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
dither
:
1.0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
batch_size
:
4
shuffle_method
:
batch_shuffle
num_workers
:
2
batch_size
:
4
model
:
############################################
num_conv_layers
:
2
# Network Architecture #
num_rnn_layers
:
3
############################################
rnn_layer_size
:
2048
num_conv_layers
:
2
use_gru
:
False
num_rnn_layers
:
3
share_rnn_weights
:
True
rnn_layer_size
:
2048
blank_id
:
0
use_gru
:
False
share_rnn_weights
:
True
blank_id
:
0
training
:
###########################################
n_epoch
:
5
# Training #
accum_grad
:
1
###########################################
lr
:
1e-5
n_epoch
:
5
lr_decay
:
0.8
accum_grad
:
1
weight_decay
:
1e-06
lr
:
1e-5
global_grad_clip
:
5.0
lr_decay
:
0.8
log_interval
:
1
weight_decay
:
1e-06
checkpoint
:
global_grad_clip
:
5.0
kbest_n
:
3
log_interval
:
1
latest_n
:
2
checkpoint
:
kbest_n
:
3
latest_n
:
2
decoding
:
batch_size
:
128
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/tiny/asr0/conf/deepspeech2_online.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.tiny
# Data #
dev_manifest
:
data/manifest.tiny
###########################################
test_manifest
:
data/manifest.tiny
train_manifest
:
data/manifest.tiny
min_input_len
:
0.0
dev_manifest
:
data/manifest.tiny
max_input_len
:
30.0
test_manifest
:
data/manifest.tiny
min_output_len
:
0.0
min_input_len
:
0.0
max_output_len
:
400.0
max_input_len
:
30.0
min_output_input_ratio
:
0.05
min_output_len
:
0.0
max_output_input_ratio
:
10.0
max_output_len
:
400.0
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
###########################################
mean_std_filepath
:
data/mean_std.json
# Dataloader #
unit_type
:
char
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
data/mean_std.json
augmentation_config
:
conf/augmentation.json
unit_type
:
char
random_seed
:
0
vocab_filepath
:
data/lang_char/vocab.txt
spm_model_prefix
:
augmentation_config
:
conf/augmentation.json
spectrum_type
:
linear
random_seed
:
0
feat_dim
:
spm_model_prefix
:
delta_delta
:
False
spectrum_type
:
linear
stride_ms
:
10.0
feat_dim
:
window_ms
:
20.0
delta_delta
:
False
n_fft
:
None
stride_ms
:
10.0
max_freq
:
None
window_ms
:
20.0
target_sample_rate
:
16000
n_fft
:
None
use_dB_normalization
:
True
max_freq
:
None
target_dB
:
-20
target_sample_rate
:
16000
dither
:
1.0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
dither
:
1.0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
0
sortagrad
:
True
batch_size
:
4
shuffle_method
:
batch_shuffle
num_workers
:
0
batch_size
:
4
model
:
############################################
num_conv_layers
:
2
# Network Architecture #
num_rnn_layers
:
4
############################################
rnn_layer_size
:
2048
num_conv_layers
:
2
rnn_direction
:
forward
num_rnn_layers
:
4
num_fc_layers
:
2
rnn_layer_size
:
2048
fc_layers_size_list
:
512,
256
rnn_direction
:
forward
use_gru
:
True
num_fc_layers
:
2
blank_id
:
0
fc_layers_size_list
:
512,
256
use_gru
:
True
blank_id
:
0
training
:
###########################################
n_epoch
:
5
# Training #
accum_grad
:
1
###########################################
lr
:
1e-5
n_epoch
:
5
lr_decay
:
1.0
accum_grad
:
1
weight_decay
:
1e-06
lr
:
1e-5
global_grad_clip
:
5.0
lr_decay
:
1.0
log_interval
:
1
weight_decay
:
1e-06
checkpoint
:
global_grad_clip
:
5.0
kbest_n
:
3
log_interval
:
1
latest_n
:
2
checkpoint
:
kbest_n
:
3
latest_n
:
2
decoding
:
batch_size
:
128
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/tiny/asr0/conf/tuning/chunk_decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
128
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/tiny/asr0/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
128
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/tiny/asr0/local/test.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix model_type"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix model_type"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
model_type
=
$3
ckpt_prefix
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_en.sh
bash
local
/download_lm_en.sh
...
@@ -21,6 +22,7 @@ fi
...
@@ -21,6 +22,7 @@ fi
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
--model_type
${
model_type
}
...
...
examples/tiny/asr0/run.sh
浏览文件 @
c907a8de
...
@@ -6,6 +6,7 @@ gpus=0
...
@@ -6,6 +6,7 @@ gpus=0
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
avg_num
=
1
model_type
=
offline
model_type
=
offline
...
@@ -32,7 +33,7 @@ fi
...
@@ -32,7 +33,7 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
...
...
examples/tiny/asr1/conf/chunk_confermer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
############################################
data
:
# Network Architecture #
train_manifest
:
data/manifest.tiny
############################################
dev_manifest
:
data/manifest.tiny
cmvn_file
:
"
data/mean_std.json"
test_manifest
:
data/manifest.tiny
cmvn_file_type
:
"
json"
min_input_len
:
0.5
# second
# encoder related
max_input_len
:
30.0
# second
encoder
:
conformer
min_output_len
:
0.0
# tokens
encoder_conf
:
max_output_len
:
400.0
# tokens
output_size
:
256
# dimension of attention
min_output_input_ratio
:
0.05
attention_heads
:
4
max_output_input_ratio
:
10.0
linear_units
:
2048
# the number of units of position-wise feed forward
num_blocks
:
12
# the number of encoder blocks
collator
:
dropout_rate
:
0.1
mean_std_filepath
:
"
"
positional_dropout_rate
:
0.1
vocab_filepath
:
data/lang_char/vocab.txt
attention_dropout_rate
:
0.0
unit_type
:
'
spm'
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
normalize_before
:
True
augmentation_config
:
conf/preprocess.yaml
use_cnn_module
:
True
batch_size
:
4
cnn_module_kernel
:
15
raw_wav
:
True
# use raw_wav or kaldi feature
activation_type
:
'
swish'
spectrum_type
:
fbank
#linear, mfcc, fbank
pos_enc_layer_type
:
'
rel_pos'
feat_dim
:
80
selfattention_layer_type
:
'
rel_selfattn'
delta_delta
:
False
causal
:
True
dither
:
1.0
use_dynamic_chunk
:
True
target_sample_rate
:
16000
cnn_module_norm
:
'
layer_norm'
# using nn.LayerNorm makes model converge faster
max_freq
:
None
use_dynamic_left_chunk
:
false
n_fft
:
None
stride_ms
:
10.0
window_ms
:
25.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
# network architecture
model
:
cmvn_file
:
"
data/mean_std.json"
cmvn_file_type
:
"
json"
# encoder related
encoder
:
conformer
encoder_conf
:
output_size
:
256
# dimension of attention
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
num_blocks
:
12
# the number of encoder blocks
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
attention_dropout_rate
:
0.0
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
True
use_cnn_module
:
True
cnn_module_kernel
:
15
activation_type
:
'
swish'
pos_enc_layer_type
:
'
rel_pos'
selfattention_layer_type
:
'
rel_selfattn'
causal
:
True
use_dynamic_chunk
:
True
cnn_module_norm
:
'
layer_norm'
# using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk
:
false
# decoder related
decoder
:
transformer
decoder_conf
:
attention_heads
:
4
linear_units
:
2048
num_blocks
:
6
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# decoder related
model_conf
:
decoder
:
transformer
ctc_weight
:
0.3
decoder_conf
:
lsm_weight
:
0.1
# label smoothing option
attention_heads
:
4
length_normalized_loss
:
false
linear_units
:
2048
num_blocks
:
6
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
model_conf
:
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
training
:
n_epoch
:
5
accum_grad
:
1
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.001
weight_decay
:
1e-06
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
25000
lr_decay
:
1.0
log_interval
:
1
checkpoint
:
kbest_n
:
10
latest_n
:
1
###########################################
# Data #
###########################################
train_manifest
:
data/manifest.tiny
dev_manifest
:
data/manifest.tiny
test_manifest
:
data/manifest.tiny
decoding
:
batch_size
:
64
###########################################
error_rate_type
:
wer
# Dataloader #
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
###########################################
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
mean_std_filepath
:
"
"
alpha
:
2.5
vocab_filepath
:
data/lang_char/vocab.txt
beta
:
0.3
unit_type
:
'
spm'
beam_size
:
10
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
cutoff_prob
:
1.0
preprocess_config
:
conf/preprocess.yaml
cutoff_top_n
:
0
feat_dim
:
80
num_proc_bsearch
:
8
stride_ms
:
10.0
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
window_ms
:
25.0
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
# <0: for decoding, use full chunk.
batch_size
:
4
# >0: for decoding, use fixed chunk size as set.
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
# 0: used for training, it's prohibited here.
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
minibatches
:
0
# for debug
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
augmentation_config
:
conf/preprocess.yaml
num_workers
:
0
subsampling_factor
:
1
num_encs
:
1
###########################################
# Training #
###########################################
n_epoch
:
5
accum_grad
:
1
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.001
weight_decay
:
1.0e-06
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
25000
lr_decay
:
1.0
log_interval
:
1
checkpoint
:
kbest_n
:
10
latest_n
:
1
examples/tiny/asr1/conf/chunk_transformer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
############################################
data
:
# Network Architecture #
train_manifest
:
data/manifest.tiny
############################################
dev_manifest
:
data/manifest.tiny
cmvn_file
:
"
data/mean_std.json"
test_manifest
:
data/manifest.tiny
cmvn_file_type
:
"
json"
min_input_len
:
0.5
# second
# encoder related
max_input_len
:
20.0
# second
encoder
:
transformer
min_output_len
:
0.0
# tokens
encoder_conf
:
max_output_len
:
400.0
# tokens
output_size
:
256
# dimension of attention
min_output_input_ratio
:
0.05
attention_heads
:
4
max_output_input_ratio
:
10.0
linear_units
:
2048
# the number of units of position-wise feed forward
num_blocks
:
12
# the number of encoder blocks
collator
:
dropout_rate
:
0.1
mean_std_filepath
:
"
"
positional_dropout_rate
:
0.1
vocab_filepath
:
data/lang_char/vocab.txt
attention_dropout_rate
:
0.0
unit_type
:
'
spm'
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
normalize_before
:
true
augmentation_config
:
conf/preprocess.yaml
use_dynamic_chunk
:
true
batch_size
:
4
use_dynamic_left_chunk
:
false
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
window_ms
:
25.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
# network architecture
model
:
cmvn_file
:
"
data/mean_std.json"
cmvn_file_type
:
"
json"
# encoder related
encoder
:
transformer
encoder_conf
:
output_size
:
256
# dimension of attention
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
num_blocks
:
12
# the number of encoder blocks
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
attention_dropout_rate
:
0.0
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
true
use_dynamic_chunk
:
true
use_dynamic_left_chunk
:
false
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
dropout_rate
:
0.1
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
training
:
# https://yaml.org/type/float.html
n_epoch
:
5
###########################################
accum_grad
:
1
# Data #
global_grad_clip
:
5.0
###########################################
optim
:
adam
train_manifest
:
data/manifest.tiny
optim_conf
:
dev_manifest
:
data/manifest.tiny
lr
:
0.002
test_manifest
:
data/manifest.tiny
weight_decay
:
1e-06
scheduler
:
warmuplr
###########################################
scheduler_conf
:
# Dataloader #
warmup_steps
:
25000
###########################################
lr_decay
:
1.0
mean_std_filepath
:
"
"
log_interval
:
1
vocab_filepath
:
data/lang_char/vocab.txt
checkpoint
:
unit_type
:
'
spm'
kbest_n
:
10
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
latest_n
:
1
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
80
stride_ms
:
10.0
window_ms
:
25.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size
:
4
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
0
subsampling_factor
:
1
num_encs
:
1
decoding
:
batch_size
:
64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
###########################################
# Training #
###########################################
n_epoch
:
5
accum_grad
:
1
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.002
weight_decay
:
1.0e-06
scheduler
:
warmuplr
scheduler_conf
:
warmup_steps
:
25000
lr_decay
:
1.0
log_interval
:
1
checkpoint
:
kbest_n
:
10
latest_n
:
1
examples/tiny/asr1/conf/conformer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest
:
data/manifest.tiny
dev_manifest
:
data/manifest.tiny
test_manifest
:
data/manifest.tiny
min_input_len
:
0.5
# second
max_input_len
:
20.0
# second
min_output_len
:
0.0
# tokens
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
###########################################
# Dataloader #
###########################################
mean_std_filepath
:
"
"
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
augmentation_config
:
conf/preprocess.yaml
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
window_ms
:
25.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
############################################
############################################
# Network Architecture #
# Network Architecture #
############################################
############################################
...
@@ -83,7 +41,41 @@ model_conf:
...
@@ -83,7 +41,41 @@ model_conf:
###########################################
###########################################
# training #
# Data #
###########################################
train_manifest
:
data/manifest.tiny
dev_manifest
:
data/manifest.tiny
test_manifest
:
data/manifest.tiny
###########################################
# Dataloader #
###########################################
mean_std_filepath
:
"
"
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
80
stride_ms
:
10.0
window_ms
:
25.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size
:
4
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
0
subsampling_factor
:
1
num_encs
:
1
###########################################
# Training #
###########################################
###########################################
n_epoch
:
5
n_epoch
:
5
accum_grad
:
4
accum_grad
:
4
...
@@ -91,7 +83,7 @@ global_grad_clip: 5.0
...
@@ -91,7 +83,7 @@ global_grad_clip: 5.0
optim
:
adam
optim
:
adam
optim_conf
:
optim_conf
:
lr
:
0.002
lr
:
0.002
weight_decay
:
1e-06
weight_decay
:
1
.0
e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
...
...
examples/tiny/asr1/conf/transformer.yaml
浏览文件 @
c907a8de
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
###########################################
# Data #
###########################################
train_manifest
:
data/manifest.tiny
dev_manifest
:
data/manifest.tiny
test_manifest
:
data/manifest.tiny
min_input_len
:
0.5
# second
max_input_len
:
20.0
# second
min_output_len
:
0.0
# tokens
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
###########################################
# Dataloader #
###########################################
mean_std_filepath
:
data/mean_std.json
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
augmentation_config
:
conf/preprocess.yaml
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
window_ms
:
25.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
############################################
############################################
# Network Architecture #
# Network Architecture #
############################################
############################################
...
@@ -74,9 +34,41 @@ model_conf:
...
@@ -74,9 +34,41 @@ model_conf:
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
###########################################
# Data #
###########################################
train_manifest
:
data/manifest.tiny
dev_manifest
:
data/manifest.tiny
test_manifest
:
data/manifest.tiny
###########################################
# Dataloader #
###########################################
mean_std_filepath
:
data/mean_std.json
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
80
stride_ms
:
10.0
window_ms
:
25.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size
:
4
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
0
subsampling_factor
:
1
num_encs
:
1
###########################################
###########################################
#
t
raining #
#
T
raining #
###########################################
###########################################
n_epoch
:
5
n_epoch
:
5
accum_grad
:
1
accum_grad
:
1
...
@@ -84,7 +76,7 @@ global_grad_clip: 5.0
...
@@ -84,7 +76,7 @@ global_grad_clip: 5.0
optim
:
adam
optim
:
adam
optim_conf
:
optim_conf
:
lr
:
0.002
lr
:
0.002
weight_decay
:
1e-06
weight_decay
:
1
.0
e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
...
...
examples/tiny/asr1/conf/tuning/chunk_decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
8
#64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
\ No newline at end of file
examples/tiny/asr1/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
8
#64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/tiny/asr1/local/align.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
batch_size
=
1
batch_size
=
1
output_dir
=
${
ckpt_prefix
}
output_dir
=
${
ckpt_prefix
}
...
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
...
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
python3
-u
${
BIN_DIR
}
/alignment.py
\
python3
-u
${
BIN_DIR
}
/alignment.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in ctc alignment!"
echo
"Failed in ctc alignment!"
...
...
examples/tiny/asr1/local/test.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
chunk_mode
=
false
chunk_mode
=
false
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
...
@@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do
...
@@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
@@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
...
@@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/tiny/asr1/run.sh
浏览文件 @
c907a8de
...
@@ -6,6 +6,7 @@ gpus=0
...
@@ -6,6 +6,7 @@ gpus=0
stage
=
0
stage
=
0
stop_stage
=
50
stop_stage
=
50
conf_path
=
conf/transformer.yaml
conf_path
=
conf/transformer.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
avg_num
=
1
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
@@ -31,12 +32,12 @@ fi
...
@@ -31,12 +32,12 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# ctc alignment of test data
# ctc alignment of test data
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/align.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
51
]
&&
[
${
stop_stage
}
-ge
51
]
;
then
if
[
${
stage
}
-le
51
]
&&
[
${
stop_stage
}
-ge
51
]
;
then
...
...
examples/wenetspeech/asr1/conf/conformer.yaml
浏览文件 @
c907a8de
# network architecture
############################################
model
:
# Network Architecture #
# encoder related
############################################
encoder
:
conformer
cmvn_file
:
encoder_conf
:
cmvn_file_type
:
"
json"
output_size
:
512
# dimension of attention
# encoder related
attention_heads
:
8
encoder
:
conformer
linear_units
:
2048
# the number of units of position-wise feed forward
encoder_conf
:
num_blocks
:
12
# the number of encoder blocks
output_size
:
512
# dimension of attention
dropout_rate
:
0.1
attention_heads
:
8
positional_dropout_rate
:
0.1
linear_units
:
2048
# the number of units of position-wise feed forward
attention_dropout_rate
:
0.0
num_blocks
:
12
# the number of encoder blocks
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
dropout_rate
:
0.1
normalize_before
:
True
positional_dropout_rate
:
0.1
use_cnn_module
:
True
attention_dropout_rate
:
0.0
cnn_module_kernel
:
15
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
cnn_module_norm
:
layer_norm
normalize_before
:
True
activation_type
:
swish
use_cnn_module
:
True
pos_enc_layer_type
:
rel_pos
cnn_module_kernel
:
15
selfattention_layer_type
:
rel_selfattn
cnn_module_norm
:
layer_norm
activation_type
:
swish
pos_enc_layer_type
:
rel_pos
selfattention_layer_type
:
rel_selfattn
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
8
attention_heads
:
8
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
dropout_rate
:
0.1
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
min_input_len
:
0.1
# second
dev_manifest
:
data/manifest.dev
max_input_len
:
12.0
# second
test_manifest
:
data/manifest.test
min_output_len
:
1.0
max_output_len
:
400.0
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
'
char'
###########################################
spm_model_prefix
:
'
'
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/preprocess.yaml
unit_type
:
'
char'
batch_size
:
64
preprocess_config
:
conf/preprocess.yaml
raw_wav
:
True
# use raw_wav or kaldi feature
spm_model_prefix
:
'
'
spectrum_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
8
0
stride_ms
:
10.
0
delta_delta
:
False
window_ms
:
25.0
dither
:
1.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
target_sample_rate
:
16000
batch_size
:
64
max_freq
:
None
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
n_fft
:
None
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
stride_ms
:
10.0
minibatches
:
0
# for debug
window_ms
:
25.0
batch_count
:
auto
use_dB_normalization
:
True
batch_bins
:
0
target_dB
:
-2
0
batch_frames_in
:
0
random_seed
:
0
batch_frames_out
:
0
keep_transcription_text
:
False
batch_frames_inout
:
0
sortagrad
:
True
num_workers
:
0
shuffle_method
:
batch_shuffle
subsampling_factor
:
1
num_workers
:
2
num_encs
:
1
training
:
###########################################
n_epoch
:
240
# Training #
accum_grad
:
16
###########################################
global_grad_clip
:
5.0
n_epoch
:
240
log_interval
:
100
accum_grad
:
16
checkpoint
:
global_grad_clip
:
5.0
kbest_n
:
50
log_interval
:
100
latest_n
:
5
checkpoint
:
optim
:
adam
kbest_n
:
50
optim_conf
:
latest_n
:
5
lr
:
0.001
optim
:
adam
weight_decay
:
1e-6
optim_conf
:
scheduler
:
warmuplr
lr
:
0.001
scheduler_conf
:
weight_decay
:
1.0e-6
warmup_steps
:
5000
scheduler
:
warmuplr
lr_decay
:
1.0
scheduler_conf
:
warmup_steps
:
5000
lr_decay
:
1.0
decoding
:
batch_size
:
128
error_rate_type
:
cer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
\ No newline at end of file
examples/wenetspeech/asr1/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
c907a8de
decode_batch_size
:
128
error_rate_type
:
cer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
\ No newline at end of file
examples/wenetspeech/asr1/local/test.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
chunk_mode
=
false
chunk_mode
=
false
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
...
@@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do
...
@@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
@@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
...
@@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/wenetspeech/asr1/local/test_wav.sh
浏览文件 @
c907a8de
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix audio_file"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix audio_file"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
audio_file
=
$3
ckpt_prefix
=
$3
audio_file
=
$4
mkdir
-p
data
mkdir
-p
data
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav
-P
data/
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav
-P
data/
...
@@ -43,10 +44,11 @@ for type in attention_rescoring; do
...
@@ -43,10 +44,11 @@ for type in attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test_wav.py
\
python3
-u
${
BIN_DIR
}
/test_wav.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
\
--opts
decod
e.decode_
batch_size
${
batch_size
}
\
--audio_file
${
audio_file
}
--audio_file
${
audio_file
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
...
...
examples/wenetspeech/asr1/run.sh
浏览文件 @
c907a8de
...
@@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7
...
@@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/conformer.yaml
conf_path
=
conf/conformer.yaml
decode_conf_path
=
conf/tuning/decode.yaml
average_checkpoint
=
true
average_checkpoint
=
true
avg_num
=
10
avg_num
=
10
...
@@ -36,12 +36,12 @@ fi
...
@@ -36,12 +36,12 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# ctc alignment of test data
# ctc alignment of test data
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
...
@@ -51,5 +51,5 @@ fi
...
@@ -51,5 +51,5 @@ fi
if
[
${
stage
}
-le
7
]
&&
[
${
stop_stage
}
-ge
7
]
;
then
if
[
${
stage
}
-le
7
]
&&
[
${
stop_stage
}
-ge
7
]
;
then
# test a single .wav file
# test a single .wav file
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
audio_file
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
audio_file
}
||
exit
-1
fi
fi
paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
浏览文件 @
c907a8de
...
@@ -80,13 +80,13 @@ def inference(config, args):
...
@@ -80,13 +80,13 @@ def inference(config, args):
def
start_server
(
config
,
args
):
def
start_server
(
config
,
args
):
"""Start the ASR server"""
"""Start the ASR server"""
config
.
defrost
()
config
.
defrost
()
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
manifest
=
config
.
test_manifest
dataset
=
ManifestDataset
.
from_config
(
config
)
dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
True
config
.
keep_transcription_text
=
True
config
.
collator
.
batch_size
=
1
config
.
batch_size
=
1
config
.
collator
.
num_workers
=
0
config
.
num_workers
=
0
collate_fn
=
SpeechCollator
.
from_config
(
config
)
collate_fn
=
SpeechCollator
.
from_config
(
config
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
...
@@ -105,14 +105,14 @@ def start_server(config, args):
...
@@ -105,14 +105,14 @@ def start_server(config, args):
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio_len
),
paddle
.
to_tensor
(
audio_len
),
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
decoding_method
=
config
.
decod
ing
.
decoding_method
,
decoding_method
=
config
.
decod
e
.
decoding_method
,
lang_model_path
=
config
.
decod
ing
.
lang_model_path
,
lang_model_path
=
config
.
decod
e
.
lang_model_path
,
beam_alpha
=
config
.
decod
ing
.
alpha
,
beam_alpha
=
config
.
decod
e
.
alpha
,
beam_beta
=
config
.
decod
ing
.
beta
,
beam_beta
=
config
.
decod
e
.
beta
,
beam_size
=
config
.
decod
ing
.
beam_size
,
beam_size
=
config
.
decod
e
.
beam_size
,
cutoff_prob
=
config
.
decod
ing
.
cutoff_prob
,
cutoff_prob
=
config
.
decod
e
.
cutoff_prob
,
cutoff_top_n
=
config
.
decod
ing
.
cutoff_top_n
,
cutoff_top_n
=
config
.
decod
e
.
cutoff_top_n
,
num_processes
=
config
.
decod
ing
.
num_proc_bsearch
)
num_processes
=
config
.
decod
e
.
num_proc_bsearch
)
return
result_transcript
[
0
]
return
result_transcript
[
0
]
# warming up with utterrances sampled from Librispeech
# warming up with utterrances sampled from Librispeech
...
@@ -179,12 +179,16 @@ if __name__ == "__main__":
...
@@ -179,12 +179,16 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
()
config
=
get_cfg_defaults
()
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_cfg
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_cfg
)
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
print
(
config
)
print
(
config
)
args
.
warmup_manifest
=
config
.
data
.
test_manifest
args
.
warmup_manifest
=
config
.
test_manifest
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
if
args
.
dump_config
:
if
args
.
dump_config
:
...
...
paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
浏览文件 @
c907a8de
...
@@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments
...
@@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments
def
start_server
(
config
,
args
):
def
start_server
(
config
,
args
):
"""Start the ASR server"""
"""Start the ASR server"""
config
.
defrost
()
config
.
defrost
()
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
manifest
=
config
.
test_manifest
dataset
=
ManifestDataset
.
from_config
(
config
)
dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
True
config
.
keep_transcription_text
=
True
config
.
collator
.
batch_size
=
1
config
.
batch_size
=
1
config
.
collator
.
num_workers
=
0
config
.
num_workers
=
0
collate_fn
=
SpeechCollator
.
from_config
(
config
)
collate_fn
=
SpeechCollator
.
from_config
(
config
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
...
@@ -62,14 +62,14 @@ def start_server(config, args):
...
@@ -62,14 +62,14 @@ def start_server(config, args):
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio_len
),
paddle
.
to_tensor
(
audio_len
),
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
decoding_method
=
config
.
decod
ing
.
decoding_method
,
decoding_method
=
config
.
decod
e
.
decoding_method
,
lang_model_path
=
config
.
decod
ing
.
lang_model_path
,
lang_model_path
=
config
.
decod
e
.
lang_model_path
,
beam_alpha
=
config
.
decod
ing
.
alpha
,
beam_alpha
=
config
.
decod
e
.
alpha
,
beam_beta
=
config
.
decod
ing
.
beta
,
beam_beta
=
config
.
decod
e
.
beta
,
beam_size
=
config
.
decod
ing
.
beam_size
,
beam_size
=
config
.
decod
e
.
beam_size
,
cutoff_prob
=
config
.
decod
ing
.
cutoff_prob
,
cutoff_prob
=
config
.
decod
e
.
cutoff_prob
,
cutoff_top_n
=
config
.
decod
ing
.
cutoff_top_n
,
cutoff_top_n
=
config
.
decod
e
.
cutoff_top_n
,
num_processes
=
config
.
decod
ing
.
num_proc_bsearch
)
num_processes
=
config
.
decod
e
.
num_proc_bsearch
)
return
result_transcript
[
0
]
return
result_transcript
[
0
]
# warming up with utterrances sampled from Librispeech
# warming up with utterrances sampled from Librispeech
...
@@ -114,12 +114,16 @@ if __name__ == "__main__":
...
@@ -114,12 +114,16 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
()
config
=
get_cfg_defaults
()
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_cfg
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_cfg
)
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
print
(
config
)
print
(
config
)
args
.
warmup_manifest
=
config
.
data
.
test_manifest
args
.
warmup_manifest
=
config
.
test_manifest
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
if
args
.
dump_config
:
if
args
.
dump_config
:
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test.py
浏览文件 @
c907a8de
...
@@ -12,6 +12,8 @@
...
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Evaluation for DeepSpeech2 model."""
"""Evaluation for DeepSpeech2 model."""
from
yacs.config
import
CfgNode
from
paddlespeech.s2t.exps.deepspeech2.config
import
get_cfg_defaults
from
paddlespeech.s2t.exps.deepspeech2.config
import
get_cfg_defaults
from
paddlespeech.s2t.exps.deepspeech2.model
import
DeepSpeech2Tester
as
Tester
from
paddlespeech.s2t.exps.deepspeech2.model
import
DeepSpeech2Tester
as
Tester
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.training.cli
import
default_argument_parser
...
@@ -44,6 +46,10 @@ if __name__ == "__main__":
...
@@ -44,6 +46,10 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
(
args
.
model_type
)
config
=
get_cfg_defaults
(
args
.
model_type
)
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_cfg
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_cfg
)
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
浏览文件 @
c907a8de
...
@@ -12,6 +12,8 @@
...
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Evaluation for DeepSpeech2 model."""
"""Evaluation for DeepSpeech2 model."""
from
yacs.config
import
CfgNode
from
paddlespeech.s2t.exps.deepspeech2.config
import
get_cfg_defaults
from
paddlespeech.s2t.exps.deepspeech2.config
import
get_cfg_defaults
from
paddlespeech.s2t.exps.deepspeech2.model
import
DeepSpeech2ExportTester
as
ExportTester
from
paddlespeech.s2t.exps.deepspeech2.model
import
DeepSpeech2ExportTester
as
ExportTester
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.training.cli
import
default_argument_parser
...
@@ -49,6 +51,10 @@ if __name__ == "__main__":
...
@@ -49,6 +51,10 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
(
args
.
model_type
)
config
=
get_cfg_defaults
(
args
.
model_type
)
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_cfg
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_cfg
)
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
浏览文件 @
c907a8de
...
@@ -18,6 +18,7 @@ from pathlib import Path
...
@@ -18,6 +18,7 @@ from pathlib import Path
import
paddle
import
paddle
import
soundfile
import
soundfile
from
yacs.config
import
CfgNode
from
paddlespeech.s2t.exps.deepspeech2.config
import
get_cfg_defaults
from
paddlespeech.s2t.exps.deepspeech2.config
import
get_cfg_defaults
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
...
@@ -41,7 +42,7 @@ class DeepSpeech2Tester_hub():
...
@@ -41,7 +42,7 @@ class DeepSpeech2Tester_hub():
self
.
audio_file
=
args
.
audio_file
self
.
audio_file
=
args
.
audio_file
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
config
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
config
)
self
.
_text_featurizer
=
TextFeaturizer
(
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
=
config
.
collator
.
unit_type
,
vocab
=
None
)
unit_type
=
config
.
unit_type
,
vocab
=
None
)
def
compute_result_transcripts
(
self
,
audio
,
audio_len
,
vocab_list
,
cfg
):
def
compute_result_transcripts
(
self
,
audio
,
audio_len
,
vocab_list
,
cfg
):
result_transcripts
=
self
.
model
.
decode
(
result_transcripts
=
self
.
model
.
decode
(
...
@@ -74,7 +75,7 @@ class DeepSpeech2Tester_hub():
...
@@ -74,7 +75,7 @@ class DeepSpeech2Tester_hub():
audio
=
paddle
.
unsqueeze
(
audio
,
axis
=
0
)
audio
=
paddle
.
unsqueeze
(
audio
,
axis
=
0
)
vocab_list
=
collate_fn_test
.
vocab_list
vocab_list
=
collate_fn_test
.
vocab_list
result_transcripts
=
self
.
compute_result_transcripts
(
result_transcripts
=
self
.
compute_result_transcripts
(
audio
,
audio_len
,
vocab_list
,
cfg
.
decod
ing
)
audio
,
audio_len
,
vocab_list
,
cfg
.
decod
e
)
logger
.
info
(
"result_transcripts: "
+
result_transcripts
[
0
])
logger
.
info
(
"result_transcripts: "
+
result_transcripts
[
0
])
def
run_test
(
self
):
def
run_test
(
self
):
...
@@ -110,13 +111,13 @@ class DeepSpeech2Tester_hub():
...
@@ -110,13 +111,13 @@ class DeepSpeech2Tester_hub():
def
setup_model
(
self
):
def
setup_model
(
self
):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
with
UpdateConfig
(
config
):
with
UpdateConfig
(
config
):
config
.
model
.
input_dim
=
self
.
collate_fn_test
.
feature_size
config
.
input_dim
=
self
.
collate_fn_test
.
feature_size
config
.
model
.
output_dim
=
self
.
collate_fn_test
.
vocab_size
config
.
output_dim
=
self
.
collate_fn_test
.
vocab_size
if
self
.
args
.
model_type
==
'offline'
:
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
.
from_config
(
config
.
model
)
model
=
DeepSpeech2Model
.
from_config
(
config
)
elif
self
.
args
.
model_type
==
'online'
:
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
.
model
)
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
)
else
:
else
:
raise
Exception
(
"wrong model type"
)
raise
Exception
(
"wrong model type"
)
...
@@ -134,8 +135,8 @@ class DeepSpeech2Tester_hub():
...
@@ -134,8 +135,8 @@ class DeepSpeech2Tester_hub():
self
.
checkpoint_dir
=
checkpoint_dir
self
.
checkpoint_dir
=
checkpoint_dir
self
.
checkpoint
=
Checkpoint
(
self
.
checkpoint
=
Checkpoint
(
kbest_n
=
self
.
config
.
training
.
checkpoint
.
kbest_n
,
kbest_n
=
self
.
config
.
checkpoint
.
kbest_n
,
latest_n
=
self
.
config
.
training
.
checkpoint
.
latest_n
)
latest_n
=
self
.
config
.
checkpoint
.
latest_n
)
def
resume
(
self
):
def
resume
(
self
):
"""Resume from the checkpoint at checkpoints in the output
"""Resume from the checkpoint at checkpoints in the output
...
@@ -190,6 +191,10 @@ if __name__ == "__main__":
...
@@ -190,6 +191,10 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
(
args
.
model_type
)
config
=
get_cfg_defaults
(
args
.
model_type
)
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_cfg
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_cfg
)
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
...
...
paddlespeech/s2t/exps/deepspeech2/config.py
浏览文件 @
c907a8de
...
@@ -23,17 +23,6 @@ from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
...
@@ -23,17 +23,6 @@ from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
def
get_cfg_defaults
(
model_type
=
'offline'
):
def
get_cfg_defaults
(
model_type
=
'offline'
):
_C
=
CfgNode
()
_C
=
CfgNode
()
_C
.
data
=
ManifestDataset
.
params
()
_C
.
collator
=
SpeechCollator
.
params
()
_C
.
training
=
DeepSpeech2Trainer
.
params
()
_C
.
decoding
=
DeepSpeech2Tester
.
params
()
if
model_type
==
'offline'
:
_C
.
model
=
DeepSpeech2Model
.
params
()
else
:
_C
.
model
=
DeepSpeech2ModelOnline
.
params
()
"""Get a yacs CfgNode object with default values for my_project."""
# Return a clone so that the defaults will not be altered
# This is for the "local variable" use pattern
config
=
_C
.
clone
()
config
=
_C
.
clone
()
config
.
set_new_allowed
(
True
)
config
.
set_new_allowed
(
True
)
return
config
return
config
paddlespeech/s2t/exps/deepspeech2/model.py
浏览文件 @
c907a8de
...
@@ -69,8 +69,8 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -69,8 +69,8 @@ class DeepSpeech2Trainer(Trainer):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
def
train_batch
(
self
,
batch_index
,
batch_data
,
msg
):
def
train_batch
(
self
,
batch_index
,
batch_data
,
msg
):
batch_size
=
self
.
config
.
collator
.
batch_size
batch_size
=
self
.
config
.
batch_size
accum_grad
=
self
.
config
.
training
.
accum_grad
accum_grad
=
self
.
config
.
accum_grad
start
=
time
.
time
()
start
=
time
.
time
()
...
@@ -133,7 +133,7 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -133,7 +133,7 @@ class DeepSpeech2Trainer(Trainer):
total_loss
+=
float
(
loss
)
*
num_utts
total_loss
+=
float
(
loss
)
*
num_utts
valid_losses
[
'val_loss'
].
append
(
float
(
loss
))
valid_losses
[
'val_loss'
].
append
(
float
(
loss
))
if
(
i
+
1
)
%
self
.
config
.
training
.
log_interval
==
0
:
if
(
i
+
1
)
%
self
.
config
.
log_interval
==
0
:
valid_dump
=
{
k
:
np
.
mean
(
v
)
for
k
,
v
in
valid_losses
.
items
()}
valid_dump
=
{
k
:
np
.
mean
(
v
)
for
k
,
v
in
valid_losses
.
items
()}
valid_dump
[
'val_history_loss'
]
=
total_loss
/
num_seen_utts
valid_dump
[
'val_history_loss'
]
=
total_loss
/
num_seen_utts
...
@@ -154,16 +154,16 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -154,16 +154,16 @@ class DeepSpeech2Trainer(Trainer):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
with
UpdateConfig
(
config
):
with
UpdateConfig
(
config
):
if
self
.
train
:
if
self
.
train
:
config
.
model
.
input_dim
=
self
.
train_loader
.
collate_fn
.
feature_size
config
.
input_dim
=
self
.
train_loader
.
collate_fn
.
feature_size
config
.
model
.
output_dim
=
self
.
train_loader
.
collate_fn
.
vocab_size
config
.
output_dim
=
self
.
train_loader
.
collate_fn
.
vocab_size
else
:
else
:
config
.
model
.
input_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
config
.
input_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
config
.
model
.
output_dim
=
self
.
test_loader
.
collate_fn
.
vocab_size
config
.
output_dim
=
self
.
test_loader
.
collate_fn
.
vocab_size
if
self
.
args
.
model_type
==
'offline'
:
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
.
from_config
(
config
.
model
)
model
=
DeepSpeech2Model
.
from_config
(
config
)
elif
self
.
args
.
model_type
==
'online'
:
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
.
model
)
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
)
else
:
else
:
raise
Exception
(
"wrong model type"
)
raise
Exception
(
"wrong model type"
)
if
self
.
parallel
:
if
self
.
parallel
:
...
@@ -177,17 +177,13 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -177,17 +177,13 @@ class DeepSpeech2Trainer(Trainer):
if
not
self
.
train
:
if
not
self
.
train
:
return
return
grad_clip
=
ClipGradByGlobalNormWithLog
(
grad_clip
=
ClipGradByGlobalNormWithLog
(
config
.
global_grad_clip
)
config
.
training
.
global_grad_clip
)
lr_scheduler
=
paddle
.
optimizer
.
lr
.
ExponentialDecay
(
lr_scheduler
=
paddle
.
optimizer
.
lr
.
ExponentialDecay
(
learning_rate
=
config
.
training
.
lr
,
learning_rate
=
config
.
lr
,
gamma
=
config
.
lr_decay
,
verbose
=
True
)
gamma
=
config
.
training
.
lr_decay
,
verbose
=
True
)
optimizer
=
paddle
.
optimizer
.
Adam
(
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
lr_scheduler
,
learning_rate
=
lr_scheduler
,
parameters
=
model
.
parameters
(),
parameters
=
model
.
parameters
(),
weight_decay
=
paddle
.
regularizer
.
L2Decay
(
weight_decay
=
paddle
.
regularizer
.
L2Decay
(
config
.
weight_decay
),
config
.
training
.
weight_decay
),
grad_clip
=
grad_clip
)
grad_clip
=
grad_clip
)
self
.
optimizer
=
optimizer
self
.
optimizer
=
optimizer
self
.
lr_scheduler
=
lr_scheduler
self
.
lr_scheduler
=
lr_scheduler
...
@@ -198,66 +194,67 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -198,66 +194,67 @@ class DeepSpeech2Trainer(Trainer):
config
.
defrost
()
config
.
defrost
()
if
self
.
train
:
if
self
.
train
:
# train
# train
config
.
data
.
manifest
=
config
.
data
.
train_manifest
config
.
manifest
=
config
.
train_manifest
train_dataset
=
ManifestDataset
.
from_config
(
config
)
train_dataset
=
ManifestDataset
.
from_config
(
config
)
if
self
.
parallel
:
if
self
.
parallel
:
batch_sampler
=
SortagradDistributedBatchSampler
(
batch_sampler
=
SortagradDistributedBatchSampler
(
train_dataset
,
train_dataset
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
num_replicas
=
None
,
num_replicas
=
None
,
rank
=
None
,
rank
=
None
,
shuffle
=
True
,
shuffle
=
True
,
drop_last
=
True
,
drop_last
=
True
,
sortagrad
=
config
.
collator
.
sortagrad
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
collator
.
shuffle_method
)
shuffle_method
=
config
.
shuffle_method
)
else
:
else
:
batch_sampler
=
SortagradBatchSampler
(
batch_sampler
=
SortagradBatchSampler
(
train_dataset
,
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
drop_last
=
True
,
drop_last
=
True
,
sortagrad
=
config
.
collator
.
sortagrad
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
collator
.
shuffle_method
)
shuffle_method
=
config
.
shuffle_method
)
config
.
collator
.
keep_transcription_text
=
False
config
.
keep_transcription_text
=
False
collate_fn_train
=
SpeechCollator
.
from_config
(
config
)
collate_fn_train
=
SpeechCollator
.
from_config
(
config
)
self
.
train_loader
=
DataLoader
(
self
.
train_loader
=
DataLoader
(
train_dataset
,
train_dataset
,
batch_sampler
=
batch_sampler
,
batch_sampler
=
batch_sampler
,
collate_fn
=
collate_fn_train
,
collate_fn
=
collate_fn_train
,
num_workers
=
config
.
collator
.
num_workers
)
num_workers
=
config
.
num_workers
)
# dev
# dev
config
.
data
.
manifest
=
config
.
data
.
dev_manifest
config
.
manifest
=
config
.
dev_manifest
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
False
config
.
keep_transcription_text
=
False
collate_fn_dev
=
SpeechCollator
.
from_config
(
config
)
collate_fn_dev
=
SpeechCollator
.
from_config
(
config
)
self
.
valid_loader
=
DataLoader
(
self
.
valid_loader
=
DataLoader
(
dev_dataset
,
dev_dataset
,
batch_size
=
int
(
config
.
collator
.
batch_size
),
batch_size
=
int
(
config
.
batch_size
),
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
collate_fn_dev
,
collate_fn
=
collate_fn_dev
,
num_workers
=
config
.
collator
.
num_workers
)
num_workers
=
config
.
num_workers
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
else
:
else
:
# test
# test
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
manifest
=
config
.
test_manifest
test_dataset
=
ManifestDataset
.
from_config
(
config
)
test_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
True
config
.
keep_transcription_text
=
True
collate_fn_test
=
SpeechCollator
.
from_config
(
config
)
collate_fn_test
=
SpeechCollator
.
from_config
(
config
)
decode_batch_size
=
config
.
get
(
'decode'
,
dict
()).
get
(
'decode_batch_size'
,
1
)
self
.
test_loader
=
DataLoader
(
self
.
test_loader
=
DataLoader
(
test_dataset
,
test_dataset
,
batch_size
=
config
.
decoding
.
batch_size
,
batch_size
=
decode_
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
collate_fn_test
,
collate_fn
=
collate_fn_test
,
num_workers
=
config
.
collator
.
num_workers
)
num_workers
=
config
.
num_workers
)
logger
.
info
(
"Setup test Dataloader!"
)
logger
.
info
(
"Setup test Dataloader!"
)
...
@@ -286,7 +283,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -286,7 +283,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def
__init__
(
self
,
config
,
args
):
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
self
.
_text_featurizer
=
TextFeaturizer
(
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
=
config
.
collator
.
unit_type
,
vocab
=
None
)
unit_type
=
config
.
unit_type
,
vocab
=
None
)
def
ordid2token
(
self
,
texts
,
texts_len
):
def
ordid2token
(
self
,
texts
,
texts_len
):
""" ord() id to chr() chr """
""" ord() id to chr() chr """
...
@@ -304,17 +301,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -304,17 +301,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
texts
,
texts
,
texts_len
,
texts_len
,
fout
=
None
):
fout
=
None
):
cfg
=
self
.
config
.
decoding
decode_cfg
=
self
.
config
.
decode
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_func
=
error_rate
.
char_errors
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
word_errors
errors_func
=
error_rate
.
char_errors
if
decode_
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
word_errors
error_rate_func
=
error_rate
.
cer
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
wer
error_rate_func
=
error_rate
.
cer
if
decode_
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
wer
vocab_list
=
self
.
test_loader
.
collate_fn
.
vocab_list
vocab_list
=
self
.
test_loader
.
collate_fn
.
vocab_list
target_transcripts
=
self
.
ordid2token
(
texts
,
texts_len
)
target_transcripts
=
self
.
ordid2token
(
texts
,
texts_len
)
result_transcripts
=
self
.
compute_result_transcripts
(
audio
,
audio_len
,
result_transcripts
=
self
.
compute_result_transcripts
(
vocab_list
,
cfg
)
audio
,
audio_len
,
vocab_list
,
decode_
cfg
)
for
utt
,
target
,
result
in
zip
(
utts
,
target_transcripts
,
for
utt
,
target
,
result
in
zip
(
utts
,
target_transcripts
,
result_transcripts
):
result_transcripts
):
...
@@ -327,29 +324,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -327,29 +324,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
logger
.
info
(
f
"Utt:
{
utt
}
"
)
logger
.
info
(
f
"Utt:
{
utt
}
"
)
logger
.
info
(
f
"Ref:
{
target
}
"
)
logger
.
info
(
f
"Ref:
{
target
}
"
)
logger
.
info
(
f
"Hyp:
{
result
}
"
)
logger
.
info
(
f
"Hyp:
{
result
}
"
)
logger
.
info
(
"Current error rate [%s] = %f"
%
logger
.
info
(
(
cfg
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
"Current error rate [%s] = %f"
%
(
decode_cfg
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
return
dict
(
return
dict
(
errors_sum
=
errors_sum
,
errors_sum
=
errors_sum
,
len_refs
=
len_refs
,
len_refs
=
len_refs
,
num_ins
=
num_ins
,
num_ins
=
num_ins
,
error_rate
=
errors_sum
/
len_refs
,
error_rate
=
errors_sum
/
len_refs
,
error_rate_type
=
cfg
.
error_rate_type
)
error_rate_type
=
decode_
cfg
.
error_rate_type
)
def
compute_result_transcripts
(
self
,
audio
,
audio_len
,
vocab_list
,
cfg
):
def
compute_result_transcripts
(
self
,
audio
,
audio_len
,
vocab_list
,
decode_cfg
):
result_transcripts
=
self
.
model
.
decode
(
result_transcripts
=
self
.
model
.
decode
(
audio
,
audio
,
audio_len
,
audio_len
,
vocab_list
,
vocab_list
,
decoding_method
=
cfg
.
decoding_method
,
decoding_method
=
decode_
cfg
.
decoding_method
,
lang_model_path
=
cfg
.
lang_model_path
,
lang_model_path
=
decode_
cfg
.
lang_model_path
,
beam_alpha
=
cfg
.
alpha
,
beam_alpha
=
decode_
cfg
.
alpha
,
beam_beta
=
cfg
.
beta
,
beam_beta
=
decode_
cfg
.
beta
,
beam_size
=
cfg
.
beam_size
,
beam_size
=
decode_
cfg
.
beam_size
,
cutoff_prob
=
cfg
.
cutoff_prob
,
cutoff_prob
=
decode_
cfg
.
cutoff_prob
,
cutoff_top_n
=
cfg
.
cutoff_top_n
,
cutoff_top_n
=
decode_
cfg
.
cutoff_top_n
,
num_processes
=
cfg
.
num_proc_bsearch
)
num_processes
=
decode_
cfg
.
num_proc_bsearch
)
return
result_transcripts
return
result_transcripts
...
@@ -358,7 +357,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -358,7 +357,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def
test
(
self
):
def
test
(
self
):
logger
.
info
(
f
"Test Total Examples:
{
len
(
self
.
test_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Test Total Examples:
{
len
(
self
.
test_loader
.
dataset
)
}
"
)
self
.
model
.
eval
()
self
.
model
.
eval
()
cfg
=
self
.
config
error_rate_type
=
None
error_rate_type
=
None
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
with
jsonlines
.
open
(
self
.
args
.
result_file
,
'w'
)
as
fout
:
with
jsonlines
.
open
(
self
.
args
.
result_file
,
'w'
)
as
fout
:
...
@@ -412,11 +410,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
...
@@ -412,11 +410,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
if
self
.
args
.
enable_auto_log
is
True
:
if
self
.
args
.
enable_auto_log
is
True
:
from
paddlespeech.s2t.utils.log
import
Autolog
from
paddlespeech.s2t.utils.log
import
Autolog
self
.
autolog
=
Autolog
(
self
.
autolog
=
Autolog
(
batch_size
=
self
.
config
.
decod
ing
.
batch_size
,
batch_size
=
self
.
config
.
decod
e
.
decode_
batch_size
,
model_name
=
"deepspeech2"
,
model_name
=
"deepspeech2"
,
model_precision
=
"fp32"
).
getlog
()
model_precision
=
"fp32"
).
getlog
()
self
.
model
.
eval
()
self
.
model
.
eval
()
cfg
=
self
.
config
error_rate_type
=
None
error_rate_type
=
None
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
with
jsonlines
.
open
(
self
.
args
.
result_file
,
'w'
)
as
fout
:
with
jsonlines
.
open
(
self
.
args
.
result_file
,
'w'
)
as
fout
:
...
@@ -441,7 +438,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
...
@@ -441,7 +438,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
if
self
.
args
.
enable_auto_log
is
True
:
if
self
.
args
.
enable_auto_log
is
True
:
self
.
autolog
.
report
()
self
.
autolog
.
report
()
def
compute_result_transcripts
(
self
,
audio
,
audio_len
,
vocab_list
,
cfg
):
def
compute_result_transcripts
(
self
,
audio
,
audio_len
,
vocab_list
,
decode_cfg
):
if
self
.
args
.
model_type
==
"online"
:
if
self
.
args
.
model_type
==
"online"
:
output_probs
,
output_lens
=
self
.
static_forward_online
(
audio
,
output_probs
,
output_lens
=
self
.
static_forward_online
(
audio
,
audio_len
)
audio_len
)
...
@@ -454,13 +452,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
...
@@ -454,13 +452,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
self
.
predictor
.
clear_intermediate_tensor
()
self
.
predictor
.
clear_intermediate_tensor
()
self
.
predictor
.
try_shrink_memory
()
self
.
predictor
.
try_shrink_memory
()
self
.
model
.
decoder
.
init_decode
(
cfg
.
alpha
,
cfg
.
beta
,
cfg
.
lang_model_path
,
self
.
model
.
decoder
.
init_decode
(
decode_cfg
.
alpha
,
decode_cfg
.
beta
,
vocab_list
,
cfg
.
decoding_method
)
decode_cfg
.
lang_model_path
,
vocab_list
,
decode_cfg
.
decoding_method
)
result_transcripts
=
self
.
model
.
decoder
.
decode_probs
(
result_transcripts
=
self
.
model
.
decoder
.
decode_probs
(
output_probs
,
output_lens
,
vocab_list
,
cfg
.
decoding_method
,
output_probs
,
output_lens
,
vocab_list
,
decode_cfg
.
decoding_method
,
cfg
.
lang_model_path
,
cfg
.
alpha
,
cfg
.
beta
,
cfg
.
beam_size
,
decode_cfg
.
lang_model_path
,
decode_cfg
.
alpha
,
decode_cfg
.
beta
,
cfg
.
cutoff_prob
,
cfg
.
cutoff_top_n
,
cfg
.
num_proc_bsearch
)
decode_cfg
.
beam_size
,
decode_cfg
.
cutoff_prob
,
decode_cfg
.
cutoff_top_n
,
decode_cfg
.
num_proc_bsearch
)
#replace the <space> with ' '
#replace the <space> with ' '
result_transcripts
=
[
result_transcripts
=
[
self
.
_text_featurizer
.
detokenize
(
sentence
)
self
.
_text_featurizer
.
detokenize
(
sentence
)
...
@@ -531,12 +531,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
...
@@ -531,12 +531,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
num_chunk
=
int
(
num_chunk
)
num_chunk
=
int
(
num_chunk
)
chunk_state_h_box
=
np
.
zeros
(
chunk_state_h_box
=
np
.
zeros
(
(
self
.
config
.
model
.
num_rnn_layers
,
1
,
(
self
.
config
.
num_rnn_layers
,
1
,
self
.
config
.
rnn_layer_size
),
self
.
config
.
model
.
rnn_layer_size
),
dtype
=
x
.
dtype
)
dtype
=
x
.
dtype
)
chunk_state_c_box
=
np
.
zeros
(
chunk_state_c_box
=
np
.
zeros
(
(
self
.
config
.
model
.
num_rnn_layers
,
1
,
(
self
.
config
.
num_rnn_layers
,
1
,
self
.
config
.
rnn_layer_size
),
self
.
config
.
model
.
rnn_layer_size
),
dtype
=
x
.
dtype
)
dtype
=
x
.
dtype
)
input_names
=
self
.
predictor
.
get_input_names
()
input_names
=
self
.
predictor
.
get_input_names
()
...
...
paddlespeech/s2t/exps/u2/bin/alignment.py
浏览文件 @
c907a8de
...
@@ -43,9 +43,9 @@ if __name__ == "__main__":
...
@@ -43,9 +43,9 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
()
config
=
get_cfg_defaults
()
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_c
onfi
g
:
if
args
.
decode_c
f
g
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_c
onfi
g
)
decode_confs
.
merge_from_file
(
args
.
decode_c
f
g
)
config
.
decode
=
decode_confs
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
...
...
paddlespeech/s2t/exps/u2/bin/test.py
浏览文件 @
c907a8de
...
@@ -47,9 +47,9 @@ if __name__ == "__main__":
...
@@ -47,9 +47,9 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
()
config
=
get_cfg_defaults
()
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_c
onfi
g
:
if
args
.
decode_c
f
g
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_c
onfi
g
)
decode_confs
.
merge_from_file
(
args
.
decode_c
f
g
)
config
.
decode
=
decode_confs
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
...
...
paddlespeech/s2t/exps/u2/bin/test_wav.py
浏览文件 @
c907a8de
...
@@ -38,7 +38,7 @@ class U2Infer():
...
@@ -38,7 +38,7 @@ class U2Infer():
self
.
config
=
config
self
.
config
=
config
self
.
audio_file
=
args
.
audio_file
self
.
audio_file
=
args
.
audio_file
self
.
preprocess_conf
=
config
.
augmentation
_config
self
.
preprocess_conf
=
config
.
preprocess
_config
self
.
preprocess_args
=
{
"train"
:
False
}
self
.
preprocess_args
=
{
"train"
:
False
}
self
.
preprocessing
=
Transformation
(
self
.
preprocess_conf
)
self
.
preprocessing
=
Transformation
(
self
.
preprocess_conf
)
...
@@ -132,9 +132,9 @@ if __name__ == "__main__":
...
@@ -132,9 +132,9 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
()
config
=
get_cfg_defaults
()
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_c
onfi
g
:
if
args
.
decode_c
f
g
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_c
onfi
g
)
decode_confs
.
merge_from_file
(
args
.
decode_c
f
g
)
config
.
decode
=
decode_confs
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
...
...
paddlespeech/s2t/exps/u2/config.py
浏览文件 @
c907a8de
...
@@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2 import U2Model
...
@@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2 import U2Model
_C
=
CfgNode
(
new_allowed
=
True
)
_C
=
CfgNode
(
new_allowed
=
True
)
ManifestDataset
.
params
(
_C
)
#
ManifestDataset.params(_C)
SpeechCollator
.
params
(
_C
)
#
SpeechCollator.params(_C)
U2Model
.
params
(
_C
)
#
U2Model.params(_C)
U2Trainer
.
params
(
_C
)
#
U2Trainer.params(_C)
_C
.
decode
=
U2Tester
.
params
()
#
_C.decode = U2Tester.params()
def
get_cfg_defaults
():
def
get_cfg_defaults
():
...
...
paddlespeech/s2t/exps/u2/model.py
浏览文件 @
c907a8de
...
@@ -264,7 +264,7 @@ class U2Trainer(Trainer):
...
@@ -264,7 +264,7 @@ class U2Trainer(Trainer):
batch_frames_in
=
config
.
batch_frames_in
,
batch_frames_in
=
config
.
batch_frames_in
,
batch_frames_out
=
config
.
batch_frames_out
,
batch_frames_out
=
config
.
batch_frames_out
,
batch_frames_inout
=
config
.
batch_frames_inout
,
batch_frames_inout
=
config
.
batch_frames_inout
,
preprocess_conf
=
config
.
augmentation
_config
,
preprocess_conf
=
config
.
preprocess
_config
,
n_iter_processes
=
config
.
num_workers
,
n_iter_processes
=
config
.
num_workers
,
subsampling_factor
=
1
,
subsampling_factor
=
1
,
num_encs
=
1
)
num_encs
=
1
)
...
@@ -283,18 +283,20 @@ class U2Trainer(Trainer):
...
@@ -283,18 +283,20 @@ class U2Trainer(Trainer):
batch_frames_in
=
0
,
batch_frames_in
=
0
,
batch_frames_out
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
config
.
augmentation
_config
,
preprocess_conf
=
config
.
preprocess
_config
,
n_iter_processes
=
config
.
num_workers
,
n_iter_processes
=
config
.
num_workers
,
subsampling_factor
=
1
,
subsampling_factor
=
1
,
num_encs
=
1
)
num_encs
=
1
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
else
:
else
:
decode_batch_size
=
config
.
get
(
'decode'
,
dict
()).
get
(
'decode_batch_size'
,
1
)
# test dataset, return raw text
# test dataset, return raw text
self
.
test_loader
=
BatchDataLoader
(
self
.
test_loader
=
BatchDataLoader
(
json_file
=
config
.
test_manifest
,
json_file
=
config
.
test_manifest
,
train_mode
=
False
,
train_mode
=
False
,
sortagrad
=
False
,
sortagrad
=
False
,
batch_size
=
config
.
decode
.
decode_batch_size
,
batch_size
=
decode_batch_size
,
maxlen_in
=
float
(
'inf'
),
maxlen_in
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
minibatches
=
0
,
minibatches
=
0
,
...
@@ -304,7 +306,7 @@ class U2Trainer(Trainer):
...
@@ -304,7 +306,7 @@ class U2Trainer(Trainer):
batch_frames_in
=
0
,
batch_frames_in
=
0
,
batch_frames_out
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
config
.
augmentation
_config
,
preprocess_conf
=
config
.
preprocess
_config
,
n_iter_processes
=
1
,
n_iter_processes
=
1
,
subsampling_factor
=
1
,
subsampling_factor
=
1
,
num_encs
=
1
)
num_encs
=
1
)
...
@@ -313,7 +315,7 @@ class U2Trainer(Trainer):
...
@@ -313,7 +315,7 @@ class U2Trainer(Trainer):
json_file
=
config
.
test_manifest
,
json_file
=
config
.
test_manifest
,
train_mode
=
False
,
train_mode
=
False
,
sortagrad
=
False
,
sortagrad
=
False
,
batch_size
=
config
.
decode
.
decode_batch_size
,
batch_size
=
decode_batch_size
,
maxlen_in
=
float
(
'inf'
),
maxlen_in
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
minibatches
=
0
,
minibatches
=
0
,
...
@@ -323,7 +325,7 @@ class U2Trainer(Trainer):
...
@@ -323,7 +325,7 @@ class U2Trainer(Trainer):
batch_frames_in
=
0
,
batch_frames_in
=
0
,
batch_frames_out
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
config
.
augmentation
_config
,
preprocess_conf
=
config
.
preprocess
_config
,
n_iter_processes
=
1
,
n_iter_processes
=
1
,
subsampling_factor
=
1
,
subsampling_factor
=
1
,
num_encs
=
1
)
num_encs
=
1
)
...
@@ -557,7 +559,7 @@ class U2Tester(U2Trainer):
...
@@ -557,7 +559,7 @@ class U2Tester(U2Trainer):
"ref_len"
:
"ref_len"
:
len_refs
,
len_refs
,
"decode_method"
:
"decode_method"
:
self
.
config
.
decoding_method
,
self
.
config
.
decod
e
.
decod
ing_method
,
})
})
f
.
write
(
data
+
'
\n
'
)
f
.
write
(
data
+
'
\n
'
)
...
...
paddlespeech/s2t/exps/u2/trainer.py
浏览文件 @
c907a8de
...
@@ -44,77 +44,77 @@ class U2Trainer(Trainer):
...
@@ -44,77 +44,77 @@ class U2Trainer(Trainer):
def
setup_dataloader
(
self
):
def
setup_dataloader
(
self
):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
config
.
defrost
()
config
.
defrost
()
config
.
collator
.
keep_transcription_text
=
False
config
.
keep_transcription_text
=
False
# train/valid dataset, return token ids
# train/valid dataset, return token ids
config
.
data
.
manifest
=
config
.
data
.
train_manifest
config
.
manifest
=
config
.
train_manifest
train_dataset
=
ManifestDataset
.
from_config
(
config
)
train_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
data
.
manifest
=
config
.
data
.
dev_manifest
config
.
manifest
=
config
.
dev_manifest
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
collate_fn_train
=
SpeechCollator
.
from_config
(
config
)
collate_fn_train
=
SpeechCollator
.
from_config
(
config
)
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
collate_fn_dev
=
SpeechCollator
.
from_config
(
config
)
collate_fn_dev
=
SpeechCollator
.
from_config
(
config
)
if
self
.
parallel
:
if
self
.
parallel
:
batch_sampler
=
SortagradDistributedBatchSampler
(
batch_sampler
=
SortagradDistributedBatchSampler
(
train_dataset
,
train_dataset
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
num_replicas
=
None
,
num_replicas
=
None
,
rank
=
None
,
rank
=
None
,
shuffle
=
True
,
shuffle
=
True
,
drop_last
=
True
,
drop_last
=
True
,
sortagrad
=
config
.
collator
.
sortagrad
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
collator
.
shuffle_method
)
shuffle_method
=
config
.
shuffle_method
)
else
:
else
:
batch_sampler
=
SortagradBatchSampler
(
batch_sampler
=
SortagradBatchSampler
(
train_dataset
,
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
drop_last
=
True
,
drop_last
=
True
,
sortagrad
=
config
.
collator
.
sortagrad
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
collator
.
shuffle_method
)
shuffle_method
=
config
.
shuffle_method
)
self
.
train_loader
=
DataLoader
(
self
.
train_loader
=
DataLoader
(
train_dataset
,
train_dataset
,
batch_sampler
=
batch_sampler
,
batch_sampler
=
batch_sampler
,
collate_fn
=
collate_fn_train
,
collate_fn
=
collate_fn_train
,
num_workers
=
config
.
collator
.
num_workers
,
)
num_workers
=
config
.
num_workers
,
)
self
.
valid_loader
=
DataLoader
(
self
.
valid_loader
=
DataLoader
(
dev_dataset
,
dev_dataset
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
collate_fn_dev
,
collate_fn
=
collate_fn_dev
,
num_workers
=
config
.
collator
.
num_workers
,
)
num_workers
=
config
.
num_workers
,
)
# test dataset, return raw text
# test dataset, return raw text
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
manifest
=
config
.
test_manifest
# filter test examples, will cause less examples, but no mismatch with training
# filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now.
# and can use large batch size , save training time, so filter test egs now.
config
.
data
.
min_input_len
=
0.0
# second
config
.
min_input_len
=
0.0
# second
config
.
data
.
max_input_len
=
float
(
'inf'
)
# second
config
.
max_input_len
=
float
(
'inf'
)
# second
config
.
data
.
min_output_len
=
0.0
# tokens
config
.
min_output_len
=
0.0
# tokens
config
.
data
.
max_output_len
=
float
(
'inf'
)
# tokens
config
.
max_output_len
=
float
(
'inf'
)
# tokens
config
.
data
.
min_output_input_ratio
=
0.00
config
.
min_output_input_ratio
=
0.00
config
.
data
.
max_output_input_ratio
=
float
(
'inf'
)
config
.
max_output_input_ratio
=
float
(
'inf'
)
test_dataset
=
ManifestDataset
.
from_config
(
config
)
test_dataset
=
ManifestDataset
.
from_config
(
config
)
# return text ord id
# return text ord id
config
.
collator
.
keep_transcription_text
=
True
config
.
keep_transcription_text
=
True
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
self
.
test_loader
=
DataLoader
(
self
.
test_loader
=
DataLoader
(
test_dataset
,
test_dataset
,
batch_size
=
config
.
decod
ing
.
batch_size
,
batch_size
=
config
.
decod
e
.
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
SpeechCollator
.
from_config
(
config
))
collate_fn
=
SpeechCollator
.
from_config
(
config
))
# return text token id
# return text token id
config
.
collator
.
keep_transcription_text
=
False
config
.
keep_transcription_text
=
False
self
.
align_loader
=
DataLoader
(
self
.
align_loader
=
DataLoader
(
test_dataset
,
test_dataset
,
batch_size
=
config
.
decod
ing
.
batch_size
,
batch_size
=
config
.
decod
e
.
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
SpeechCollator
.
from_config
(
config
))
collate_fn
=
SpeechCollator
.
from_config
(
config
))
...
@@ -122,7 +122,7 @@ class U2Trainer(Trainer):
...
@@ -122,7 +122,7 @@ class U2Trainer(Trainer):
def
setup_model
(
self
):
def
setup_model
(
self
):
config
=
self
.
config
config
=
self
.
config
model_conf
=
config
.
model
model_conf
=
config
with
UpdateConfig
(
model_conf
):
with
UpdateConfig
(
model_conf
):
model_conf
.
input_dim
=
self
.
train_loader
.
collate_fn
.
feature_size
model_conf
.
input_dim
=
self
.
train_loader
.
collate_fn
.
feature_size
model_conf
.
output_dim
=
self
.
train_loader
.
collate_fn
.
vocab_size
model_conf
.
output_dim
=
self
.
train_loader
.
collate_fn
.
vocab_size
...
@@ -136,7 +136,7 @@ class U2Trainer(Trainer):
...
@@ -136,7 +136,7 @@ class U2Trainer(Trainer):
logger
.
info
(
f
"
{
model
}
"
)
logger
.
info
(
f
"
{
model
}
"
)
layer_tools
.
print_params
(
model
,
logger
.
info
)
layer_tools
.
print_params
(
model
,
logger
.
info
)
train_config
=
config
.
training
train_config
=
config
optim_type
=
train_config
.
optim
optim_type
=
train_config
.
optim
optim_conf
=
train_config
.
optim_conf
optim_conf
=
train_config
.
optim_conf
scheduler_type
=
train_config
.
scheduler
scheduler_type
=
train_config
.
scheduler
...
@@ -156,7 +156,7 @@ class U2Trainer(Trainer):
...
@@ -156,7 +156,7 @@ class U2Trainer(Trainer):
config
,
config
,
parameters
,
parameters
,
lr_scheduler
=
None
,
):
lr_scheduler
=
None
,
):
train_config
=
config
.
training
train_config
=
config
optim_type
=
train_config
.
optim
optim_type
=
train_config
.
optim
optim_conf
=
train_config
.
optim_conf
optim_conf
=
train_config
.
optim_conf
scheduler_type
=
train_config
.
scheduler
scheduler_type
=
train_config
.
scheduler
...
@@ -182,7 +182,7 @@ class U2Trainer(Trainer):
...
@@ -182,7 +182,7 @@ class U2Trainer(Trainer):
def
setup_updater
(
self
):
def
setup_updater
(
self
):
output_dir
=
self
.
output_dir
output_dir
=
self
.
output_dir
config
=
self
.
config
.
training
config
=
self
.
config
updater
=
U2Updater
(
updater
=
U2Updater
(
model
=
self
.
model
,
model
=
self
.
model
,
...
...
paddlespeech/s2t/exps/u2_kaldi/bin/test.py
浏览文件 @
c907a8de
...
@@ -69,6 +69,10 @@ if __name__ == "__main__":
...
@@ -69,6 +69,10 @@ if __name__ == "__main__":
config
=
CfgNode
()
config
=
CfgNode
()
config
.
set_new_allowed
(
True
)
config
.
set_new_allowed
(
True
)
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_cfg
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_cfg
)
config
.
decode
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
...
...
paddlespeech/s2t/exps/u2_kaldi/model.py
浏览文件 @
c907a8de
...
@@ -80,7 +80,7 @@ class U2Trainer(Trainer):
...
@@ -80,7 +80,7 @@ class U2Trainer(Trainer):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
def
train_batch
(
self
,
batch_index
,
batch_data
,
msg
):
def
train_batch
(
self
,
batch_index
,
batch_data
,
msg
):
train_conf
=
self
.
config
.
training
train_conf
=
self
.
config
start
=
time
.
time
()
start
=
time
.
time
()
# forward
# forward
...
@@ -122,7 +122,7 @@ class U2Trainer(Trainer):
...
@@ -122,7 +122,7 @@ class U2Trainer(Trainer):
if
(
batch_index
+
1
)
%
train_conf
.
log_interval
==
0
:
if
(
batch_index
+
1
)
%
train_conf
.
log_interval
==
0
:
msg
+=
"train time: {:>.3f}s, "
.
format
(
iteration_time
)
msg
+=
"train time: {:>.3f}s, "
.
format
(
iteration_time
)
msg
+=
"batch size: {}, "
.
format
(
self
.
config
.
collator
.
batch_size
)
msg
+=
"batch size: {}, "
.
format
(
self
.
config
.
batch_size
)
msg
+=
"accum: {}, "
.
format
(
train_conf
.
accum_grad
)
msg
+=
"accum: {}, "
.
format
(
train_conf
.
accum_grad
)
msg
+=
', '
.
join
(
'{}: {:>.6f}'
.
format
(
k
,
v
)
msg
+=
', '
.
join
(
'{}: {:>.6f}'
.
format
(
k
,
v
)
for
k
,
v
in
losses_np
.
items
())
for
k
,
v
in
losses_np
.
items
())
...
@@ -157,7 +157,7 @@ class U2Trainer(Trainer):
...
@@ -157,7 +157,7 @@ class U2Trainer(Trainer):
if
ctc_loss
:
if
ctc_loss
:
valid_losses
[
'val_ctc_loss'
].
append
(
float
(
ctc_loss
))
valid_losses
[
'val_ctc_loss'
].
append
(
float
(
ctc_loss
))
if
(
i
+
1
)
%
self
.
config
.
training
.
log_interval
==
0
:
if
(
i
+
1
)
%
self
.
config
.
log_interval
==
0
:
valid_dump
=
{
k
:
np
.
mean
(
v
)
for
k
,
v
in
valid_losses
.
items
()}
valid_dump
=
{
k
:
np
.
mean
(
v
)
for
k
,
v
in
valid_losses
.
items
()}
valid_dump
[
'val_history_loss'
]
=
total_loss
/
num_seen_utts
valid_dump
[
'val_history_loss'
]
=
total_loss
/
num_seen_utts
...
@@ -186,7 +186,7 @@ class U2Trainer(Trainer):
...
@@ -186,7 +186,7 @@ class U2Trainer(Trainer):
self
.
before_train
()
self
.
before_train
()
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
while
self
.
epoch
<
self
.
config
.
n_epoch
:
with
Timer
(
"Epoch-Train Time Cost: {}"
):
with
Timer
(
"Epoch-Train Time Cost: {}"
):
self
.
model
.
train
()
self
.
model
.
train
()
try
:
try
:
...
@@ -235,10 +235,10 @@ class U2Trainer(Trainer):
...
@@ -235,10 +235,10 @@ class U2Trainer(Trainer):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
# train/valid dataset, return token ids
# train/valid dataset, return token ids
self
.
train_loader
=
BatchDataLoader
(
self
.
train_loader
=
BatchDataLoader
(
json_file
=
config
.
data
.
train_manifest
,
json_file
=
config
.
train_manifest
,
train_mode
=
True
,
train_mode
=
True
,
sortagrad
=
False
,
sortagrad
=
False
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
maxlen_in
=
float
(
'inf'
),
maxlen_in
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
minibatches
=
0
,
minibatches
=
0
,
...
@@ -248,16 +248,16 @@ class U2Trainer(Trainer):
...
@@ -248,16 +248,16 @@ class U2Trainer(Trainer):
batch_frames_in
=
0
,
batch_frames_in
=
0
,
batch_frames_out
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
config
.
collator
.
augmentation
_config
,
preprocess_conf
=
config
.
preprocess
_config
,
n_iter_processes
=
config
.
collator
.
num_workers
,
n_iter_processes
=
config
.
num_workers
,
subsampling_factor
=
1
,
subsampling_factor
=
1
,
num_encs
=
1
)
num_encs
=
1
)
self
.
valid_loader
=
BatchDataLoader
(
self
.
valid_loader
=
BatchDataLoader
(
json_file
=
config
.
d
ata
.
d
ev_manifest
,
json_file
=
config
.
dev_manifest
,
train_mode
=
False
,
train_mode
=
False
,
sortagrad
=
False
,
sortagrad
=
False
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
maxlen_in
=
float
(
'inf'
),
maxlen_in
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
minibatches
=
0
,
minibatches
=
0
,
...
@@ -268,16 +268,18 @@ class U2Trainer(Trainer):
...
@@ -268,16 +268,18 @@ class U2Trainer(Trainer):
batch_frames_out
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
None
,
preprocess_conf
=
None
,
n_iter_processes
=
config
.
collator
.
num_workers
,
n_iter_processes
=
config
.
num_workers
,
subsampling_factor
=
1
,
subsampling_factor
=
1
,
num_encs
=
1
)
num_encs
=
1
)
decode_batch_size
=
config
.
get
(
'decode'
,
dict
()).
get
(
'decode_batch_size'
,
1
)
# test dataset, return raw text
# test dataset, return raw text
self
.
test_loader
=
BatchDataLoader
(
self
.
test_loader
=
BatchDataLoader
(
json_file
=
config
.
data
.
test_manifest
,
json_file
=
config
.
test_manifest
,
train_mode
=
False
,
train_mode
=
False
,
sortagrad
=
False
,
sortagrad
=
False
,
batch_size
=
config
.
decoding
.
batch_size
,
batch_size
=
decode_
batch_size
,
maxlen_in
=
float
(
'inf'
),
maxlen_in
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
minibatches
=
0
,
minibatches
=
0
,
...
@@ -293,10 +295,10 @@ class U2Trainer(Trainer):
...
@@ -293,10 +295,10 @@ class U2Trainer(Trainer):
num_encs
=
1
)
num_encs
=
1
)
self
.
align_loader
=
BatchDataLoader
(
self
.
align_loader
=
BatchDataLoader
(
json_file
=
config
.
data
.
test_manifest
,
json_file
=
config
.
test_manifest
,
train_mode
=
False
,
train_mode
=
False
,
sortagrad
=
False
,
sortagrad
=
False
,
batch_size
=
config
.
decoding
.
batch_size
,
batch_size
=
decode_
batch_size
,
maxlen_in
=
float
(
'inf'
),
maxlen_in
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
minibatches
=
0
,
minibatches
=
0
,
...
@@ -316,7 +318,7 @@ class U2Trainer(Trainer):
...
@@ -316,7 +318,7 @@ class U2Trainer(Trainer):
config
=
self
.
config
config
=
self
.
config
# model
# model
model_conf
=
config
.
model
model_conf
=
config
with
UpdateConfig
(
model_conf
):
with
UpdateConfig
(
model_conf
):
model_conf
.
input_dim
=
self
.
train_loader
.
feat_dim
model_conf
.
input_dim
=
self
.
train_loader
.
feat_dim
model_conf
.
output_dim
=
self
.
train_loader
.
vocab_size
model_conf
.
output_dim
=
self
.
train_loader
.
vocab_size
...
@@ -392,9 +394,9 @@ class U2Tester(U2Trainer):
...
@@ -392,9 +394,9 @@ class U2Tester(U2Trainer):
def
__init__
(
self
,
config
,
args
):
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
self
.
text_feature
=
TextFeaturizer
(
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
collator
.
unit_type
,
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
collator
.
vocab_filepath
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
collator
.
spm_model_prefix
)
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
vocab_list
=
self
.
text_feature
.
vocab_list
self
.
vocab_list
=
self
.
text_feature
.
vocab_list
def
id2token
(
self
,
texts
,
texts_len
,
text_feature
):
def
id2token
(
self
,
texts
,
texts_len
,
text_feature
):
...
@@ -413,10 +415,10 @@ class U2Tester(U2Trainer):
...
@@ -413,10 +415,10 @@ class U2Tester(U2Trainer):
texts
,
texts
,
texts_len
,
texts_len
,
fout
=
None
):
fout
=
None
):
cfg
=
self
.
config
.
decoding
decode_cfg
=
self
.
config
.
decode
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_func
=
error_rate
.
char_errors
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
word_errors
errors_func
=
error_rate
.
char_errors
if
decode_
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
word_errors
error_rate_func
=
error_rate
.
cer
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
wer
error_rate_func
=
error_rate
.
cer
if
decode_
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
wer
start_time
=
time
.
time
()
start_time
=
time
.
time
()
target_transcripts
=
self
.
id2token
(
texts
,
texts_len
,
self
.
text_feature
)
target_transcripts
=
self
.
id2token
(
texts
,
texts_len
,
self
.
text_feature
)
...
@@ -424,12 +426,12 @@ class U2Tester(U2Trainer):
...
@@ -424,12 +426,12 @@ class U2Tester(U2Trainer):
audio
,
audio
,
audio_len
,
audio_len
,
text_feature
=
self
.
text_feature
,
text_feature
=
self
.
text_feature
,
decoding_method
=
cfg
.
decoding_method
,
decoding_method
=
decode_
cfg
.
decoding_method
,
beam_size
=
cfg
.
beam_size
,
beam_size
=
decode_
cfg
.
beam_size
,
ctc_weight
=
cfg
.
ctc_weight
,
ctc_weight
=
decode_
cfg
.
ctc_weight
,
decoding_chunk_size
=
cfg
.
decoding_chunk_size
,
decoding_chunk_size
=
decode_
cfg
.
decoding_chunk_size
,
num_decoding_left_chunks
=
cfg
.
num_decoding_left_chunks
,
num_decoding_left_chunks
=
decode_
cfg
.
num_decoding_left_chunks
,
simulate_streaming
=
cfg
.
simulate_streaming
)
simulate_streaming
=
decode_
cfg
.
simulate_streaming
)
decode_time
=
time
.
time
()
-
start_time
decode_time
=
time
.
time
()
-
start_time
for
i
,
(
utt
,
target
,
result
,
rec_tids
)
in
enumerate
(
for
i
,
(
utt
,
target
,
result
,
rec_tids
)
in
enumerate
(
...
@@ -449,15 +451,16 @@ class U2Tester(U2Trainer):
...
@@ -449,15 +451,16 @@ class U2Tester(U2Trainer):
logger
.
info
(
f
"Utt:
{
utt
}
"
)
logger
.
info
(
f
"Utt:
{
utt
}
"
)
logger
.
info
(
f
"Ref:
{
target
}
"
)
logger
.
info
(
f
"Ref:
{
target
}
"
)
logger
.
info
(
f
"Hyp:
{
result
}
"
)
logger
.
info
(
f
"Hyp:
{
result
}
"
)
logger
.
info
(
"One example error rate [%s] = %f"
%
logger
.
info
(
(
cfg
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
"One example error rate [%s] = %f"
%
(
decode_cfg
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
return
dict
(
return
dict
(
errors_sum
=
errors_sum
,
errors_sum
=
errors_sum
,
len_refs
=
len_refs
,
len_refs
=
len_refs
,
num_ins
=
num_ins
,
# num examples
num_ins
=
num_ins
,
# num examples
error_rate
=
errors_sum
/
len_refs
,
error_rate
=
errors_sum
/
len_refs
,
error_rate_type
=
cfg
.
error_rate_type
,
error_rate_type
=
decode_
cfg
.
error_rate_type
,
num_frames
=
audio_len
.
sum
().
numpy
().
item
(),
num_frames
=
audio_len
.
sum
().
numpy
().
item
(),
decode_time
=
decode_time
)
decode_time
=
decode_time
)
...
@@ -468,7 +471,7 @@ class U2Tester(U2Trainer):
...
@@ -468,7 +471,7 @@ class U2Tester(U2Trainer):
self
.
model
.
eval
()
self
.
model
.
eval
()
logger
.
info
(
f
"Test Total Examples:
{
len
(
self
.
test_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Test Total Examples:
{
len
(
self
.
test_loader
.
dataset
)
}
"
)
stride_ms
=
self
.
config
.
collator
.
stride_ms
stride_ms
=
self
.
config
.
stride_ms
error_rate_type
=
None
error_rate_type
=
None
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
num_frames
=
0.0
num_frames
=
0.0
...
@@ -519,15 +522,15 @@ class U2Tester(U2Trainer):
...
@@ -519,15 +522,15 @@ class U2Tester(U2Trainer):
"ref_len"
:
"ref_len"
:
len_refs
,
len_refs
,
"decode_method"
:
"decode_method"
:
self
.
config
.
decod
ing
.
decoding_method
,
self
.
config
.
decod
e
.
decoding_method
,
})
})
f
.
write
(
data
+
'
\n
'
)
f
.
write
(
data
+
'
\n
'
)
@
paddle
.
no_grad
()
@
paddle
.
no_grad
()
def
align
(
self
):
def
align
(
self
):
ctc_utils
.
ctc_align
(
self
.
config
,
self
.
model
,
self
.
align_loader
,
ctc_utils
.
ctc_align
(
self
.
config
,
self
.
model
,
self
.
align_loader
,
self
.
config
.
decod
ing
.
batch_size
,
self
.
config
.
decod
e
.
decode_
batch_size
,
self
.
config
.
collator
.
stride_ms
,
self
.
vocab_list
,
self
.
config
.
stride_ms
,
self
.
vocab_list
,
self
.
args
.
result_file
)
self
.
args
.
result_file
)
def
load_inferspec
(
self
):
def
load_inferspec
(
self
):
...
@@ -539,7 +542,7 @@ class U2Tester(U2Trainer):
...
@@ -539,7 +542,7 @@ class U2Tester(U2Trainer):
"""
"""
from
paddlespeech.s2t.models.u2
import
U2InferModel
from
paddlespeech.s2t.models.u2
import
U2InferModel
infer_model
=
U2InferModel
.
from_pretrained
(
self
.
test_loader
,
infer_model
=
U2InferModel
.
from_pretrained
(
self
.
test_loader
,
self
.
config
.
model
.
clone
(),
self
.
config
.
clone
(),
self
.
args
.
checkpoint_path
)
self
.
args
.
checkpoint_path
)
feat_dim
=
self
.
test_loader
.
feat_dim
feat_dim
=
self
.
test_loader
.
feat_dim
input_spec
=
[
input_spec
=
[
...
...
paddlespeech/s2t/exps/u2_st/bin/test.py
浏览文件 @
c907a8de
...
@@ -14,12 +14,14 @@
...
@@ -14,12 +14,14 @@
"""Evaluation for U2 model."""
"""Evaluation for U2 model."""
import
cProfile
import
cProfile
from
yacs.config
import
CfgNode
from
paddlespeech.s2t.exps.u2_st.config
import
get_cfg_defaults
from
paddlespeech.s2t.exps.u2_st.config
import
get_cfg_defaults
from
paddlespeech.s2t.exps.u2_st.model
import
U2STTester
as
Tester
from
paddlespeech.s2t.exps.u2_st.model
import
U2STTester
as
Tester
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.utils.utility
import
print_arguments
from
paddlespeech.s2t.utils.utility
import
print_arguments
# TODO(hui zhang): dynamic load
# TODO(hui zhang): dynamic load
def
main_sp
(
config
,
args
):
def
main_sp
(
config
,
args
):
...
@@ -35,7 +37,7 @@ def main(config, args):
...
@@ -35,7 +37,7 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
# save asr result to
# save asr result to
parser
.
add_argument
(
parser
.
add_argument
(
"--result_file"
,
type
=
str
,
help
=
"path of save the asr result"
)
"--result_file"
,
type
=
str
,
help
=
"path of save the asr result"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
@@ -45,6 +47,10 @@ if __name__ == "__main__":
...
@@ -45,6 +47,10 @@ if __name__ == "__main__":
config
=
get_cfg_defaults
()
config
=
get_cfg_defaults
()
if
args
.
config
:
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
config
.
merge_from_file
(
args
.
config
)
if
args
.
decode_cfg
:
decode_conf
=
CfgNode
(
new_allowed
=
True
)
decode_conf
.
merge_from_file
(
args
.
decode_cfg
)
config
.
decode
=
decode_conf
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
...
...
paddlespeech/s2t/exps/u2_st/config.py
浏览文件 @
c907a8de
...
@@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2_st import U2STModel
...
@@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2_st import U2STModel
_C
=
CfgNode
()
_C
=
CfgNode
()
_C
.
data
=
ManifestDataset
.
params
()
#
_C.data = ManifestDataset.params()
_C
.
collator
=
SpeechCollator
.
params
()
#
_C.collator = SpeechCollator.params()
_C
.
model
=
U2STModel
.
params
()
#
_C.model = U2STModel.params()
_C
.
training
=
U2STTrainer
.
params
()
#
_C.training = U2STTrainer.params()
_C
.
decoding
=
U2STTester
.
params
()
#
_C.decoding = U2STTester.params()
def
get_cfg_defaults
():
def
get_cfg_defaults
():
...
...
paddlespeech/s2t/exps/u2_st/model.py
浏览文件 @
c907a8de
...
@@ -78,7 +78,7 @@ class U2STTrainer(Trainer):
...
@@ -78,7 +78,7 @@ class U2STTrainer(Trainer):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
def
train_batch
(
self
,
batch_index
,
batch_data
,
msg
):
def
train_batch
(
self
,
batch_index
,
batch_data
,
msg
):
train_conf
=
self
.
config
.
training
train_conf
=
self
.
config
start
=
time
.
time
()
start
=
time
.
time
()
# forward
# forward
utt
,
audio
,
audio_len
,
text
,
text_len
=
batch_data
utt
,
audio
,
audio_len
,
text
,
text_len
=
batch_data
...
@@ -127,7 +127,7 @@ class U2STTrainer(Trainer):
...
@@ -127,7 +127,7 @@ class U2STTrainer(Trainer):
if
(
batch_index
+
1
)
%
train_conf
.
log_interval
==
0
:
if
(
batch_index
+
1
)
%
train_conf
.
log_interval
==
0
:
msg
+=
"train time: {:>.3f}s, "
.
format
(
iteration_time
)
msg
+=
"train time: {:>.3f}s, "
.
format
(
iteration_time
)
msg
+=
"batch size: {}, "
.
format
(
self
.
config
.
collator
.
batch_size
)
msg
+=
"batch size: {}, "
.
format
(
self
.
config
.
batch_size
)
msg
+=
"accum: {}, "
.
format
(
train_conf
.
accum_grad
)
msg
+=
"accum: {}, "
.
format
(
train_conf
.
accum_grad
)
msg
+=
', '
.
join
(
'{}: {:>.6f}'
.
format
(
k
,
v
)
msg
+=
', '
.
join
(
'{}: {:>.6f}'
.
format
(
k
,
v
)
for
k
,
v
in
losses_np
.
items
())
for
k
,
v
in
losses_np
.
items
())
...
@@ -168,7 +168,7 @@ class U2STTrainer(Trainer):
...
@@ -168,7 +168,7 @@ class U2STTrainer(Trainer):
if
ctc_loss
:
if
ctc_loss
:
valid_losses
[
'val_ctc_loss'
].
append
(
float
(
ctc_loss
))
valid_losses
[
'val_ctc_loss'
].
append
(
float
(
ctc_loss
))
if
(
i
+
1
)
%
self
.
config
.
training
.
log_interval
==
0
:
if
(
i
+
1
)
%
self
.
config
.
log_interval
==
0
:
valid_dump
=
{
k
:
np
.
mean
(
v
)
for
k
,
v
in
valid_losses
.
items
()}
valid_dump
=
{
k
:
np
.
mean
(
v
)
for
k
,
v
in
valid_losses
.
items
()}
valid_dump
[
'val_history_st_loss'
]
=
total_loss
/
num_seen_utts
valid_dump
[
'val_history_st_loss'
]
=
total_loss
/
num_seen_utts
...
@@ -197,7 +197,7 @@ class U2STTrainer(Trainer):
...
@@ -197,7 +197,7 @@ class U2STTrainer(Trainer):
self
.
before_train
()
self
.
before_train
()
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
while
self
.
epoch
<
self
.
config
.
n_epoch
:
with
Timer
(
"Epoch-Train Time Cost: {}"
):
with
Timer
(
"Epoch-Train Time Cost: {}"
):
self
.
model
.
train
()
self
.
model
.
train
()
try
:
try
:
...
@@ -245,91 +245,93 @@ class U2STTrainer(Trainer):
...
@@ -245,91 +245,93 @@ class U2STTrainer(Trainer):
def
setup_dataloader
(
self
):
def
setup_dataloader
(
self
):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
config
.
defrost
()
config
.
defrost
()
config
.
collator
.
keep_transcription_text
=
False
config
.
keep_transcription_text
=
False
# train/valid dataset, return token ids
# train/valid dataset, return token ids
config
.
data
.
manifest
=
config
.
data
.
train_manifest
config
.
manifest
=
config
.
train_manifest
train_dataset
=
ManifestDataset
.
from_config
(
config
)
train_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
data
.
manifest
=
config
.
data
.
dev_manifest
config
.
manifest
=
config
.
dev_manifest
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
if
config
.
model
.
model
_conf
.
asr_weight
>
0.
:
if
config
.
model_conf
.
asr_weight
>
0.
:
Collator
=
TripletSpeechCollator
Collator
=
TripletSpeechCollator
TestCollator
=
SpeechCollator
TestCollator
=
SpeechCollator
else
:
else
:
TestCollator
=
Collator
=
SpeechCollator
TestCollator
=
Collator
=
SpeechCollator
collate_fn_train
=
Collator
.
from_config
(
config
)
collate_fn_train
=
Collator
.
from_config
(
config
)
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
collate_fn_dev
=
Collator
.
from_config
(
config
)
collate_fn_dev
=
Collator
.
from_config
(
config
)
if
self
.
parallel
:
if
self
.
parallel
:
batch_sampler
=
SortagradDistributedBatchSampler
(
batch_sampler
=
SortagradDistributedBatchSampler
(
train_dataset
,
train_dataset
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
num_replicas
=
None
,
num_replicas
=
None
,
rank
=
None
,
rank
=
None
,
shuffle
=
True
,
shuffle
=
True
,
drop_last
=
True
,
drop_last
=
True
,
sortagrad
=
config
.
collator
.
sortagrad
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
collator
.
shuffle_method
)
shuffle_method
=
config
.
shuffle_method
)
else
:
else
:
batch_sampler
=
SortagradBatchSampler
(
batch_sampler
=
SortagradBatchSampler
(
train_dataset
,
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
drop_last
=
True
,
drop_last
=
True
,
sortagrad
=
config
.
collator
.
sortagrad
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
collator
.
shuffle_method
)
shuffle_method
=
config
.
shuffle_method
)
self
.
train_loader
=
DataLoader
(
self
.
train_loader
=
DataLoader
(
train_dataset
,
train_dataset
,
batch_sampler
=
batch_sampler
,
batch_sampler
=
batch_sampler
,
collate_fn
=
collate_fn_train
,
collate_fn
=
collate_fn_train
,
num_workers
=
config
.
collator
.
num_workers
,
)
num_workers
=
config
.
num_workers
,
)
self
.
valid_loader
=
DataLoader
(
self
.
valid_loader
=
DataLoader
(
dev_dataset
,
dev_dataset
,
batch_size
=
config
.
collator
.
batch_size
,
batch_size
=
config
.
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
collate_fn_dev
,
collate_fn
=
collate_fn_dev
,
num_workers
=
config
.
collator
.
num_workers
,
)
num_workers
=
config
.
num_workers
,
)
# test dataset, return raw text
# test dataset, return raw text
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
manifest
=
config
.
test_manifest
# filter test examples, will cause less examples, but no mismatch with training
# filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now.
# and can use large batch size , save training time, so filter test egs now.
# config.
data.
min_input_len = 0.0 # second
# config.min_input_len = 0.0 # second
# config.
data.
max_input_len = float('inf') # second
# config.max_input_len = float('inf') # second
# config.
data.
min_output_len = 0.0 # tokens
# config.min_output_len = 0.0 # tokens
# config.
data.
max_output_len = float('inf') # tokens
# config.max_output_len = float('inf') # tokens
# config.
data.
min_output_input_ratio = 0.00
# config.min_output_input_ratio = 0.00
# config.
data.
max_output_input_ratio = float('inf')
# config.max_output_input_ratio = float('inf')
test_dataset
=
ManifestDataset
.
from_config
(
config
)
test_dataset
=
ManifestDataset
.
from_config
(
config
)
# return text ord id
# return text ord id
config
.
collator
.
keep_transcription_text
=
True
config
.
keep_transcription_text
=
True
config
.
collator
.
augmentation_config
=
""
config
.
augmentation_config
=
""
decode_batch_size
=
config
.
get
(
'decode'
,
dict
()).
get
(
'decode_batch_size'
,
1
)
self
.
test_loader
=
DataLoader
(
self
.
test_loader
=
DataLoader
(
test_dataset
,
test_dataset
,
batch_size
=
config
.
decoding
.
batch_size
,
batch_size
=
decode_
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
TestCollator
.
from_config
(
config
),
collate_fn
=
TestCollator
.
from_config
(
config
),
num_workers
=
config
.
collator
.
num_workers
,
)
num_workers
=
config
.
num_workers
,
)
# return text token id
# return text token id
config
.
collator
.
keep_transcription_text
=
False
config
.
keep_transcription_text
=
False
self
.
align_loader
=
DataLoader
(
self
.
align_loader
=
DataLoader
(
test_dataset
,
test_dataset
,
batch_size
=
config
.
decoding
.
batch_size
,
batch_size
=
decode_
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
TestCollator
.
from_config
(
config
),
collate_fn
=
TestCollator
.
from_config
(
config
),
num_workers
=
config
.
collator
.
num_workers
,
)
num_workers
=
config
.
num_workers
,
)
logger
.
info
(
"Setup train/valid/test/align Dataloader!"
)
logger
.
info
(
"Setup train/valid/test/align Dataloader!"
)
def
setup_model
(
self
):
def
setup_model
(
self
):
config
=
self
.
config
config
=
self
.
config
model_conf
=
config
.
model
model_conf
=
config
with
UpdateConfig
(
model_conf
):
with
UpdateConfig
(
model_conf
):
model_conf
.
input_dim
=
self
.
train_loader
.
collate_fn
.
feature_size
model_conf
.
input_dim
=
self
.
train_loader
.
collate_fn
.
feature_size
model_conf
.
output_dim
=
self
.
train_loader
.
collate_fn
.
vocab_size
model_conf
.
output_dim
=
self
.
train_loader
.
collate_fn
.
vocab_size
...
@@ -342,7 +344,7 @@ class U2STTrainer(Trainer):
...
@@ -342,7 +344,7 @@ class U2STTrainer(Trainer):
logger
.
info
(
f
"
{
model
}
"
)
logger
.
info
(
f
"
{
model
}
"
)
layer_tools
.
print_params
(
model
,
logger
.
info
)
layer_tools
.
print_params
(
model
,
logger
.
info
)
train_config
=
config
.
training
train_config
=
config
optim_type
=
train_config
.
optim
optim_type
=
train_config
.
optim
optim_conf
=
train_config
.
optim_conf
optim_conf
=
train_config
.
optim_conf
scheduler_type
=
train_config
.
scheduler
scheduler_type
=
train_config
.
scheduler
...
@@ -428,7 +430,7 @@ class U2STTester(U2STTrainer):
...
@@ -428,7 +430,7 @@ class U2STTester(U2STTrainer):
def
translate
(
self
,
audio
,
audio_len
):
def
translate
(
self
,
audio
,
audio_len
):
""""E2E translation from extracted audio feature"""
""""E2E translation from extracted audio feature"""
cfg
=
self
.
config
.
decoding
decode_cfg
=
self
.
config
.
decode
text_feature
=
self
.
test_loader
.
collate_fn
.
text_feature
text_feature
=
self
.
test_loader
.
collate_fn
.
text_feature
self
.
model
.
eval
()
self
.
model
.
eval
()
...
@@ -436,12 +438,12 @@ class U2STTester(U2STTrainer):
...
@@ -436,12 +438,12 @@ class U2STTester(U2STTrainer):
audio
,
audio
,
audio_len
,
audio_len
,
text_feature
=
text_feature
,
text_feature
=
text_feature
,
decoding_method
=
cfg
.
decoding_method
,
decoding_method
=
decode_
cfg
.
decoding_method
,
beam_size
=
cfg
.
beam_size
,
beam_size
=
decode_
cfg
.
beam_size
,
word_reward
=
cfg
.
word_reward
,
word_reward
=
decode_
cfg
.
word_reward
,
decoding_chunk_size
=
cfg
.
decoding_chunk_size
,
decoding_chunk_size
=
decode_
cfg
.
decoding_chunk_size
,
num_decoding_left_chunks
=
cfg
.
num_decoding_left_chunks
,
num_decoding_left_chunks
=
decode_
cfg
.
num_decoding_left_chunks
,
simulate_streaming
=
cfg
.
simulate_streaming
)
simulate_streaming
=
decode_
cfg
.
simulate_streaming
)
return
hyps
return
hyps
def
compute_translation_metrics
(
self
,
def
compute_translation_metrics
(
self
,
...
@@ -452,7 +454,7 @@ class U2STTester(U2STTrainer):
...
@@ -452,7 +454,7 @@ class U2STTester(U2STTrainer):
texts_len
,
texts_len
,
bleu_func
,
bleu_func
,
fout
=
None
):
fout
=
None
):
cfg
=
self
.
config
.
decoding
decode_cfg
=
self
.
config
.
decode
len_refs
,
num_ins
=
0
,
0
len_refs
,
num_ins
=
0
,
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
...
@@ -467,12 +469,12 @@ class U2STTester(U2STTrainer):
...
@@ -467,12 +469,12 @@ class U2STTester(U2STTrainer):
audio
,
audio
,
audio_len
,
audio_len
,
text_feature
=
text_feature
,
text_feature
=
text_feature
,
decoding_method
=
cfg
.
decoding_method
,
decoding_method
=
decode_
cfg
.
decoding_method
,
beam_size
=
cfg
.
beam_size
,
beam_size
=
decode_
cfg
.
beam_size
,
word_reward
=
cfg
.
word_reward
,
word_reward
=
decode_
cfg
.
word_reward
,
decoding_chunk_size
=
cfg
.
decoding_chunk_size
,
decoding_chunk_size
=
decode_
cfg
.
decoding_chunk_size
,
num_decoding_left_chunks
=
cfg
.
num_decoding_left_chunks
,
num_decoding_left_chunks
=
decode_
cfg
.
num_decoding_left_chunks
,
simulate_streaming
=
cfg
.
simulate_streaming
)
simulate_streaming
=
decode_
cfg
.
simulate_streaming
)
decode_time
=
time
.
time
()
-
start_time
decode_time
=
time
.
time
()
-
start_time
for
utt
,
target
,
result
in
zip
(
utts
,
refs
,
hyps
):
for
utt
,
target
,
result
in
zip
(
utts
,
refs
,
hyps
):
...
@@ -502,8 +504,8 @@ class U2STTester(U2STTrainer):
...
@@ -502,8 +504,8 @@ class U2STTester(U2STTrainer):
self
.
model
.
eval
()
self
.
model
.
eval
()
logger
.
info
(
f
"Test Total Examples:
{
len
(
self
.
test_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Test Total Examples:
{
len
(
self
.
test_loader
.
dataset
)
}
"
)
cfg
=
self
.
config
.
decoding
decode_cfg
=
self
.
config
.
decode
bleu_func
=
bleu_score
.
char_bleu
if
cfg
.
error_rate_type
==
'char-bleu'
else
bleu_score
.
bleu
bleu_func
=
bleu_score
.
char_bleu
if
decode_
cfg
.
error_rate_type
==
'char-bleu'
else
bleu_score
.
bleu
stride_ms
=
self
.
test_loader
.
collate_fn
.
stride_ms
stride_ms
=
self
.
test_loader
.
collate_fn
.
stride_ms
hyps
,
refs
=
[],
[]
hyps
,
refs
=
[],
[]
...
@@ -549,15 +551,15 @@ class U2STTester(U2STTrainer):
...
@@ -549,15 +551,15 @@ class U2STTester(U2STTrainer):
"num_examples"
:
"num_examples"
:
num_ins
,
num_ins
,
"decode_method"
:
"decode_method"
:
self
.
config
.
decod
ing
.
decoding_method
,
self
.
config
.
decod
e
.
decoding_method
,
})
})
f
.
write
(
data
+
'
\n
'
)
f
.
write
(
data
+
'
\n
'
)
@
paddle
.
no_grad
()
@
paddle
.
no_grad
()
def
align
(
self
):
def
align
(
self
):
ctc_utils
.
ctc_align
(
self
.
config
,
self
.
model
,
self
.
align_loader
,
ctc_utils
.
ctc_align
(
self
.
config
,
self
.
model
,
self
.
align_loader
,
self
.
config
.
decod
ing
.
batch_size
,
self
.
config
.
decod
e
.
decode_
batch_size
,
self
.
config
.
collator
.
stride_ms
,
self
.
vocab_list
,
self
.
config
.
stride_ms
,
self
.
vocab_list
,
self
.
args
.
result_file
)
self
.
args
.
result_file
)
def
load_inferspec
(
self
):
def
load_inferspec
(
self
):
...
@@ -569,7 +571,7 @@ class U2STTester(U2STTrainer):
...
@@ -569,7 +571,7 @@ class U2STTester(U2STTrainer):
"""
"""
from
paddlespeech.s2t.models.u2
import
U2InferModel
from
paddlespeech.s2t.models.u2
import
U2InferModel
infer_model
=
U2InferModel
.
from_pretrained
(
self
.
test_loader
,
infer_model
=
U2InferModel
.
from_pretrained
(
self
.
test_loader
,
self
.
config
.
model
.
clone
(),
self
.
config
.
clone
(),
self
.
args
.
checkpoint_path
)
self
.
args
.
checkpoint_path
)
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
input_spec
=
[
input_spec
=
[
...
...
paddlespeech/s2t/io/collator.py
浏览文件 @
c907a8de
...
@@ -256,45 +256,43 @@ class SpeechCollator(SpeechCollatorBase):
...
@@ -256,45 +256,43 @@ class SpeechCollator(SpeechCollatorBase):
Returns:
Returns:
SpeechCollator: collator object.
SpeechCollator: collator object.
"""
"""
assert
'augmentation_config'
in
config
.
collator
assert
'augmentation_config'
in
config
assert
'keep_transcription_text'
in
config
.
collator
assert
'keep_transcription_text'
in
config
assert
'mean_std_filepath'
in
config
.
collator
assert
'mean_std_filepath'
in
config
assert
'vocab_filepath'
in
config
.
collator
assert
'vocab_filepath'
in
config
assert
'spectrum_type'
in
config
.
collator
assert
'spectrum_type'
in
config
assert
'n_fft'
in
config
.
collator
assert
'n_fft'
in
config
assert
config
.
collator
assert
config
if
isinstance
(
config
.
collator
.
augmentation_config
,
(
str
,
bytes
)):
if
isinstance
(
config
.
augmentation_config
,
(
str
,
bytes
)):
if
config
.
collator
.
augmentation_config
:
if
config
.
augmentation_config
:
aug_file
=
io
.
open
(
aug_file
=
io
.
open
(
config
.
collator
.
augmentation_config
,
config
.
augmentation_config
,
mode
=
'r'
,
encoding
=
'utf8'
)
mode
=
'r'
,
encoding
=
'utf8'
)
else
:
else
:
aug_file
=
io
.
StringIO
(
initial_value
=
'{}'
,
newline
=
''
)
aug_file
=
io
.
StringIO
(
initial_value
=
'{}'
,
newline
=
''
)
else
:
else
:
aug_file
=
config
.
collator
.
augmentation_config
aug_file
=
config
.
augmentation_config
assert
isinstance
(
aug_file
,
io
.
StringIO
)
assert
isinstance
(
aug_file
,
io
.
StringIO
)
speech_collator
=
cls
(
speech_collator
=
cls
(
aug_file
=
aug_file
,
aug_file
=
aug_file
,
random_seed
=
0
,
random_seed
=
0
,
mean_std_filepath
=
config
.
collator
.
mean_std_filepath
,
mean_std_filepath
=
config
.
mean_std_filepath
,
unit_type
=
config
.
collator
.
unit_type
,
unit_type
=
config
.
unit_type
,
vocab_filepath
=
config
.
collator
.
vocab_filepath
,
vocab_filepath
=
config
.
vocab_filepath
,
spm_model_prefix
=
config
.
collator
.
spm_model_prefix
,
spm_model_prefix
=
config
.
spm_model_prefix
,
spectrum_type
=
config
.
collator
.
spectrum_type
,
spectrum_type
=
config
.
spectrum_type
,
feat_dim
=
config
.
collator
.
feat_dim
,
feat_dim
=
config
.
feat_dim
,
delta_delta
=
config
.
collator
.
delta_delta
,
delta_delta
=
config
.
delta_delta
,
stride_ms
=
config
.
collator
.
stride_ms
,
stride_ms
=
config
.
stride_ms
,
window_ms
=
config
.
collator
.
window_ms
,
window_ms
=
config
.
window_ms
,
n_fft
=
config
.
collator
.
n_fft
,
n_fft
=
config
.
n_fft
,
max_freq
=
config
.
collator
.
max_freq
,
max_freq
=
config
.
max_freq
,
target_sample_rate
=
config
.
collator
.
target_sample_rate
,
target_sample_rate
=
config
.
target_sample_rate
,
use_dB_normalization
=
config
.
collator
.
use_dB_normalization
,
use_dB_normalization
=
config
.
use_dB_normalization
,
target_dB
=
config
.
collator
.
target_dB
,
target_dB
=
config
.
target_dB
,
dither
=
config
.
collator
.
dither
,
dither
=
config
.
dither
,
keep_transcription_text
=
config
.
collator
.
keep_transcription_text
)
keep_transcription_text
=
config
.
keep_transcription_text
)
return
speech_collator
return
speech_collator
...
...
paddlespeech/s2t/io/dataset.py
浏览文件 @
c907a8de
...
@@ -54,17 +54,17 @@ class ManifestDataset(Dataset):
...
@@ -54,17 +54,17 @@ class ManifestDataset(Dataset):
Returns:
Returns:
ManifestDataset: dataet object.
ManifestDataset: dataet object.
"""
"""
assert
'manifest'
in
config
.
data
assert
'manifest'
in
config
assert
config
.
data
.
manifest
assert
config
.
manifest
dataset
=
cls
(
dataset
=
cls
(
manifest_path
=
config
.
data
.
manifest
,
manifest_path
=
config
.
manifest
,
max_input_len
=
config
.
data
.
max_input_len
,
max_input_len
=
config
.
max_input_len
,
min_input_len
=
config
.
data
.
min_input_len
,
min_input_len
=
config
.
min_input_len
,
max_output_len
=
config
.
data
.
max_output_len
,
max_output_len
=
config
.
max_output_len
,
min_output_len
=
config
.
data
.
min_output_len
,
min_output_len
=
config
.
min_output_len
,
max_output_input_ratio
=
config
.
data
.
max_output_input_ratio
,
max_output_input_ratio
=
config
.
max_output_input_ratio
,
min_output_input_ratio
=
config
.
data
.
min_output_input_ratio
,
)
min_output_input_ratio
=
config
.
min_output_input_ratio
,
)
return
dataset
return
dataset
def
__init__
(
self
,
def
__init__
(
self
,
...
...
paddlespeech/s2t/models/ds2/deepspeech2.py
浏览文件 @
c907a8de
...
@@ -221,12 +221,12 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -221,12 +221,12 @@ class DeepSpeech2Model(nn.Layer):
model
=
cls
(
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
rnn_size
=
config
.
rnn_layer_size
,
use_gru
=
config
.
model
.
use_gru
,
use_gru
=
config
.
use_gru
,
share_rnn_weights
=
config
.
model
.
share_rnn_weights
,
share_rnn_weights
=
config
.
share_rnn_weights
,
blank_id
=
config
.
model
.
blank_id
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
infos
=
Checkpoint
().
load_parameters
(
infos
=
Checkpoint
().
load_parameters
(
model
,
checkpoint_path
=
checkpoint_path
)
model
,
checkpoint_path
=
checkpoint_path
)
...
@@ -240,7 +240,7 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -240,7 +240,7 @@ class DeepSpeech2Model(nn.Layer):
Parameters
Parameters
config: yacs.config.CfgNode
config: yacs.config.CfgNode
config
.model
config
Returns
Returns
-------
-------
DeepSpeech2Model
DeepSpeech2Model
...
...
paddlespeech/s2t/models/ds2_online/deepspeech2.py
浏览文件 @
c907a8de
...
@@ -353,14 +353,14 @@ class DeepSpeech2ModelOnline(nn.Layer):
...
@@ -353,14 +353,14 @@ class DeepSpeech2ModelOnline(nn.Layer):
model
=
cls
(
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
model
.
rnn_direction
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
model
.
num_fc_layers
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
model
.
fc_layers_size_list
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
model
.
use_gru
,
use_gru
=
config
.
use_gru
,
blank_id
=
config
.
model
.
blank_id
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
infos
=
Checkpoint
().
load_parameters
(
infos
=
Checkpoint
().
load_parameters
(
model
,
checkpoint_path
=
checkpoint_path
)
model
,
checkpoint_path
=
checkpoint_path
)
...
@@ -374,7 +374,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
...
@@ -374,7 +374,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
Parameters
Parameters
config: yacs.config.CfgNode
config: yacs.config.CfgNode
config
.model
config
Returns
Returns
-------
-------
DeepSpeech2ModelOnline
DeepSpeech2ModelOnline
...
...
paddlespeech/s2t/training/cli.py
浏览文件 @
c907a8de
...
@@ -101,7 +101,7 @@ def default_argument_parser(parser=None):
...
@@ -101,7 +101,7 @@ def default_argument_parser(parser=None):
title
=
'Test Options'
,
description
=
None
)
title
=
'Test Options'
,
description
=
None
)
test_group
.
add_argument
(
test_group
.
add_argument
(
"--decode_c
onfi
g"
,
"--decode_c
f
g"
,
metavar
=
"DECODE_CONFIG_FILE"
,
metavar
=
"DECODE_CONFIG_FILE"
,
help
=
"decode config file."
)
help
=
"decode config file."
)
...
...
tests/benchmark/conformer/run.sh
浏览文件 @
c907a8de
...
@@ -22,6 +22,7 @@ sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml
...
@@ -22,6 +22,7 @@ sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml
fp_item_list
=(
fp32
)
fp_item_list
=(
fp32
)
bs_item
=(
16
)
bs_item
=(
16
)
config_path
=
conf/benchmark/conformer.yaml
config_path
=
conf/benchmark/conformer.yaml
decode_config_path
=
conf/tuning/decode.yaml
seed
=
0
seed
=
0
output
=
exp/conformer
output
=
exp/conformer
profiler_options
=
None
profiler_options
=
None
...
@@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do
...
@@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do
echo
"index is speed, 8gpus, run_mode is multi_process, begin, conformer"
echo
"index is speed, 8gpus, run_mode is multi_process, begin, conformer"
run_mode
=
mp
run_mode
=
mp
ngpu
=
8
ngpu
=
8
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7 bash
${
CUR_DIR
}
/run_benchmark.sh
${
run_mode
}
${
config_path
}
${
output
}
${
seed
}
${
ngpu
}
${
profiler_options
}
${
bs_item
}
${
fp_item
}
${
model_item
}
|
tee
${
log_path
}
/
${
log_name
}
_speed_8gpus8p 2>&1
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7 bash
${
CUR_DIR
}
/run_benchmark.sh
${
run_mode
}
${
config_path
}
${
decode_config_path
}
${
output
}
${
seed
}
${
ngpu
}
${
profiler_options
}
${
bs_item
}
${
fp_item
}
${
model_item
}
|
tee
${
log_path
}
/
${
log_name
}
_speed_8gpus8p 2>&1
sleep
60
sleep
60
log_name
=
speech_
${
model_item
}
_bs
${
bs_item
}
_
${
fp_item
}
# 如:clas_MobileNetv1_mp_bs32_fp32_8
log_name
=
speech_
${
model_item
}
_bs
${
bs_item
}
_
${
fp_item
}
# 如:clas_MobileNetv1_mp_bs32_fp32_8
echo
"index is speed, 1gpus, begin,
${
log_name
}
"
echo
"index is speed, 1gpus, begin,
${
log_name
}
"
run_mode
=
sp
run_mode
=
sp
ngpu
=
1
ngpu
=
1
CUDA_VISIBLE_DEVICES
=
0 bash
${
CUR_DIR
}
/run_benchmark.sh
${
run_mode
}
${
config_path
}
${
output
}
${
seed
}
${
ngpu
}
${
profiler_options
}
${
bs_item
}
${
fp_item
}
${
model_item
}
|
tee
${
log_path
}
/
${
log_name
}
_speed_1gpus 2>&1
# (5min)
CUDA_VISIBLE_DEVICES
=
0 bash
${
CUR_DIR
}
/run_benchmark.sh
${
run_mode
}
${
config_path
}
${
decode_config_path
}
${
output
}
${
seed
}
${
ngpu
}
${
profiler_options
}
${
bs_item
}
${
fp_item
}
${
model_item
}
|
tee
${
log_path
}
/
${
log_name
}
_speed_1gpus 2>&1
# (5min)
sleep
60
sleep
60
done
done
done
done
...
...
tests/benchmark/conformer/run_benchmark.sh
浏览文件 @
c907a8de
...
@@ -5,13 +5,14 @@ function _set_params(){
...
@@ -5,13 +5,14 @@ function _set_params(){
run_mode
=
${
1
:-
"sp"
}
# 单卡sp|多卡mp
run_mode
=
${
1
:-
"sp"
}
# 单卡sp|多卡mp
config_path
=
${
2
:-
"conf/conformer.yaml"
}
config_path
=
${
2
:-
"conf/conformer.yaml"
}
output
=
${
3
:-
"exp/conformer"
}
decode_config_path
=
${
3
:-
"conf/tuning/decode.yaml"
}
seed
=
${
4
:-
"0"
}
output
=
${
4
:-
"exp/conformer"
}
ngpu
=
${
5
:-
"1"
}
seed
=
${
5
:-
"0"
}
profiler_options
=
${
6
:-
"None"
}
ngpu
=
${
6
:-
"1"
}
batch_size
=
${
7
:-
"32"
}
profiler_options
=
${
7
:-
"None"
}
fp_item
=
${
8
:-
"fp32"
}
batch_size
=
${
8
:-
"32"
}
model_item
=
${
9
:-
"conformer"
}
fp_item
=
${
9
:-
"fp32"
}
model_item
=
${
10
:-
"conformer"
}
benchmark_max_step
=
0
benchmark_max_step
=
0
run_log_path
=
${
TRAIN_LOG_DIR
:-
$(
pwd
)
}
# TRAIN_LOG_DIR 后续QA设置该参数
run_log_path
=
${
TRAIN_LOG_DIR
:-
$(
pwd
)
}
# TRAIN_LOG_DIR 后续QA设置该参数
# 添加日志解析需要的参数
# 添加日志解析需要的参数
...
@@ -35,6 +36,7 @@ function _train(){
...
@@ -35,6 +36,7 @@ function _train(){
echo
"Train on
${
num_gpu_devices
}
GPUs"
echo
"Train on
${
num_gpu_devices
}
GPUs"
echo
"current CUDA_VISIBLE_DEVICES=
$CUDA_VISIBLE_DEVICES
, gpus=
$num_gpu_devices
, batch_size=
$batch_size
"
echo
"current CUDA_VISIBLE_DEVICES=
$CUDA_VISIBLE_DEVICES
, gpus=
$num_gpu_devices
, batch_size=
$batch_size
"
train_cmd
=
"--config=
${
config_path
}
\
train_cmd
=
"--config=
${
config_path
}
\
--decode_cfg=
${
decode_config_path
}
\
--output=
${
output
}
\
--output=
${
output
}
\
--seed=
${
seed
}
\
--seed=
${
seed
}
\
--ngpu=
${
ngpu
}
\
--ngpu=
${
ngpu
}
\
...
@@ -68,7 +70,7 @@ function _train(){
...
@@ -68,7 +70,7 @@ function _train(){
}
}
source
${
BENCHMARK_ROOT
}
/scripts/run_model.sh
# 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
source
${
BENCHMARK_ROOT
}
/scripts/run_model.sh
# 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params
$@
#
_set_params $@
#
_train # 如果只想产出训练log,不解析,可取消注释
#_train # 如果只想产出训练log,不解析,可取消注释
_run
# 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
_run
# 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
tests/chains/ds2/ds2_params_lite_train_infer.txt
浏览文件 @
c907a8de
...
@@ -21,13 +21,13 @@ null:null
...
@@ -21,13 +21,13 @@ null:null
null:null
null:null
##
##
===========================eval_params===========================
===========================eval_params===========================
eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --
checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9
.rsl --model_type offline
eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --
decode_cfg conf/tuning/decode.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --result_file tests/4
.rsl --model_type offline
null:null
null:null
##
##
===========================infer_params===========================
===========================infer_params===========================
null:null
null:null
null:null
null:null
norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/
9 --export_path exp/deepspeech_tiny/checkpoints/9
.jit
norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/
4 --export_path exp/deepspeech_tiny/checkpoints/4
.jit
quant_export:null
quant_export:null
fpgm_export:null
fpgm_export:null
distill_export:null
distill_export:null
...
...
tests/chains/ds2/ds2_params_whole_train_infer.txt
浏览文件 @
c907a8de
...
@@ -21,7 +21,7 @@ null:null
...
@@ -21,7 +21,7 @@ null:null
null:null
null:null
##
##
===========================eval_params===========================
===========================eval_params===========================
eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --
decode_cfg conf/tuning/decode.yaml --
result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
null:null
null:null
##
##
===========================infer_params===========================
===========================infer_params===========================
...
...
tests/chains/ds2/lite_train_infer.sh
浏览文件 @
c907a8de
bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer
bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer
cd
../../
examples/tiny/s
0
cd
../../
../examples/tiny/asr
0
source
path.sh
source
path.sh
bash ../../../tests/chains/
test.sh ../../../tests/chains
/ds2_params_lite_train_infer.txt lite_train_infer
bash ../../../tests/chains/
ds2/test.sh ../../../tests/chains/ds2
/ds2_params_lite_train_infer.txt lite_train_infer
cd
../../../tests/chains
cd
../../../tests/chains
tests/chains/ds2/prepare.sh
浏览文件 @
c907a8de
...
@@ -34,7 +34,7 @@ MODE=$2
...
@@ -34,7 +34,7 @@ MODE=$2
if
[
${
MODE
}
=
"lite_train_infer"
]
;
then
if
[
${
MODE
}
=
"lite_train_infer"
]
;
then
# pretrain lite train data
# pretrain lite train data
curPath
=
$(
readlink
-f
"
$(
dirname
"
$0
"
)
"
)
curPath
=
$(
readlink
-f
"
$(
dirname
"
$0
"
)
"
)
cd
${
curPath
}
/../../
examples/tiny/s
0
cd
${
curPath
}
/../../
../examples/tiny/asr
0
source
path.sh
source
path.sh
# download audio data
# download audio data
bash ./local/data.sh
||
exit
-1
bash ./local/data.sh
||
exit
-1
...
@@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then
...
@@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then
elif
[
${
MODE
}
=
"whole_train_infer"
]
;
then
elif
[
${
MODE
}
=
"whole_train_infer"
]
;
then
curPath
=
$(
readlink
-f
"
$(
dirname
"
$0
"
)
"
)
curPath
=
$(
readlink
-f
"
$(
dirname
"
$0
"
)
"
)
cd
${
curPath
}
/../../
examples/aishell/s
0
cd
${
curPath
}
/../../
../examples/aishell/asr
0
source
path.sh
source
path.sh
# download audio data
# download audio data
bash ./local/data.sh
||
exit
-1
bash ./local/data.sh
||
exit
-1
...
@@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then
...
@@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then
cd
${
curPath
}
cd
${
curPath
}
elif
[
${
MODE
}
=
"whole_infer"
]
;
then
elif
[
${
MODE
}
=
"whole_infer"
]
;
then
curPath
=
$(
readlink
-f
"
$(
dirname
"
$0
"
)
"
)
curPath
=
$(
readlink
-f
"
$(
dirname
"
$0
"
)
"
)
cd
${
curPath
}
/../../
examples/aishell/s
0
cd
${
curPath
}
/../../
../examples/aishell/asr
0
source
path.sh
source
path.sh
# download audio data
# download audio data
bash ./local/data.sh
||
exit
-1
bash ./local/data.sh
||
exit
-1
...
@@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then
...
@@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then
cd
${
curPath
}
cd
${
curPath
}
else
else
curPath
=
$(
readlink
-f
"
$(
dirname
"
$0
"
)
"
)
curPath
=
$(
readlink
-f
"
$(
dirname
"
$0
"
)
"
)
cd
${
curPath
}
/../../
examples/aishell/s
0
cd
${
curPath
}
/../../
../examples/aishell/asr
0
source
path.sh
source
path.sh
# download audio data
# download audio data
bash ./local/data.sh
||
exit
-1
bash ./local/data.sh
||
exit
-1
...
...
tests/chains/ds2/test.sh
浏览文件 @
c907a8de
...
@@ -324,6 +324,7 @@ else
...
@@ -324,6 +324,7 @@ else
gsu
=
${
gpu
//,/
}
gsu
=
${
gpu
//,/
}
nump
=
`
echo
$gsu
|
wc
-w
`
nump
=
`
echo
$gsu
|
wc
-w
`
cmd
=
"
${
python
}
${
run_train
}
--ngpu=
$nump
"
cmd
=
"
${
python
}
${
run_train
}
--ngpu=
$nump
"
export
CUDA_VISIBLE_DEVICES
=
${
gpu
}
else
# train with multi-machine
else
# train with multi-machine
cmd
=
"
${
python
}
-m paddle.distributed.launch --ips=
${
ips
}
--gpus=
${
gpu
}
${
run_train
}
${
set_save_model
}
${
set_pretrain
}
${
set_epoch
}
${
set_autocast
}
${
set_batchsize
}
${
set_train_params1
}
"
cmd
=
"
${
python
}
-m paddle.distributed.launch --ips=
${
ips
}
--gpus=
${
gpu
}
${
run_train
}
${
set_save_model
}
${
set_pretrain
}
${
set_epoch
}
${
set_autocast
}
${
set_batchsize
}
${
set_train_params1
}
"
fi
fi
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录