Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
41eeed04
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
41eeed04
编写于
12月 29, 2021
作者:
H
huangyuxin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add librispeech asr1
上级
fb6d1e2c
变更
20
显示空白变更内容
内联
并排
Showing
20 changed file
with
590 addition
and
627 deletion
+590
-627
examples/aishell/asr1/local/align.sh
examples/aishell/asr1/local/align.sh
+1
-1
examples/aishell/asr1/local/test.sh
examples/aishell/asr1/local/test.sh
+5
-5
examples/aishell/asr1/local/test_wav.sh
examples/aishell/asr1/local/test_wav.sh
+2
-2
examples/csmsc/voc5/README.md
examples/csmsc/voc5/README.md
+2
-2
examples/librispeech/asr1/conf/chunk_conformer.yaml
examples/librispeech/asr1/conf/chunk_conformer.yaml
+90
-94
examples/librispeech/asr1/conf/chunk_transformer.yaml
examples/librispeech/asr1/conf/chunk_transformer.yaml
+83
-96
examples/librispeech/asr1/conf/conformer.yaml
examples/librispeech/asr1/conf/conformer.yaml
+87
-94
examples/librispeech/asr1/conf/transformer.yaml
examples/librispeech/asr1/conf/transformer.yaml
+81
-102
examples/librispeech/asr1/conf/tuning/chunk_decode.yaml
examples/librispeech/asr1/conf/tuning/chunk_decode.yaml
+11
-0
examples/librispeech/asr1/conf/tuning/decode.yaml
examples/librispeech/asr1/conf/tuning/decode.yaml
+11
-0
examples/librispeech/asr1/local/align.sh
examples/librispeech/asr1/local/align.sh
+6
-4
examples/librispeech/asr1/local/test.sh
examples/librispeech/asr1/local/test.sh
+13
-9
examples/librispeech/asr1/local/test_wav.sh
examples/librispeech/asr1/local/test_wav.sh
+8
-6
examples/librispeech/asr1/run.sh
examples/librispeech/asr1/run.sh
+4
-3
examples/tiny/asr1/conf/conformer.yaml
examples/tiny/asr1/conf/conformer.yaml
+93
-104
examples/tiny/asr1/conf/transformer.yaml
examples/tiny/asr1/conf/transformer.yaml
+88
-100
paddlespeech/s2t/exps/u2/bin/alignment.py
paddlespeech/s2t/exps/u2/bin/alignment.py
+1
-1
paddlespeech/s2t/exps/u2/bin/test.py
paddlespeech/s2t/exps/u2/bin/test.py
+1
-1
paddlespeech/s2t/exps/u2/bin/test_wav.py
paddlespeech/s2t/exps/u2/bin/test_wav.py
+2
-2
paddlespeech/s2t/exps/u2/config.py
paddlespeech/s2t/exps/u2/config.py
+1
-1
未找到文件。
examples/aishell/asr1/local/align.sh
浏览文件 @
41eeed04
...
@@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/alignment.py \
...
@@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/alignment.py \
--decode_config
${
decode_config_path
}
\
--decode_config
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decode_batch_size
${
batch_size
}
--opts
decod
e
.decode_batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in ctc alignment!"
echo
"Failed in ctc alignment!"
...
...
examples/aishell/asr1/local/test.sh
浏览文件 @
41eeed04
...
@@ -30,7 +30,7 @@ for type in attention ctc_greedy_search; do
...
@@ -30,7 +30,7 @@ for type in attention ctc_greedy_search; do
# stream decoding only support batchsize=1
# stream decoding only support batchsize=1
batch_size
=
1
batch_size
=
1
else
else
batch_size
=
64
batch_size
=
1
fi
fi
output_dir
=
${
ckpt_prefix
}
output_dir
=
${
ckpt_prefix
}
mkdir
-p
${
output_dir
}
mkdir
-p
${
output_dir
}
...
@@ -40,8 +40,8 @@ for type in attention ctc_greedy_search; do
...
@@ -40,8 +40,8 @@ for type in attention ctc_greedy_search; do
--decode_config
${
decode_config_path
}
\
--decode_config
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing
.decode_batch_size
${
batch_size
}
--opts
decod
e
.decode_batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
@@ -60,8 +60,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
...
@@ -60,8 +60,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
--decode_config
${
decode_config_path
}
\
--decode_config
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/aishell/asr1/local/test_wav.sh
浏览文件 @
41eeed04
...
@@ -46,8 +46,8 @@ for type in attention_rescoring; do
...
@@ -46,8 +46,8 @@ for type in attention_rescoring; do
--decode_config
${
decode_config_path
}
\
--decode_config
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing
.decode_batch_size
${
batch_size
}
\
--opts
decod
e
.decode_batch_size
${
batch_size
}
\
--audio_file
${
audio_file
}
--audio_file
${
audio_file
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
...
...
examples/csmsc/voc5/README.md
浏览文件 @
41eeed04
...
@@ -125,8 +125,8 @@ HiFiGAN checkpoint contains files listed below.
...
@@ -125,8 +125,8 @@ HiFiGAN checkpoint contains files listed below.
```
text
```
text
hifigan_csmsc_ckpt_0.1.1
hifigan_csmsc_ckpt_0.1.1
├── default.yaml # default config used to train hifigan
├── default.yaml # default config used to train hifigan
├── feats_stats.npy #
statistics used to normalize spectrogram when training
hifigan
├── feats_stats.npy #
generator parameters of
hifigan
└── snapshot_iter_2500000.pdz #
generator parameters of
hifigan
└── snapshot_iter_2500000.pdz #
statistics used to normalize spectrogram when training
hifigan
```
```
## Acknowledgement
## Acknowledgement
...
...
examples/librispeech/asr1/conf/chunk_conformer.yaml
浏览文件 @
41eeed04
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
# encoder related
cmvn_file_type
:
"
json"
encoder
:
conformer
# encoder related
encoder_conf
:
encoder
:
conformer
encoder_conf
:
output_size
:
256
# dimension of attention
output_size
:
256
# dimension of attention
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
linear_units
:
2048
# the number of units of position-wise feed forward
...
@@ -24,9 +25,9 @@ model:
...
@@ -24,9 +25,9 @@ model:
cnn_module_norm
:
'
layer_norm'
# using nn.LayerNorm makes model converge faster
cnn_module_norm
:
'
layer_norm'
# using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk
:
false
use_dynamic_left_chunk
:
false
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
...
@@ -35,69 +36,64 @@ model:
...
@@ -35,69 +36,64 @@ model:
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev
test_manifest
:
data/manifest.test
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
'
spm'
###########################################
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
"
"
unit_type
:
'
spm'
augmentation_config
:
conf/preprocess.yaml
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
feat_dim
:
80
mean_std_filepath
:
"
"
stride_ms
:
10.0
augmentation_config
:
conf/preprocess.yaml
window_ms
:
25.0
feat_dim
:
80
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
stride_ms
:
10.0
batch_size
:
16
window_ms
:
25.0
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_size
:
16
minibatches
:
0
# for debug
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
batch_count
:
auto
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_bins
:
0
minibatches
:
0
# for debug
batch_frames_in
:
0
batch_count
:
auto
batch_frames_out
:
0
batch_bins
:
0
batch_frames_inout
:
0
batch_frames_in
:
0
augmentation_config
:
conf/preprocess.yaml
batch_frames_out
:
0
num_workers
:
0
batch_frames_inout
:
0
subsampling_factor
:
1
augmentation_config
:
conf/preprocess.yaml
num_encs
:
1
num_workers
:
0
subsampling_factor
:
1
num_encs
:
1
training
:
###########################################
n_epoch
:
120
# Training #
accum_grad
:
8
###########################################
global_grad_clip
:
5.0
n_epoch
:
120
optim
:
adam
accum_grad
:
8
optim_conf
:
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.001
lr
:
0.001
weight_decay
:
1e-06
weight_decay
:
1e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
log_interval
:
100
log_interval
:
100
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
decoding
:
batch_size
:
128
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
true
# simulate streaming inference. Defaults to False.
examples/librispeech/asr1/conf/chunk_transformer.yaml
浏览文件 @
41eeed04
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
# encoder related
cmvn_file_type
:
"
json"
encoder
:
transformer
# encoder related
encoder_conf
:
encoder
:
transformer
encoder_conf
:
output_size
:
256
# dimension of attention
output_size
:
256
# dimension of attention
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
linear_units
:
2048
# the number of units of position-wise feed forward
...
@@ -17,9 +18,9 @@ model:
...
@@ -17,9 +18,9 @@ model:
use_dynamic_chunk
:
true
use_dynamic_chunk
:
true
use_dynamic_left_chunk
:
false
use_dynamic_left_chunk
:
false
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
...
@@ -28,76 +29,62 @@ model:
...
@@ -28,76 +29,62 @@ model:
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev
test_manifest
:
data/manifest.test
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
'
spm'
###########################################
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
"
"
unit_type
:
'
spm'
augmentation_config
:
conf/preprocess.yaml
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
feat_dim
:
80
mean_std_filepath
:
"
"
stride_ms
:
10.0
augmentation_config
:
conf/preprocess.yaml
window_ms
:
25.0
feat_dim
:
80
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
stride_ms
:
10.0
batch_size
:
64
window_ms
:
25.0
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_size
:
64
minibatches
:
0
# for debug
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
batch_count
:
auto
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_bins
:
0
minibatches
:
0
# for debug
batch_frames_in
:
0
batch_count
:
auto
batch_frames_out
:
0
batch_bins
:
0
batch_frames_inout
:
0
batch_frames_in
:
0
augmentation_config
:
conf/preprocess.yaml
batch_frames_out
:
0
num_workers
:
0
batch_frames_inout
:
0
subsampling_factor
:
1
augmentation_config
:
conf/preprocess.yaml
num_encs
:
1
num_workers
:
0
subsampling_factor
:
1
num_encs
:
1
training
:
###########################################
n_epoch
:
120
# Training #
accum_grad
:
1
###########################################
global_grad_clip
:
5.0
n_epoch
:
120
optim
:
adam
accum_grad
:
1
optim_conf
:
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.001
lr
:
0.001
weight_decay
:
1e-06
weight_decay
:
1e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
lr_decay
:
1.0
lr_decay
:
1.0
log_interval
:
100
log_interval
:
100
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
\ No newline at end of file
decoding
:
batch_size
:
64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
true
# simulate streaming inference. Defaults to False.
\ No newline at end of file
examples/librispeech/asr1/conf/conformer.yaml
浏览文件 @
41eeed04
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
# encoder related
cmvn_file_type
:
"
json"
encoder
:
conformer
# encoder related
encoder_conf
:
encoder
:
conformer
encoder_conf
:
output_size
:
256
# dimension of attention
output_size
:
256
# dimension of attention
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
linear_units
:
2048
# the number of units of position-wise feed forward
...
@@ -20,9 +21,9 @@ model:
...
@@ -20,9 +21,9 @@ model:
pos_enc_layer_type
:
'
rel_pos'
pos_enc_layer_type
:
'
rel_pos'
selfattention_layer_type
:
'
rel_selfattn'
selfattention_layer_type
:
'
rel_selfattn'
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
...
@@ -31,74 +32,66 @@ model:
...
@@ -31,74 +32,66 @@ model:
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
ctc_grad_norm_type
:
null
ctc_grad_norm_type
:
null
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test-clean
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev
test_manifest
:
data/manifest.test-clean
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
'
spm'
###########################################
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
"
"
unit_type
:
'
spm'
augmentation_config
:
conf/preprocess.yaml
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
feat_dim
:
80
mean_std_filepath
:
"
"
stride_ms
:
10.0
augmentation_config
:
conf/preprocess.yaml
window_ms
:
25.0
feat_dim
:
80
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
stride_ms
:
10.0
batch_size
:
16
window_ms
:
25.0
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_size
:
16
minibatches
:
0
# for debug
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
batch_count
:
auto
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_bins
:
0
minibatches
:
0
# for debug
batch_frames_in
:
0
batch_count
:
auto
batch_frames_out
:
0
batch_bins
:
0
batch_frames_inout
:
0
batch_frames_in
:
0
augmentation_config
:
conf/preprocess.yaml
batch_frames_out
:
0
num_workers
:
0
batch_frames_inout
:
0
subsampling_factor
:
1
augmentation_config
:
conf/preprocess.yaml
num_encs
:
1
num_workers
:
0
subsampling_factor
:
1
num_encs
:
1
training
:
###########################################
n_epoch
:
70
# Training #
accum_grad
:
8
###########################################
global_grad_clip
:
3.0
n_epoch
:
70
optim
:
adam
accum_grad
:
8
optim_conf
:
global_grad_clip
:
3.0
optim
:
adam
optim_conf
:
lr
:
0.004
lr
:
0.004
weight_decay
:
1e-06
weight_decay
:
1e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
lr_decay
:
1.0
lr_decay
:
1.0
log_interval
:
100
log_interval
:
100
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
decoding
:
batch_size
:
64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/librispeech/asr1/conf/transformer.yaml
浏览文件 @
41eeed04
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
# encoder related
cmvn_file_type
:
"
json"
encoder
:
transformer
# encoder related
encoder_conf
:
encoder
:
transformer
encoder_conf
:
output_size
:
256
# dimension of attention
output_size
:
256
# dimension of attention
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
linear_units
:
2048
# the number of units of position-wise feed forward
...
@@ -15,9 +16,9 @@ model:
...
@@ -15,9 +16,9 @@ model:
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
true
normalize_before
:
true
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
...
@@ -26,85 +27,63 @@ model:
...
@@ -26,85 +27,63 @@ model:
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.train
# Data #
dev_manifest
:
data/manifest.dev
###########################################
test_manifest
:
data/manifest.test-clean
train_manifest
:
data/manifest.train
min_input_len
:
0.5
# second
dev_manifest
:
data/manifest.dev
max_input_len
:
30.0
# second
test_manifest
:
data/manifest.test-clean
min_output_len
:
0.0
# tokens
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.05
max_output_input_ratio
:
100.0
collator
:
###########################################
vocab_filepath
:
data/lang_char/vocab.txt
# Dataloader #
unit_type
:
'
spm'
###########################################
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
"
"
unit_type
:
'
spm'
augmentation_config
:
conf/preprocess.yaml
spm_model_prefix
:
'
data/lang_char/bpe_unigram_5000'
feat_dim
:
80
mean_std_filepath
:
"
"
stride_ms
:
10.0
augmentation_config
:
conf/preprocess.yaml
window_ms
:
25.0
feat_dim
:
80
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
stride_ms
:
10.0
batch_size
:
32
window_ms
:
25.0
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_size
:
32
minibatches
:
0
# for debug
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
batch_count
:
auto
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
batch_bins
:
0
minibatches
:
0
# for debug
batch_frames_in
:
0
batch_count
:
auto
batch_frames_out
:
0
batch_bins
:
0
batch_frames_inout
:
0
batch_frames_in
:
0
augmentation_config
:
conf/preprocess.yaml
batch_frames_out
:
0
num_workers
:
0
batch_frames_inout
:
0
subsampling_factor
:
1
augmentation_config
:
conf/preprocess.yaml
num_encs
:
1
num_workers
:
0
subsampling_factor
:
1
num_encs
:
1
training
:
###########################################
n_epoch
:
120
# Training #
accum_grad
:
4
###########################################
global_grad_clip
:
5.0
n_epoch
:
120
optim
:
adam
accum_grad
:
4
optim_conf
:
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.004
lr
:
0.004
weight_decay
:
1e-06
weight_decay
:
1e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
lr_decay
:
1.0
lr_decay
:
1.0
log_interval
:
100
log_interval
:
100
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
decoding
:
batch_size
:
64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/librispeech/asr1/conf/tuning/chunk_decode.yaml
0 → 100644
浏览文件 @
41eeed04
decode_batch_size
:
128
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
true
# simulate streaming inference. Defaults to False.
examples/librispeech/asr1/conf/tuning/decode.yaml
0 → 100644
浏览文件 @
41eeed04
decode_batch_size
:
64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size
:
10
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/librispeech/asr1/local/align.sh
浏览文件 @
41eeed04
#!/bin/bash
#!/bin/bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
batch_size
=
1
batch_size
=
1
output_dir
=
${
ckpt_prefix
}
output_dir
=
${
ckpt_prefix
}
...
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
...
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
python3
-u
${
BIN_DIR
}
/alignment.py
\
python3
-u
${
BIN_DIR
}
/alignment.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_config
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in ctc alignment!"
echo
"Failed in ctc alignment!"
...
...
examples/librispeech/asr1/local/test.sh
浏览文件 @
41eeed04
...
@@ -15,8 +15,8 @@ recog_set="test-clean"
...
@@ -15,8 +15,8 @@ recog_set="test-clean"
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
chunk_mode
=
false
chunk_mode
=
false
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
...
@@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_config
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
@@ -76,10 +78,11 @@ for type in ctc_greedy_search; do
...
@@ -76,10 +78,11 @@ for type in ctc_greedy_search; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_config
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
@@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
...
@@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_config
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
--opts
decod
e.decode_
batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/librispeech/asr1/local/test_wav.sh
浏览文件 @
41eeed04
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix audio_file"
echo
"usage:
${
0
}
config_path
decode_config_path
ckpt_path_prefix audio_file"
exit
-1
exit
-1
fi
fi
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
...
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
decode_config_path
=
$2
audio_file
=
$3
ckpt_prefix
=
$3
audio_file
=
$4
mkdir
-p
data
mkdir
-p
data
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav
-P
data/
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav
-P
data/
...
@@ -49,10 +50,11 @@ for type in attention_rescoring; do
...
@@ -49,10 +50,11 @@ for type in attention_rescoring; do
python3
-u
${
BIN_DIR
}
/test_wav.py
\
python3
-u
${
BIN_DIR
}
/test_wav.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_config
${
decode_config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--result_file
${
output_dir
}
/
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decod
ing
.decoding_method
${
type
}
\
--opts
decod
e
.decoding_method
${
type
}
\
--opts
decod
ing.
batch_size
${
batch_size
}
\
--opts
decod
e.decode_
batch_size
${
batch_size
}
\
--audio_file
${
audio_file
}
--audio_file
${
audio_file
}
#score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
#score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
...
...
examples/librispeech/asr1/run.sh
浏览文件 @
41eeed04
...
@@ -8,6 +8,7 @@ gpus=0,1,2,3
...
@@ -8,6 +8,7 @@ gpus=0,1,2,3
stage
=
0
stage
=
0
stop_stage
=
50
stop_stage
=
50
conf_path
=
conf/transformer.yaml
conf_path
=
conf/transformer.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
30
avg_num
=
30
audio_file
=
data/demo_002_en.wav
audio_file
=
data/demo_002_en.wav
...
@@ -34,17 +35,17 @@ fi
...
@@ -34,17 +35,17 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# ctc alignment of test data
# ctc alignment of test data
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# test a single .wav file
# test a single .wav file
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
audio_file
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
audio_file
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
51
]
&&
[
${
stop_stage
}
-ge
51
]
;
then
if
[
${
stage
}
-le
51
]
&&
[
${
stop_stage
}
-ge
51
]
;
then
...
...
examples/tiny/asr1/conf/conformer.yaml
浏览文件 @
41eeed04
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.tiny
# Data #
dev_manifest
:
data/manifest.tiny
###########################################
test_manifest
:
data/manifest.tiny
train_manifest
:
data/manifest.tiny
min_input_len
:
0.5
# second
dev_manifest
:
data/manifest.tiny
max_input_len
:
20.0
# second
test_manifest
:
data/manifest.tiny
min_output_len
:
0.0
# tokens
min_input_len
:
0.5
# second
max_output_len
:
400.0
# tokens
max_input_len
:
20.0
# second
min_output_input_ratio
:
0.05
min_output_len
:
0.0
# tokens
max_output_input_ratio
:
10.0
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
mean_std_filepath
:
"
"
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
augmentation_config
:
conf/preprocess.yaml
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
window_ms
:
25.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
###########################################
# Dataloader #
###########################################
mean_std_filepath
:
"
"
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
augmentation_config
:
conf/preprocess.yaml
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
window_ms
:
25.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
# network architecture
model
:
############################################
cmvn_file
:
"
data/mean_std.json"
# Network Architecture #
cmvn_file_type
:
"
json"
############################################
# encoder related
cmvn_file
:
"
data/mean_std.json"
encoder
:
conformer
cmvn_file_type
:
"
json"
encoder_conf
:
# encoder related
encoder
:
conformer
encoder_conf
:
output_size
:
256
# dimension of attention
output_size
:
256
# dimension of attention
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
linear_units
:
2048
# the number of units of position-wise feed forward
...
@@ -58,9 +64,9 @@ model:
...
@@ -58,9 +64,9 @@ model:
pos_enc_layer_type
:
'
rel_pos'
pos_enc_layer_type
:
'
rel_pos'
selfattention_layer_type
:
'
rel_selfattn'
selfattention_layer_type
:
'
rel_selfattn'
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
...
@@ -69,48 +75,31 @@ model:
...
@@ -69,48 +75,31 @@ model:
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
training
:
###########################################
n_epoch
:
5
# training #
accum_grad
:
4
###########################################
global_grad_clip
:
5.0
n_epoch
:
5
optim
:
adam
accum_grad
:
4
optim_conf
:
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.002
lr
:
0.002
weight_decay
:
1e-06
weight_decay
:
1e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
lr_decay
:
1.0
lr_decay
:
1.0
log_interval
:
1
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
10
kbest_n
:
10
latest_n
:
1
latest_n
:
1
decoding
:
batch_size
:
64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
examples/tiny/asr1/conf/transformer.yaml
浏览文件 @
41eeed04
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
data
:
###########################################
train_manifest
:
data/manifest.tiny
# Data #
dev_manifest
:
data/manifest.tiny
###########################################
test_manifest
:
data/manifest.tiny
train_manifest
:
data/manifest.tiny
min_input_len
:
0.5
# second
dev_manifest
:
data/manifest.tiny
max_input_len
:
20.0
# second
test_manifest
:
data/manifest.tiny
min_output_len
:
0.0
# tokens
min_input_len
:
0.5
# second
max_output_len
:
400.0
# tokens
max_input_len
:
20.0
# second
min_output_input_ratio
:
0.05
min_output_len
:
0.0
# tokens
max_output_input_ratio
:
10.0
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
###########################################
mean_std_filepath
:
data/mean_std.json
# Dataloader #
vocab_filepath
:
data/lang_char/vocab.txt
###########################################
unit_type
:
'
spm'
mean_std_filepath
:
data/mean_std.json
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/preprocess.yaml
unit_type
:
'
spm'
batch_size
:
4
spm_model_prefix
:
'
data/lang_char/bpe_unigram_200'
raw_wav
:
True
# use raw_wav or kaldi feature
augmentation_config
:
conf/preprocess.yaml
spectrum_type
:
fbank
#linear, mfcc, fbank
batch_size
:
4
feat_dim
:
80
raw_wav
:
True
# use raw_wav or kaldi feature
delta_delta
:
False
spectrum_type
:
fbank
#linear, mfcc, fbank
dither
:
1.0
feat_dim
:
80
target_sample_rate
:
16000
delta_delta
:
False
max_freq
:
None
dither
:
1.0
n_fft
:
None
target_sample_rate
:
16000
stride_ms
:
10.0
max_freq
:
None
window_ms
:
25.0
n_fft
:
None
use_dB_normalization
:
True
stride_ms
:
10.0
target_dB
:
-20
window_ms
:
25.0
random_seed
:
0
use_dB_normalization
:
True
keep_transcription_text
:
False
target_dB
:
-20
sortagrad
:
True
random_seed
:
0
shuffle_method
:
batch_shuffle
keep_transcription_text
:
False
num_workers
:
2
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
# network architecture
############################################
model
:
# Network Architecture #
cmvn_file
:
############################################
cmvn_file_type
:
"
json"
cmvn_file
:
# encoder related
cmvn_file_type
:
"
json"
encoder
:
transformer
# encoder related
encoder_conf
:
encoder
:
transformer
encoder_conf
:
output_size
:
256
# dimension of attention
output_size
:
256
# dimension of attention
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
linear_units
:
2048
# the number of units of position-wise feed forward
...
@@ -52,9 +57,9 @@ model:
...
@@ -52,9 +57,9 @@ model:
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
true
normalize_before
:
true
# decoder related
# decoder related
decoder
:
transformer
decoder
:
transformer
decoder_conf
:
decoder_conf
:
attention_heads
:
4
attention_heads
:
4
linear_units
:
2048
linear_units
:
2048
num_blocks
:
6
num_blocks
:
6
...
@@ -63,48 +68,31 @@ model:
...
@@ -63,48 +68,31 @@ model:
self_attention_dropout_rate
:
0.0
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
training
:
###########################################
n_epoch
:
5
# training #
accum_grad
:
1
###########################################
global_grad_clip
:
5.0
n_epoch
:
5
optim
:
adam
accum_grad
:
1
optim_conf
:
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.002
lr
:
0.002
weight_decay
:
1e-06
weight_decay
:
1e-06
scheduler
:
warmuplr
scheduler
:
warmuplr
scheduler_conf
:
scheduler_conf
:
warmup_steps
:
25000
warmup_steps
:
25000
lr_decay
:
1.0
lr_decay
:
1.0
log_interval
:
1
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
2
kbest_n
:
2
latest_n
:
1
latest_n
:
1
decoding
:
batch_size
:
8
#64
error_rate_type
:
wer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
False
# simulate streaming inference. Defaults to False.
paddlespeech/s2t/exps/u2/bin/alignment.py
浏览文件 @
41eeed04
...
@@ -46,7 +46,7 @@ if __name__ == "__main__":
...
@@ -46,7 +46,7 @@ if __name__ == "__main__":
if
args
.
decode_config
:
if
args
.
decode_config
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_config
)
decode_confs
.
merge_from_file
(
args
.
decode_config
)
config
.
decod
ing
=
decode_confs
config
.
decod
e
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
...
...
paddlespeech/s2t/exps/u2/bin/test.py
浏览文件 @
41eeed04
...
@@ -50,7 +50,7 @@ if __name__ == "__main__":
...
@@ -50,7 +50,7 @@ if __name__ == "__main__":
if
args
.
decode_config
:
if
args
.
decode_config
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_config
)
decode_confs
.
merge_from_file
(
args
.
decode_config
)
config
.
decod
ing
=
decode_confs
config
.
decod
e
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
...
...
paddlespeech/s2t/exps/u2/bin/test_wav.py
浏览文件 @
41eeed04
...
@@ -81,7 +81,7 @@ class U2Infer():
...
@@ -81,7 +81,7 @@ class U2Infer():
ilen
=
paddle
.
to_tensor
(
feat
.
shape
[
0
])
ilen
=
paddle
.
to_tensor
(
feat
.
shape
[
0
])
xs
=
paddle
.
to_tensor
(
feat
,
dtype
=
'float32'
).
unsqueeze
(
axis
=
0
)
xs
=
paddle
.
to_tensor
(
feat
,
dtype
=
'float32'
).
unsqueeze
(
axis
=
0
)
decode_config
=
self
.
config
.
decod
ing
decode_config
=
self
.
config
.
decod
e
result_transcripts
=
self
.
model
.
decode
(
result_transcripts
=
self
.
model
.
decode
(
xs
,
xs
,
ilen
,
ilen
,
...
@@ -135,7 +135,7 @@ if __name__ == "__main__":
...
@@ -135,7 +135,7 @@ if __name__ == "__main__":
if
args
.
decode_config
:
if
args
.
decode_config
:
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
=
CfgNode
(
new_allowed
=
True
)
decode_confs
.
merge_from_file
(
args
.
decode_config
)
decode_confs
.
merge_from_file
(
args
.
decode_config
)
config
.
decod
ing
=
decode_confs
config
.
decod
e
=
decode_confs
if
args
.
opts
:
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
config
.
freeze
()
...
...
paddlespeech/s2t/exps/u2/config.py
浏览文件 @
41eeed04
...
@@ -29,7 +29,7 @@ U2Model.params(_C)
...
@@ -29,7 +29,7 @@ U2Model.params(_C)
U2Trainer
.
params
(
_C
)
U2Trainer
.
params
(
_C
)
_C
.
decod
ing
=
U2Tester
.
params
()
_C
.
decod
e
=
U2Tester
.
params
()
def
get_cfg_defaults
():
def
get_cfg_defaults
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录