Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
44743622
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
44743622
编写于
11月 05, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
filter example; cmvn stride and window int; libri/s1 conf
上级
18d9abc7
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
195 addition
and
49 deletion
+195
-49
examples/aishell/s0/local/data.sh
examples/aishell/s0/local/data.sh
+2
-2
examples/aishell/s1/local/data.sh
examples/aishell/s1/local/data.sh
+2
-2
examples/callcenter/s1/local/data.sh
examples/callcenter/s1/local/data.sh
+2
-2
examples/dataset/librispeech/librispeech.py
examples/dataset/librispeech/librispeech.py
+5
-5
examples/librispeech/s0/local/data.sh
examples/librispeech/s0/local/data.sh
+2
-2
examples/librispeech/s1/conf/chunk_conformer.yaml
examples/librispeech/s1/conf/chunk_conformer.yaml
+2
-2
examples/librispeech/s1/conf/chunk_transformer.yaml
examples/librispeech/s1/conf/chunk_transformer.yaml
+2
-2
examples/librispeech/s1/conf/conformer.yaml
examples/librispeech/s1/conf/conformer.yaml
+2
-2
examples/librispeech/s1/conf/preprocess.yaml
examples/librispeech/s1/conf/preprocess.yaml
+29
-0
examples/librispeech/s1/conf/transformer.yaml
examples/librispeech/s1/conf/transformer.yaml
+2
-2
examples/librispeech/s1/local/data.sh
examples/librispeech/s1/local/data.sh
+30
-15
examples/ted_en_zh/t0/local/data.sh
examples/ted_en_zh/t0/local/data.sh
+2
-2
examples/timit/s1/local/data.sh
examples/timit/s1/local/data.sh
+2
-2
examples/tiny/s0/local/data.sh
examples/tiny/s0/local/data.sh
+2
-2
examples/tiny/s1/local/data.sh
examples/tiny/s1/local/data.sh
+2
-2
utils/compute_mean_std.py
utils/compute_mean_std.py
+4
-4
utils/format_data.py
utils/format_data.py
+1
-1
utils/remove_longshortdata.py
utils/remove_longshortdata.py
+102
-0
未找到文件。
examples/aishell/s0/local/data.sh
浏览文件 @
44743622
...
...
@@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path
=
"data/manifest.train.raw"
\
--spectrum_type
=
"linear"
\
--delta_delta
=
false
\
--stride_ms
=
10
.0
\
--window_ms
=
20
.0
\
--stride_ms
=
10
\
--window_ms
=
20
\
--sample_rate
=
16000
\
--use_dB_normalization
=
True
\
--num_samples
=
2000
\
...
...
examples/aishell/s1/local/data.sh
浏览文件 @
44743622
...
...
@@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--stride_ms
=
10
.0
\
--window_ms
=
25
.0
\
--stride_ms
=
10
\
--window_ms
=
25
\
--sample_rate
=
16000
\
--use_dB_normalization
=
False
\
--num_samples
=
-1
\
...
...
examples/callcenter/s1/local/data.sh
浏览文件 @
44743622
...
...
@@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--stride_ms
=
10
.0
\
--window_ms
=
25
.0
\
--stride_ms
=
10
\
--window_ms
=
25
\
--sample_rate
=
8000
\
--use_dB_normalization
=
False
\
--num_samples
=
-1
\
...
...
examples/dataset/librispeech/librispeech.py
浏览文件 @
44743622
...
...
@@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
print
(
"Creating manifest %s ..."
%
manifest_path
)
json_lines
=
[]
total_sec
=
0.0
total_
text
=
0.0
total_
char
=
0.0
total_num
=
0
for
subfolder
,
_
,
filelist
in
sorted
(
os
.
walk
(
data_dir
)):
...
...
@@ -89,7 +89,7 @@ def create_manifest(data_dir, manifest_path):
text_filepath
=
os
.
path
.
join
(
subfolder
,
text_filelist
[
0
])
for
line
in
io
.
open
(
text_filepath
,
encoding
=
"utf8"
):
segments
=
line
.
strip
().
split
()
n
_token
=
len
(
segments
[
1
:])
n
chars
=
len
(
segments
[
1
:])
text
=
' '
.
join
(
segments
[
1
:]).
lower
()
audio_filepath
=
os
.
path
.
abspath
(
...
...
@@ -110,7 +110,7 @@ def create_manifest(data_dir, manifest_path):
}))
total_sec
+=
duration
total_
text
+=
n_token
total_
char
+=
nchars
total_num
+=
1
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
out_file
:
...
...
@@ -125,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
print
(
f
"
{
subset
}
:"
,
file
=
f
)
print
(
f
"
{
total_num
}
utts"
,
file
=
f
)
print
(
f
"
{
total_sec
/
(
60
*
60
)
}
h"
,
file
=
f
)
print
(
f
"
{
total_
text
}
text
"
,
file
=
f
)
print
(
f
"
{
total_
text
/
total_sec
}
text
/sec"
,
file
=
f
)
print
(
f
"
{
total_
char
}
char
"
,
file
=
f
)
print
(
f
"
{
total_
char
/
total_sec
}
char
/sec"
,
file
=
f
)
print
(
f
"
{
total_sec
/
total_num
}
sec/utt"
,
file
=
f
)
...
...
examples/librispeech/s0/local/data.sh
浏览文件 @
44743622
...
...
@@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type
=
"linear"
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--stride_ms
=
10
.0
\
--window_ms
=
20
.0
\
--stride_ms
=
10
\
--window_ms
=
20
\
--use_dB_normalization
=
True
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
...
...
examples/librispeech/s1/conf/chunk_conformer.yaml
浏览文件 @
44743622
...
...
@@ -15,7 +15,7 @@ collator:
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/
augmentation.json
augmentation_config
:
conf/
preprocess.yaml
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
...
...
@@ -38,7 +38,7 @@ collator:
# network architecture
model
:
cmvn_file
:
"
data/mean_std.json"
cmvn_file
:
cmvn_file_type
:
"
json"
# encoder related
encoder
:
conformer
...
...
examples/librispeech/s1/conf/chunk_transformer.yaml
浏览文件 @
44743622
...
...
@@ -15,7 +15,7 @@ collator:
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/
augmentation.json
augmentation_config
:
conf/
preprocess.yaml
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
...
...
@@ -38,7 +38,7 @@ collator:
# network architecture
model
:
cmvn_file
:
"
data/mean_std.json"
cmvn_file
:
cmvn_file_type
:
"
json"
# encoder related
encoder
:
transformer
...
...
examples/librispeech/s1/conf/conformer.yaml
浏览文件 @
44743622
...
...
@@ -15,7 +15,7 @@ collator:
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/
augmentation.json
augmentation_config
:
conf/
preprocess.yaml
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
...
...
@@ -38,7 +38,7 @@ collator:
# network architecture
model
:
cmvn_file
:
"
data/mean_std.json"
cmvn_file
:
cmvn_file_type
:
"
json"
# encoder related
encoder
:
conformer
...
...
examples/librispeech/s1/conf/preprocess.yaml
0 → 100644
浏览文件 @
44743622
process
:
# extract kaldi fbank from PCM
-
type
:
fbank_kaldi
fs
:
16000
n_mels
:
80
n_shift
:
160
win_length
:
400
dither
:
true
-
type
:
cmvn_json
cmvn_path
:
data/mean_std.json
# these three processes are a.k.a. SpecAugument
-
type
:
time_warp
max_time_warp
:
5
inplace
:
true
mode
:
PIL
-
type
:
freq_mask
F
:
30
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
-
type
:
time_mask
T
:
40
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
examples/librispeech/s1/conf/transformer.yaml
浏览文件 @
44743622
...
...
@@ -15,7 +15,7 @@ collator:
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/
augmentation.json
augmentation_config
:
conf/
preprocess.yaml
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
spectrum_type
:
fbank
#linear, mfcc, fbank
...
...
@@ -38,7 +38,7 @@ collator:
# network architecture
model
:
cmvn_file
:
"
data/mean_std.json"
cmvn_file
:
cmvn_file_type
:
"
json"
# encoder related
encoder
:
transformer
...
...
examples/librispeech/s1/local/data.sh
浏览文件 @
44743622
...
...
@@ -8,6 +8,11 @@ nbpe=5000
bpemode
=
unigram
bpeprefix
=
"data/bpe_
${
bpemode
}
_
${
nbpe
}
"
stride_ms
=
10
window_ms
=
25
sample_rate
=
16000
feat_dim
=
80
source
${
MAIN_ROOT
}
/utils/parse_options.sh
...
...
@@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
exit
1
fi
for
s
et
in
train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other
;
do
mv
data/manifest.
${
s
et
}
data/manifest.
${
set
}
.raw
for
s
ub
in
train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other
;
do
mv
data/manifest.
${
s
ub
}
data/manifest.
${
sub
}
.raw
done
rm
-rf
data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
for
s
et
in
train-clean-100 train-clean-360 train-other-500
;
do
cat
data/manifest.
${
s
et
}
.raw
>>
data/manifest.train.raw
for
s
ub
in
train-clean-100 train-clean-360 train-other-500
;
do
cat
data/manifest.
${
s
ub
}
.raw
>>
data/manifest.train.raw
done
for
s
et
in
dev-clean dev-other
;
do
cat
data/manifest.
${
s
et
}
.raw
>>
data/manifest.dev.raw
for
s
ub
in
dev-clean dev-other
;
do
cat
data/manifest.
${
s
ub
}
.raw
>>
data/manifest.dev.raw
done
for
s
et
in
test-clean test-other
;
do
cat
data/manifest.
${
s
et
}
.raw
>>
data/manifest.test.raw
for
s
ub
in
test-clean test-other
;
do
cat
data/manifest.
${
s
ub
}
.raw
>>
data/manifest.test.raw
done
fi
...
...
@@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
-1
\
--spectrum_type
=
"fbank"
\
--feat_dim
=
80
\
--feat_dim
=
${
feat_dim
}
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--stride_ms
=
10.0
\
--window_ms
=
25.0
\
--sample_rate
=
${
sample_rate
}
\
--stride_ms
=
${
stride_ms
}
\
--window_ms
=
${
window_ms
}
\
--use_dB_normalization
=
False
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
...
...
@@ -85,15 +90,15 @@ fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# format manifest with tokenids, vocab size
for
s
et
in
train dev
test
dev-clean dev-other test-clean test-other
;
do
for
s
ub
in
train dev
test
dev-clean dev-other test-clean test-other
;
do
{
python3
${
MAIN_ROOT
}
/utils/format_data.py
\
--cmvn_path
"data/mean_std.json"
\
--unit_type
"spm"
\
--spm_model_prefix
${
bpeprefix
}
\
--vocab_path
=
"data/vocab.txt"
\
--manifest_path
=
"data/manifest.
${
s
et
}
.raw"
\
--output_path
=
"data/manifest.
${
s
et
}
"
--manifest_path
=
"data/manifest.
${
s
ub
}
.raw"
\
--output_path
=
"data/manifest.
${
s
ub
}
"
if
[
$?
-ne
0
]
;
then
echo
"Formt mnaifest failed. Terminated."
...
...
@@ -102,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
}
&
done
wait
for
sub
in
train dev
;
do
mv
data/manifest.
${
sub
}
data/manifest.
${
sub
}
.fmt
done
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
for
sub
in
train dev
;
do
remove_longshortdata.py
--maxframes
3000
--maxchars
400
--stride_ms
${
stride_ms
}
data/manifest.
${
sub
}
.fmt data/manifest.
${
sub
}
done
fi
echo
"LibriSpeech Data preparation done."
...
...
examples/ted_en_zh/t0/local/data.sh
浏览文件 @
44743622
...
...
@@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim
=
80
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--stride_ms
=
10
.0
\
--window_ms
=
25
.0
\
--stride_ms
=
10
\
--window_ms
=
25
\
--use_dB_normalization
=
False
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
...
...
examples/timit/s1/local/data.sh
浏览文件 @
44743622
...
...
@@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim
=
80
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--stride_ms
=
10
.0
\
--window_ms
=
25
.0
\
--stride_ms
=
10
\
--window_ms
=
25
\
--use_dB_normalization
=
False
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
...
...
examples/tiny/s0/local/data.sh
浏览文件 @
44743622
...
...
@@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type
=
"linear"
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--stride_ms
=
10
.0
\
--window_ms
=
20
.0
\
--stride_ms
=
10
\
--window_ms
=
20
\
--use_dB_normalization
=
False
\
--num_workers
=
2
\
--output_path
=
"data/mean_std.json"
...
...
examples/tiny/s1/local/data.sh
浏览文件 @
44743622
...
...
@@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim
=
80
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--stride_ms
=
10
.0
\
--window_ms
=
25
.0
\
--stride_ms
=
10
\
--window_ms
=
25
\
--use_dB_normalization
=
False
\
--num_workers
=
2
\
--output_path
=
"data/mean_std.json"
...
...
utils/compute_mean_std.py
浏览文件 @
44743622
...
...
@@ -33,8 +33,8 @@ add_arg('spectrum_type', str,
choices
=
[
'linear'
,
'mfcc'
,
'fbank'
])
add_arg
(
'feat_dim'
,
int
,
13
,
"Audio feature dim."
)
add_arg
(
'delta_delta'
,
bool
,
False
,
"Audio feature with delta delta."
)
add_arg
(
'stride_ms'
,
float
,
10.
0
,
"stride length in ms."
)
add_arg
(
'window_ms'
,
float
,
20.
0
,
"stride length in ms."
)
add_arg
(
'stride_ms'
,
int
,
1
0
,
"stride length in ms."
)
add_arg
(
'window_ms'
,
int
,
2
0
,
"stride length in ms."
)
add_arg
(
'sample_rate'
,
int
,
16000
,
"target sample rate."
)
add_arg
(
'use_dB_normalization'
,
bool
,
True
,
"do dB normalization."
)
add_arg
(
'target_dB'
,
int
,
-
20
,
"target dB."
)
...
...
@@ -61,8 +61,8 @@ def main():
spectrum_type
=
args
.
spectrum_type
,
feat_dim
=
args
.
feat_dim
,
delta_delta
=
args
.
delta_delta
,
stride_ms
=
args
.
stride_ms
,
window_ms
=
args
.
window_ms
,
stride_ms
=
float
(
args
.
stride_ms
)
,
window_ms
=
float
(
args
.
window_ms
)
,
n_fft
=
None
,
max_freq
=
None
,
target_sample_rate
=
args
.
sample_rate
,
...
...
utils/format_data.py
浏览文件 @
44743622
...
...
@@ -122,7 +122,7 @@ def main():
fout
.
write
(
json
.
dumps
(
output_json
)
+
'
\n
'
)
count
+=
1
print
(
f
"Examples number:
{
count
}
"
)
print
(
f
"
{
args
.
manifest_paths
}
Examples number:
{
count
}
"
)
fout
.
close
()
...
...
utils/remove_longshortdata.py
0 → 100755
浏览文件 @
44743622
#!/usr/bin/env python3
"""remove longshort data from manifest"""
import
logging
import
argparse
import
jsonlines
from
paddlespeech.s2t.utils.cli_utils
import
get_commandline_args
# manifest after format
# josnline like this
# {
# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
# "utt2spk": "111-2222",
# "utt": "111-2222-333"
# }
def
get_parser
():
parser
=
argparse
.
ArgumentParser
(
description
=
"remove longshort data from format manifest"
,
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
,
)
parser
.
add_argument
(
"--verbose"
,
"-V"
,
default
=
0
,
type
=
int
,
help
=
"Verbose option"
)
parser
.
add_argument
(
"--iaxis"
,
default
=
0
,
type
=
int
,
help
=
"multi inputs index, 0 is the first"
)
parser
.
add_argument
(
"--oaxis"
,
default
=
0
,
type
=
int
,
help
=
"multi outputs index, 0 is the first"
)
parser
.
add_argument
(
"--maxframes"
,
default
=
2000
,
type
=
int
,
help
=
"maxframes"
)
parser
.
add_argument
(
"--minframes"
,
default
=
10
,
type
=
int
,
help
=
"minframes"
)
parser
.
add_argument
(
"--maxchars"
,
default
=
200
,
type
=
int
,
help
=
"max tokens"
)
parser
.
add_argument
(
"--minchars"
,
default
=
0
,
type
=
int
,
help
=
"min tokens"
)
parser
.
add_argument
(
"--stride_ms"
,
default
=
10
,
type
=
int
,
help
=
"stride in ms unit."
)
parser
.
add_argument
(
"rspecifier"
,
type
=
str
,
help
=
"jsonl format manifest. e.g. manifest.jsonl"
)
parser
.
add_argument
(
"wspecifier_or_wxfilename"
,
type
=
str
,
help
=
"Write specifier. e.g. manifest.jsonl"
)
return
parser
def
filter_input
(
args
,
line
):
tmp
=
line
[
'input'
][
args
.
iaxis
]
if
args
.
sound
:
# second to frame
nframe
=
tmp
[
'shape'
][
0
]
*
1000
/
args
.
stride_ms
else
:
nframe
=
tmp
[
'shape'
][
0
]
if
nframe
<
args
.
minframes
or
nframe
>
args
.
maxframes
:
return
True
else
:
return
False
def
filter_output
(
args
,
line
):
nchars
=
len
(
line
[
'output'
][
args
.
iaxis
][
'text'
])
if
nchars
<
args
.
minchars
or
nchars
>
args
.
maxchars
:
return
True
else
:
return
False
def
main
():
args
=
get_parser
().
parse_args
()
logfmt
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if
args
.
verbose
>
0
:
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
logfmt
)
else
:
logging
.
basicConfig
(
level
=
logging
.
WARN
,
format
=
logfmt
)
logging
.
info
(
get_commandline_args
())
with
jsonlines
.
open
(
args
.
rspecifier
,
'r'
)
as
reader
:
lines
=
list
(
reader
)
logging
.
info
(
f
"Example:
{
len
(
lines
)
}
"
)
feat
=
lines
[
0
][
'input'
][
args
.
iaxis
][
'feat'
]
args
.
soud
=
False
if
feat
.
split
(
'.'
)[
-
1
]
not
in
'ark, scp'
:
args
.
sound
=
True
count
=
0
filter
=
0
with
jsonlines
.
open
(
args
.
wspecifier_or_wxfilename
,
'w'
)
as
writer
:
for
line
in
lines
:
if
filter_input
(
args
,
line
)
or
filter_output
(
args
,
line
):
filter
+=
1
continue
writer
.
write
(
line
)
count
+=
1
logging
.
info
(
f
"Example after filter:
{
count
}
\{filter}"
)
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录