Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
b7b1bda3
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b7b1bda3
编写于
9月 28, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
test refactor collator
上级
f628e218
变更
47
隐藏空白更改
内联
并排
Showing
47 changed file
with
125 addition
and
163 deletion
+125
-163
deepspeech/exps/u2_st/model.py
deepspeech/exps/u2_st/model.py
+2
-4
deepspeech/frontend/featurizer/audio_featurizer.py
deepspeech/frontend/featurizer/audio_featurizer.py
+16
-16
deepspeech/frontend/featurizer/speech_featurizer.py
deepspeech/frontend/featurizer/speech_featurizer.py
+3
-33
deepspeech/frontend/utility.py
deepspeech/frontend/utility.py
+1
-0
deepspeech/io/collator.py
deepspeech/io/collator.py
+58
-50
deepspeech/io/dataset.py
deepspeech/io/dataset.py
+3
-18
deepspeech/io/reader.py
deepspeech/io/reader.py
+1
-1
docs/src/data_preparation.md
docs/src/data_preparation.md
+1
-1
docs/src/deepspeech_architecture.md
docs/src/deepspeech_architecture.md
+1
-1
examples/1xt2x/aishell/conf/deepspeech2.yaml
examples/1xt2x/aishell/conf/deepspeech2.yaml
+1
-1
examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+1
-1
examples/1xt2x/librispeech/conf/deepspeech2.yaml
examples/1xt2x/librispeech/conf/deepspeech2.yaml
+1
-1
examples/aishell/s0/conf/deepspeech2.yaml
examples/aishell/s0/conf/deepspeech2.yaml
+1
-1
examples/aishell/s0/conf/deepspeech2_online.yaml
examples/aishell/s0/conf/deepspeech2_online.yaml
+1
-1
examples/aishell/s0/local/data.sh
examples/aishell/s0/local/data.sh
+1
-1
examples/aishell/s1/conf/chunk_conformer.yaml
examples/aishell/s1/conf/chunk_conformer.yaml
+1
-1
examples/aishell/s1/conf/conformer.yaml
examples/aishell/s1/conf/conformer.yaml
+1
-1
examples/aishell/s1/local/data.sh
examples/aishell/s1/local/data.sh
+1
-1
examples/callcenter/s1/conf/chunk_conformer.yaml
examples/callcenter/s1/conf/chunk_conformer.yaml
+1
-1
examples/callcenter/s1/conf/conformer.yaml
examples/callcenter/s1/conf/conformer.yaml
+1
-1
examples/callcenter/s1/local/data.sh
examples/callcenter/s1/local/data.sh
+1
-1
examples/librispeech/s0/conf/deepspeech2.yaml
examples/librispeech/s0/conf/deepspeech2.yaml
+1
-1
examples/librispeech/s0/conf/deepspeech2_online.yaml
examples/librispeech/s0/conf/deepspeech2_online.yaml
+1
-1
examples/librispeech/s0/local/data.sh
examples/librispeech/s0/local/data.sh
+1
-1
examples/librispeech/s1/conf/chunk_conformer.yaml
examples/librispeech/s1/conf/chunk_conformer.yaml
+1
-1
examples/librispeech/s1/conf/chunk_transformer.yaml
examples/librispeech/s1/conf/chunk_transformer.yaml
+1
-1
examples/librispeech/s1/conf/conformer.yaml
examples/librispeech/s1/conf/conformer.yaml
+1
-1
examples/librispeech/s1/conf/transformer.yaml
examples/librispeech/s1/conf/transformer.yaml
+1
-1
examples/librispeech/s1/local/data.sh
examples/librispeech/s1/local/data.sh
+1
-1
examples/librispeech/s2/conf/chunk_conformer.yaml
examples/librispeech/s2/conf/chunk_conformer.yaml
+1
-1
examples/librispeech/s2/conf/chunk_transformer.yaml
examples/librispeech/s2/conf/chunk_transformer.yaml
+1
-1
examples/librispeech/s2/conf/conformer.yaml
examples/librispeech/s2/conf/conformer.yaml
+1
-1
examples/librispeech/s2/local/data.sh
examples/librispeech/s2/local/data.sh
+1
-1
examples/ted_en_zh/t0/conf/transformer.yaml
examples/ted_en_zh/t0/conf/transformer.yaml
+1
-1
examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+1
-1
examples/ted_en_zh/t0/local/data.sh
examples/ted_en_zh/t0/local/data.sh
+1
-1
examples/timit/s1/conf/transformer.yaml
examples/timit/s1/conf/transformer.yaml
+1
-1
examples/timit/s1/local/data.sh
examples/timit/s1/local/data.sh
+1
-1
examples/tiny/s0/conf/deepspeech2.yaml
examples/tiny/s0/conf/deepspeech2.yaml
+1
-1
examples/tiny/s0/conf/deepspeech2_online.yaml
examples/tiny/s0/conf/deepspeech2_online.yaml
+1
-1
examples/tiny/s0/local/data.sh
examples/tiny/s0/local/data.sh
+1
-1
examples/tiny/s1/conf/chunk_confermer.yaml
examples/tiny/s1/conf/chunk_confermer.yaml
+1
-1
examples/tiny/s1/conf/chunk_transformer.yaml
examples/tiny/s1/conf/chunk_transformer.yaml
+1
-1
examples/tiny/s1/conf/conformer.yaml
examples/tiny/s1/conf/conformer.yaml
+1
-1
examples/tiny/s1/conf/transformer.yaml
examples/tiny/s1/conf/transformer.yaml
+1
-1
examples/tiny/s1/local/data.sh
examples/tiny/s1/local/data.sh
+1
-1
utils/compute_mean_std.py
utils/compute_mean_std.py
+2
-2
未找到文件。
deepspeech/exps/u2_st/model.py
浏览文件 @
b7b1bda3
...
...
@@ -31,7 +31,6 @@ from yacs.config import CfgNode
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.collator
import
TripletSpeechCollator
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.io.dataset
import
TripletManifestDataset
from
deepspeech.io.sampler
import
SortagradBatchSampler
from
deepspeech.io.sampler
import
SortagradDistributedBatchSampler
from
deepspeech.models.u2_st
import
U2STModel
...
...
@@ -249,12 +248,11 @@ class U2STTrainer(Trainer):
config
.
collator
.
keep_transcription_text
=
False
# train/valid dataset, return token ids
Dataset
=
TripletManifestDataset
if
config
.
model
.
model_conf
.
asr_weight
>
0.
else
ManifestDataset
config
.
data
.
manifest
=
config
.
data
.
train_manifest
train_dataset
=
Dataset
.
from_config
(
config
)
train_dataset
=
Manifest
Dataset
.
from_config
(
config
)
config
.
data
.
manifest
=
config
.
data
.
dev_manifest
dev_dataset
=
Dataset
.
from_config
(
config
)
dev_dataset
=
Manifest
Dataset
.
from_config
(
config
)
if
config
.
model
.
model_conf
.
asr_weight
>
0.
:
Collator
=
TripletSpeechCollator
...
...
deepspeech/frontend/featurizer/audio_featurizer.py
浏览文件 @
b7b1bda3
...
...
@@ -24,15 +24,15 @@ class AudioFeaturizer():
Currently, it supports feature types of linear spectrogram and mfcc.
:param spec
gra
m_type: Specgram feature type. Options: 'linear'.
:type spec
gra
m_type: str
:param spec
tru
m_type: Specgram feature type. Options: 'linear'.
:type spec
tru
m_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When spec
gra
m_type is 'linear', only FFT bins
:param max_freq: When spec
tru
m_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when spec
gra
m_type is 'mfcc', max_feq is the
returned; when spec
tru
m_type is 'mfcc', max_feq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Audio are resampled (if upsampling or
...
...
@@ -47,7 +47,7 @@ class AudioFeaturizer():
"""
def
__init__
(
self
,
spec
gra
m_type
:
str
=
'linear'
,
spec
tru
m_type
:
str
=
'linear'
,
feat_dim
:
int
=
None
,
delta_delta
:
bool
=
False
,
stride_ms
=
10.0
,
...
...
@@ -58,7 +58,7 @@ class AudioFeaturizer():
use_dB_normalization
=
True
,
target_dB
=-
20
,
dither
=
1.0
):
self
.
_spec
gram_type
=
specgra
m_type
self
.
_spec
trum_type
=
spectru
m_type
# mfcc and fbank using `feat_dim`
self
.
_feat_dim
=
feat_dim
# mfcc and fbank using `delta-delta`
...
...
@@ -113,27 +113,27 @@ class AudioFeaturizer():
def
feature_size
(
self
):
"""audio feature size"""
feat_dim
=
0
if
self
.
_spec
gra
m_type
==
'linear'
:
if
self
.
_spec
tru
m_type
==
'linear'
:
fft_point
=
self
.
_window_ms
if
self
.
_fft_point
is
None
else
self
.
_fft_point
feat_dim
=
int
(
fft_point
*
(
self
.
_target_sample_rate
/
1000
)
/
2
+
1
)
elif
self
.
_spec
gra
m_type
==
'mfcc'
:
elif
self
.
_spec
tru
m_type
==
'mfcc'
:
# mfcc, delta, delta-delta
feat_dim
=
int
(
self
.
_feat_dim
*
3
)
if
self
.
_delta_delta
else
int
(
self
.
_feat_dim
)
elif
self
.
_spec
gra
m_type
==
'fbank'
:
elif
self
.
_spec
tru
m_type
==
'fbank'
:
# fbank, delta, delta-delta
feat_dim
=
int
(
self
.
_feat_dim
*
3
)
if
self
.
_delta_delta
else
int
(
self
.
_feat_dim
)
else
:
raise
ValueError
(
"Unknown spec
gra
m_type %s. "
"Supported values: linear."
%
self
.
_spec
gra
m_type
)
raise
ValueError
(
"Unknown spec
tru
m_type %s. "
"Supported values: linear."
%
self
.
_spec
tru
m_type
)
return
feat_dim
def
_compute_specgram
(
self
,
audio_segment
):
"""Extract various audio features."""
sample_rate
=
audio_segment
.
sample_rate
if
self
.
_spec
gra
m_type
==
'linear'
:
if
self
.
_spec
tru
m_type
==
'linear'
:
samples
=
audio_segment
.
samples
return
self
.
_compute_linear_specgram
(
samples
,
...
...
@@ -141,7 +141,7 @@ class AudioFeaturizer():
stride_ms
=
self
.
_stride_ms
,
window_ms
=
self
.
_window_ms
,
max_freq
=
self
.
_max_freq
)
elif
self
.
_spec
gra
m_type
==
'mfcc'
:
elif
self
.
_spec
tru
m_type
==
'mfcc'
:
samples
=
audio_segment
.
to
(
'int16'
)
return
self
.
_compute_mfcc
(
samples
,
...
...
@@ -152,7 +152,7 @@ class AudioFeaturizer():
max_freq
=
self
.
_max_freq
,
dither
=
self
.
_dither
,
delta_delta
=
self
.
_delta_delta
)
elif
self
.
_spec
gra
m_type
==
'fbank'
:
elif
self
.
_spec
tru
m_type
==
'fbank'
:
samples
=
audio_segment
.
to
(
'int16'
)
return
self
.
_compute_fbank
(
samples
,
...
...
@@ -164,8 +164,8 @@ class AudioFeaturizer():
dither
=
self
.
_dither
,
delta_delta
=
self
.
_delta_delta
)
else
:
raise
ValueError
(
"Unknown spec
gra
m_type %s. "
"Supported values: linear."
%
self
.
_spec
gra
m_type
)
raise
ValueError
(
"Unknown spec
tru
m_type %s. "
"Supported values: linear."
%
self
.
_spec
tru
m_type
)
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
"""Compute the spectrogram for samples from a real signal."""
...
...
deepspeech/frontend/featurizer/speech_featurizer.py
浏览文件 @
b7b1bda3
...
...
@@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
class
SpeechFeaturizer
():
"""Speech featurizer, for extracting features from both audio and transcript
contents of SpeechSegment.
Currently, for audio parts, it supports feature types of linear
spectrogram and mfcc; for transcript parts, it only supports char-level
tokenizing and conversion into a list of token indices. Note that the
token indexing order follows the given vocabulary file.
:param vocab_filepath: Filepath to load vocabulary for token indices
conversion.
:type specgram_type: str
:param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
:type specgram_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When specgram_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when specgram_type is 'mfcc', max_freq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Speech are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
:type target_dB: float
"""Speech and Text feature extraction.
"""
def
__init__
(
self
,
unit_type
,
vocab_filepath
,
spm_model_prefix
=
None
,
spec
gra
m_type
=
'linear'
,
spec
tru
m_type
=
'linear'
,
feat_dim
=
None
,
delta_delta
=
False
,
stride_ms
=
10.0
,
...
...
@@ -70,7 +40,7 @@ class SpeechFeaturizer():
self
.
window_ms
=
window_ms
self
.
audio_feature
=
AudioFeaturizer
(
spec
gram_type
=
specgra
m_type
,
spec
trum_type
=
spectru
m_type
,
feat_dim
=
feat_dim
,
delta_delta
=
delta_delta
,
stride_ms
=
stride_ms
,
...
...
deepspeech/frontend/utility.py
浏览文件 @
b7b1bda3
...
...
@@ -15,6 +15,7 @@
import
json
import
math
import
tarfile
from
collections
import
namedtuple
from
typing
import
List
from
typing
import
Optional
from
typing
import
Text
...
...
deepspeech/io/collator.py
浏览文件 @
b7b1bda3
...
...
@@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
logger
=
Log
(
__name__
).
getlog
()
def
tokenids
(
text
,
keep_transcription_text
):
# for training text is token ids
tokens
=
text
# token ids
if
keep_transcription_text
:
# text is string, convert to unicode ord
assert
isinstance
(
text
,
str
),
(
type
(
text
),
text
)
tokens
=
[
ord
(
t
)
for
t
in
text
]
tokens
=
np
.
array
(
tokens
,
dtype
=
np
.
int64
)
return
tokens
class
SpeechCollatorBase
():
def
__init__
(
self
,
...
...
@@ -150,7 +163,6 @@ class SpeechCollatorBase():
# extract speech feature
spectrum
,
transcript_part
=
self
.
_speech_featurizer
.
featurize
(
speech_segment
,
self
.
keep_transcription_text
)
# CMVN spectrum
if
self
.
_normalizer
:
spectrum
=
self
.
_normalizer
.
apply
(
spectrum
)
...
...
@@ -163,38 +175,35 @@ class SpeechCollatorBase():
"""batch examples
Args:
batch (
[List]): batch is (audio, text)
batch (
List[Dict]): batch is [dict(audio, text, ...)]
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
Returns:
tuple(audio, text, audio_lens, text_lens): batched data.
audio : (B, Tmax, D)
audio_lens: (B)
text : (B, Umax)
text_lens: (B)
tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
utts: (B,)
xs_pad : (B, Tmax, D)
ilens: (B,)
ys_pad : (B, Umax)
olens: (B,)
"""
audios
=
[]
audio_lens
=
[]
texts
=
[]
text_lens
=
[]
utts
=
[]
for
utt
,
audio
,
text
in
batch
:
for
idx
,
item
in
enumerate
(
batch
):
utts
.
append
(
item
[
'utt'
])
audio
=
item
[
'feat'
]
text
=
item
[
'text'
]
audio
,
text
=
self
.
process_utterance
(
audio
,
text
)
#utt
utts
.
append
(
utt
)
# audio
audios
.
append
(
audio
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
0
])
# text
# for training, text is token ids, else text is string, convert to unicode ord
tokens
=
[]
if
self
.
keep_transcription_text
:
assert
isinstance
(
text
,
str
),
(
type
(
text
),
text
)
tokens
=
[
ord
(
t
)
for
t
in
text
]
else
:
tokens
=
text
# token ids
tokens
=
np
.
array
(
tokens
,
dtype
=
np
.
int64
)
tokens
=
tokenids
(
text
,
self
.
keep_transcription_text
)
texts
.
append
(
tokens
)
text_lens
.
append
(
tokens
.
shape
[
0
])
...
...
@@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator):
"""batch examples
Args:
batch (
[List]): batch is (audio, text)
batch (
List[Dict]): batch is [dict(audio, text, ...)]
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
Returns:
tuple(audio, text, audio_lens, text_lens): batched data.
audio : (B, Tmax, D)
audio_lens: (B)
text : (B, Umax)
text_lens: (B)
tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
utts: (B,)
xs_pad : (B, Tmax, D)
ilens: (B,)
ys_pad : [(B, Umax), (B, Umax)]
olens: [(B,), (B,)]
"""
utts
=
[]
audios
=
[]
audio_lens
=
[]
translation_text
=
[]
...
...
@@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator):
transcription_text
=
[]
transcription_text_lens
=
[]
utts
=
[]
for
utt
,
audio
,
translation
,
transcription
in
batch
:
for
idx
,
item
in
enumerate
(
batch
):
utts
.
append
(
item
[
'utt'
])
audio
=
item
[
'feat'
]
translation
=
item
[
'text'
]
transcription
=
item
[
'text1'
]
audio
,
translation
,
transcription
=
self
.
process_utterance
(
audio
,
translation
,
transcription
)
#utt
utts
.
append
(
utt
)
# audio
audios
.
append
(
audio
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
0
])
# text
# for training, text is token ids
# else text is string, convert to unicode ord
tokens
=
[[],
[]]
for
idx
,
text
in
enumerate
([
translation
,
transcription
]):
if
self
.
keep_transcription_text
:
assert
isinstance
(
text
,
str
),
(
type
(
text
),
text
)
tokens
[
idx
]
=
[
ord
(
t
)
for
t
in
text
]
else
:
tokens
[
idx
]
=
text
# token ids
tokens
[
idx
]
=
np
.
array
(
tokens
[
idx
],
dtype
=
np
.
int64
)
tokens
[
idx
]
=
tokenids
(
text
,
self
.
keep_transcription_text
)
translation_text
.
append
(
tokens
[
0
])
translation_text_lens
.
append
(
tokens
[
0
].
shape
[
0
])
transcription_text
.
append
(
tokens
[
1
])
transcription_text_lens
.
append
(
tokens
[
1
].
shape
[
0
])
padded_audios
=
pad_sequence
(
audios
,
padding_value
=
0.0
).
astype
(
np
.
float32
)
#[B, T, D]
audio_lens
=
np
.
array
(
audio_lens
).
astype
(
np
.
int64
)
padded_translation
=
pad_
sequence
(
translation_text
,
padding_value
=
IGNORE_ID
).
astype
(
np
.
int64
)
xs_pad
=
pad_list
(
audios
,
0.0
).
astype
(
np
.
float32
)
#[B, T, D]
ilens
=
np
.
array
(
audio_lens
).
astype
(
np
.
int64
)
padded_translation
=
pad_
list
(
translation_text
,
IGNORE_ID
).
astype
(
np
.
int64
)
translation_lens
=
np
.
array
(
translation_text_lens
).
astype
(
np
.
int64
)
padded_transcription
=
pad_sequence
(
transcription_text
,
padding_value
=
IGNORE_ID
).
astype
(
np
.
int64
)
padded_transcription
=
pad_list
(
transcription_text
,
IGNORE_ID
).
astype
(
np
.
int64
)
transcription_lens
=
np
.
array
(
transcription_text_lens
).
astype
(
np
.
int64
)
return
utts
,
padded_audios
,
audio_lens
,
(
padded_translation
,
padded_transcription
),
(
translation_lens
,
transcription_lens
)
ys_pad
=
(
padded_translation
,
padded_transcription
)
olens
=
(
translation_lens
,
transcription_lens
)
return
utts
,
xs_pad
,
ilens
,
ys_pad
,
olens
deepspeech/io/dataset.py
浏览文件 @
b7b1bda3
...
...
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
from
deepspeech.frontend.utility
import
read_manifest
from
deepspeech.utils.log
import
Log
__all__
=
[
"ManifestDataset"
,
"Tr
ipletManifestDataset"
,
"Tr
ansformDataset"
]
__all__
=
[
"ManifestDataset"
,
"TransformDataset"
]
logger
=
Log
(
__name__
).
getlog
()
...
...
@@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
return
len
(
self
.
_manifest
)
def
__getitem__
(
self
,
idx
):
instance
=
self
.
_manifest
[
idx
]
return
instance
[
"utt"
],
instance
[
"feat"
],
instance
[
"text"
]
class
TripletManifestDataset
(
ManifestDataset
):
"""
For Joint Training of Speech Translation and ASR.
text: translation,
text1: transcript.
"""
def
__getitem__
(
self
,
idx
):
instance
=
self
.
_manifest
[
idx
]
return
instance
[
"utt"
],
instance
[
"feat"
],
instance
[
"text"
],
instance
[
"text1"
]
return
self
.
_manifest
[
idx
]
class
TransformDataset
(
Dataset
):
...
...
@@ -273,5 +259,4 @@ class AudioDataset(Dataset):
return
len
(
self
.
minibatch
)
def
__getitem__
(
self
,
idx
):
instance
=
self
.
minibatch
[
idx
]
return
instance
[
"utt"
],
instance
[
"feat"
],
instance
[
"text"
]
return
self
.
minibatch
[
idx
]
deepspeech/io/reader.py
浏览文件 @
b7b1bda3
...
...
@@ -322,7 +322,7 @@ class LoadInputsAndTargets():
"Not supported: loader_type={}"
.
format
(
filetype
))
def
file_type
(
self
,
filepath
):
suffix
=
filepath
.
split
(
":"
)[
0
].
split
(
'.'
)[
1
]
suffix
=
filepath
.
split
(
":"
)[
0
].
split
(
'.'
)[
-
1
]
if
suffix
==
'ark'
:
return
'mat'
elif
suffix
==
'scp'
:
...
...
docs/src/data_preparation.md
浏览文件 @
b7b1bda3
...
...
@@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
```
bash
python3 utils/compute_mean_std.py
\
--num_samples
2000
\
--spec
gra
m_type
linear
\
--spec
tru
m_type
linear
\
--manifest_path
examples/librispeech/data/manifest.train
\
--output_path
examples/librispeech/data/mean_std.npz
```
...
...
docs/src/deepspeech_architecture.md
浏览文件 @
b7b1bda3
...
...
@@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
cd examples/aishell/s0
python3 ../../../utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--spec
gra
m_type="linear" \
--spec
tru
m_type="linear" \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=20.0 \
...
...
examples/1xt2x/aishell/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
delta_delta
:
False
stride_ms
:
10.0
...
...
examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
delta_delta
:
False
stride_ms
:
10.0
...
...
examples/1xt2x/librispeech/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
delta_delta
:
False
stride_ms
:
10.0
...
...
examples/aishell/s0/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
delta_delta
:
False
stride_ms
:
10.0
...
...
examples/aishell/s0/conf/deepspeech2_online.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spec
gra
m_type
:
linear
#linear, mfcc, fbank
spec
tru
m_type
:
linear
#linear, mfcc, fbank
feat_dim
:
delta_delta
:
False
stride_ms
:
10.0
...
...
examples/aishell/s0/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers
=
$(
nproc
)
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--spec
gra
m_type
=
"linear"
\
--spec
tru
m_type
=
"linear"
\
--delta_delta
=
false
\
--stride_ms
=
10.0
\
--window_ms
=
20.0
\
...
...
examples/aishell/s1/conf/chunk_conformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/aishell/s1/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/aishell/s1/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers
=
$(
nproc
)
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--stride_ms
=
10.0
\
...
...
examples/callcenter/s1/conf/chunk_conformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/callcenter/s1/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/callcenter/s1/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers
=
$(
nproc
)
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--stride_ms
=
10.0
\
...
...
examples/librispeech/s0/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
...
...
examples/librispeech/s0/conf/deepspeech2_online.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
...
...
examples/librispeech/s0/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
2000
\
--spec
gra
m_type
=
"linear"
\
--spec
tru
m_type
=
"linear"
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--stride_ms
=
10.0
\
...
...
examples/librispeech/s1/conf/chunk_conformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/librispeech/s1/conf/chunk_transformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/librispeech/s1/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/librispeech/s1/conf/transformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/librispeech/s1/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
-1
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
...
...
examples/librispeech/s2/conf/chunk_conformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/librispeech/s2/conf/chunk_transformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/librispeech/s2/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/librispeech/s2/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
-1
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
...
...
examples/ted_en_zh/t0/conf/transformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
# augmentation_config: conf/augmentation.json
batch_size
:
10
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
# augmentation_config: conf/augmentation.json
batch_size
:
10
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/ted_en_zh/t0/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
-1
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
...
...
examples/timit/s1/conf/transformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -17,7 +17,7 @@ collator:
augmentation_config
:
"
"
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/timit/s1/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
-1
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
...
...
examples/tiny/s0/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
delta_delta
:
False
stride_ms
:
10.0
...
...
examples/tiny/s0/conf/deepspeech2_online.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
delta_delta
:
False
stride_ms
:
10.0
...
...
examples/tiny/s0/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--num_samples
=
64
\
--spec
gra
m_type
=
"linear"
\
--spec
tru
m_type
=
"linear"
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--stride_ms
=
10.0
\
...
...
examples/tiny/s1/conf/chunk_confermer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/tiny/s1/conf/chunk_transformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/tiny/s1/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/tiny/s1/conf/transformer.yaml
浏览文件 @
b7b1bda3
...
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
...
...
examples/tiny/s1/local/data.sh
浏览文件 @
b7b1bda3
...
...
@@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--num_samples
=
64
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
...
...
utils/compute_mean_std.py
浏览文件 @
b7b1bda3
...
...
@@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg
(
'num_samples'
,
int
,
2000
,
"# of samples to for statistics."
)
add_arg
(
'spec
gra
m_type'
,
str
,
add_arg
(
'spec
tru
m_type'
,
str
,
'linear'
,
"Audio feature type. Options: linear, mfcc, fbank."
,
choices
=
[
'linear'
,
'mfcc'
,
'fbank'
])
...
...
@@ -58,7 +58,7 @@ def main():
augmentation_pipeline
=
AugmentationPipeline
(
'{}'
)
audio_featurizer
=
AudioFeaturizer
(
spec
gram_type
=
args
.
specgra
m_type
,
spec
trum_type
=
args
.
spectru
m_type
,
feat_dim
=
args
.
feat_dim
,
delta_delta
=
args
.
delta_delta
,
stride_ms
=
args
.
stride_ms
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录