Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
b7b1bda3
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
接近 2 年 前同步成功
通知
209
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b7b1bda3
编写于
9月 28, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
test refactor collator
上级
f628e218
变更
47
隐藏空白更改
内联
并排
Showing
47 changed file
with
125 addition
and
163 deletion
+125
-163
deepspeech/exps/u2_st/model.py
deepspeech/exps/u2_st/model.py
+2
-4
deepspeech/frontend/featurizer/audio_featurizer.py
deepspeech/frontend/featurizer/audio_featurizer.py
+16
-16
deepspeech/frontend/featurizer/speech_featurizer.py
deepspeech/frontend/featurizer/speech_featurizer.py
+3
-33
deepspeech/frontend/utility.py
deepspeech/frontend/utility.py
+1
-0
deepspeech/io/collator.py
deepspeech/io/collator.py
+58
-50
deepspeech/io/dataset.py
deepspeech/io/dataset.py
+3
-18
deepspeech/io/reader.py
deepspeech/io/reader.py
+1
-1
docs/src/data_preparation.md
docs/src/data_preparation.md
+1
-1
docs/src/deepspeech_architecture.md
docs/src/deepspeech_architecture.md
+1
-1
examples/1xt2x/aishell/conf/deepspeech2.yaml
examples/1xt2x/aishell/conf/deepspeech2.yaml
+1
-1
examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+1
-1
examples/1xt2x/librispeech/conf/deepspeech2.yaml
examples/1xt2x/librispeech/conf/deepspeech2.yaml
+1
-1
examples/aishell/s0/conf/deepspeech2.yaml
examples/aishell/s0/conf/deepspeech2.yaml
+1
-1
examples/aishell/s0/conf/deepspeech2_online.yaml
examples/aishell/s0/conf/deepspeech2_online.yaml
+1
-1
examples/aishell/s0/local/data.sh
examples/aishell/s0/local/data.sh
+1
-1
examples/aishell/s1/conf/chunk_conformer.yaml
examples/aishell/s1/conf/chunk_conformer.yaml
+1
-1
examples/aishell/s1/conf/conformer.yaml
examples/aishell/s1/conf/conformer.yaml
+1
-1
examples/aishell/s1/local/data.sh
examples/aishell/s1/local/data.sh
+1
-1
examples/callcenter/s1/conf/chunk_conformer.yaml
examples/callcenter/s1/conf/chunk_conformer.yaml
+1
-1
examples/callcenter/s1/conf/conformer.yaml
examples/callcenter/s1/conf/conformer.yaml
+1
-1
examples/callcenter/s1/local/data.sh
examples/callcenter/s1/local/data.sh
+1
-1
examples/librispeech/s0/conf/deepspeech2.yaml
examples/librispeech/s0/conf/deepspeech2.yaml
+1
-1
examples/librispeech/s0/conf/deepspeech2_online.yaml
examples/librispeech/s0/conf/deepspeech2_online.yaml
+1
-1
examples/librispeech/s0/local/data.sh
examples/librispeech/s0/local/data.sh
+1
-1
examples/librispeech/s1/conf/chunk_conformer.yaml
examples/librispeech/s1/conf/chunk_conformer.yaml
+1
-1
examples/librispeech/s1/conf/chunk_transformer.yaml
examples/librispeech/s1/conf/chunk_transformer.yaml
+1
-1
examples/librispeech/s1/conf/conformer.yaml
examples/librispeech/s1/conf/conformer.yaml
+1
-1
examples/librispeech/s1/conf/transformer.yaml
examples/librispeech/s1/conf/transformer.yaml
+1
-1
examples/librispeech/s1/local/data.sh
examples/librispeech/s1/local/data.sh
+1
-1
examples/librispeech/s2/conf/chunk_conformer.yaml
examples/librispeech/s2/conf/chunk_conformer.yaml
+1
-1
examples/librispeech/s2/conf/chunk_transformer.yaml
examples/librispeech/s2/conf/chunk_transformer.yaml
+1
-1
examples/librispeech/s2/conf/conformer.yaml
examples/librispeech/s2/conf/conformer.yaml
+1
-1
examples/librispeech/s2/local/data.sh
examples/librispeech/s2/local/data.sh
+1
-1
examples/ted_en_zh/t0/conf/transformer.yaml
examples/ted_en_zh/t0/conf/transformer.yaml
+1
-1
examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+1
-1
examples/ted_en_zh/t0/local/data.sh
examples/ted_en_zh/t0/local/data.sh
+1
-1
examples/timit/s1/conf/transformer.yaml
examples/timit/s1/conf/transformer.yaml
+1
-1
examples/timit/s1/local/data.sh
examples/timit/s1/local/data.sh
+1
-1
examples/tiny/s0/conf/deepspeech2.yaml
examples/tiny/s0/conf/deepspeech2.yaml
+1
-1
examples/tiny/s0/conf/deepspeech2_online.yaml
examples/tiny/s0/conf/deepspeech2_online.yaml
+1
-1
examples/tiny/s0/local/data.sh
examples/tiny/s0/local/data.sh
+1
-1
examples/tiny/s1/conf/chunk_confermer.yaml
examples/tiny/s1/conf/chunk_confermer.yaml
+1
-1
examples/tiny/s1/conf/chunk_transformer.yaml
examples/tiny/s1/conf/chunk_transformer.yaml
+1
-1
examples/tiny/s1/conf/conformer.yaml
examples/tiny/s1/conf/conformer.yaml
+1
-1
examples/tiny/s1/conf/transformer.yaml
examples/tiny/s1/conf/transformer.yaml
+1
-1
examples/tiny/s1/local/data.sh
examples/tiny/s1/local/data.sh
+1
-1
utils/compute_mean_std.py
utils/compute_mean_std.py
+2
-2
未找到文件。
deepspeech/exps/u2_st/model.py
浏览文件 @
b7b1bda3
...
@@ -31,7 +31,6 @@ from yacs.config import CfgNode
...
@@ -31,7 +31,6 @@ from yacs.config import CfgNode
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.collator
import
TripletSpeechCollator
from
deepspeech.io.collator
import
TripletSpeechCollator
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.io.dataset
import
TripletManifestDataset
from
deepspeech.io.sampler
import
SortagradBatchSampler
from
deepspeech.io.sampler
import
SortagradBatchSampler
from
deepspeech.io.sampler
import
SortagradDistributedBatchSampler
from
deepspeech.io.sampler
import
SortagradDistributedBatchSampler
from
deepspeech.models.u2_st
import
U2STModel
from
deepspeech.models.u2_st
import
U2STModel
...
@@ -249,12 +248,11 @@ class U2STTrainer(Trainer):
...
@@ -249,12 +248,11 @@ class U2STTrainer(Trainer):
config
.
collator
.
keep_transcription_text
=
False
config
.
collator
.
keep_transcription_text
=
False
# train/valid dataset, return token ids
# train/valid dataset, return token ids
Dataset
=
TripletManifestDataset
if
config
.
model
.
model_conf
.
asr_weight
>
0.
else
ManifestDataset
config
.
data
.
manifest
=
config
.
data
.
train_manifest
config
.
data
.
manifest
=
config
.
data
.
train_manifest
train_dataset
=
Dataset
.
from_config
(
config
)
train_dataset
=
Manifest
Dataset
.
from_config
(
config
)
config
.
data
.
manifest
=
config
.
data
.
dev_manifest
config
.
data
.
manifest
=
config
.
data
.
dev_manifest
dev_dataset
=
Dataset
.
from_config
(
config
)
dev_dataset
=
Manifest
Dataset
.
from_config
(
config
)
if
config
.
model
.
model_conf
.
asr_weight
>
0.
:
if
config
.
model
.
model_conf
.
asr_weight
>
0.
:
Collator
=
TripletSpeechCollator
Collator
=
TripletSpeechCollator
...
...
deepspeech/frontend/featurizer/audio_featurizer.py
浏览文件 @
b7b1bda3
...
@@ -24,15 +24,15 @@ class AudioFeaturizer():
...
@@ -24,15 +24,15 @@ class AudioFeaturizer():
Currently, it supports feature types of linear spectrogram and mfcc.
Currently, it supports feature types of linear spectrogram and mfcc.
:param spec
gra
m_type: Specgram feature type. Options: 'linear'.
:param spec
tru
m_type: Specgram feature type. Options: 'linear'.
:type spec
gra
m_type: str
:type spec
tru
m_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:type window_ms: float
:param max_freq: When spec
gra
m_type is 'linear', only FFT bins
:param max_freq: When spec
tru
m_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
corresponding to frequencies between [0, max_freq] are
returned; when spec
gra
m_type is 'mfcc', max_feq is the
returned; when spec
tru
m_type is 'mfcc', max_feq is the
highest band edge of mel filters.
highest band edge of mel filters.
:types max_freq: None|float
:types max_freq: None|float
:param target_sample_rate: Audio are resampled (if upsampling or
:param target_sample_rate: Audio are resampled (if upsampling or
...
@@ -47,7 +47,7 @@ class AudioFeaturizer():
...
@@ -47,7 +47,7 @@ class AudioFeaturizer():
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
spec
gra
m_type
:
str
=
'linear'
,
spec
tru
m_type
:
str
=
'linear'
,
feat_dim
:
int
=
None
,
feat_dim
:
int
=
None
,
delta_delta
:
bool
=
False
,
delta_delta
:
bool
=
False
,
stride_ms
=
10.0
,
stride_ms
=
10.0
,
...
@@ -58,7 +58,7 @@ class AudioFeaturizer():
...
@@ -58,7 +58,7 @@ class AudioFeaturizer():
use_dB_normalization
=
True
,
use_dB_normalization
=
True
,
target_dB
=-
20
,
target_dB
=-
20
,
dither
=
1.0
):
dither
=
1.0
):
self
.
_spec
gram_type
=
specgra
m_type
self
.
_spec
trum_type
=
spectru
m_type
# mfcc and fbank using `feat_dim`
# mfcc and fbank using `feat_dim`
self
.
_feat_dim
=
feat_dim
self
.
_feat_dim
=
feat_dim
# mfcc and fbank using `delta-delta`
# mfcc and fbank using `delta-delta`
...
@@ -113,27 +113,27 @@ class AudioFeaturizer():
...
@@ -113,27 +113,27 @@ class AudioFeaturizer():
def
feature_size
(
self
):
def
feature_size
(
self
):
"""audio feature size"""
"""audio feature size"""
feat_dim
=
0
feat_dim
=
0
if
self
.
_spec
gra
m_type
==
'linear'
:
if
self
.
_spec
tru
m_type
==
'linear'
:
fft_point
=
self
.
_window_ms
if
self
.
_fft_point
is
None
else
self
.
_fft_point
fft_point
=
self
.
_window_ms
if
self
.
_fft_point
is
None
else
self
.
_fft_point
feat_dim
=
int
(
fft_point
*
(
self
.
_target_sample_rate
/
1000
)
/
2
+
feat_dim
=
int
(
fft_point
*
(
self
.
_target_sample_rate
/
1000
)
/
2
+
1
)
1
)
elif
self
.
_spec
gra
m_type
==
'mfcc'
:
elif
self
.
_spec
tru
m_type
==
'mfcc'
:
# mfcc, delta, delta-delta
# mfcc, delta, delta-delta
feat_dim
=
int
(
self
.
_feat_dim
*
feat_dim
=
int
(
self
.
_feat_dim
*
3
)
if
self
.
_delta_delta
else
int
(
self
.
_feat_dim
)
3
)
if
self
.
_delta_delta
else
int
(
self
.
_feat_dim
)
elif
self
.
_spec
gra
m_type
==
'fbank'
:
elif
self
.
_spec
tru
m_type
==
'fbank'
:
# fbank, delta, delta-delta
# fbank, delta, delta-delta
feat_dim
=
int
(
self
.
_feat_dim
*
feat_dim
=
int
(
self
.
_feat_dim
*
3
)
if
self
.
_delta_delta
else
int
(
self
.
_feat_dim
)
3
)
if
self
.
_delta_delta
else
int
(
self
.
_feat_dim
)
else
:
else
:
raise
ValueError
(
"Unknown spec
gra
m_type %s. "
raise
ValueError
(
"Unknown spec
tru
m_type %s. "
"Supported values: linear."
%
self
.
_spec
gra
m_type
)
"Supported values: linear."
%
self
.
_spec
tru
m_type
)
return
feat_dim
return
feat_dim
def
_compute_specgram
(
self
,
audio_segment
):
def
_compute_specgram
(
self
,
audio_segment
):
"""Extract various audio features."""
"""Extract various audio features."""
sample_rate
=
audio_segment
.
sample_rate
sample_rate
=
audio_segment
.
sample_rate
if
self
.
_spec
gra
m_type
==
'linear'
:
if
self
.
_spec
tru
m_type
==
'linear'
:
samples
=
audio_segment
.
samples
samples
=
audio_segment
.
samples
return
self
.
_compute_linear_specgram
(
return
self
.
_compute_linear_specgram
(
samples
,
samples
,
...
@@ -141,7 +141,7 @@ class AudioFeaturizer():
...
@@ -141,7 +141,7 @@ class AudioFeaturizer():
stride_ms
=
self
.
_stride_ms
,
stride_ms
=
self
.
_stride_ms
,
window_ms
=
self
.
_window_ms
,
window_ms
=
self
.
_window_ms
,
max_freq
=
self
.
_max_freq
)
max_freq
=
self
.
_max_freq
)
elif
self
.
_spec
gra
m_type
==
'mfcc'
:
elif
self
.
_spec
tru
m_type
==
'mfcc'
:
samples
=
audio_segment
.
to
(
'int16'
)
samples
=
audio_segment
.
to
(
'int16'
)
return
self
.
_compute_mfcc
(
return
self
.
_compute_mfcc
(
samples
,
samples
,
...
@@ -152,7 +152,7 @@ class AudioFeaturizer():
...
@@ -152,7 +152,7 @@ class AudioFeaturizer():
max_freq
=
self
.
_max_freq
,
max_freq
=
self
.
_max_freq
,
dither
=
self
.
_dither
,
dither
=
self
.
_dither
,
delta_delta
=
self
.
_delta_delta
)
delta_delta
=
self
.
_delta_delta
)
elif
self
.
_spec
gra
m_type
==
'fbank'
:
elif
self
.
_spec
tru
m_type
==
'fbank'
:
samples
=
audio_segment
.
to
(
'int16'
)
samples
=
audio_segment
.
to
(
'int16'
)
return
self
.
_compute_fbank
(
return
self
.
_compute_fbank
(
samples
,
samples
,
...
@@ -164,8 +164,8 @@ class AudioFeaturizer():
...
@@ -164,8 +164,8 @@ class AudioFeaturizer():
dither
=
self
.
_dither
,
dither
=
self
.
_dither
,
delta_delta
=
self
.
_delta_delta
)
delta_delta
=
self
.
_delta_delta
)
else
:
else
:
raise
ValueError
(
"Unknown spec
gra
m_type %s. "
raise
ValueError
(
"Unknown spec
tru
m_type %s. "
"Supported values: linear."
%
self
.
_spec
gra
m_type
)
"Supported values: linear."
%
self
.
_spec
tru
m_type
)
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
"""Compute the spectrogram for samples from a real signal."""
"""Compute the spectrogram for samples from a real signal."""
...
...
deepspeech/frontend/featurizer/speech_featurizer.py
浏览文件 @
b7b1bda3
...
@@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
...
@@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
class
SpeechFeaturizer
():
class
SpeechFeaturizer
():
"""Speech featurizer, for extracting features from both audio and transcript
"""Speech and Text feature extraction.
contents of SpeechSegment.
Currently, for audio parts, it supports feature types of linear
spectrogram and mfcc; for transcript parts, it only supports char-level
tokenizing and conversion into a list of token indices. Note that the
token indexing order follows the given vocabulary file.
:param vocab_filepath: Filepath to load vocabulary for token indices
conversion.
:type specgram_type: str
:param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
:type specgram_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When specgram_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when specgram_type is 'mfcc', max_freq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Speech are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
:type target_dB: float
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
unit_type
,
unit_type
,
vocab_filepath
,
vocab_filepath
,
spm_model_prefix
=
None
,
spm_model_prefix
=
None
,
spec
gra
m_type
=
'linear'
,
spec
tru
m_type
=
'linear'
,
feat_dim
=
None
,
feat_dim
=
None
,
delta_delta
=
False
,
delta_delta
=
False
,
stride_ms
=
10.0
,
stride_ms
=
10.0
,
...
@@ -70,7 +40,7 @@ class SpeechFeaturizer():
...
@@ -70,7 +40,7 @@ class SpeechFeaturizer():
self
.
window_ms
=
window_ms
self
.
window_ms
=
window_ms
self
.
audio_feature
=
AudioFeaturizer
(
self
.
audio_feature
=
AudioFeaturizer
(
spec
gram_type
=
specgra
m_type
,
spec
trum_type
=
spectru
m_type
,
feat_dim
=
feat_dim
,
feat_dim
=
feat_dim
,
delta_delta
=
delta_delta
,
delta_delta
=
delta_delta
,
stride_ms
=
stride_ms
,
stride_ms
=
stride_ms
,
...
...
deepspeech/frontend/utility.py
浏览文件 @
b7b1bda3
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
import
json
import
json
import
math
import
math
import
tarfile
import
tarfile
from
collections
import
namedtuple
from
typing
import
List
from
typing
import
List
from
typing
import
Optional
from
typing
import
Optional
from
typing
import
Text
from
typing
import
Text
...
...
deepspeech/io/collator.py
浏览文件 @
b7b1bda3
...
@@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
...
@@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
logger
=
Log
(
__name__
).
getlog
()
logger
=
Log
(
__name__
).
getlog
()
def
tokenids
(
text
,
keep_transcription_text
):
# for training text is token ids
tokens
=
text
# token ids
if
keep_transcription_text
:
# text is string, convert to unicode ord
assert
isinstance
(
text
,
str
),
(
type
(
text
),
text
)
tokens
=
[
ord
(
t
)
for
t
in
text
]
tokens
=
np
.
array
(
tokens
,
dtype
=
np
.
int64
)
return
tokens
class
SpeechCollatorBase
():
class
SpeechCollatorBase
():
def
__init__
(
def
__init__
(
self
,
self
,
...
@@ -150,7 +163,6 @@ class SpeechCollatorBase():
...
@@ -150,7 +163,6 @@ class SpeechCollatorBase():
# extract speech feature
# extract speech feature
spectrum
,
transcript_part
=
self
.
_speech_featurizer
.
featurize
(
spectrum
,
transcript_part
=
self
.
_speech_featurizer
.
featurize
(
speech_segment
,
self
.
keep_transcription_text
)
speech_segment
,
self
.
keep_transcription_text
)
# CMVN spectrum
# CMVN spectrum
if
self
.
_normalizer
:
if
self
.
_normalizer
:
spectrum
=
self
.
_normalizer
.
apply
(
spectrum
)
spectrum
=
self
.
_normalizer
.
apply
(
spectrum
)
...
@@ -163,38 +175,35 @@ class SpeechCollatorBase():
...
@@ -163,38 +175,35 @@ class SpeechCollatorBase():
"""batch examples
"""batch examples
Args:
Args:
batch (
[List]): batch is (audio, text)
batch (
List[Dict]): batch is [dict(audio, text, ...)]
audio (np.ndarray) shape (T, D)
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
text (List[int] or str): shape (U,)
Returns:
Returns:
tuple(audio, text, audio_lens, text_lens): batched data.
tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
audio : (B, Tmax, D)
utts: (B,)
audio_lens: (B)
xs_pad : (B, Tmax, D)
text : (B, Umax)
ilens: (B,)
text_lens: (B)
ys_pad : (B, Umax)
olens: (B,)
"""
"""
audios
=
[]
audios
=
[]
audio_lens
=
[]
audio_lens
=
[]
texts
=
[]
texts
=
[]
text_lens
=
[]
text_lens
=
[]
utts
=
[]
utts
=
[]
for
utt
,
audio
,
text
in
batch
:
for
idx
,
item
in
enumerate
(
batch
):
utts
.
append
(
item
[
'utt'
])
audio
=
item
[
'feat'
]
text
=
item
[
'text'
]
audio
,
text
=
self
.
process_utterance
(
audio
,
text
)
audio
,
text
=
self
.
process_utterance
(
audio
,
text
)
#utt
utts
.
append
(
utt
)
# audio
audios
.
append
(
audio
)
# [T, D]
audios
.
append
(
audio
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
0
])
audio_lens
.
append
(
audio
.
shape
[
0
])
# text
# for training, text is token ids, else text is string, convert to unicode ord
tokens
=
tokenids
(
text
,
self
.
keep_transcription_text
)
tokens
=
[]
if
self
.
keep_transcription_text
:
assert
isinstance
(
text
,
str
),
(
type
(
text
),
text
)
tokens
=
[
ord
(
t
)
for
t
in
text
]
else
:
tokens
=
text
# token ids
tokens
=
np
.
array
(
tokens
,
dtype
=
np
.
int64
)
texts
.
append
(
tokens
)
texts
.
append
(
tokens
)
text_lens
.
append
(
tokens
.
shape
[
0
])
text_lens
.
append
(
tokens
.
shape
[
0
])
...
@@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator):
...
@@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator):
"""batch examples
"""batch examples
Args:
Args:
batch (
[List]): batch is (audio, text)
batch (
List[Dict]): batch is [dict(audio, text, ...)]
audio (np.ndarray) shape (T, D)
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
text (List[int] or str): shape (U,)
Returns:
Returns:
tuple(audio, text, audio_lens, text_lens): batched data.
tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
audio : (B, Tmax, D)
utts: (B,)
audio_lens: (B)
xs_pad : (B, Tmax, D)
text : (B, Umax)
ilens: (B,)
text_lens: (B)
ys_pad : [(B, Umax), (B, Umax)]
olens: [(B,), (B,)]
"""
"""
utts
=
[]
audios
=
[]
audios
=
[]
audio_lens
=
[]
audio_lens
=
[]
translation_text
=
[]
translation_text
=
[]
...
@@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator):
...
@@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator):
transcription_text
=
[]
transcription_text
=
[]
transcription_text_lens
=
[]
transcription_text_lens
=
[]
utts
=
[]
for
idx
,
item
in
enumerate
(
batch
):
for
utt
,
audio
,
translation
,
transcription
in
batch
:
utts
.
append
(
item
[
'utt'
])
audio
=
item
[
'feat'
]
translation
=
item
[
'text'
]
transcription
=
item
[
'text1'
]
audio
,
translation
,
transcription
=
self
.
process_utterance
(
audio
,
translation
,
transcription
=
self
.
process_utterance
(
audio
,
translation
,
transcription
)
audio
,
translation
,
transcription
)
#utt
utts
.
append
(
utt
)
# audio
audios
.
append
(
audio
)
# [T, D]
audios
.
append
(
audio
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
0
])
audio_lens
.
append
(
audio
.
shape
[
0
])
# text
# for training, text is token ids
# else text is string, convert to unicode ord
tokens
=
[[],
[]]
tokens
=
[[],
[]]
for
idx
,
text
in
enumerate
([
translation
,
transcription
]):
for
idx
,
text
in
enumerate
([
translation
,
transcription
]):
if
self
.
keep_transcription_text
:
tokens
[
idx
]
=
tokenids
(
text
,
self
.
keep_transcription_text
)
assert
isinstance
(
text
,
str
),
(
type
(
text
),
text
)
tokens
[
idx
]
=
[
ord
(
t
)
for
t
in
text
]
else
:
tokens
[
idx
]
=
text
# token ids
tokens
[
idx
]
=
np
.
array
(
tokens
[
idx
],
dtype
=
np
.
int64
)
translation_text
.
append
(
tokens
[
0
])
translation_text
.
append
(
tokens
[
0
])
translation_text_lens
.
append
(
tokens
[
0
].
shape
[
0
])
translation_text_lens
.
append
(
tokens
[
0
].
shape
[
0
])
transcription_text
.
append
(
tokens
[
1
])
transcription_text
.
append
(
tokens
[
1
])
transcription_text_lens
.
append
(
tokens
[
1
].
shape
[
0
])
transcription_text_lens
.
append
(
tokens
[
1
].
shape
[
0
])
padded_audios
=
pad_sequence
(
xs_pad
=
pad_list
(
audios
,
0.0
).
astype
(
np
.
float32
)
#[B, T, D]
audios
,
padding_value
=
0.0
).
astype
(
np
.
float32
)
#[B, T, D]
ilens
=
np
.
array
(
audio_lens
).
astype
(
np
.
int64
)
audio_lens
=
np
.
array
(
audio_lens
).
astype
(
np
.
int64
)
padded_translation
=
pad_
sequence
(
padded_translation
=
pad_
list
(
translation_text
,
translation_text
,
padding_value
=
IGNORE_ID
).
astype
(
np
.
int64
)
IGNORE_ID
).
astype
(
np
.
int64
)
translation_lens
=
np
.
array
(
translation_text_lens
).
astype
(
np
.
int64
)
translation_lens
=
np
.
array
(
translation_text_lens
).
astype
(
np
.
int64
)
padded_transcription
=
pad_sequence
(
transcription_text
,
padding_value
=
IGNORE_ID
).
astype
(
np
.
int64
)
padded_transcription
=
pad_list
(
transcription_text
,
IGNORE_ID
).
astype
(
np
.
int64
)
transcription_lens
=
np
.
array
(
transcription_text_lens
).
astype
(
np
.
int64
)
transcription_lens
=
np
.
array
(
transcription_text_lens
).
astype
(
np
.
int64
)
return
utts
,
padded_audios
,
audio_lens
,
(
padded_translation
,
padded_transcription
),
(
translation_lens
,
ys_pad
=
(
padded_translation
,
padded_transcription
)
transcription_lens
)
olens
=
(
translation_lens
,
transcription_lens
)
return
utts
,
xs_pad
,
ilens
,
ys_pad
,
olens
deepspeech/io/dataset.py
浏览文件 @
b7b1bda3
...
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
...
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
from
deepspeech.frontend.utility
import
read_manifest
from
deepspeech.frontend.utility
import
read_manifest
from
deepspeech.utils.log
import
Log
from
deepspeech.utils.log
import
Log
__all__
=
[
"ManifestDataset"
,
"Tr
ipletManifestDataset"
,
"Tr
ansformDataset"
]
__all__
=
[
"ManifestDataset"
,
"TransformDataset"
]
logger
=
Log
(
__name__
).
getlog
()
logger
=
Log
(
__name__
).
getlog
()
...
@@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
...
@@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
return
len
(
self
.
_manifest
)
return
len
(
self
.
_manifest
)
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
instance
=
self
.
_manifest
[
idx
]
return
self
.
_manifest
[
idx
]
return
instance
[
"utt"
],
instance
[
"feat"
],
instance
[
"text"
]
class
TripletManifestDataset
(
ManifestDataset
):
"""
For Joint Training of Speech Translation and ASR.
text: translation,
text1: transcript.
"""
def
__getitem__
(
self
,
idx
):
instance
=
self
.
_manifest
[
idx
]
return
instance
[
"utt"
],
instance
[
"feat"
],
instance
[
"text"
],
instance
[
"text1"
]
class
TransformDataset
(
Dataset
):
class
TransformDataset
(
Dataset
):
...
@@ -273,5 +259,4 @@ class AudioDataset(Dataset):
...
@@ -273,5 +259,4 @@ class AudioDataset(Dataset):
return
len
(
self
.
minibatch
)
return
len
(
self
.
minibatch
)
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
instance
=
self
.
minibatch
[
idx
]
return
self
.
minibatch
[
idx
]
return
instance
[
"utt"
],
instance
[
"feat"
],
instance
[
"text"
]
deepspeech/io/reader.py
浏览文件 @
b7b1bda3
...
@@ -322,7 +322,7 @@ class LoadInputsAndTargets():
...
@@ -322,7 +322,7 @@ class LoadInputsAndTargets():
"Not supported: loader_type={}"
.
format
(
filetype
))
"Not supported: loader_type={}"
.
format
(
filetype
))
def
file_type
(
self
,
filepath
):
def
file_type
(
self
,
filepath
):
suffix
=
filepath
.
split
(
":"
)[
0
].
split
(
'.'
)[
1
]
suffix
=
filepath
.
split
(
":"
)[
0
].
split
(
'.'
)[
-
1
]
if
suffix
==
'ark'
:
if
suffix
==
'ark'
:
return
'mat'
return
'mat'
elif
suffix
==
'scp'
:
elif
suffix
==
'scp'
:
...
...
docs/src/data_preparation.md
浏览文件 @
b7b1bda3
...
@@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
...
@@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
```
bash
```
bash
python3 utils/compute_mean_std.py
\
python3 utils/compute_mean_std.py
\
--num_samples
2000
\
--num_samples
2000
\
--spec
gra
m_type
linear
\
--spec
tru
m_type
linear
\
--manifest_path
examples/librispeech/data/manifest.train
\
--manifest_path
examples/librispeech/data/manifest.train
\
--output_path
examples/librispeech/data/mean_std.npz
--output_path
examples/librispeech/data/mean_std.npz
```
```
...
...
docs/src/deepspeech_architecture.md
浏览文件 @
b7b1bda3
...
@@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
...
@@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
cd examples/aishell/s0
cd examples/aishell/s0
python3 ../../../utils/compute_mean_std.py \
python3 ../../../utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--manifest_path="data/manifest.train.raw" \
--spec
gra
m_type="linear" \
--spec
tru
m_type="linear" \
--delta_delta=false \
--delta_delta=false \
--stride_ms=10.0 \
--stride_ms=10.0 \
--window_ms=20.0 \
--window_ms=20.0 \
...
...
examples/1xt2x/aishell/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
random_seed
:
0
random_seed
:
0
spm_model_prefix
:
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
feat_dim
:
delta_delta
:
False
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
...
...
examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
random_seed
:
0
random_seed
:
0
spm_model_prefix
:
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
feat_dim
:
delta_delta
:
False
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
...
...
examples/1xt2x/librispeech/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
random_seed
:
0
random_seed
:
0
spm_model_prefix
:
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
feat_dim
:
delta_delta
:
False
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
...
...
examples/aishell/s0/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
random_seed
:
0
random_seed
:
0
spm_model_prefix
:
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
feat_dim
:
delta_delta
:
False
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
...
...
examples/aishell/s0/conf/deepspeech2_online.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
random_seed
:
0
random_seed
:
0
spm_model_prefix
:
spm_model_prefix
:
spec
gra
m_type
:
linear
#linear, mfcc, fbank
spec
tru
m_type
:
linear
#linear, mfcc, fbank
feat_dim
:
feat_dim
:
delta_delta
:
False
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
...
...
examples/aishell/s0/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers
=
$(
nproc
)
num_workers
=
$(
nproc
)
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--spec
gra
m_type
=
"linear"
\
--spec
tru
m_type
=
"linear"
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--stride_ms
=
10.0
\
--stride_ms
=
10.0
\
--window_ms
=
20.0
\
--window_ms
=
20.0
\
...
...
examples/aishell/s1/conf/chunk_conformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
32
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/aishell/s1/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
64
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/aishell/s1/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers
=
$(
nproc
)
num_workers
=
$(
nproc
)
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--stride_ms
=
10.0
\
--stride_ms
=
10.0
\
...
...
examples/callcenter/s1/conf/chunk_conformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
32
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/callcenter/s1/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
32
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/callcenter/s1/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers
=
$(
nproc
)
num_workers
=
$(
nproc
)
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--stride_ms
=
10.0
\
--stride_ms
=
10.0
\
...
...
examples/librispeech/s0/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
random_seed
:
0
random_seed
:
0
spm_model_prefix
:
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
target_sample_rate
:
16000
target_sample_rate
:
16000
max_freq
:
None
max_freq
:
None
n_fft
:
None
n_fft
:
None
...
...
examples/librispeech/s0/conf/deepspeech2_online.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
random_seed
:
0
random_seed
:
0
spm_model_prefix
:
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
target_sample_rate
:
16000
target_sample_rate
:
16000
max_freq
:
None
max_freq
:
None
n_fft
:
None
n_fft
:
None
...
...
examples/librispeech/s0/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
2000
\
--num_samples
=
2000
\
--spec
gra
m_type
=
"linear"
\
--spec
tru
m_type
=
"linear"
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
--stride_ms
=
10.0
\
--stride_ms
=
10.0
\
...
...
examples/librispeech/s1/conf/chunk_conformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
16
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/librispeech/s1/conf/chunk_transformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
64
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/librispeech/s1/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
32
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/librispeech/s1/conf/transformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
32
batch_size
:
32
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/librispeech/s1/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
-1
\
--num_samples
=
-1
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
...
...
examples/librispeech/s2/conf/chunk_conformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
16
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/librispeech/s2/conf/chunk_transformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
64
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/librispeech/s2/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
16
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/librispeech/s2/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
-1
\
--num_samples
=
-1
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
...
...
examples/ted_en_zh/t0/conf/transformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
# augmentation_config: conf/augmentation.json
# augmentation_config: conf/augmentation.json
batch_size
:
10
batch_size
:
10
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
# augmentation_config: conf/augmentation.json
# augmentation_config: conf/augmentation.json
batch_size
:
10
batch_size
:
10
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/ted_en_zh/t0/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
-1
\
--num_samples
=
-1
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
...
...
examples/timit/s1/conf/transformer.yaml
浏览文件 @
b7b1bda3
...
@@ -17,7 +17,7 @@ collator:
...
@@ -17,7 +17,7 @@ collator:
augmentation_config
:
"
"
augmentation_config
:
"
"
batch_size
:
64
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/timit/s1/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
-1
\
--num_samples
=
-1
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
...
...
examples/tiny/s0/conf/deepspeech2.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
random_seed
:
0
random_seed
:
0
spm_model_prefix
:
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
feat_dim
:
delta_delta
:
False
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
...
...
examples/tiny/s0/conf/deepspeech2_online.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
random_seed
:
0
random_seed
:
0
spm_model_prefix
:
spm_model_prefix
:
spec
gra
m_type
:
linear
spec
tru
m_type
:
linear
feat_dim
:
feat_dim
:
delta_delta
:
False
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
...
...
examples/tiny/s0/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--num_samples
=
64
\
--num_samples
=
64
\
--spec
gra
m_type
=
"linear"
\
--spec
tru
m_type
=
"linear"
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
--stride_ms
=
10.0
\
--stride_ms
=
10.0
\
...
...
examples/tiny/s1/conf/chunk_confermer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
4
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/tiny/s1/conf/chunk_transformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
4
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/tiny/s1/conf/conformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
4
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/tiny/s1/conf/transformer.yaml
浏览文件 @
b7b1bda3
...
@@ -18,7 +18,7 @@ collator:
...
@@ -18,7 +18,7 @@ collator:
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
4
batch_size
:
4
raw_wav
:
True
# use raw_wav or kaldi feature
raw_wav
:
True
# use raw_wav or kaldi feature
spec
gra
m_type
:
fbank
#linear, mfcc, fbank
spec
tru
m_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
feat_dim
:
80
delta_delta
:
False
delta_delta
:
False
dither
:
1.0
dither
:
1.0
...
...
examples/tiny/s1/local/data.sh
浏览文件 @
b7b1bda3
...
@@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--num_samples
=
64
\
--num_samples
=
64
\
--spec
gra
m_type
=
"fbank"
\
--spec
tru
m_type
=
"fbank"
\
--feat_dim
=
80
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
...
...
utils/compute_mean_std.py
浏览文件 @
b7b1bda3
...
@@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
...
@@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
# yapf: disable
add_arg
(
'num_samples'
,
int
,
2000
,
"# of samples to for statistics."
)
add_arg
(
'num_samples'
,
int
,
2000
,
"# of samples to for statistics."
)
add_arg
(
'spec
gra
m_type'
,
str
,
add_arg
(
'spec
tru
m_type'
,
str
,
'linear'
,
'linear'
,
"Audio feature type. Options: linear, mfcc, fbank."
,
"Audio feature type. Options: linear, mfcc, fbank."
,
choices
=
[
'linear'
,
'mfcc'
,
'fbank'
])
choices
=
[
'linear'
,
'mfcc'
,
'fbank'
])
...
@@ -58,7 +58,7 @@ def main():
...
@@ -58,7 +58,7 @@ def main():
augmentation_pipeline
=
AugmentationPipeline
(
'{}'
)
augmentation_pipeline
=
AugmentationPipeline
(
'{}'
)
audio_featurizer
=
AudioFeaturizer
(
audio_featurizer
=
AudioFeaturizer
(
spec
gram_type
=
args
.
specgra
m_type
,
spec
trum_type
=
args
.
spectru
m_type
,
feat_dim
=
args
.
feat_dim
,
feat_dim
=
args
.
feat_dim
,
delta_delta
=
args
.
delta_delta
,
delta_delta
=
args
.
delta_delta
,
stride_ms
=
args
.
stride_ms
,
stride_ms
=
args
.
stride_ms
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录