Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
b7674866
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b7674866
编写于
4月 09, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor audio feat
上级
177f463d
变更
14
显示空白变更内容
内联
并排
Showing
14 changed file
with
103 addition
and
55 deletion
+103
-55
.notebook/jit_infer.ipynb
.notebook/jit_infer.ipynb
+2
-0
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
+2
-0
deepspeech/exps/deepspeech2/bin/deploy/server.py
deepspeech/exps/deepspeech2/bin/deploy/server.py
+2
-0
deepspeech/exps/deepspeech2/bin/tune.py
deepspeech/exps/deepspeech2/bin/tune.py
+2
-0
deepspeech/exps/deepspeech2/config.py
deepspeech/exps/deepspeech2/config.py
+4
-2
deepspeech/exps/deepspeech2/model.py
deepspeech/exps/deepspeech2/model.py
+6
-0
deepspeech/frontend/featurizer/audio_featurizer.py
deepspeech/frontend/featurizer/audio_featurizer.py
+18
-14
deepspeech/frontend/featurizer/speech_featurizer.py
deepspeech/frontend/featurizer/speech_featurizer.py
+2
-2
deepspeech/frontend/featurizer/text_featurizer.py
deepspeech/frontend/featurizer/text_featurizer.py
+10
-7
deepspeech/io/__init__.py
deepspeech/io/__init__.py
+2
-0
deepspeech/io/dataset.py
deepspeech/io/dataset.py
+7
-1
examples/tiny/s0/local/data.sh
examples/tiny/s0/local/data.sh
+3
-1
utils/build_vocab.py
utils/build_vocab.py
+31
-24
utils/compute_mean_std.py
utils/compute_mean_std.py
+12
-4
未找到文件。
.notebook/jit_infer.ipynb
浏览文件 @
b7674866
...
@@ -307,6 +307,8 @@
...
@@ -307,6 +307,8 @@
" max_freq=config.data.max_freq,\n",
" max_freq=config.data.max_freq,\n",
" target_sample_rate=config.data.target_sample_rate,\n",
" target_sample_rate=config.data.target_sample_rate,\n",
" specgram_type=config.data.specgram_type,\n",
" specgram_type=config.data.specgram_type,\n",
" feat_dim=config.data.feat_dim,\n",
" delta_delta=config.data.delat_delta,\n",
" use_dB_normalization=config.data.use_dB_normalization,\n",
" use_dB_normalization=config.data.use_dB_normalization,\n",
" target_dB=config.data.target_dB,\n",
" target_dB=config.data.target_dB,\n",
" random_seed=config.data.random_seed,\n",
" random_seed=config.data.random_seed,\n",
...
...
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
浏览文件 @
b7674866
...
@@ -98,6 +98,8 @@ def start_server(config, args):
...
@@ -98,6 +98,8 @@ def start_server(config, args):
max_freq
=
config
.
data
.
max_freq
,
max_freq
=
config
.
data
.
max_freq
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
specgram_type
=
config
.
data
.
specgram_type
,
specgram_type
=
config
.
data
.
specgram_type
,
feat_dim
=
config
.
data
.
feat_dim
,
delta_delta
=
config
.
data
.
delat_delta
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
target_dB
=
config
.
data
.
target_dB
,
target_dB
=
config
.
data
.
target_dB
,
random_seed
=
config
.
data
.
random_seed
,
random_seed
=
config
.
data
.
random_seed
,
...
...
deepspeech/exps/deepspeech2/bin/deploy/server.py
浏览文件 @
b7674866
...
@@ -50,6 +50,8 @@ def start_server(config, args):
...
@@ -50,6 +50,8 @@ def start_server(config, args):
max_freq
=
config
.
data
.
max_freq
,
max_freq
=
config
.
data
.
max_freq
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
specgram_type
=
config
.
data
.
specgram_type
,
specgram_type
=
config
.
data
.
specgram_type
,
feat_dim
=
config
.
data
.
feat_dim
,
delta_delta
=
config
.
data
.
delat_delta
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
target_dB
=
config
.
data
.
target_dB
,
target_dB
=
config
.
data
.
target_dB
,
random_seed
=
config
.
data
.
random_seed
,
random_seed
=
config
.
data
.
random_seed
,
...
...
deepspeech/exps/deepspeech2/bin/tune.py
浏览文件 @
b7674866
...
@@ -56,6 +56,8 @@ def tune(config, args):
...
@@ -56,6 +56,8 @@ def tune(config, args):
max_freq
=
config
.
data
.
max_freq
,
max_freq
=
config
.
data
.
max_freq
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
specgram_type
=
config
.
data
.
specgram_type
,
specgram_type
=
config
.
data
.
specgram_type
,
feat_dim
=
config
.
data
.
feat_dim
,
delta_delta
=
config
.
data
.
delat_delta
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
target_dB
=
config
.
data
.
target_dB
,
target_dB
=
config
.
data
.
target_dB
,
random_seed
=
config
.
data
.
random_seed
,
random_seed
=
config
.
data
.
random_seed
,
...
...
deepspeech/exps/deepspeech2/config.py
浏览文件 @
b7674866
...
@@ -32,8 +32,10 @@ _C.data = CN(
...
@@ -32,8 +32,10 @@ _C.data = CN(
window_ms
=
20.0
,
# ms
window_ms
=
20.0
,
# ms
n_fft
=
None
,
# fft points
n_fft
=
None
,
# fft points
max_freq
=
None
,
# None for samplerate/2
max_freq
=
None
,
# None for samplerate/2
specgram_type
=
'linear'
,
# 'linear', 'mfcc'
specgram_type
=
'linear'
,
# 'linear', 'mfcc', 'fbank'
target_sample_rate
=
16000
,
# sample rate
feat_dim
=
0
,
# 'mfcc', 'fbank'
delat_delta
=
False
,
# 'mfcc', 'fbank'
target_sample_rate
=
16000
,
# target sample rate
use_dB_normalization
=
True
,
use_dB_normalization
=
True
,
target_dB
=-
20
,
target_dB
=-
20
,
random_seed
=
0
,
random_seed
=
0
,
...
...
deepspeech/exps/deepspeech2/model.py
浏览文件 @
b7674866
...
@@ -163,6 +163,8 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -163,6 +163,8 @@ class DeepSpeech2Trainer(Trainer):
max_freq
=
config
.
data
.
max_freq
,
max_freq
=
config
.
data
.
max_freq
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
specgram_type
=
config
.
data
.
specgram_type
,
specgram_type
=
config
.
data
.
specgram_type
,
feat_dim
=
config
.
data
.
feat_dim
,
delta_delta
=
config
.
data
.
delat_delta
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
target_dB
=
config
.
data
.
target_dB
,
target_dB
=
config
.
data
.
target_dB
,
random_seed
=
config
.
data
.
random_seed
,
random_seed
=
config
.
data
.
random_seed
,
...
@@ -183,6 +185,8 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -183,6 +185,8 @@ class DeepSpeech2Trainer(Trainer):
max_freq
=
config
.
data
.
max_freq
,
max_freq
=
config
.
data
.
max_freq
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
specgram_type
=
config
.
data
.
specgram_type
,
specgram_type
=
config
.
data
.
specgram_type
,
feat_dim
=
config
.
data
.
feat_dim
,
delta_delta
=
config
.
data
.
delat_delta
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
target_dB
=
config
.
data
.
target_dB
,
target_dB
=
config
.
data
.
target_dB
,
random_seed
=
config
.
data
.
random_seed
,
random_seed
=
config
.
data
.
random_seed
,
...
@@ -378,6 +382,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -378,6 +382,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
max_freq
=
config
.
data
.
max_freq
,
max_freq
=
config
.
data
.
max_freq
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
target_sample_rate
=
config
.
data
.
target_sample_rate
,
specgram_type
=
config
.
data
.
specgram_type
,
specgram_type
=
config
.
data
.
specgram_type
,
feat_dim
=
config
.
data
.
feat_dim
,
delta_delta
=
config
.
data
.
delat_delta
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
use_dB_normalization
=
config
.
data
.
use_dB_normalization
,
target_dB
=
config
.
data
.
target_dB
,
target_dB
=
config
.
data
.
target_dB
,
random_seed
=
config
.
data
.
random_seed
,
random_seed
=
config
.
data
.
random_seed
,
...
...
deepspeech/frontend/featurizer/audio_featurizer.py
浏览文件 @
b7674866
...
@@ -61,7 +61,9 @@ class AudioFeaturizer(object):
...
@@ -61,7 +61,9 @@ class AudioFeaturizer(object):
use_dB_normalization
=
True
,
use_dB_normalization
=
True
,
target_dB
=-
20
):
target_dB
=-
20
):
self
.
_specgram_type
=
specgram_type
self
.
_specgram_type
=
specgram_type
# mfcc and fbank using `feat_dim`
self
.
_feat_dim
=
feat_dim
self
.
_feat_dim
=
feat_dim
# mfcc and fbank using `delta-delta`
self
.
_delta_delta
=
delta_delta
self
.
_delta_delta
=
delta_delta
self
.
_stride_ms
=
stride_ms
self
.
_stride_ms
=
stride_ms
self
.
_window_ms
=
window_ms
self
.
_window_ms
=
window_ms
...
@@ -130,25 +132,28 @@ class AudioFeaturizer(object):
...
@@ -130,25 +132,28 @@ class AudioFeaturizer(object):
"""Extract various audio features."""
"""Extract various audio features."""
if
self
.
_specgram_type
==
'linear'
:
if
self
.
_specgram_type
==
'linear'
:
return
self
.
_compute_linear_specgram
(
return
self
.
_compute_linear_specgram
(
samples
,
sample_rate
,
self
.
_stride_ms
,
self
.
_window_ms
,
samples
,
self
.
_max_freq
)
sample_rate
,
stride_ms
=
self
.
_stride_ms
,
window_ms
=
self
.
_window_ms
,
max_freq
=
self
.
_max_freq
)
elif
self
.
_specgram_type
==
'mfcc'
:
elif
self
.
_specgram_type
==
'mfcc'
:
return
self
.
_compute_mfcc
(
return
self
.
_compute_mfcc
(
samples
,
samples
,
sample_rate
,
sample_rate
,
self
.
_stride_ms
,
feat_dim
=
self
.
_feat_dim
,
s
elf
.
_feat_dim
,
s
tride_ms
=
self
.
_stride_ms
,
self
.
_window_ms
,
window_ms
=
self
.
_window_ms
,
self
.
_max_freq
,
max_freq
=
self
.
_max_freq
,
delta_delta
=
self
.
_delta_delta
)
delta_delta
=
self
.
_delta_delta
)
elif
self
.
_specgram_type
==
'fbank'
:
elif
self
.
_specgram_type
==
'fbank'
:
return
self
.
_compute_fbank
(
return
self
.
_compute_fbank
(
samples
,
samples
,
sample_rate
,
sample_rate
,
self
.
_stride_ms
,
feat_dim
=
self
.
_feat_dim
,
s
elf
.
_feat_dim
,
s
tride_ms
=
self
.
_stride_ms
,
self
.
_window_ms
,
window_ms
=
self
.
_window_ms
,
self
.
_max_freq
,
max_freq
=
self
.
_max_freq
,
delta_delta
=
self
.
_delta_delta
)
delta_delta
=
self
.
_delta_delta
)
else
:
else
:
raise
ValueError
(
"Unknown specgram_type %s. "
raise
ValueError
(
"Unknown specgram_type %s. "
...
@@ -323,10 +328,9 @@ class AudioFeaturizer(object):
...
@@ -323,10 +328,9 @@ class AudioFeaturizer(object):
winstep
=
0.001
*
stride_ms
,
winstep
=
0.001
*
stride_ms
,
nfilt
=
feat_dim
,
nfilt
=
feat_dim
,
nfft
=
512
,
nfft
=
512
,
lowfreq
=
max_freq
,
lowfreq
=
0
,
highfreq
=
None
,
highfreq
=
max_freq
,
preemph
=
0.97
,
preemph
=
0.97
,)
winfunc
=
lambda
x
:
np
.
ones
((
x
,
)))
fbank_feat
=
np
.
transpose
(
fbank_feat
)
fbank_feat
=
np
.
transpose
(
fbank_feat
)
if
delta_delta
:
if
delta_delta
:
fbank_feat
=
self
.
_concat_delta_delta
(
fbank_feat
)
fbank_feat
=
self
.
_concat_delta_delta
(
fbank_feat
)
...
...
deepspeech/frontend/featurizer/speech_featurizer.py
浏览文件 @
b7674866
...
@@ -56,8 +56,8 @@ class SpeechFeaturizer(object):
...
@@ -56,8 +56,8 @@ class SpeechFeaturizer(object):
vocab_filepath
,
vocab_filepath
,
spm_model_prefix
=
None
,
spm_model_prefix
=
None
,
specgram_type
=
'linear'
,
specgram_type
=
'linear'
,
feat_dim
=
13
,
feat_dim
=
None
,
delta_delta
=
Tru
e
,
delta_delta
=
Fals
e
,
stride_ms
=
10.0
,
stride_ms
=
10.0
,
window_ms
=
20.0
,
window_ms
=
20.0
,
n_fft
=
None
,
n_fft
=
None
,
...
...
deepspeech/frontend/featurizer/text_featurizer.py
浏览文件 @
b7674866
...
@@ -43,6 +43,15 @@ class TextFeaturizer(object):
...
@@ -43,6 +43,15 @@ class TextFeaturizer(object):
self
.
sp
=
spm
.
SentencePieceProcessor
()
self
.
sp
=
spm
.
SentencePieceProcessor
()
self
.
sp
.
Load
(
spm_model
)
self
.
sp
.
Load
(
spm_model
)
def
tokenize
(
self
,
text
):
if
self
.
unit_type
==
'char'
:
tokens
=
self
.
char_tokenize
(
text
)
elif
self
.
unit_type
==
'word'
:
tokens
=
self
.
word_tokenize
(
text
)
else
:
# spm
tokens
=
self
.
spm_tokenize
(
text
)
return
tokens
def
featurize
(
self
,
text
):
def
featurize
(
self
,
text
):
"""Convert text string to a list of token indices in char-level.Note
"""Convert text string to a list of token indices in char-level.Note
that the token indexing order follows the given vocabulary file.
that the token indexing order follows the given vocabulary file.
...
@@ -52,13 +61,7 @@ class TextFeaturizer(object):
...
@@ -52,13 +61,7 @@ class TextFeaturizer(object):
:return: List of char-level token indices.
:return: List of char-level token indices.
:rtype: List[int]
:rtype: List[int]
"""
"""
if
self
.
unit_type
==
'char'
:
tokens
=
self
.
tokenize
(
text
)
tokens
=
self
.
char_tokenize
(
text
)
elif
self
.
unit_type
==
'word'
:
tokens
=
self
.
word_tokenize
(
text
)
else
:
tokens
=
self
.
spm_tokenize
(
text
)
ids
=
[]
ids
=
[]
for
token
in
tokens
:
for
token
in
tokens
:
token
=
token
if
token
in
self
.
_vocab_dict
else
self
.
unk
token
=
token
if
token
in
self
.
_vocab_dict
else
self
.
unk
...
...
deepspeech/io/__init__.py
浏览文件 @
b7674866
...
@@ -55,6 +55,8 @@ def create_dataloader(manifest_path,
...
@@ -55,6 +55,8 @@ def create_dataloader(manifest_path,
window_ms
=
window_ms
,
window_ms
=
window_ms
,
max_freq
=
max_freq
,
max_freq
=
max_freq
,
specgram_type
=
specgram_type
,
specgram_type
=
specgram_type
,
feat_dim
=
config
.
data
.
feat_dim
,
delta_delta
=
config
.
data
.
delat_delta
,
use_dB_normalization
=
use_dB_normalization
,
use_dB_normalization
=
use_dB_normalization
,
random_seed
=
random_seed
,
random_seed
=
random_seed
,
keep_transcription_text
=
keep_transcription_text
)
keep_transcription_text
=
keep_transcription_text
)
...
...
deepspeech/io/dataset.py
浏览文件 @
b7674866
...
@@ -51,6 +51,8 @@ class ManifestDataset(Dataset):
...
@@ -51,6 +51,8 @@ class ManifestDataset(Dataset):
max_freq
=
None
,
max_freq
=
None
,
target_sample_rate
=
16000
,
target_sample_rate
=
16000
,
specgram_type
=
'linear'
,
specgram_type
=
'linear'
,
feat_dim
=
None
,
delta_delta
=
False
,
use_dB_normalization
=
True
,
use_dB_normalization
=
True
,
target_dB
=-
20
,
target_dB
=-
20
,
random_seed
=
0
,
random_seed
=
0
,
...
@@ -71,7 +73,9 @@ class ManifestDataset(Dataset):
...
@@ -71,7 +73,9 @@ class ManifestDataset(Dataset):
n_fft (int, optional): fft points for rfft. Defaults to None.
n_fft (int, optional): fft points for rfft. Defaults to None.
max_freq (int, optional): max cut freq. Defaults to None.
max_freq (int, optional): max cut freq. Defaults to None.
target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
specgram_type (str, optional): 'linear' or 'mfcc'. Defaults to 'linear'.
specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
target_dB (int, optional): target dB. Defaults to -20.
target_dB (int, optional): target dB. Defaults to -20.
random_seed (int, optional): for random generator. Defaults to 0.
random_seed (int, optional): for random generator. Defaults to 0.
...
@@ -89,6 +93,8 @@ class ManifestDataset(Dataset):
...
@@ -89,6 +93,8 @@ class ManifestDataset(Dataset):
vocab_filepath
=
vocab_filepath
,
vocab_filepath
=
vocab_filepath
,
spm_model_prefix
=
spm_model_prefix
,
spm_model_prefix
=
spm_model_prefix
,
specgram_type
=
specgram_type
,
specgram_type
=
specgram_type
,
feat_dim
=
feat_dim
,
delta_delta
=
delta_delta
,
stride_ms
=
stride_ms
,
stride_ms
=
stride_ms
,
window_ms
=
window_ms
,
window_ms
=
window_ms
,
n_fft
=
n_fft
,
n_fft
=
n_fft
,
...
...
examples/tiny/s0/local/data.sh
浏览文件 @
b7674866
...
@@ -40,7 +40,9 @@ fi
...
@@ -40,7 +40,9 @@ fi
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--num_samples
=
64
\
--num_samples
=
64
\
--specgram_type
=
"linear"
\
--specgram_type
=
"fbank"
\
--feat_dim
=
80
\
--delta_delta
=
false
\
--output_path
=
"data/mean_std.npz"
--output_path
=
"data/mean_std.npz"
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
...
...
utils/build_vocab.py
浏览文件 @
b7674866
...
@@ -54,17 +54,13 @@ add_arg('spm_model_prefix', str, "spm_model_%(spm_mode)_%(count_threshold)",
...
@@ -54,17 +54,13 @@ add_arg('spm_model_prefix', str, "spm_model_%(spm_mode)_%(count_threshold)",
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
def
count_manifest
(
counter
,
manifest_path
):
def
count_manifest
(
counter
,
text_feature
,
manifest_path
):
manifest_jsons
=
read_manifest
(
manifest_path
)
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
for
line_json
in
manifest_jsons
:
if
args
.
unit_type
==
'char'
:
line
=
text_feature
.
tokenize
(
line_json
[
'text'
])
for
char
in
line_json
[
'text'
]:
counter
.
update
(
line
)
counter
.
update
(
char
)
elif
args
.
unit_type
==
'word'
:
def
dump_text_manifest
(
fileobj
,
manifest_path
):
for
word
in
line_json
[
'text'
].
split
():
counter
.
update
(
word
)
def
read_text_manifest
(
fileobj
,
manifest_path
):
manifest_jsons
=
read_manifest
(
manifest_path
)
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
for
line_json
in
manifest_jsons
:
fileobj
.
write
(
line_json
[
'text'
]
+
"
\n
"
)
fileobj
.
write
(
line_json
[
'text'
]
+
"
\n
"
)
...
@@ -77,9 +73,11 @@ def main():
...
@@ -77,9 +73,11 @@ def main():
fout
.
write
(
UNK
+
'
\n
'
)
# <unk> must be 1
fout
.
write
(
UNK
+
'
\n
'
)
# <unk> must be 1
if
args
.
unit_type
!=
'spm'
:
if
args
.
unit_type
!=
'spm'
:
text_feature
=
TextFeaturizer
(
args
.
unit_type
,
args
.
vocab_path
)
counter
=
Counter
()
counter
=
Counter
()
for
manifest_path
in
args
.
manifest_paths
:
for
manifest_path
in
args
.
manifest_paths
:
count_manifest
(
counter
,
manifest_path
)
count_manifest
(
counter
,
text_feature
,
manifest_path
)
count_sorted
=
sorted
(
counter
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
count_sorted
=
sorted
(
counter
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
for
char
,
count
in
count_sorted
:
for
char
,
count
in
count_sorted
:
...
@@ -93,7 +91,7 @@ def main():
...
@@ -93,7 +91,7 @@ def main():
fp
=
tempfile
.
NamedTemporaryFile
(
mode
=
'w'
,
delete
=
False
)
fp
=
tempfile
.
NamedTemporaryFile
(
mode
=
'w'
,
delete
=
False
)
for
manifest_path
in
args
.
manifest_paths
:
for
manifest_path
in
args
.
manifest_paths
:
read
_text_manifest
(
fp
,
manifest_path
)
dump
_text_manifest
(
fp
,
manifest_path
)
fp
.
close
()
fp
.
close
()
# train
# train
spm
.
SentencePieceTrainer
.
Train
(
spm
.
SentencePieceTrainer
.
Train
(
...
@@ -108,20 +106,29 @@ def main():
...
@@ -108,20 +106,29 @@ def main():
# encode
# encode
text_feature
=
TextFeaturizer
(
args
.
unit_type
,
args
.
vocab_path
,
args
.
spm_model_prefix
)
text_feature
=
TextFeaturizer
(
args
.
unit_type
,
args
.
vocab_path
,
args
.
spm_model_prefix
)
vocabs
=
set
()
# vocabs = set()
# for manifest_path in args.manifest_paths:
# manifest_jsons = read_manifest(manifest_path)
# for line_json in manifest_jsons:
# line = line_json['text']
# enc_line = text_feature.spm_tokenize(line)
# for code in enc_line:
# vocabs.add(code)
# #print(" ".join(enc_line))
# vocabs_sorted = sorted(vocabs)
# for unit in vocabs_sorted:
# fout.write(unit + "\n")
counter
=
Counter
()
for
manifest_path
in
args
.
manifest_paths
:
for
manifest_path
in
args
.
manifest_paths
:
manifest_jsons
=
read_manifest
(
manifest_path
)
count_manifest
(
counter
,
text_feature
,
manifest_path
)
for
line_json
in
manifest_jsons
:
line
=
line_json
[
'text'
]
count_sorted
=
sorted
(
counter
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
enc_line
=
text_feature
.
spm_tokenize
(
line
)
for
token
,
count
in
count_sorted
:
for
code
in
enc_line
:
fout
.
write
(
token
+
'
\n
'
)
vocabs
.
add
(
code
)
#print(" ".join(enc_line))
print
(
f
"spm vocab size:
{
len
(
count_sorted
)
}
"
)
vocabs_sorted
=
sorted
(
vocabs
)
for
unit
in
vocabs_sorted
:
fout
.
write
(
unit
+
"
\n
"
)
print
(
f
"spm vocab size:
{
len
(
vocabs_sorted
)
}
"
)
fout
.
write
(
SOS
+
"
\n
"
)
# <sos/eos>
fout
.
write
(
SOS
+
"
\n
"
)
# <sos/eos>
fout
.
close
()
fout
.
close
()
...
...
utils/compute_mean_std.py
浏览文件 @
b7674866
...
@@ -28,12 +28,13 @@ add_arg('specgram_type', str,
...
@@ -28,12 +28,13 @@ add_arg('specgram_type', str,
'linear'
,
'linear'
,
"Audio feature type. Options: linear, mfcc, fbank."
,
"Audio feature type. Options: linear, mfcc, fbank."
,
choices
=
[
'linear'
,
'mfcc'
,
'fbank'
])
choices
=
[
'linear'
,
'mfcc'
,
'fbank'
])
add_arg
(
'feat_dim'
,
int
,
add_arg
(
'feat_dim'
,
int
,
13
,
"Audio feature dim."
)
13
,
"Audio feature dim."
)
add_arg
(
'delta_delta'
,
bool
,
add_arg
(
'delta_delta'
,
bool
,
False
,
False
,
"Audio feature with delta delta."
)
"Audio feature with delta delta."
)
add_arg
(
'stride_ms'
,
float
,
10.0
,
"stride length in ms."
)
add_arg
(
'window_ms'
,
float
,
20.0
,
"stride length in ms."
)
add_arg
(
'sample_rate'
,
int
,
16000
,
"target sample rate."
)
add_arg
(
'manifest_path'
,
str
,
add_arg
(
'manifest_path'
,
str
,
'data/librispeech/manifest.train'
,
'data/librispeech/manifest.train'
,
"Filepath of manifest to compute normalizer's mean and stddev."
)
"Filepath of manifest to compute normalizer's mean and stddev."
)
...
@@ -51,7 +52,14 @@ def main():
...
@@ -51,7 +52,14 @@ def main():
audio_featurizer
=
AudioFeaturizer
(
audio_featurizer
=
AudioFeaturizer
(
specgram_type
=
args
.
specgram_type
,
specgram_type
=
args
.
specgram_type
,
feat_dim
=
args
.
feat_dim
,
feat_dim
=
args
.
feat_dim
,
delta_delta
=
args
.
delta_delta
)
delta_delta
=
args
.
delta_delta
,
stride_ms
=
args
.
stride_ms
,
window_ms
=
args
.
window_ms
,
n_fft
=
None
,
max_freq
=
None
,
target_sample_rate
=
args
.
sample_rate
,
use_dB_normalization
=
True
,
target_dB
=-
20
)
def
augment_and_featurize
(
audio_segment
):
def
augment_and_featurize
(
audio_segment
):
augmentation_pipeline
.
transform_audio
(
audio_segment
)
augmentation_pipeline
.
transform_audio
(
audio_segment
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录