Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
17092cbb
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
17092cbb
编写于
10月 09, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
D,T to T,D
上级
0c9fbaf7
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
122 addition
and
149 deletion
+122
-149
deepspeech/frontend/audio.py
deepspeech/frontend/audio.py
+13
-7
deepspeech/frontend/augmentor/spec_augment.py
deepspeech/frontend/augmentor/spec_augment.py
+4
-4
deepspeech/frontend/featurizer/__init__.py
deepspeech/frontend/featurizer/__init__.py
+3
-0
deepspeech/frontend/featurizer/audio_featurizer.py
deepspeech/frontend/featurizer/audio_featurizer.py
+48
-38
deepspeech/frontend/featurizer/speech_featurizer.py
deepspeech/frontend/featurizer/speech_featurizer.py
+33
-86
deepspeech/frontend/normalizer.py
deepspeech/frontend/normalizer.py
+9
-8
deepspeech/frontend/speech.py
deepspeech/frontend/speech.py
+8
-3
deepspeech/io/collator.py
deepspeech/io/collator.py
+2
-2
examples/librispeech/s1/path.sh
examples/librispeech/s1/path.sh
+2
-1
未找到文件。
deepspeech/frontend/audio.py
浏览文件 @
17092cbb
...
@@ -24,8 +24,10 @@ import soundfile
...
@@ -24,8 +24,10 @@ import soundfile
import
soxbindings
as
sox
import
soxbindings
as
sox
from
scipy
import
signal
from
scipy
import
signal
from
.utility
import
subfile_from_tar
class
AudioSegment
(
object
):
class
AudioSegment
():
"""Monaural audio segment abstraction.
"""Monaural audio segment abstraction.
:param samples: Audio samples [num_samples x num_channels].
:param samples: Audio samples [num_samples x num_channels].
...
@@ -68,16 +70,20 @@ class AudioSegment(object):
...
@@ -68,16 +70,20 @@ class AudioSegment(object):
self
.
duration
,
self
.
rms_db
))
self
.
duration
,
self
.
rms_db
))
@
classmethod
@
classmethod
def
from_file
(
cls
,
file
):
def
from_file
(
cls
,
file
,
infos
=
None
):
"""Create audio segment from audio file.
"""Create audio segment from audio file.
:param filepath: Filepath or file object to audio file.
Args:
:type filepath: str|file
filepath (str|file): Filepath or file object to audio file.
:return: Audio segment instance.
infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
:rtype: AudioSegment
Returns:
AudioSegment: Audio segment instance.
"""
"""
if
isinstance
(
file
,
str
)
and
re
.
findall
(
r
".seqbin_\d+$"
,
file
):
if
isinstance
(
file
,
str
)
and
re
.
findall
(
r
".seqbin_\d+$"
,
file
):
return
cls
.
from_sequence_file
(
file
)
return
cls
.
from_sequence_file
(
file
)
elif
isinstance
(
file
,
str
)
and
file
.
startswith
(
'tar:'
):
return
cls
.
from_file
(
subfile_from_tar
(
file
,
infos
))
else
:
else
:
samples
,
sample_rate
=
soundfile
.
read
(
file
,
dtype
=
'float32'
)
samples
,
sample_rate
=
soundfile
.
read
(
file
,
dtype
=
'float32'
)
return
cls
(
samples
,
sample_rate
)
return
cls
(
samples
,
sample_rate
)
...
...
deepspeech/frontend/augmentor/spec_augment.py
浏览文件 @
17092cbb
...
@@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase):
...
@@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase):
return
self
.
_time_mask
return
self
.
_time_mask
def
__repr__
(
self
):
def
__repr__
(
self
):
return
f
"specaug: F-
{
F
}
, T-
{
T
}
, F-n-
{
n_freq_masks
}
, T-n-
{
n_time_masks
}
"
return
f
"specaug: F-
{
self
.
F
}
, T-
{
self
.
T
}
, F-n-
{
self
.
n_freq_masks
}
, T-n-
{
self
.
n_time_masks
}
"
def
time_warp
(
self
,
x
,
mode
=
'PIL'
):
def
time_warp
(
self
,
x
,
mode
=
'PIL'
):
"""time warp for spec augment
"""time warp for spec augment
...
...
deepspeech/frontend/featurizer/__init__.py
浏览文件 @
17092cbb
...
@@ -11,3 +11,6 @@
...
@@ -11,3 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
.audio_featurizer
import
AudioFeaturizer
#noqa: F401
from
.speech_featurizer
import
SpeechFeaturizer
from
.text_featurizer
import
TextFeaturizer
deepspeech/frontend/featurizer/audio_featurizer.py
浏览文件 @
17092cbb
...
@@ -18,7 +18,7 @@ from python_speech_features import logfbank
...
@@ -18,7 +18,7 @@ from python_speech_features import logfbank
from
python_speech_features
import
mfcc
from
python_speech_features
import
mfcc
class
AudioFeaturizer
(
object
):
class
AudioFeaturizer
():
"""Audio featurizer, for extracting features from audio contents of
"""Audio featurizer, for extracting features from audio contents of
AudioSegment or SpeechSegment.
AudioSegment or SpeechSegment.
...
@@ -167,32 +167,6 @@ class AudioFeaturizer(object):
...
@@ -167,32 +167,6 @@ class AudioFeaturizer(object):
raise
ValueError
(
"Unknown spectrum_type %s. "
raise
ValueError
(
"Unknown spectrum_type %s. "
"Supported values: linear."
%
self
.
_spectrum_type
)
"Supported values: linear."
%
self
.
_spectrum_type
)
def
_compute_linear_specgram
(
self
,
samples
,
sample_rate
,
stride_ms
=
10.0
,
window_ms
=
20.0
,
max_freq
=
None
,
eps
=
1e-14
):
"""Compute the linear spectrogram from FFT energy."""
if
max_freq
is
None
:
max_freq
=
sample_rate
/
2
if
max_freq
>
sample_rate
/
2
:
raise
ValueError
(
"max_freq must not be greater than half of "
"sample rate."
)
if
stride_ms
>
window_ms
:
raise
ValueError
(
"Stride size must not be greater than "
"window size."
)
stride_size
=
int
(
0.001
*
sample_rate
*
stride_ms
)
window_size
=
int
(
0.001
*
sample_rate
*
window_ms
)
specgram
,
freqs
=
self
.
_specgram_real
(
samples
,
window_size
=
window_size
,
stride_size
=
stride_size
,
sample_rate
=
sample_rate
)
ind
=
np
.
where
(
freqs
<=
max_freq
)[
0
][
-
1
]
+
1
return
np
.
log
(
specgram
[:
ind
,
:]
+
eps
)
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
"""Compute the spectrogram for samples from a real signal."""
"""Compute the spectrogram for samples from a real signal."""
# extract strided windows
# extract strided windows
...
@@ -217,26 +191,65 @@ class AudioFeaturizer(object):
...
@@ -217,26 +191,65 @@ class AudioFeaturizer(object):
freqs
=
float
(
sample_rate
)
/
window_size
*
np
.
arange
(
fft
.
shape
[
0
])
freqs
=
float
(
sample_rate
)
/
window_size
*
np
.
arange
(
fft
.
shape
[
0
])
return
fft
,
freqs
return
fft
,
freqs
def
_compute_linear_specgram
(
self
,
samples
,
sample_rate
,
stride_ms
=
10.0
,
window_ms
=
20.0
,
max_freq
=
None
,
eps
=
1e-14
):
"""Compute the linear spectrogram from FFT energy.
Args:
samples ([type]): [description]
sample_rate ([type]): [description]
stride_ms (float, optional): [description]. Defaults to 10.0.
window_ms (float, optional): [description]. Defaults to 20.0.
max_freq ([type], optional): [description]. Defaults to None.
eps ([type], optional): [description]. Defaults to 1e-14.
Raises:
ValueError: [description]
ValueError: [description]
Returns:
np.ndarray: log spectrogram, (time, freq)
"""
if
max_freq
is
None
:
max_freq
=
sample_rate
/
2
if
max_freq
>
sample_rate
/
2
:
raise
ValueError
(
"max_freq must not be greater than half of "
"sample rate."
)
if
stride_ms
>
window_ms
:
raise
ValueError
(
"Stride size must not be greater than "
"window size."
)
stride_size
=
int
(
0.001
*
sample_rate
*
stride_ms
)
window_size
=
int
(
0.001
*
sample_rate
*
window_ms
)
specgram
,
freqs
=
self
.
_specgram_real
(
samples
,
window_size
=
window_size
,
stride_size
=
stride_size
,
sample_rate
=
sample_rate
)
ind
=
np
.
where
(
freqs
<=
max_freq
)[
0
][
-
1
]
+
1
# (freq, time)
spec
=
np
.
log
(
specgram
[:
ind
,
:]
+
eps
)
return
np
.
transpose
(
spec
)
def
_concat_delta_delta
(
self
,
feat
):
def
_concat_delta_delta
(
self
,
feat
):
"""append delat, delta-delta feature.
"""append delat, delta-delta feature.
Args:
Args:
feat (np.ndarray): (
D, T
)
feat (np.ndarray): (
T, D
)
Returns:
Returns:
np.ndarray: feat with delta-delta, (
3*D, T
)
np.ndarray: feat with delta-delta, (
T, 3*D
)
"""
"""
feat
=
np
.
transpose
(
feat
)
# Deltas
# Deltas
d_feat
=
delta
(
feat
,
2
)
d_feat
=
delta
(
feat
,
2
)
# Deltas-Deltas
# Deltas-Deltas
dd_feat
=
delta
(
feat
,
2
)
dd_feat
=
delta
(
feat
,
2
)
# transpose
feat
=
np
.
transpose
(
feat
)
d_feat
=
np
.
transpose
(
d_feat
)
dd_feat
=
np
.
transpose
(
dd_feat
)
# concat above three features
# concat above three features
concat_feat
=
np
.
concatenate
((
feat
,
d_feat
,
dd_feat
))
concat_feat
=
np
.
concatenate
((
feat
,
d_feat
,
dd_feat
)
,
axis
=
1
)
return
concat_feat
return
concat_feat
def
_compute_mfcc
(
self
,
def
_compute_mfcc
(
self
,
...
@@ -292,7 +305,6 @@ class AudioFeaturizer(object):
...
@@ -292,7 +305,6 @@ class AudioFeaturizer(object):
ceplifter
=
22
,
ceplifter
=
22
,
useEnergy
=
True
,
useEnergy
=
True
,
winfunc
=
'povey'
)
winfunc
=
'povey'
)
mfcc_feat
=
np
.
transpose
(
mfcc_feat
)
if
delta_delta
:
if
delta_delta
:
mfcc_feat
=
self
.
_concat_delta_delta
(
mfcc_feat
)
mfcc_feat
=
self
.
_concat_delta_delta
(
mfcc_feat
)
return
mfcc_feat
return
mfcc_feat
...
@@ -346,8 +358,6 @@ class AudioFeaturizer(object):
...
@@ -346,8 +358,6 @@ class AudioFeaturizer(object):
remove_dc_offset
=
True
,
remove_dc_offset
=
True
,
preemph
=
0.97
,
preemph
=
0.97
,
wintype
=
'povey'
)
wintype
=
'povey'
)
fbank_feat
=
np
.
transpose
(
fbank_feat
)
if
delta_delta
:
if
delta_delta
:
fbank_feat
=
self
.
_concat_delta_delta
(
fbank_feat
)
fbank_feat
=
self
.
_concat_delta_delta
(
fbank_feat
)
return
fbank_feat
return
fbank_feat
deepspeech/frontend/featurizer/speech_featurizer.py
浏览文件 @
17092cbb
...
@@ -16,38 +16,8 @@ from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
...
@@ -16,38 +16,8 @@ from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
from
deepspeech.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
deepspeech.frontend.featurizer.text_featurizer
import
TextFeaturizer
class
SpeechFeaturizer
(
object
):
class
SpeechFeaturizer
():
"""Speech featurizer, for extracting features from both audio and transcript
"""Speech and Text feature extraction.
contents of SpeechSegment.
Currently, for audio parts, it supports feature types of linear
spectrogram and mfcc; for transcript parts, it only supports char-level
tokenizing and conversion into a list of token indices. Note that the
token indexing order follows the given vocabulary file.
:param vocab_filepath: Filepath to load vocabulary for token indices
conversion.
:type spectrum_type: str
:param spectrum_type: Specgram feature type. Options: 'linear', 'mfcc'.
:type spectrum_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When spectrum_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when spectrum_type is 'mfcc', max_freq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Speech are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
:type target_dB: float
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
...
@@ -64,8 +34,12 @@ class SpeechFeaturizer(object):
...
@@ -64,8 +34,12 @@ class SpeechFeaturizer(object):
target_sample_rate
=
16000
,
target_sample_rate
=
16000
,
use_dB_normalization
=
True
,
use_dB_normalization
=
True
,
target_dB
=-
20
,
target_dB
=-
20
,
dither
=
1.0
):
dither
=
1.0
,
self
.
_audio_featurizer
=
AudioFeaturizer
(
maskctc
=
False
):
self
.
stride_ms
=
stride_ms
self
.
window_ms
=
window_ms
self
.
audio_feature
=
AudioFeaturizer
(
spectrum_type
=
spectrum_type
,
spectrum_type
=
spectrum_type
,
feat_dim
=
feat_dim
,
feat_dim
=
feat_dim
,
delta_delta
=
delta_delta
,
delta_delta
=
delta_delta
,
...
@@ -77,8 +51,14 @@ class SpeechFeaturizer(object):
...
@@ -77,8 +51,14 @@ class SpeechFeaturizer(object):
use_dB_normalization
=
use_dB_normalization
,
use_dB_normalization
=
use_dB_normalization
,
target_dB
=
target_dB
,
target_dB
=
target_dB
,
dither
=
dither
)
dither
=
dither
)
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
,
vocab_filepath
,
self
.
feature_size
=
self
.
audio_feature
.
feature_size
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
unit_type
,
vocab_filepath
=
vocab_filepath
,
spm_model_prefix
=
spm_model_prefix
,
maskctc
=
maskctc
)
self
.
vocab_size
=
self
.
text_feature
.
vocab_size
def
featurize
(
self
,
speech_segment
,
keep_transcription_text
):
def
featurize
(
self
,
speech_segment
,
keep_transcription_text
):
"""Extract features for speech segment.
"""Extract features for speech segment.
...
@@ -94,66 +74,33 @@ class SpeechFeaturizer(object):
...
@@ -94,66 +74,33 @@ class SpeechFeaturizer(object):
Returns:
Returns:
tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
"""
"""
spec_feature
=
self
.
_audio_featurizer
.
featurize
(
speech_segment
)
spec_feature
=
self
.
audio_feature
.
featurize
(
speech_segment
)
if
keep_transcription_text
:
if
keep_transcription_text
:
return
spec_feature
,
speech_segment
.
transcript
return
spec_feature
,
speech_segment
.
transcript
if
speech_segment
.
has_token
:
if
speech_segment
.
has_token
:
text_ids
=
speech_segment
.
token_ids
text_ids
=
speech_segment
.
token_ids
else
:
else
:
text_ids
=
self
.
_text_featurizer
.
featurize
(
text_ids
=
self
.
text_feature
.
featurize
(
speech_segment
.
transcript
)
speech_segment
.
transcript
)
return
spec_feature
,
text_ids
return
spec_feature
,
text_ids
@
property
def
text_featurize
(
self
,
text
,
keep_transcription_text
):
def
vocab_size
(
self
):
"""Extract features for speech segment.
"""Return the vocabulary size.
Returns:
int: Vocabulary size.
"""
return
self
.
_text_featurizer
.
vocab_size
@
property
def
vocab_list
(
self
):
"""Return the vocabulary in list.
Returns:
List[str]:
"""
return
self
.
_text_featurizer
.
vocab_list
@
property
def
vocab_dict
(
self
):
"""Return the vocabulary in dict.
Returns:
Dict[str, int]:
"""
return
self
.
_text_featurizer
.
vocab_dict
@
property
def
feature_size
(
self
):
"""Return the audio feature size.
Returns:
1. For audio parts, extract the audio features.
int: audio feature size.
2. For transcript parts, keep the original text or convert text string
"""
to a list of token indices in char-level.
return
self
.
_audio_featurizer
.
feature_size
@
property
Args:
def
stride_ms
(
self
):
text (str): text.
"""time length in `ms` unit per frame
keep_transcription_text (bool): True, keep transcript text, False, token ids
Returns:
Returns:
float: time(ms)/frame
(str|List[int]): text, or list of token indices.
"""
"""
return
self
.
_audio_featurizer
.
stride_ms
if
keep_transcription_text
:
return
text
@
property
def
text_feature
(
self
):
"""Return the text feature object.
Returns:
text_ids
=
self
.
text_feature
.
featurize
(
text
)
TextFeaturizer: object.
return
text_ids
"""
return
self
.
_text_featurizer
deepspeech/frontend/normalizer.py
浏览文件 @
17092cbb
...
@@ -40,21 +40,21 @@ class CollateFunc(object):
...
@@ -40,21 +40,21 @@ class CollateFunc(object):
number
=
0
number
=
0
for
item
in
batch
:
for
item
in
batch
:
audioseg
=
AudioSegment
.
from_file
(
item
[
'feat'
])
audioseg
=
AudioSegment
.
from_file
(
item
[
'feat'
])
feat
=
self
.
feature_func
(
audioseg
)
#(
D, T
)
feat
=
self
.
feature_func
(
audioseg
)
#(
T, D
)
sums
=
np
.
sum
(
feat
,
axis
=
1
)
sums
=
np
.
sum
(
feat
,
axis
=
0
)
if
mean_stat
is
None
:
if
mean_stat
is
None
:
mean_stat
=
sums
mean_stat
=
sums
else
:
else
:
mean_stat
+=
sums
mean_stat
+=
sums
square_sums
=
np
.
sum
(
np
.
square
(
feat
),
axis
=
1
)
square_sums
=
np
.
sum
(
np
.
square
(
feat
),
axis
=
0
)
if
var_stat
is
None
:
if
var_stat
is
None
:
var_stat
=
square_sums
var_stat
=
square_sums
else
:
else
:
var_stat
+=
square_sums
var_stat
+=
square_sums
number
+=
feat
.
shape
[
1
]
number
+=
feat
.
shape
[
0
]
return
number
,
mean_stat
,
var_stat
return
number
,
mean_stat
,
var_stat
...
@@ -120,7 +120,7 @@ class FeatureNormalizer(object):
...
@@ -120,7 +120,7 @@ class FeatureNormalizer(object):
"""Normalize features to be of zero mean and unit stddev.
"""Normalize features to be of zero mean and unit stddev.
:param features: Input features to be normalized.
:param features: Input features to be normalized.
:type features: ndarray, shape (
D, T
)
:type features: ndarray, shape (
T, D
)
:param eps: added to stddev to provide numerical stablibity.
:param eps: added to stddev to provide numerical stablibity.
:type eps: float
:type eps: float
:return: Normalized features.
:return: Normalized features.
...
@@ -130,9 +130,10 @@ class FeatureNormalizer(object):
...
@@ -130,9 +130,10 @@ class FeatureNormalizer(object):
def
_read_mean_std_from_file
(
self
,
filepath
,
eps
=
1e-20
):
def
_read_mean_std_from_file
(
self
,
filepath
,
eps
=
1e-20
):
"""Load mean and std from file."""
"""Load mean and std from file."""
mean
,
istd
=
load_cmvn
(
filepath
,
filetype
=
'json'
)
filetype
=
filepath
.
split
(
"."
)[
-
1
]
self
.
_mean
=
np
.
expand_dims
(
mean
,
axis
=-
1
)
mean
,
istd
=
load_cmvn
(
filepath
,
filetype
=
filetype
)
self
.
_istd
=
np
.
expand_dims
(
istd
,
axis
=-
1
)
self
.
_mean
=
np
.
expand_dims
(
mean
,
axis
=
0
)
self
.
_istd
=
np
.
expand_dims
(
istd
,
axis
=
0
)
def
write_to_file
(
self
,
filepath
):
def
write_to_file
(
self
,
filepath
):
"""Write the mean and stddev to the file.
"""Write the mean and stddev to the file.
...
...
deepspeech/frontend/speech.py
浏览文件 @
17092cbb
...
@@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment):
...
@@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment):
return
not
self
.
__eq__
(
other
)
return
not
self
.
__eq__
(
other
)
@
classmethod
@
classmethod
def
from_file
(
cls
,
filepath
,
transcript
,
tokens
=
None
,
token_ids
=
None
):
def
from_file
(
cls
,
filepath
,
transcript
,
tokens
=
None
,
token_ids
=
None
,
infos
=
None
):
"""Create speech segment from audio file and corresponding transcript.
"""Create speech segment from audio file and corresponding transcript.
Args:
Args:
...
@@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment):
...
@@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment):
transcript (str): Transcript text for the speech.
transcript (str): Transcript text for the speech.
tokens (List[str], optional): text tokens. Defaults to None.
tokens (List[str], optional): text tokens. Defaults to None.
token_ids (List[int], optional): text token ids. Defaults to None.
token_ids (List[int], optional): text token ids. Defaults to None.
infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
Returns:
Returns:
SpeechSegment: Speech segment instance.
SpeechSegment: Speech segment instance.
"""
"""
audio
=
AudioSegment
.
from_file
(
filepath
,
infos
)
audio
=
AudioSegment
.
from_file
(
filepath
)
return
cls
(
audio
.
samples
,
audio
.
sample_rate
,
transcript
,
tokens
,
return
cls
(
audio
.
samples
,
audio
.
sample_rate
,
transcript
,
tokens
,
token_ids
)
token_ids
)
...
...
deepspeech/io/collator.py
浏览文件 @
17092cbb
...
@@ -56,8 +56,8 @@ class SpeechCollator():
...
@@ -56,8 +56,8 @@ class SpeechCollator():
for
utt
,
audio
,
text
in
batch
:
for
utt
,
audio
,
text
in
batch
:
utts
.
append
(
utt
)
utts
.
append
(
utt
)
# audio
# audio
audios
.
append
(
audio
.
T
)
# [T, D]
audios
.
append
(
audio
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
1
])
audio_lens
.
append
(
audio
.
shape
[
0
])
# text
# text
# for training, text is token ids
# for training, text is token ids
# else text is string, convert to unicode ord
# else text is string, convert to unicode ord
...
...
examples/librispeech/s1/path.sh
浏览文件 @
17092cbb
...
@@ -3,6 +3,7 @@ export MAIN_ROOT=${PWD}/../../../
...
@@ -3,6 +3,7 @@ export MAIN_ROOT=${PWD}/../../../
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
PATH
}
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
PATH
}
export
LC_ALL
=
C
export
LC_ALL
=
C
export
PYTHONDONTWRITEBYTECODE
=
1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export
PYTHONIOENCODING
=
UTF-8
export
PYTHONIOENCODING
=
UTF-8
export
PYTHONPATH
=
${
MAIN_ROOT
}
:
${
PYTHONPATH
}
export
PYTHONPATH
=
${
MAIN_ROOT
}
:
${
PYTHONPATH
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录