Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
70ebbfd8
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
70ebbfd8
编写于
2月 10, 2022
作者:
小湉湉
提交者:
GitHub
2月 10, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1432 from Jackwaterveg/fix
[Bug fix] fix resample
上级
e0280ff9
9a55783a
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
15 addition
and
10 deletion
+15
-10
paddlespeech/cli/asr/infer.py
paddlespeech/cli/asr/infer.py
+4
-2
paddlespeech/s2t/transform/perturb.py
paddlespeech/s2t/transform/perturb.py
+2
-1
paddlespeech/s2t/transform/spectrogram.py
paddlespeech/s2t/transform/spectrogram.py
+4
-3
paddlespeech/vector/exps/ge2e/audio_processor.py
paddlespeech/vector/exps/ge2e/audio_processor.py
+5
-4
未找到文件。
paddlespeech/cli/asr/infer.py
浏览文件 @
70ebbfd8
...
@@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
...
@@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
audio
=
audio
[:,
0
]
audio
=
audio
[:,
0
]
# pcm16 -> pcm 32
# pcm16 -> pcm 32
audio
=
self
.
_pcm16to32
(
audio
)
audio
=
self
.
_pcm16to32
(
audio
)
audio
=
librosa
.
resample
(
audio
,
audio_sample_rate
,
audio
=
librosa
.
resample
(
self
.
sample_rate
)
audio
,
orig_sr
=
audio_sample_rate
,
target_sr
=
self
.
sample_rate
)
audio_sample_rate
=
self
.
sample_rate
audio_sample_rate
=
self
.
sample_rate
# pcm32 -> pcm 16
# pcm32 -> pcm 16
audio
=
self
.
_pcm32to16
(
audio
)
audio
=
self
.
_pcm32to16
(
audio
)
...
...
paddlespeech/s2t/transform/perturb.py
浏览文件 @
70ebbfd8
...
@@ -90,7 +90,8 @@ class SpeedPerturbation():
...
@@ -90,7 +90,8 @@ class SpeedPerturbation():
# Note1: resample requires the sampling-rate of input and output,
# Note1: resample requires the sampling-rate of input and output,
# but actually only the ratio is used.
# but actually only the ratio is used.
y
=
librosa
.
resample
(
x
,
ratio
,
1
,
res_type
=
self
.
res_type
)
y
=
librosa
.
resample
(
x
,
orig_sr
=
ratio
,
target_sr
=
1
,
res_type
=
self
.
res_type
)
if
self
.
keep_length
:
if
self
.
keep_length
:
diff
=
abs
(
len
(
x
)
-
len
(
y
))
diff
=
abs
(
len
(
x
)
-
len
(
y
))
...
...
paddlespeech/s2t/transform/spectrogram.py
浏览文件 @
70ebbfd8
...
@@ -38,7 +38,7 @@ def stft(x,
...
@@ -38,7 +38,7 @@ def stft(x,
x
=
np
.
stack
(
x
=
np
.
stack
(
[
[
librosa
.
stft
(
librosa
.
stft
(
x
[:,
ch
],
y
=
x
[:,
ch
],
n_fft
=
n_fft
,
n_fft
=
n_fft
,
hop_length
=
n_shift
,
hop_length
=
n_shift
,
win_length
=
win_length
,
win_length
=
win_length
,
...
@@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
...
@@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
x
=
np
.
stack
(
x
=
np
.
stack
(
[
[
librosa
.
istft
(
librosa
.
istft
(
x
[:,
ch
].
T
,
# [Time, Freq] -> [Freq, Time]
y
=
x
[:,
ch
].
T
,
# [Time, Freq] -> [Freq, Time]
hop_length
=
n_shift
,
hop_length
=
n_shift
,
win_length
=
win_length
,
win_length
=
win_length
,
window
=
window
,
window
=
window
,
...
@@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
...
@@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
# spc: (Time, Channel, Freq) or (Time, Freq)
# spc: (Time, Channel, Freq) or (Time, Freq)
spc
=
np
.
abs
(
x_stft
)
spc
=
np
.
abs
(
x_stft
)
# mel_basis: (Mel_freq, Freq)
# mel_basis: (Mel_freq, Freq)
mel_basis
=
librosa
.
filters
.
mel
(
fs
,
n_fft
,
n_mels
,
fmin
,
fmax
)
mel_basis
=
librosa
.
filters
.
mel
(
sr
=
fs
,
n_fft
=
n_fft
,
n_mels
=
n_mels
,
fmin
=
fmin
,
fmax
=
fmax
)
# lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
# lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
lmspc
=
np
.
log10
(
np
.
maximum
(
eps
,
np
.
dot
(
spc
,
mel_basis
.
T
)))
lmspc
=
np
.
log10
(
np
.
maximum
(
eps
,
np
.
dot
(
spc
,
mel_basis
.
T
)))
...
...
paddlespeech/vector/exps/ge2e/audio_processor.py
浏览文件 @
70ebbfd8
...
@@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int,
...
@@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int,
partial_utterance_n_frames : int
partial_utterance_n_frames : int
the number of mel spectrogram frames in each partial utterance.
the number of mel spectrogram frames in each partial utterance.
min_pad_coverage : int
min_pad_coverage : int
when reaching the last partial utterance, it may or may not have enough frames.
when reaching the last partial utterance, it may or may not have enough frames.
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
...
@@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int,
...
@@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int,
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
Returns
Returns
----------
----------
the waveform slices and mel spectrogram slices as lists of array slices.
the waveform slices and mel spectrogram slices as lists of array slices.
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
"""
"""
assert
0
<=
overlap
<
1
assert
0
<=
overlap
<
1
...
@@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):
...
@@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):
# Resample if numpy.array is passed and sr does not match
# Resample if numpy.array is passed and sr does not match
if
source_sr
is
not
None
and
source_sr
!=
self
.
sampling_rate
:
if
source_sr
is
not
None
and
source_sr
!=
self
.
sampling_rate
:
wav
=
librosa
.
resample
(
wav
,
source_sr
,
self
.
sampling_rate
)
wav
=
librosa
.
resample
(
wav
,
orig_sr
=
source_sr
,
target_sr
=
self
.
sampling_rate
)
# loudness normalization
# loudness normalization
wav
=
normalize_volume
(
wav
=
normalize_volume
(
...
@@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):
...
@@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):
def
melspectrogram
(
self
,
wav
):
def
melspectrogram
(
self
,
wav
):
mel
=
librosa
.
feature
.
melspectrogram
(
mel
=
librosa
.
feature
.
melspectrogram
(
wav
,
y
=
wav
,
sr
=
self
.
sampling_rate
,
sr
=
self
.
sampling_rate
,
n_fft
=
self
.
n_fft
,
n_fft
=
self
.
n_fft
,
hop_length
=
self
.
hop_length
,
hop_length
=
self
.
hop_length
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录