提交 f657ee64 编写于 作者: K KP

Feature alignment.

上级 67dcff2f
...@@ -37,6 +37,7 @@ class Spectrogram(nn.Layer): ...@@ -37,6 +37,7 @@ class Spectrogram(nn.Layer):
hop_length: Optional[int]=None, hop_length: Optional[int]=None,
win_length: Optional[int]=None, win_length: Optional[int]=None,
window: str='hann', window: str='hann',
power: float=2.0,
center: bool=True, center: bool=True,
pad_mode: str='reflect', pad_mode: str='reflect',
dtype: str=paddle.float32): dtype: str=paddle.float32):
...@@ -54,6 +55,7 @@ class Spectrogram(nn.Layer): ...@@ -54,6 +55,7 @@ class Spectrogram(nn.Layer):
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
'exponential','triang','bohman','blackman','cosine','tukey','taylor'. 'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
The default value is 'hann' The default value is 'hann'
power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
If False, frame t begins at x[t * hop_length] If False, frame t begins at x[t * hop_length]
The default value is True The default value is True
...@@ -68,6 +70,9 @@ class Spectrogram(nn.Layer): ...@@ -68,6 +70,9 @@ class Spectrogram(nn.Layer):
""" """
super(Spectrogram, self).__init__() super(Spectrogram, self).__init__()
assert power > 0, 'Power of spectrogram must be > 0.'
self.power = power
if win_length is None: if win_length is None:
win_length = n_fft win_length = n_fft
...@@ -85,7 +90,7 @@ class Spectrogram(nn.Layer): ...@@ -85,7 +90,7 @@ class Spectrogram(nn.Layer):
def forward(self, x): def forward(self, x):
stft = self._stft(x) stft = self._stft(x)
spectrogram = paddle.square(paddle.abs(stft)) spectrogram = paddle.pow(paddle.abs(stft), self.power)
return spectrogram return spectrogram
...@@ -96,6 +101,7 @@ class MelSpectrogram(nn.Layer): ...@@ -96,6 +101,7 @@ class MelSpectrogram(nn.Layer):
hop_length: Optional[int]=None, hop_length: Optional[int]=None,
win_length: Optional[int]=None, win_length: Optional[int]=None,
window: str='hann', window: str='hann',
power: float=2.0,
center: bool=True, center: bool=True,
pad_mode: str='reflect', pad_mode: str='reflect',
n_mels: int=64, n_mels: int=64,
...@@ -120,6 +126,7 @@ class MelSpectrogram(nn.Layer): ...@@ -120,6 +126,7 @@ class MelSpectrogram(nn.Layer):
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
'exponential','triang','bohman','blackman','cosine','tukey','taylor'. 'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
The default value is 'hann' The default value is 'hann'
power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
If False, frame t begins at x[t * hop_length] If False, frame t begins at x[t * hop_length]
The default value is True The default value is True
...@@ -142,6 +149,7 @@ class MelSpectrogram(nn.Layer): ...@@ -142,6 +149,7 @@ class MelSpectrogram(nn.Layer):
hop_length=hop_length, hop_length=hop_length,
win_length=win_length, win_length=win_length,
window=window, window=window,
power=power,
center=center, center=center,
pad_mode=pad_mode, pad_mode=pad_mode,
dtype=dtype) dtype=dtype)
...@@ -176,6 +184,7 @@ class LogMelSpectrogram(nn.Layer): ...@@ -176,6 +184,7 @@ class LogMelSpectrogram(nn.Layer):
hop_length: Optional[int]=None, hop_length: Optional[int]=None,
win_length: Optional[int]=None, win_length: Optional[int]=None,
window: str='hann', window: str='hann',
power: float=2.0,
center: bool=True, center: bool=True,
pad_mode: str='reflect', pad_mode: str='reflect',
n_mels: int=64, n_mels: int=64,
...@@ -214,11 +223,8 @@ class LogMelSpectrogram(nn.Layer): ...@@ -214,11 +223,8 @@ class LogMelSpectrogram(nn.Layer):
htk (bool): whether to use HTK formula in computing fbank matrix. htk (bool): whether to use HTK formula in computing fbank matrix.
norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
You can specify norm=1.0/2.0 to use customized p-norm normalization. You can specify norm=1.0/2.0 to use customized p-norm normalization.
ref_value (float): the reference value. If smaller than 1.0, the db level ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly. amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
Otherwise, the db level is pushed down.
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
e.g., 1e-3.
top_db (float): the maximum db value of resulting spectrum, above which the top_db (float): the maximum db value of resulting spectrum, above which the
spectrum is clipped(to top_db). spectrum is clipped(to top_db).
dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
...@@ -232,6 +238,7 @@ class LogMelSpectrogram(nn.Layer): ...@@ -232,6 +238,7 @@ class LogMelSpectrogram(nn.Layer):
hop_length=hop_length, hop_length=hop_length,
win_length=win_length, win_length=win_length,
window=window, window=window,
power=power,
center=center, center=center,
pad_mode=pad_mode, pad_mode=pad_mode,
n_mels=n_mels, n_mels=n_mels,
...@@ -246,7 +253,6 @@ class LogMelSpectrogram(nn.Layer): ...@@ -246,7 +253,6 @@ class LogMelSpectrogram(nn.Layer):
self.top_db = top_db self.top_db = top_db
def forward(self, x): def forward(self, x):
# import ipdb; ipdb.set_trace()
mel_feature = self._melspectrogram(x) mel_feature = self._melspectrogram(x)
log_mel_feature = power_to_db( log_mel_feature = power_to_db(
mel_feature, mel_feature,
...@@ -264,6 +270,7 @@ class MFCC(nn.Layer): ...@@ -264,6 +270,7 @@ class MFCC(nn.Layer):
hop_length: Optional[int]=None, hop_length: Optional[int]=None,
win_length: Optional[int]=None, win_length: Optional[int]=None,
window: str='hann', window: str='hann',
power: float=2.0,
center: bool=True, center: bool=True,
pad_mode: str='reflect', pad_mode: str='reflect',
n_mels: int=64, n_mels: int=64,
...@@ -291,6 +298,7 @@ class MFCC(nn.Layer): ...@@ -291,6 +298,7 @@ class MFCC(nn.Layer):
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
'exponential','triang','bohman','blackman','cosine','tukey','taylor'. 'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
The default value is 'hann' The default value is 'hann'
power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
If False, frame t begins at x[t * hop_length] If False, frame t begins at x[t * hop_length]
The default value is True The default value is True
...@@ -303,11 +311,8 @@ class MFCC(nn.Layer): ...@@ -303,11 +311,8 @@ class MFCC(nn.Layer):
htk (bool): whether to use HTK formula in computing fbank matrix. htk (bool): whether to use HTK formula in computing fbank matrix.
norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
You can specify norm=1.0/2.0 to use customized p-norm normalization. You can specify norm=1.0/2.0 to use customized p-norm normalization.
ref_value (float): the reference value. If smaller than 1.0, the db level ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly. amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
Otherwise, the db level is pushed down.
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
e.g., 1e-3.
top_db (float): the maximum db value of resulting spectrum, above which the top_db (float): the maximum db value of resulting spectrum, above which the
spectrum is clipped(to top_db). spectrum is clipped(to top_db).
dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
...@@ -322,6 +327,7 @@ class MFCC(nn.Layer): ...@@ -322,6 +327,7 @@ class MFCC(nn.Layer):
hop_length=hop_length, hop_length=hop_length,
win_length=win_length, win_length=win_length,
window=window, window=window,
power=power,
center=center, center=center,
pad_mode=pad_mode, pad_mode=pad_mode,
n_mels=n_mels, n_mels=n_mels,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册