From 6ec25e3cc3b26f3491f7aa8ace8ff64ecdcad3d9 Mon Sep 17 00:00:00 2001 From: ranchlai Date: Tue, 20 Jul 2021 12:16:51 +0800 Subject: [PATCH] Export parameters for setting power_to_db in LogMel transform (#5326) * added sound classication * added liscense, clean code, add pre-commit * update req * moved to PaddlePaddle-models * code re-structure * update README.md * update README.md * Update README.md * add audioset training * default resample mode to kaiser_fast * delete some comments * precommit check * sha->rev * add config.ymal * remove SoundClassification from paddlespeech, since it's in PaddleAudio now * add labels * remove old labels * update code * empty * #5300 * add evaluate, etc * remove trace| * import evaluate * path update * precommit check * recover slowfast * restore README.md to paddle:develop * refactor * update readme * update README.md * refactor * refactor * refactor * refactor * precommit fixed * update README.md * Update README.md * Update README.md * Update train.py changed prefixed, removed some comments * add wav file for testing * bug fixed eval,new checkpoint map=0.416 * Update README.md * added dcase task1b example * update README.md * code fixed for last review * fixed level string formating * fixed according to PR reviews * added wav2vec2.0 * restore datatsets * add liscense, remove scipy, move test_audio to cloud * remove 3rd-party dependency:pathos * add testing for wav2vec2 * update README.md * updated README.md, added librispeech results * Revert "updated README.md, added librispeech results" This reverts commit da4012958e8e0bf2d7f4b608f74518583dd7d73b. * code fixed from reviews * add librispeech test * remove pathos imports * updated README.md * update README.md * minor-fix according to code reviews * updated README_LP.MD * fixed according to code review * fixed according to code review * added preprocessing example * removed dcase2021_task1b from examples * remove preprocessing from examples * added amsoftmax to losses * added eer/min_dcf to metrics * updated __init__.py * add stft,spectrogram, melspectrogram, log-melspectrogram * add _internal, transoform, functional to imports * add new module: functional * add new module: window.py to _internel/ * add correspoding new unit-test for the new modules * added ISTFT * clean code and docstring, clean unit test * clean code and docstring * functional * added back preprocessing * add README.md * remove preprocessing for now * clean code, add doc * change _internal to signal * add new transoforms * add new functionals * add eps to amsoftmax, return the prediction * add ffmpeg backend * remove dithering in depth-convert, add ffmpeg to backend * add Mudecode/enccode/RandomCodec * changed variable name, fixed bug * use namedtuple for returning * refactor utils * refactor * add melspectrogram/spectrogram, add doc string * add doc string, clean code * rename window to windowing * updated docstring, minor bug fixed * move losses.py to future examples * remove mu_encode/decode * refactor * move metrics to future examples * remove features/ * naming changes for mu law algorithms * update test, add testing utils * fixed import * fixed import * fixed duplicate output in logging * add code examples, shape info, etc * add doc for public functions * make backend controllable * fixed coding stype in docstring * export parameters for power_to_db LogMel transform * default to_db to False to be consistent with functional * fixed typo in docstring --- PaddleAudio/paddleaudio/functional.py | 2 +- PaddleAudio/paddleaudio/transforms.py | 36 +++++++++++++++++++---- PaddleAudio/paddleaudio/utils/_librosa.py | 6 ++-- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/PaddleAudio/paddleaudio/functional.py b/PaddleAudio/paddleaudio/functional.py index a9c95e91..025e5903 100644 --- a/PaddleAudio/paddleaudio/functional.py +++ b/PaddleAudio/paddleaudio/functional.py @@ -492,7 +492,7 @@ def power_to_db(magnitude: Tensor, amin(float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin). top_db(float): the maximum db value of resulting spectrum, above which the - spectrum is clipped(to to_db). + spectrum is clipped(to top_db). Returns: The spectrogram in log-scale. shape: diff --git a/PaddleAudio/paddleaudio/transforms.py b/PaddleAudio/paddleaudio/transforms.py index 7cb8edb1..d9065c20 100644 --- a/PaddleAudio/paddleaudio/transforms.py +++ b/PaddleAudio/paddleaudio/transforms.py @@ -338,7 +338,10 @@ class LogMelSpectrogram(nn.Layer): power: float = 2.0, n_mels: int = 64, f_min: float = 0.0, - f_max: Optional[float] = None): + f_max: Optional[float] = None, + ref_value: float = 1.0, + amin: float = 1e-10, + top_db: Optional[float] = 80.0): """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal, typically an audio waveform. @@ -366,6 +369,13 @@ class LogMelSpectrogram(nn.Layer): n_mels(int): the mel bins. f_min(float): the lower cut-off frequency, below which the filter response is zero. f_max(float): the upper cut-off frequency, above which the filter response is zeros. + ref_value(float): the reference value. If smaller than 1.0, the db level + of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. + amin(float): the minimum value of input magnitude, below which the input + magnitude is clipped(to amin). For numerical stability, set amin to a larger value, + e.g., 1e-3. + top_db(float): the maximum db value of resulting spectrum, above which the + spectrum is clipped(to top_db). Notes: The LogMelSpectrogram transform relies on MelSpectrogram transform to compute @@ -388,13 +398,29 @@ class LogMelSpectrogram(nn.Layer): """ super(LogMelSpectrogram, self).__init__() - self._melspectrogram = MelSpectrogram(sr, n_fft, hop_length, win_length, - window, center, pad_mode, power, - n_mels, f_min, f_max) + + self._melspectrogram = MelSpectrogram(sr=sr, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + power=power, + n_mels=n_mels, + f_min=f_min, + f_max=f_max) + + self.ref_value = ref_value + self.amin = amin + self.top_db = top_db def forward(self, x: Tensor) -> Tensor: mel_feature = self._melspectrogram(x) - log_mel_feature = F.power_to_db(mel_feature) + log_mel_feature = F.power_to_db(mel_feature, + ref_value=self.ref_value, + amin=self.amin, + top_db=self.top_db) return log_mel_feature def __repr__(self): diff --git a/PaddleAudio/paddleaudio/utils/_librosa.py b/PaddleAudio/paddleaudio/utils/_librosa.py index bf4c87ed..40dc6b91 100644 --- a/PaddleAudio/paddleaudio/utils/_librosa.py +++ b/PaddleAudio/paddleaudio/utils/_librosa.py @@ -443,7 +443,7 @@ def melspectrogram(x: array, center: bool = True, pad_mode: str = 'reflect', power: float = 2.0, - to_db: bool = True, + to_db: bool = False, ref: float = 1.0, amin: float = 1e-10, top_db: Optional[float] = None) -> array: @@ -454,12 +454,12 @@ def melspectrogram(x: array, window_size: int, typically 512, 1024, 2048, etc. The window size for framing, also used as n_fft for stft Returns: - The mel-spectrogram in power scale or db scale(default) + The mel-spectrogram in amplitude scale(default) or db scale Notes: 1. sr is default to 16000, which is commonly used in speech/speaker processing. 2. when fmax is None, it is set to sr//2. - 3. this function will convert mel-spectrogram to db scale by default, which is different + 3. this function will convert mel-spectrogram to db scale by default, which is different from that of librosa. """ _check_audio(x, mono=True) -- GitLab