From 6ec25e3cc3b26f3491f7aa8ace8ff64ecdcad3d9 Mon Sep 17 00:00:00 2001
From: ranchlai <ranchlai@163.com>
Date: Tue, 20 Jul 2021 12:16:51 +0800
Subject: [PATCH]  Export parameters for setting power_to_db in LogMel
 transform (#5326)

* added sound classication

* added liscense, clean code, add pre-commit

* update req

* moved to PaddlePaddle-models

* code re-structure

* update README.md

* update README.md

* Update README.md

* add audioset training

* default resample mode to kaiser_fast

* delete some comments

* precommit check

* sha->rev

* add config.ymal

* remove SoundClassification from paddlespeech, since it's in PaddleAudio now

* add labels

* remove old labels

* update code

* empty

* #5300

* add evaluate, etc

* remove trace|

* import evaluate

* path update

* precommit check

* recover slowfast

* restore README.md to paddle:develop

* refactor

* update readme

* update README.md

* refactor

* refactor

* refactor

* refactor

* precommit fixed

* update README.md

* Update README.md

* Update README.md

* Update train.py

changed prefixed, removed some comments

* add wav file for testing

* bug fixed eval,new checkpoint map=0.416

* Update README.md

* added dcase task1b example

* update README.md

* code fixed for last review

* fixed level string formating

* fixed according to PR reviews

* added wav2vec2.0

* restore datatsets

* add liscense, remove scipy, move test_audio to cloud

* remove 3rd-party dependency:pathos

* add testing for wav2vec2

* update README.md

* updated README.md, added librispeech results

* Revert "updated README.md, added librispeech results"

This reverts commit da4012958e8e0bf2d7f4b608f74518583dd7d73b.

* code fixed from reviews

* add librispeech test

* remove pathos imports

* updated README.md

* update README.md

* minor-fix according to code reviews

* updated README_LP.MD

* fixed according to code review

* fixed according to code review

* added preprocessing example

* removed dcase2021_task1b from examples

* remove preprocessing from examples

* added amsoftmax to losses

* added eer/min_dcf to metrics

* updated __init__.py

* add stft,spectrogram, melspectrogram, log-melspectrogram

* add _internal, transoform, functional to imports

* add new module: functional

* add new module: window.py to _internel/

* add correspoding new unit-test for the new modules

* added ISTFT

* clean code and docstring, clean unit test

* clean code and docstring

* functional

* added back preprocessing

* add README.md

* remove preprocessing for now

* clean code, add doc

* change _internal to signal

* add new transoforms

* add new functionals

* add eps to amsoftmax, return the prediction

* add ffmpeg backend

* remove dithering in depth-convert, add ffmpeg to backend

* add Mudecode/enccode/RandomCodec

* changed variable name, fixed bug

* use namedtuple for returning

* refactor utils

* refactor

* add melspectrogram/spectrogram, add doc string

* add doc string, clean code

* rename window to windowing

* updated docstring, minor bug fixed

* move losses.py to future examples

* remove mu_encode/decode

* refactor

* move metrics to future examples

* remove features/

* naming changes for mu law algorithms

* update test, add testing utils

* fixed import

* fixed import

* fixed duplicate output in logging

* add code examples, shape info, etc

* add doc for public functions

* make backend controllable

* fixed coding stype in docstring

* export parameters for power_to_db LogMel transform

* default to_db to False to be consistent with functional

* fixed typo in docstring
---
 PaddleAudio/paddleaudio/functional.py     |  2 +-
 PaddleAudio/paddleaudio/transforms.py     | 36 +++++++++++++++++++----
 PaddleAudio/paddleaudio/utils/_librosa.py |  6 ++--
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/PaddleAudio/paddleaudio/functional.py b/PaddleAudio/paddleaudio/functional.py
index a9c95e91..025e5903 100644
--- a/PaddleAudio/paddleaudio/functional.py
+++ b/PaddleAudio/paddleaudio/functional.py
@@ -492,7 +492,7 @@ def power_to_db(magnitude: Tensor,
         amin(float): the minimum value of input magnitude, below which the input
             magnitude is clipped(to amin).
         top_db(float): the maximum db value of resulting spectrum, above which the
-            spectrum is clipped(to to_db).
+            spectrum is clipped(to top_db).
     Returns:
         The spectrogram in log-scale.
     shape:
diff --git a/PaddleAudio/paddleaudio/transforms.py b/PaddleAudio/paddleaudio/transforms.py
index 7cb8edb1..d9065c20 100644
--- a/PaddleAudio/paddleaudio/transforms.py
+++ b/PaddleAudio/paddleaudio/transforms.py
@@ -338,7 +338,10 @@ class LogMelSpectrogram(nn.Layer):
                  power: float = 2.0,
                  n_mels: int = 64,
                  f_min: float = 0.0,
-                 f_max: Optional[float] = None):
+                 f_max: Optional[float] = None,
+                 ref_value: float = 1.0,
+                 amin: float = 1e-10,
+                 top_db: Optional[float] = 80.0):
         """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
         typically an audio waveform.
 
@@ -366,6 +369,13 @@ class LogMelSpectrogram(nn.Layer):
             n_mels(int): the mel bins.
             f_min(float): the lower cut-off frequency, below which the filter response is zero.
             f_max(float): the upper cut-off frequency, above which the filter response is zeros.
+            ref_value(float): the reference value. If smaller than 1.0, the db level
+                of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
+            amin(float): the minimum value of input magnitude, below which the input
+                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
+                e.g., 1e-3.
+            top_db(float): the maximum db value of resulting spectrum, above which the
+                spectrum is clipped(to top_db).
 
         Notes:
             The LogMelSpectrogram transform relies on MelSpectrogram transform to compute
@@ -388,13 +398,29 @@ class LogMelSpectrogram(nn.Layer):
 
         """
         super(LogMelSpectrogram, self).__init__()
-        self._melspectrogram = MelSpectrogram(sr, n_fft, hop_length, win_length,
-                                              window, center, pad_mode, power,
-                                              n_mels, f_min, f_max)
+
+        self._melspectrogram = MelSpectrogram(sr=sr,
+                                              n_fft=n_fft,
+                                              hop_length=hop_length,
+                                              win_length=win_length,
+                                              window=window,
+                                              center=center,
+                                              pad_mode=pad_mode,
+                                              power=power,
+                                              n_mels=n_mels,
+                                              f_min=f_min,
+                                              f_max=f_max)
+
+        self.ref_value = ref_value
+        self.amin = amin
+        self.top_db = top_db
 
     def forward(self, x: Tensor) -> Tensor:
         mel_feature = self._melspectrogram(x)
-        log_mel_feature = F.power_to_db(mel_feature)
+        log_mel_feature = F.power_to_db(mel_feature,
+                                        ref_value=self.ref_value,
+                                        amin=self.amin,
+                                        top_db=self.top_db)
         return log_mel_feature
 
     def __repr__(self):
diff --git a/PaddleAudio/paddleaudio/utils/_librosa.py b/PaddleAudio/paddleaudio/utils/_librosa.py
index bf4c87ed..40dc6b91 100644
--- a/PaddleAudio/paddleaudio/utils/_librosa.py
+++ b/PaddleAudio/paddleaudio/utils/_librosa.py
@@ -443,7 +443,7 @@ def melspectrogram(x: array,
                    center: bool = True,
                    pad_mode: str = 'reflect',
                    power: float = 2.0,
-                   to_db: bool = True,
+                   to_db: bool = False,
                    ref: float = 1.0,
                    amin: float = 1e-10,
                    top_db: Optional[float] = None) -> array:
@@ -454,12 +454,12 @@ def melspectrogram(x: array,
         window_size: int, typically 512, 1024, 2048, etc.
         The window size for framing, also used as n_fft for stft
     Returns:
-        The mel-spectrogram in power scale or db scale(default)
+        The mel-spectrogram in amplitude scale(default) or db scale
 
     Notes:
     1. sr is default to 16000, which is commonly used in speech/speaker processing.
     2. when fmax is None, it is set to sr//2.
-    3. this function will convert mel-spectrogram to db scale by default, which is different
+    3. this function will convert mel-spectrogram to db scale by default, which is different from
     that of librosa.
     """
     _check_audio(x, mono=True)
-- 
GitLab