add audio utils

b769579e · Hui Zhang · 7635f98b · b769579e · b769579e · b769579e
4 changed file
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 """Contains data helper functions."""

+import numpy as np
+import math
 import json
 import codecs
 import os
@@ -50,3 +52,85 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
                json_data["duration"] >= min_duration):
            manifest.append(json_data)
    return manifest
+
+
+def rms_to_db(rms: float):
+    """Root Mean Square to dB.
+    Args:
+        rms ([float]): root mean square
+
+    Returns:
+        float: dB
+    """
+    return 20.0 * math.log10(max(1e-16, rms))
+
+
+def rms_to_dbfs(rms: float):
+    """Root Mean Square to dBFS.
+    https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
+    Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
+   
+    dB = dBFS + 3.0103
+    dBFS = db - 3.0103
+    e.g. 0 dB = -3.0103 dBFS
+
+    Args:
+        rms ([float]): root mean square
+
+    Returns:
+        float: dBFS
+    """
+    return rms_to_db(rms) - 3.0103
+
+
+def max_dbfs(sample_data: np.ndarry):
+    """Peak dBFS based on the maximum energy sample. 
+
+    Args:
+        sample_data ([np.ndarry]): float array, [-1, 1].
+
+    Returns:
+        float: dBFS 
+    """
+    # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
+    return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
+
+
+def mean_dbfs(sample_data):
+    """Peak dBFS based on the RMS energy. 
+
+    Args:
+        sample_data ([np.ndarry]): float array, [-1, 1].
+
+    Returns:
+        float: dBFS 
+    """
+    return rms_to_dbfs(
+        math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
+
+
+def gain_db_to_ratio(gain_db: float):
+    """dB to ratio
+
+    Args:
+        gain_db (float): gain in dB
+
+    Returns:
+        float: scale in amp
+    """
+    return math.pow(10.0, gain_db / 20.0)
+
+
+def normalize_audio(sample_data: np.ndarry, dbfs: float=-3.0103):
+    """Nomalize audio to dBFS.
+    
+    Args:
+        sample_data (np.ndarry): input wave samples, [-1, 1].
+        dbfs (float, optional): target dBFS. Defaults to -3.0103.
+
+    Returns:
+        np.ndarry: normalized wave
+    """
+    return np.maximum(
+        np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)),
+                   1.0), -1.0)
--- a/deepspeech/modules/encoder_layer.py
+++ b/deepspeech/modules/encoder_layer.py
@@ -133,7 +133,7 @@ class ConformerEncoderLayer(nn.Layer):
    def __init__(
            self,
            size: int,
-            self_attn: int,
+            self_attn: nn.Layer,
            feed_forward: Optional[nn.Layer]=None,
            feed_forward_macaron: Optional[nn.Layer]=None,
            conv_module: Optional[nn.Layer]=None,

--- a/deepspeech/utils/common.py
+++ b/deepspeech/utils/common.py
@@ -110,4 +110,4 @@ def log_add(args: List[int]) -> float:
        return -float('inf')
    a_max = max(args)
    lsp = math.log(sum(math.exp(a - a_max) for a in args))
-    return a_max + lsp
+    return a_max + lsp
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ tensorboardX
 yacs
 typeguard
 pre-commit
+paddlepaddle-gpu==2.0.0