提交 b769579e 编写于 作者: H Hui Zhang

add audio utils

上级 7635f98b
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
# limitations under the License. # limitations under the License.
"""Contains data helper functions.""" """Contains data helper functions."""
import numpy as np
import math
import json import json
import codecs import codecs
import os import os
...@@ -50,3 +52,85 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): ...@@ -50,3 +52,85 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
json_data["duration"] >= min_duration): json_data["duration"] >= min_duration):
manifest.append(json_data) manifest.append(json_data)
return manifest return manifest
def rms_to_db(rms: float):
"""Root Mean Square to dB.
Args:
rms ([float]): root mean square
Returns:
float: dB
"""
return 20.0 * math.log10(max(1e-16, rms))
def rms_to_dbfs(rms: float):
"""Root Mean Square to dBFS.
https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
dB = dBFS + 3.0103
dBFS = db - 3.0103
e.g. 0 dB = -3.0103 dBFS
Args:
rms ([float]): root mean square
Returns:
float: dBFS
"""
return rms_to_db(rms) - 3.0103
def max_dbfs(sample_data: np.ndarry):
"""Peak dBFS based on the maximum energy sample.
Args:
sample_data ([np.ndarry]): float array, [-1, 1].
Returns:
float: dBFS
"""
# Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
def mean_dbfs(sample_data):
"""Peak dBFS based on the RMS energy.
Args:
sample_data ([np.ndarry]): float array, [-1, 1].
Returns:
float: dBFS
"""
return rms_to_dbfs(
math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
def gain_db_to_ratio(gain_db: float):
"""dB to ratio
Args:
gain_db (float): gain in dB
Returns:
float: scale in amp
"""
return math.pow(10.0, gain_db / 20.0)
def normalize_audio(sample_data: np.ndarry, dbfs: float=-3.0103):
"""Nomalize audio to dBFS.
Args:
sample_data (np.ndarry): input wave samples, [-1, 1].
dbfs (float, optional): target dBFS. Defaults to -3.0103.
Returns:
np.ndarry: normalized wave
"""
return np.maximum(
np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)),
1.0), -1.0)
...@@ -133,7 +133,7 @@ class ConformerEncoderLayer(nn.Layer): ...@@ -133,7 +133,7 @@ class ConformerEncoderLayer(nn.Layer):
def __init__( def __init__(
self, self,
size: int, size: int,
self_attn: int, self_attn: nn.Layer,
feed_forward: Optional[nn.Layer]=None, feed_forward: Optional[nn.Layer]=None,
feed_forward_macaron: Optional[nn.Layer]=None, feed_forward_macaron: Optional[nn.Layer]=None,
conv_module: Optional[nn.Layer]=None, conv_module: Optional[nn.Layer]=None,
......
...@@ -6,3 +6,4 @@ tensorboardX ...@@ -6,3 +6,4 @@ tensorboardX
yacs yacs
typeguard typeguard
pre-commit pre-commit
paddlepaddle-gpu==2.0.0
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册