utility.py 8.6 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14
"""Contains data helper functions."""
15 16
import json
import math
17 18 19
from typing import List
from typing import Optional
from typing import Text
20

21
import jsonlines
22
import numpy as np
H
Hui Zhang 已提交
23

24
from deepspeech.utils.log import Log
25

26
logger = Log(__name__).getlog()
27

28
__all__ = [
29 30 31
    "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
    "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
    "EOS", "UNK", "BLANK", "MASKCTC"
32 33 34
]

IGNORE_ID = -1
35 36
# `sos` and `eos` using same token
SOS = "<eos>"
37 38 39
EOS = SOS
UNK = "<unk>"
BLANK = "<blank>"
40 41 42 43 44 45 46 47 48
MASKCTC = "<mask>"


def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
    if dict_path is None:
        return None

    with open(dict_path, "r") as f:
        dictionary = f.readlines()
49
    char_list = [entry.strip().split(" ")[0] for entry in dictionary]
50 51 52 53 54 55 56 57
    if BLANK not in char_list:
        char_list.insert(0, BLANK)
    if EOS not in char_list:
        char_list.append(EOS)
    # for non-autoregressive maskctc model
    if maskctc and MASKCTC not in char_list:
        char_list.append(MASKCTC)
    return char_list
58 59 60 61 62 63 64 65 66 67


def read_manifest(
        manifest_path,
        max_input_len=float('inf'),
        min_input_len=0.0,
        max_output_len=float('inf'),
        min_output_len=0.0,
        max_output_input_ratio=float('inf'),
        min_output_input_ratio=0.0, ):
68
    """Load and parse manifest file.
69

70 71
    Args:
        manifest_path ([type]): Manifest file to load and parse.
H
huangyuxin 已提交
72 73
        max_input_len ([type], optional): maximum output seq length,
            in seconds for raw wav, in frame numbers for feature data.
74
            Defaults to float('inf').
H
huangyuxin 已提交
75 76
        min_input_len (float, optional): minimum input seq length,
            in seconds for raw wav, in frame numbers for feature data.
77
            Defaults to 0.0.
H
huangyuxin 已提交
78
        max_output_len (float, optional): maximum input seq length,
79
            in modeling units. Defaults to 500.0.
H
huangyuxin 已提交
80
        min_output_len (float, optional): minimum input seq length,
81
            in modeling units. Defaults to 0.0.
H
huangyuxin 已提交
82
        max_output_input_ratio (float, optional):
83
            maximum output seq length/output seq length ratio. Defaults to 10.0.
H
huangyuxin 已提交
84
        min_output_input_ratio (float, optional):
85
            minimum output seq length/output seq length ratio. Defaults to 0.05.
86 87 88

    Raises:
        IOError: If failed to parse the manifest.
89

90 91
    Returns:
        List[dict]: Manifest parsing results.
92
    """
93

94
    manifest = []
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
    with jsonlines.open(manifest_path, 'r') as reader:
        for json_data in reader:
            feat_len = json_data["feat_shape"][
                0] if 'feat_shape' in json_data else 1.0
            token_len = json_data["token_shape"][
                0] if 'token_shape' in json_data else 1.0
            conditions = [
                feat_len >= min_input_len,
                feat_len <= max_input_len,
                token_len >= min_output_len,
                token_len <= max_output_len,
                token_len / feat_len >= min_output_input_ratio,
                token_len / feat_len <= max_output_input_ratio,
            ]
            if all(conditions):
                manifest.append(json_data)
111
    return manifest
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129


def rms_to_db(rms: float):
    """Root Mean Square to dB.

    Args:
        rms ([float]): root mean square

    Returns:
        float: dB
    """
    return 20.0 * math.log10(max(1e-16, rms))


def rms_to_dbfs(rms: float):
    """Root Mean Square to dBFS.
    https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
    Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
H
huangyuxin 已提交
130

131 132 133 134 135 136 137 138 139 140 141 142 143 144
    dB = dBFS + 3.0103
    dBFS = db - 3.0103
    e.g. 0 dB = -3.0103 dBFS

    Args:
        rms ([float]): root mean square

    Returns:
        float: dBFS
    """
    return rms_to_db(rms) - 3.0103


def max_dbfs(sample_data: np.ndarray):
H
huangyuxin 已提交
145
    """Peak dBFS based on the maximum energy sample.
146 147 148 149 150

    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].

    Returns:
H
huangyuxin 已提交
151
        float: dBFS
152 153 154 155 156 157
    """
    # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
    return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))


def mean_dbfs(sample_data):
H
huangyuxin 已提交
158
    """Peak dBFS based on the RMS energy.
159 160 161 162 163

    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].

    Returns:
H
huangyuxin 已提交
164
        float: dBFS
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
    """
    return rms_to_dbfs(
        math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))


def gain_db_to_ratio(gain_db: float):
    """dB to ratio

    Args:
        gain_db (float): gain in dB

    Returns:
        float: scale in amp
    """
    return math.pow(10.0, gain_db / 20.0)


def normalize_audio(sample_data: np.ndarray, dbfs: float=-3.0103):
    """Nomalize audio to dBFS.
H
huangyuxin 已提交
184

185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
    Args:
        sample_data (np.ndarray): input wave samples, [-1, 1].
        dbfs (float, optional): target dBFS. Defaults to -3.0103.

    Returns:
        np.ndarray: normalized wave
    """
    return np.maximum(
        np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)),
                   1.0), -1.0)


def _load_json_cmvn(json_cmvn_file):
    """ Load the json format cmvn stats file and calculate cmvn

    Args:
        json_cmvn_file: cmvn stats file in json format

    Returns:
        a numpy array of [means, vars]
    """
    with open(json_cmvn_file) as f:
        cmvn_stats = json.load(f)

    means = cmvn_stats['mean_stat']
    variance = cmvn_stats['var_stat']
    count = cmvn_stats['frame_num']
    for i in range(len(means)):
        means[i] /= count
        variance[i] = variance[i] / count - means[i] * means[i]
        if variance[i] < 1.0e-20:
            variance[i] = 1.0e-20
        variance[i] = 1.0 / math.sqrt(variance[i])
    cmvn = np.array([means, variance])
    return cmvn


def _load_kaldi_cmvn(kaldi_cmvn_file):
    """ Load the kaldi format cmvn stats file and calculate cmvn

    Args:
        kaldi_cmvn_file:  kaldi text style global cmvn file, which
           is generated by:
           compute-cmvn-stats --binary=false scp:feats.scp global_cmvn

    Returns:
        a numpy array of [means, vars]
    """
    means = []
    variance = []
    with open(kaldi_cmvn_file, 'r') as fid:
        # kaldi binary file start with '\0B'
        if fid.read(2) == '\0B':
            logger.error('kaldi cmvn binary file is not supported, please '
                         'recompute it by: compute-cmvn-stats --binary=false '
                         ' scp:feats.scp global_cmvn')
            sys.exit(1)
        fid.seek(0)
        arr = fid.read().split()
        assert (arr[0] == '[')
        assert (arr[-2] == '0')
        assert (arr[-1] == ']')
        feat_dim = int((len(arr) - 2 - 2) / 2)
        for i in range(1, feat_dim + 1):
            means.append(float(arr[i]))
        count = float(arr[feat_dim + 1])
        for i in range(feat_dim + 2, 2 * feat_dim + 2):
            variance.append(float(arr[i]))

    for i in range(len(means)):
        means[i] /= count
        variance[i] = variance[i] / count - means[i] * means[i]
        if variance[i] < 1.0e-20:
            variance[i] = 1.0e-20
        variance[i] = 1.0 / math.sqrt(variance[i])
    cmvn = np.array([means, variance])
    return cmvn


def load_cmvn(cmvn_file: str, filetype: str):
    """load cmvn from file.

    Args:
        cmvn_file (str): cmvn path.
        filetype (str): file type, optional[npz, json, kaldi].

    Raises:
        ValueError: file type not support.

    Returns:
        Tuple[np.ndarray, np.ndarray]: mean, istd
    """
    assert filetype in ['npz', 'json', 'kaldi'], filetype
    filetype = filetype.lower()
    if filetype == "json":
        cmvn = _load_json_cmvn(cmvn_file)
    elif filetype == "kaldi":
        cmvn = _load_kaldi_cmvn(cmvn_file)
H
huangyuxin 已提交
283 284 285 286 287 288 289
    elif filetype == "npz":
        eps = 1e-14
        npzfile = np.load(cmvn_file)
        mean = np.squeeze(npzfile["mean"])
        std = np.squeeze(npzfile["std"])
        istd = 1 / (std + eps)
        cmvn = [mean, istd]
290 291 292
    else:
        raise ValueError(f"cmvn file type no support: {filetype}")
    return cmvn[0], cmvn[1]