You need to sign in or sign up before continuing.
utility.py 9.9 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14
"""Contains data helper functions."""
15 16
import json
import math
17
import tarfile
18 19 20
from typing import List
from typing import Optional
from typing import Text
21

22
import jsonlines
23
import numpy as np
H
Hui Zhang 已提交
24

25
from deepspeech.utils.log import Log
26

27
logger = Log(__name__).getlog()
28

29
__all__ = [
30 31
    "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
    "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
H
Hui Zhang 已提交
32
    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
33 34 35
]

IGNORE_ID = -1
36 37
# `sos` and `eos` using same token
SOS = "<eos>"
38 39 40
EOS = SOS
UNK = "<unk>"
BLANK = "<blank>"
41
MASKCTC = "<mask>"
H
Hui Zhang 已提交
42
SPACE = "<space>"
43 44 45 46 47 48 49 50


def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
    if dict_path is None:
        return None

    with open(dict_path, "r") as f:
        dictionary = f.readlines()
51
    char_list = [entry.strip().split(" ")[0] for entry in dictionary]
52 53 54 55 56 57 58 59
    if BLANK not in char_list:
        char_list.insert(0, BLANK)
    if EOS not in char_list:
        char_list.append(EOS)
    # for non-autoregressive maskctc model
    if maskctc and MASKCTC not in char_list:
        char_list.append(MASKCTC)
    return char_list
60 61 62 63 64 65 66 67 68 69


def read_manifest(
        manifest_path,
        max_input_len=float('inf'),
        min_input_len=0.0,
        max_output_len=float('inf'),
        min_output_len=0.0,
        max_output_input_ratio=float('inf'),
        min_output_input_ratio=0.0, ):
70
    """Load and parse manifest file.
71

72 73
    Args:
        manifest_path ([type]): Manifest file to load and parse.
H
huangyuxin 已提交
74 75
        max_input_len ([type], optional): maximum output seq length,
            in seconds for raw wav, in frame numbers for feature data.
76
            Defaults to float('inf').
H
huangyuxin 已提交
77 78
        min_input_len (float, optional): minimum input seq length,
            in seconds for raw wav, in frame numbers for feature data.
79
            Defaults to 0.0.
H
huangyuxin 已提交
80
        max_output_len (float, optional): maximum input seq length,
81
            in modeling units. Defaults to 500.0.
H
huangyuxin 已提交
82
        min_output_len (float, optional): minimum input seq length,
83
            in modeling units. Defaults to 0.0.
H
huangyuxin 已提交
84
        max_output_input_ratio (float, optional):
85
            maximum output seq length/output seq length ratio. Defaults to 10.0.
H
huangyuxin 已提交
86
        min_output_input_ratio (float, optional):
87
            minimum output seq length/output seq length ratio. Defaults to 0.05.
88 89 90

    Raises:
        IOError: If failed to parse the manifest.
91

92 93
    Returns:
        List[dict]: Manifest parsing results.
94
    """
95

96
    manifest = []
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
    with jsonlines.open(manifest_path, 'r') as reader:
        for json_data in reader:
            feat_len = json_data["feat_shape"][
                0] if 'feat_shape' in json_data else 1.0
            token_len = json_data["token_shape"][
                0] if 'token_shape' in json_data else 1.0
            conditions = [
                feat_len >= min_input_len,
                feat_len <= max_input_len,
                token_len >= min_output_len,
                token_len <= max_output_len,
                token_len / feat_len >= min_output_input_ratio,
                token_len / feat_len <= max_output_input_ratio,
            ]
            if all(conditions):
                manifest.append(json_data)
113
    return manifest
114 115


116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
# Tar File read
TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])


def parse_tar(file):
    """Parse a tar file to get a tarfile object
    and a map containing tarinfoes
    """
    result = {}
    f = tarfile.open(file)
    for tarinfo in f.getmembers():
        result[tarinfo.name] = tarinfo
    return f, result


def subfile_from_tar(file, local_data=None):
    """Get subfile object from tar.

    tar:tarpath#filename

    It will return a subfile object from tar file
    and cached tar file info for next reading request.
    """
    tarpath, filename = file.split(':', 1)[1].split('#', 1)

    if local_data is None:
        local_data = TarLocalData(tar2info={}, tar2object={})

    assert isinstance(local_data, TarLocalData)

    if 'tar2info' not in local_data.__dict__:
        local_data.tar2info = {}
    if 'tar2object' not in local_data.__dict__:
        local_data.tar2object = {}

    if tarpath not in local_data.tar2info:
        fobj, infos = parse_tar(tarpath)
        local_data.tar2info[tarpath] = infos
        local_data.tar2object[tarpath] = fobj
    else:
        fobj = local_data.tar2object[tarpath]
        infos = local_data.tar2info[tarpath]
    return fobj.extractfile(infos[filename])


161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
def rms_to_db(rms: float):
    """Root Mean Square to dB.

    Args:
        rms ([float]): root mean square

    Returns:
        float: dB
    """
    return 20.0 * math.log10(max(1e-16, rms))


def rms_to_dbfs(rms: float):
    """Root Mean Square to dBFS.
    https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
    Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
H
huangyuxin 已提交
177

178 179 180 181 182 183 184 185 186 187 188 189 190 191
    dB = dBFS + 3.0103
    dBFS = db - 3.0103
    e.g. 0 dB = -3.0103 dBFS

    Args:
        rms ([float]): root mean square

    Returns:
        float: dBFS
    """
    return rms_to_db(rms) - 3.0103


def max_dbfs(sample_data: np.ndarray):
H
huangyuxin 已提交
192
    """Peak dBFS based on the maximum energy sample.
193 194 195 196 197

    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].

    Returns:
H
huangyuxin 已提交
198
        float: dBFS
199 200 201 202 203 204
    """
    # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
    return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))


def mean_dbfs(sample_data):
H
huangyuxin 已提交
205
    """Peak dBFS based on the RMS energy.
206 207 208 209 210

    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].

    Returns:
H
huangyuxin 已提交
211
        float: dBFS
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
    """
    return rms_to_dbfs(
        math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))


def gain_db_to_ratio(gain_db: float):
    """dB to ratio

    Args:
        gain_db (float): gain in dB

    Returns:
        float: scale in amp
    """
    return math.pow(10.0, gain_db / 20.0)


def normalize_audio(sample_data: np.ndarray, dbfs: float=-3.0103):
    """Nomalize audio to dBFS.
H
huangyuxin 已提交
231

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
    Args:
        sample_data (np.ndarray): input wave samples, [-1, 1].
        dbfs (float, optional): target dBFS. Defaults to -3.0103.

    Returns:
        np.ndarray: normalized wave
    """
    return np.maximum(
        np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)),
                   1.0), -1.0)


def _load_json_cmvn(json_cmvn_file):
    """ Load the json format cmvn stats file and calculate cmvn

    Args:
        json_cmvn_file: cmvn stats file in json format

    Returns:
        a numpy array of [means, vars]
    """
    with open(json_cmvn_file) as f:
        cmvn_stats = json.load(f)

    means = cmvn_stats['mean_stat']
    variance = cmvn_stats['var_stat']
    count = cmvn_stats['frame_num']
    for i in range(len(means)):
        means[i] /= count
        variance[i] = variance[i] / count - means[i] * means[i]
        if variance[i] < 1.0e-20:
            variance[i] = 1.0e-20
        variance[i] = 1.0 / math.sqrt(variance[i])
    cmvn = np.array([means, variance])
    return cmvn


def _load_kaldi_cmvn(kaldi_cmvn_file):
    """ Load the kaldi format cmvn stats file and calculate cmvn

    Args:
        kaldi_cmvn_file:  kaldi text style global cmvn file, which
           is generated by:
           compute-cmvn-stats --binary=false scp:feats.scp global_cmvn

    Returns:
        a numpy array of [means, vars]
    """
    means = []
    variance = []
    with open(kaldi_cmvn_file, 'r') as fid:
        # kaldi binary file start with '\0B'
        if fid.read(2) == '\0B':
            logger.error('kaldi cmvn binary file is not supported, please '
                         'recompute it by: compute-cmvn-stats --binary=false '
                         ' scp:feats.scp global_cmvn')
            sys.exit(1)
        fid.seek(0)
        arr = fid.read().split()
        assert (arr[0] == '[')
        assert (arr[-2] == '0')
        assert (arr[-1] == ']')
        feat_dim = int((len(arr) - 2 - 2) / 2)
        for i in range(1, feat_dim + 1):
            means.append(float(arr[i]))
        count = float(arr[feat_dim + 1])
        for i in range(feat_dim + 2, 2 * feat_dim + 2):
            variance.append(float(arr[i]))

    for i in range(len(means)):
        means[i] /= count
        variance[i] = variance[i] / count - means[i] * means[i]
        if variance[i] < 1.0e-20:
            variance[i] = 1.0e-20
        variance[i] = 1.0 / math.sqrt(variance[i])
    cmvn = np.array([means, variance])
    return cmvn


def load_cmvn(cmvn_file: str, filetype: str):
    """load cmvn from file.

    Args:
        cmvn_file (str): cmvn path.
        filetype (str): file type, optional[npz, json, kaldi].

    Raises:
        ValueError: file type not support.

    Returns:
        Tuple[np.ndarray, np.ndarray]: mean, istd
    """
    assert filetype in ['npz', 'json', 'kaldi'], filetype
    filetype = filetype.lower()
    if filetype == "json":
        cmvn = _load_json_cmvn(cmvn_file)
    elif filetype == "kaldi":
        cmvn = _load_kaldi_cmvn(cmvn_file)
H
huangyuxin 已提交
330 331 332 333 334 335 336
    elif filetype == "npz":
        eps = 1e-14
        npzfile = np.load(cmvn_file)
        mean = np.squeeze(npzfile["mean"])
        std = np.squeeze(npzfile["std"])
        istd = 1 / (std + eps)
        cmvn = [mean, istd]
337 338 339
    else:
        raise ValueError(f"cmvn file type no support: {filetype}")
    return cmvn[0], cmvn[1]