audio.py 15.3 KB
Newer Older
K
KP 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15

__all__ = [
16 17
    'set_backend',
    'get_backends',
18 19 20 21 22 23 24 25
    'resample',
    'to_mono',
    'depth_convert',
    'normalize',
    'save_wav',
    'load',
]
import os
K
KP 已提交
26
import warnings
27
from typing import List, Optional, Tuple, Type, Union
K
KP 已提交
28 29

import numpy as np
30 31 32
import resampy
import soundfile as sf
from numpy import ndarray as array
K
KP 已提交
33 34
from scipy.io import wavfile

35
from ..utils import ParameterError
36
from ._ffmpeg import DecodingError, FFmpegAudioFile
K
KP 已提交
37

38 39 40
NORMALMIZE_TYPES = ['linear', 'gaussian']
MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
41 42
SUPPORT_BACKENDS = ['ffmpeg', 'soundfile']

43
EPS = 1e-8
K
KP 已提交
44

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
BACK_END = None


def set_backend(backend: Union[str, None] = 'ffmpeg'):
    """Set audio decoding backend.
    Parameters:
        backend(str|None): The name of the backend to use. If None, paddleaudio will
            choose the optimal backend automatically.

    Notes:
        Use get_backends() to get available backends.

    """
    global BACK_END
    if backend and backend not in SUPPORT_BACKENDS:
        raise ParameterError(f'Unsupported backend {backend} ,' +
                             f'supported backends are {SUPPORT_BACKENDS}')
    BACK_END = backend


def get_backends():
    return SUPPORT_BACKENDS

K
KP 已提交
68

69 70 71 72
def _safe_cast(y: array, dtype: Union[type, str]) -> array:
    """Data type casting in a safe way, i.e., prevent overflow or underflow.
    Notes:
        This function is used internally.
73
    """
74
    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
75

K
KP 已提交
76

77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
def _ffmpeg_load(file: os.PathLike,
                 offset: Optional[float] = None,
                 duration: Optional[int] = None) -> Tuple[array, int]:
    """Load audio file using audioread ffmpeg backend.
    Notes:
        This function is for internal use only.
    """
    with FFmpegAudioFile(file) as f:
        sr = f.samplerate
        buffer = b''
        for d in f.read_data():
            buffer += d
    wav = np.frombuffer(buffer, dtype='int16')
    if f.channels != 1:
        wav = wav.reshape((
            -1,
            f.channels,
        )).transpose(1, 0)
    if offset:
        wav = wav[int(offset * sr):]
    if duration is not None:
        frame_duration = int(duration * sr)
        wav = wav[:frame_duration]

    return wav, sr


def _sound_file_load(file: os.PathLike,
                     offset: Optional[float] = None,
                     dtype: str = 'int16',
                     duration: Optional[int] = None) -> Tuple[array, int]:
108 109
    """Load audio using soundfile library.
    This function loads audio file using libsndfile.
110

111 112 113 114 115 116 117 118 119 120 121 122 123 124
    Reference:
        http://www.mega-nerd.com/libsndfile/#Features
    Notes:
        This function is for internal use only.
    """
    with sf.SoundFile(file) as sf_desc:
        sr_native = sf_desc.samplerate
        if offset:
            sf_desc.seek(int(offset * sr_native))
        if duration is not None:
            frame_duration = int(duration * sr_native)
        else:
            frame_duration = -1
        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
125

126
    return y, sf_desc.samplerate
127 128


129
def _sox_file_load():
130 131
    """Load audio using sox library.
    This function loads audio file using sox.
132

133 134 135 136
    Reference:
        http://sox.sourceforge.net/
    Notes:
        This function is for internal use only.
137
    """
138
    raise NotImplementedError()
K
KP 已提交
139

140

141 142 143
def depth_convert(y: array, dtype: Union[type, str]) -> array:
    """Convert audio array to target dtype safely.
    The function converts audio waveform to a target dtype, with addition steps of
144
    preventing overflow/underflow and preserving audio range.
K
KP 已提交
145

146 147 148 149
    Parameters:
        y(array): the input audio array of shape [n,], [1,n] or [2,n].
        dtype(str|type): the target dtype. The following dtypes are supported:
            'int16', 'int8', 'float32' and 'float64'.
150
    """
K
KP 已提交
151

152 153 154
    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
    if y.dtype not in SUPPORT_DTYPE:
        raise ParameterError(
155 156
            f'Unsupported audio dtype, ' +
            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
157 158 159

    if dtype not in SUPPORT_DTYPE:
        raise ParameterError(
160 161
            f'Unsupported audio dtype, ' +
            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
K
KP 已提交
162 163 164 165 166

    if dtype == y.dtype:
        return y

    if dtype == 'float64' and y.dtype == 'float32':
167
        return _safe_cast(y, dtype)
K
KP 已提交
168
    if dtype == 'float32' and y.dtype == 'float64':
169
        return _safe_cast(y, dtype)
K
KP 已提交
170 171 172 173

    if dtype == 'int16' or dtype == 'int8':
        if y.dtype in ['float64', 'float32']:
            factor = np.iinfo(dtype).max
174 175 176
            y = np.clip(y * factor,
                        np.iinfo(dtype).min,
                        np.iinfo(dtype).max).astype(dtype)
K
KP 已提交
177 178 179
            y = y.astype(dtype)
        else:
            if dtype == 'int16' and y.dtype == 'int8':
180
                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
K
KP 已提交
181 182 183
                y = y.astype('float32') * factor
                y = y.astype('int16')

184 185 186
            else:  # dtype == 'int8' and y.dtype=='int16':
                y = y.astype('int32') * np.iinfo('int8').max / \
                    np.iinfo('int16').max
K
KP 已提交
187 188 189 190 191 192 193 194
                y = y.astype('int8')

    if dtype in ['float32', 'float64']:
        org_dtype = y.dtype
        y = y.astype(dtype) / np.iinfo(org_dtype).max
    return y


195 196 197 198
def resample(y: array,
             src_sr: int,
             target_sr: int,
             mode: str = 'kaiser_fast') -> array:
199
    """Apply audio resampling to the input audio array.
K
KP 已提交
200

201 202 203
     Notes:
        1. This function uses resampy.resample to do the resampling.
        2. The default mode is kaiser_fast.  For better audio quality,
204
            use mode = 'kaiser_best'
205 206 207
     """
    if mode == 'kaiser_best':
        warnings.warn(
208 209 210
            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}.' +
            f'This function is pretty slow, ' +
            f'we recommend the mode kaiser_fast in large scale audio training')
K
KP 已提交
211

212
    if not isinstance(y, np.ndarray):
213 214
        raise TypeError(
            f'Only support numpy array, but received y in {type(y)}')
K
KP 已提交
215

216 217
    if mode not in RESAMPLE_MODES:
        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
K
KP 已提交
218

219
    return resampy.resample(y, src_sr, target_sr, filter=mode)
220 221


222
def to_mono(y: array, merge_type: str = 'ch0') -> array:
223 224 225 226 227 228 229 230 231 232 233 234 235 236
    """Convert stereo audio to mono audio.
    Parameters:
        y(array): the input audio array of shape [2,n], where n is the number of audio samples.
        merge_type(str): the type of algorithm for mergin. Supported types are
            "average": the audio samples from both channels are averaged.
            "ch0": all audio samples from channel 0 are taken as output.
            "ch1: all audio samples from channel 1 are taken as output.
            "random": all audio samples from channel 0 or 1 are taken as output.
        The default value is "average".
    Returns:
        The mono (single-channel) audio.
    Notes:
        This function will keep the audio dtype and will automatically handle the averaging precision
        for int16 or int8 dtype.
237
    """
238 239 240 241 242 243 244 245 246
    if merge_type not in MERGE_TYPES:
        raise ParameterError(
            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
        )
    if y.ndim > 2:
        raise ParameterError(
            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
    if y.ndim == 1:  # nothing to merge
        return y
247

248 249 250 251 252 253
    if merge_type == 'ch0':
        return y[0]
    if merge_type == 'ch1':
        return y[1]
    if merge_type == 'random':
        return y[np.random.randint(0, 2)]
254

255
    # need to do averaging according to dtype
256

257
    if y.dtype == 'float32':
258
        y_out = y.mean(0)
259
    elif y.dtype == 'int16':
260
        y_out = y.mean(0)
261 262 263 264
        y_out = np.clip(y_out,
                        np.iinfo(y.dtype).min,
                        np.iinfo(y.dtype).max).astype(y.dtype)
    elif y.dtype == 'int8':
265
        y_out = y.mean(0)
266 267 268 269 270 271
        y_out = np.clip(y_out,
                        np.iinfo(y.dtype).min,
                        np.iinfo(y.dtype).max).astype(y.dtype)
    else:
        raise ParameterError(f'Unsupported dtype: {y.dtype}')
    return y_out
272 273 274 275 276


def normalize(y: array,
              norm_type: str = 'linear',
              mul_factor: float = 1.0) -> array:
277 278 279 280 281 282 283 284 285 286 287
    """Normalize the input audio.

     Parameters:
        norm_type(str): normalization algorithm. Supported types are
            'linear': the audio is normalized linearly such that np.max(np.abs(y))==mul_factor
            'gaussian': the audio is normalized such that np.mean(y)==0 and np.std(y)==mul_factor
            The default value is 'linear'.
        norm_mul_factor(float): additional multiplication factor after normalization.
            The default value is 1.0.
    Notes:
        The audio will be converted to float32, unless its dtype is originly float64.
288
    """
289 290
    if y.dtype not in ['float32', 'float64']:
        y = y.astype('float32')
K
KP 已提交
291 292 293

    if norm_type == 'linear':
        amax = np.max(np.abs(y))
294
        factor = 1.0 / (amax + EPS)
K
KP 已提交
295 296 297 298
        y = y * factor * mul_factor
    elif norm_type == 'gaussian':
        amean = np.mean(y)
        astd = np.std(y)
299 300
        astd = max(astd, EPS)
        y = mul_factor * (y - amean) / astd
K
KP 已提交
301
    else:
302
        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
K
KP 已提交
303 304 305 306

    return y


307
def save_wav(y: array, sr: int, file: os.PathLike) -> None:
308 309
    """Save audio file to disk.
    This function saves audio to disk using scipy.io.wavfile, with additional step
310
    to convert input waveform to int16 unless it already is int16.
311

312 313 314 315
    Parameters:
        y(array): the audio data.
        sr(int|None): the sample rate of the audio data. If sr does not match the actual audio data,
        the resulting file will encounter play-back problems.
316
    Notes:
317
        The function only supports raw wav format.
318
    """
319 320 321 322 323
    if y.ndim == 2 and y.shape[0] > y.shape[1]:
        warnings.warn(
            f'The audio array tried to saved has {y.shape[0]} channels ' +
            f'and the wave length is {y.shape[1]}. It\'s that what you mean?' +
            f'If not, try to tranpose the array before saving.')
324 325 326 327 328 329 330
    if not file.endswith('.wav'):
        raise ParameterError(
            f'only .wav file supported, but dst file name is: {file}')

    if sr <= 0:
        raise ParameterError(
            f'Sample rate should be larger than 0, recieved sr = {sr}')
K
KP 已提交
331 332

    if y.dtype not in ['int16', 'int8']:
333 334 335 336
        warnings.warn(
            f'input data type is {y.dtype}, will convert data to int16 format before saving'
        )
        y_out = depth_convert(y, 'int16')
K
KP 已提交
337
    else:
338
        y_out = y
K
KP 已提交
339

340
    wavfile.write(file, sr, y_out.T)
K
KP 已提交
341 342 343


def load(
344
        file: os.PathLike,
345 346 347 348 349 350 351 352 353 354 355
        sr: Optional[int] = None,
        mono: bool = True,
        merge_type: str = 'average',  # ch0,ch1,random,average
        normal: bool = True,
        norm_type: str = 'linear',
        norm_mul_factor: float = 1.0,
        offset: float = 0.0,
        duration: Optional[int] = None,
        dtype: str = 'float32',
        resample_mode: str = 'kaiser_fast') -> Tuple[array, int]:
    """Load audio file from disk.
356
    This function loads audio from disk using using automatically chosen backend.
357
    Parameters:
358 359 360 361 362 363 364 365 366 367
        file(os.PathLike): the path of the file. URLs are not supported.
        sr(int|None): the target sample rate after loaded. If None, the original (native)
            sample rate is deduced from the file itself and no resampling is performed.
            If the native sample rate is different from specified target sample rate, resamping
            is performed according to resample_mode parameter.
            The default value is None.
        mono(bool): whether to convert audio to mono using algorithm specified in merge_type parameter
            if it is originally steore. See to_mono() for more details.
            The default value is True.
        merge_type(str): the merging algorithm. See to_mono() for more details.
368
            The default value is 'ch0'.
369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
        normal(bool): whether to normalize the audio waveform. If True, the audio will be normalized using algorithm
            specified in norm_type. See normalize() for more details.
            The default value is True.
        norm_mul_factor(float): additional multiplication factor for normalization. See normalize() for more details.
            The default value is 1.0.
        norm_type(str): normalization algorithm. Supported types are 'linear' and 'gaussian'. See normalize() for
            more details. The default value is 'linear'.
        offset(float): the time (in seconds) for offseting the audio after loaded, e.g., set offset=1.0 to load all data
            after 1.0 second. If the audio duration is less than offset, empty array is returned.
            The default value is 0.
        duration(float): the audio length measured in seconds after it is loaded. If None, or the actual audio duration is
            less than specified duration, the actual audio array is returned without padding.
            The default value is None.
        dtype(str): the target dtype of the return audio array. The dynamic range of audio samples will be
            adjusted according to dtype.
        resample_mode(str): the algorithm used in resampling. See resample() for more details.

    Raises:
        FileNotFoundError, if audio file is not found
        DecodingError, if audio file is not supported
389 390

    """
391 392 393
    if BACK_END == 'ffmpeg':
        y, r = _ffmpeg_load(file, offset=offset, duration=duration)
    elif BACK_END == 'soundfile':
394 395 396 397
        y, r = _sound_file_load(file,
                                offset=offset,
                                dtype=dtype,
                                duration=duration)
398
    else:
399
        try:
400 401 402 403 404 405 406 407 408 409 410 411
            y, r = _sound_file_load(file,
                                    offset=offset,
                                    dtype=dtype,
                                    duration=duration)
        except FileNotFoundError:
            raise FileNotFoundError(
                f'Trying to load a file that doesnot exist {file}')
        except:
            try:
                y, r = _ffmpeg_load(file, offset=offset, duration=duration)
            except DecodingError:
                raise DecodingError(f'Failed to load and decode file {file}')
K
KP 已提交
412

413
    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
414
        return np.array([], dtype=dtype)  # return empty audio
K
KP 已提交
415 416

    if mono:
417
        y = to_mono(y, merge_type)
K
KP 已提交
418 419

    if sr is not None and sr != r:
420
        y = resample(y, r, sr, mode=resample_mode)
K
KP 已提交
421 422 423 424
        r = sr

    if normal:
        y = normalize(y, norm_type, norm_mul_factor)
425 426 427
    elif dtype in ['int8', 'int16']:
        # still need to do normalization, before depth convertion
        y = normalize(y, 'linear', 1.0)
K
KP 已提交
428 429 430

    y = depth_convert(y, dtype)
    return y, r