# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Iterable, List, Optional, Tuple, TypeVar import numpy as np from numpy import ndarray as array from paddleaudio.backends import depth_convert from paddleaudio.utils import ParameterError from paddleaudio.utils._librosa import pad_center __all__ = [ 'depth_augment', 'spect_augment', 'random_crop1d', 'center_crop1d', 'random_crop2d', 'adaptive_spect_augment', 'random_crop_or_pad1d', 'center_crop_or_pad1d', ] def randint(high: int) -> int: """Generate one random integer in range [0 high) This is a helper function for random data augmentaiton """ return int(np.random.randint(0, high=high)) def rand() -> float: """Generate one floating-point number in range [0 1) This is a helper function for random data augmentaiton """ return float(np.random.rand(1)) def depth_augment(y: array, choices: List[str] = ['int8', 'int16'], probs: List[float] = [0.5, 0.5]) -> array: """ Audio depth augmentation Do audio depth augmentation to simulate the distortion brought by quantization. """ assert len(probs) == len( choices ), 'number of choices {} must be equal to size of probs {}'.format( len(choices), len(probs)) depth = np.random.choice(choices, p=probs) src_depth = y.dtype y1 = depth_convert(y, depth) y2 = depth_convert(y1, src_depth) return y2 def adaptive_spect_augment(spect: array, tempo_axis: int = 0, level: float = 0.1) -> array: """Do adpative spectrogram augmentation The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation。 """ assert spect.ndim == 2., 'only supports 2d tensor or numpy array' if tempo_axis == 0: nt, nf = spect.shape else: nf, nt = spect.shape time_mask_width = int(nt * level * 0.5) freq_mask_width = int(nf * level * 0.5) num_time_mask = int(10 * level) num_freq_mask = int(10 * level) if tempo_axis == 0: for _ in range(num_time_mask): start = randint(nt - time_mask_width) spect[start:start + time_mask_width, :] = 0 for _ in range(num_freq_mask): start = randint(nf - freq_mask_width) spect[:, start:start + freq_mask_width] = 0 else: for _ in range(num_time_mask): start = randint(nt - time_mask_width) spect[:, start:start + time_mask_width] = 0 for _ in range(num_freq_mask): start = randint(nf - freq_mask_width) spect[start:start + freq_mask_width, :] = 0 return spect def spect_augment(spect: array, tempo_axis: int = 0, max_time_mask: int = 3, max_freq_mask: int = 3, max_time_mask_width: int = 30, max_freq_mask_width: int = 20) -> array: """Do spectrogram augmentation in both time and freq axis """ assert spect.ndim == 2., 'only supports 2d tensor or numpy array' if tempo_axis == 0: nt, nf = spect.shape else: nf, nt = spect.shape num_time_mask = randint(max_time_mask) num_freq_mask = randint(max_freq_mask) time_mask_width = randint(max_time_mask_width) freq_mask_width = randint(max_freq_mask_width) if tempo_axis == 0: for _ in range(num_time_mask): start = randint(nt - time_mask_width) spect[start:start + time_mask_width, :] = 0 for _ in range(num_freq_mask): start = randint(nf - freq_mask_width) spect[:, start:start + freq_mask_width] = 0 else: for _ in range(num_time_mask): start = randint(nt - time_mask_width) spect[:, start:start + time_mask_width] = 0 for _ in range(num_freq_mask): start = randint(nf - freq_mask_width) spect[start:start + freq_mask_width, :] = 0 return spect def random_crop1d(y: array, crop_len: int) -> array: """ Do random cropping on 1d input signal The input is a 1d signal, typically a sound waveform """ if y.ndim != 1: raise ParameterError('only accept 1d numpy array') n = len(y) idx = randint(n - crop_len) return y[idx:idx + crop_len] def center_crop1d(y: array, crop_len: int) -> array: """ Do random cropping on 1d input signal The input is a 1d signal, typically a sound waveform """ if y.ndim != 1: raise ParameterError( f'only accept 1d numpy array, but received y.ndim={y.ndim}') n = len(y) start = (n - crop_len) // 2 return y[start:start + crop_len] def random_crop_or_pad1d(y: array, crop_len: int) -> array: """ Do random cropping or padding to the target length defined by crop_len,given a 1d input signal The input is a 1d signal, typically a sound waveform. """ if y.ndim != 1: raise ParameterError( f'only accept 1d numpy array, but received y.ndim={y.ndim}') n = len(y) if crop_len == n: return y elif crop_len > n: return pad_center(y, crop_len) else: return random_crop1d(y, crop_len) def center_crop_or_pad1d(y: array, crop_len: int) -> array: """ Do random cropping or padding to the target length defined by crop_len,given a 1d input signal The input is a 1d signal, typically a sound waveform. """ if y.ndim != 1: raise ParameterError( f'only accept 1d numpy array, but received y.ndim={y.ndim}') n = len(y) if crop_len == n: return y elif crop_len > n: return pad_center(y, crop_len) else: return center_crop1d(y, crop_len) def random_crop2d(s: array, crop_len: int, tempo_axis: int = 0) -> array: """ Do random cropping for 2D array, typically a spectrogram. The cropping is done in temporal direction on the time-freq input signal. """ if tempo_axis >= s.ndim: raise ParameterError('axis out of range') n = s.shape[tempo_axis] idx = randint(high=n - crop_len) sli = [slice(None) for i in range(s.ndim)] sli[tempo_axis] = slice(idx, idx + crop_len) out = s[tuple(sli)] return out