from typing import Tuple import numpy as np import paddle from paddle import Tensor from paddle import nn from paddle.nn import functional as F import soundfile as sf from .common import get_window, dft_matrix def read(wavpath:str, sr:int = None, dtype='int16')->Tuple[int, np.ndarray]: wav, r_sr = sf.read(wavpath, dtype=dtype) if sr: assert sr == r_sr return r_sr, wav def frames(x: Tensor, num_samples: Tensor, sr: int, win_length: float, stride_length: float, clip: bool = False) -> Tuple[Tensor, Tensor]: """Extract frames from audio. Parameters ---------- x : Tensor Shape (B, T), batched waveform. num_samples : Tensor Shape (B, ), number of samples of each waveform. sr: int Sampling Rate. win_length : float Window length in ms. stride_length : float Stride length in ms. clip : bool, optional Whether to clip audio that does not fit into the last frame, by default True Returns ------- frames : Tensor Shape (B, T', win_length). num_frames : Tensor Shape (B, ) number of valid frames """ assert stride_length <= win_length stride_length = int(stride_length * sr) win_length = int(win_length * sr) num_frames = (num_samples - win_length) // stride_length padding = (0, 0) if not clip: num_frames += 1 need_samples = num_frames * stride_length + win_length padding = (0, need_samples - num_samples - 1) weight = paddle.eye(win_length).unsqueeze(1) #[win_length, 1, win_length] frames = F.conv1d(x.unsqueeze(-1), weight, padding=padding, stride=(stride_length, ), data_format='NLC') return frames, num_frames class STFT(nn.Layer): """A module for computing stft transformation in a differentiable way. http://practicalcryptography.com/miscellaneous/machine-learning/intuitive-guide-discrete-fourier-transform/ Parameters ------------ n_fft : int Number of samples in a frame. sr: int Number of Samplilng rate. stride_length : float Number of samples shifted between adjacent frames. win_length : float Length of the window. clip: bool Whether to clip audio is necesaary. """ def __init__(self, n_fft: int, sr: int, win_length: float, stride_length: float, window_type: str = None, clip: bool = False): super().__init__() self.sr = sr self.win_length = int(win_length * sr) self.stride_length = int(stride_length * sr) self.clip = clip self.n_fft = n_fft self.n_bin = 1 + n_fft // 2 w_real, w_imag, kernel_size = dft_matrix(self.n_fft, self.win_length, self.n_bin) # calculate window window = get_window(window_type, kernel_size) # (2 * n_bins, kernel_size) w = np.concatenate([w_real, w_imag], axis=0) w = w * window # (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size) w = np.expand_dims(w, 1) weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype()) self.register_buffer("weight", weight) def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]: """Compute the stft transform. Parameters ------------ x : Tensor [shape=(B, T)] The input waveform. num_samples : Tensor [shape=(B,)] Number of samples of each waveform. Returns ------------ C : Tensor Shape(B, T', n_bins, 2) Spectrogram. num_frames: Tensor Shape (B,) number of samples of each spectrogram """ num_frames = (num_samples - self.win_length) // self.stride_length padding = (0, 0) if not self.clip: num_frames += 1 need_samples = num_frames * self.stride_length + self.win_length padding = (0, need_samples - num_samples - 1) batch_size, _ = paddle.shape(x) x = x.unsqueeze(-1) C = F.conv1d(x, self.weight, stride=(self.stride_length, ), padding=padding, data_format="NLC") C = paddle.reshape(C, [batch_size, -1, 2, self.n_bin]) C = C.transpose([0, 1, 3, 2]) return C, num_frames def powspec(C:Tensor) -> Tensor: """Compute the power spectrum. Args: C (Tensor): [B, T, C, 2] Returns: Tensor: [B, T, C] """ real, imag = paddle.chunk(C, 2, axis=-1) return paddle.square(real.squeeze(-1)) + paddle.square(imag.squeeze(-1)) def magspec(C: Tensor, eps=1e-10) -> Tensor: """Compute the magnitude spectrum. Args: C (Tensor): [B, T, C, 2] eps (float): epsilon. Returns: Tensor: [B, T, C] """ pspec = powspec(C) return paddle.sqrt(pspec + eps)