refactor paddleaudio, test=doc

c437a7c5 · Hui Zhang · a9422260 · c437a7c5 · c437a7c5 · a9422260
30 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -30,5 +30,6 @@ tools/OpenBLAS/
 tools/Miniconda3-latest-Linux-x86_64.sh
 tools/activate_python.sh
 tools/miniconda.sh
+tools/CRF++-0.58/
 *output/
--- a/paddleaudio/CHANGELOG.md
+++ b/paddleaudio/CHANGELOG.md
 # Changelog
+Date: 2022-2-25, Author: Hui Zhang.
+  - Refactor architecture.
\ No newline at end of file
--- a/paddleaudio/features/augment.py
+++ b/paddleaudio/features/augment.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-import numpy as np
-from numpy import ndarray as array
-from ..backends import depth_convert
-from ..utils import ParameterError
-__all__ = [
-    'depth_augment',
-    'spect_augment',
-    'random_crop1d',
-    'random_crop2d',
-    'adaptive_spect_augment',
-]
-def randint(high: int) -> int:
-    """Generate one random integer in range [0 high)
-     This is a helper function for random data augmentaiton
-    """
-    return int(np.random.randint(0, high=high))
-def rand() -> float:
-    """Generate one floating-point number in range [0 1)
-    This is a helper function for random data augmentaiton
-    """
-    return float(np.random.rand(1))
-def depth_augment(y: array,
-                  choices: List=['int8', 'int16'],
-                  probs: List[float]=[0.5, 0.5]) -> array:
-    """ Audio depth augmentation
-    Do audio depth augmentation to simulate the distortion brought by quantization.
-    """
-    assert len(probs) == len(
-        choices
-    ), 'number of choices {} must be equal to size of probs {}'.format(
-        len(choices), len(probs))
-    depth = np.random.choice(choices, p=probs)
-    src_depth = y.dtype
-    y1 = depth_convert(y, depth)
-    y2 = depth_convert(y1, src_depth)
-    return y2
-def adaptive_spect_augment(spect: array, tempo_axis: int=0,
-                           level: float=0.1) -> array:
-    """Do adpative spectrogram augmentation
-    The level of the augmentation is gowern by the paramter level,
-    ranging from 0 to 1, with 0 represents no augmentation。
-    """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-    time_mask_width = int(nt * level * 0.5)
-    freq_mask_width = int(nf * level * 0.5)
-    num_time_mask = int(10 * level)
-    num_freq_mask = int(10 * level)
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
-    else:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-    return spect
-def spect_augment(spect: array,
-                  tempo_axis: int=0,
-                  max_time_mask: int=3,
-                  max_freq_mask: int=3,
-                  max_time_mask_width: int=30,
-                  max_freq_mask_width: int=20) -> array:
-    """Do spectrogram augmentation in both time and freq axis
-    Reference:
-    """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-    num_time_mask = randint(max_time_mask)
-    num_freq_mask = randint(max_freq_mask)
-    time_mask_width = randint(max_time_mask_width)
-    freq_mask_width = randint(max_freq_mask_width)
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
-    else:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-    return spect
-def random_crop1d(y: array, crop_len: int) -> array:
-    """ Do random cropping on 1d input signal
-    The input is a 1d signal, typically a sound waveform
-    """
-    if y.ndim != 1:
-        'only accept 1d tensor or numpy array'
-    n = len(y)
-    idx = randint(n - crop_len)
-    return y[idx:idx + crop_len]
-def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
-    """ Do random cropping for 2D array, typically a spectrogram.
-    The cropping is done in temporal direction on the time-freq input signal.
-    """
-    if tempo_axis >= s.ndim:
-        raise ParameterError('axis out of range')
-    n = s.shape[tempo_axis]
-    idx = randint(high=n - crop_len)
-    sli = [slice(None) for i in range(s.ndim)]
-    sli[tempo_axis] = slice(idx, idx + crop_len)
-    out = s[tuple(sli)]
-    return out
--- a/paddleaudio/backends/__init__.py
+++ b/paddleaudio/backends/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .audio import *
--- a/paddleaudio/__init__.py
+++ b/paddleaudio/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .backends import *
-from .features import *
--- a/paddleaudio/paddleaudio/backends/soundfile_backend.py
+++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py
--- a/paddleaudio/paddleaudio/backends/sox_backend.py
+++ b/paddleaudio/paddleaudio/backends/sox_backend.py
--- a/paddleaudio/datasets/__init__.py
+++ b/paddleaudio/datasets/__init__.py
--- a/paddleaudio/datasets/dataset.py
+++ b/paddleaudio/datasets/dataset.py
--- a/paddleaudio/datasets/esc50.py
+++ b/paddleaudio/datasets/esc50.py
--- a/paddleaudio/datasets/gtzan.py
+++ b/paddleaudio/datasets/gtzan.py
--- a/paddleaudio/datasets/tess.py
+++ b/paddleaudio/datasets/tess.py
--- a/paddleaudio/datasets/urban_sound.py
+++ b/paddleaudio/datasets/urban_sound.py
--- a/paddleaudio/features/__init__.py
+++ b/paddleaudio/features/__init__.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .augment import *
-from .core import *
+from .librosa import Spectrogram
-from .spectrum import *
+from .librosa import MelSpectrogram
+from .librosa import LogMelSpectrogram
\ No newline at end of file
--- a/paddleaudio/features/spectrum.py
+++ b/paddleaudio/features/spectrum.py
@@ -19,7 +19,7 @@ from typing import Union
 import paddle
 import paddle.nn as nn
-from .window import get_window
+from ..functional.window import get_window
 __all__ = [
    'Spectrogram',

--- a/paddleaudio/paddleaudio/functional/__init__.py
+++ b/paddleaudio/paddleaudio/functional/__init__.py
--- a/paddleaudio/features/core.py
+++ b/paddleaudio/features/core.py
@@ -21,11 +21,14 @@ import numpy as np
 import scipy
 from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
-from scipy.signal import get_window
+from scipy import signal
 from ..utils import ParameterError
+from ..backends import depth_convert
 __all__ = [
+    # dsp
    'stft',
    'mfcc',
    'hz_to_mel',
@@ -38,6 +41,12 @@ __all__ = [
    'spectrogram',
    'mu_encode',
    'mu_decode',
+    # augmentation
+    'depth_augment',
+    'spect_augment',
+    'random_crop1d',
+    'random_crop2d',
+    'adaptive_spect_augment',
 ]
@@ -303,7 +312,7 @@ def stft(x: array,
    if hop_length is None:
        hop_length = int(win_length // 4)
-    fft_window = get_window(window, win_length, fftbins=True)
+    fft_window = signal.get_window(window, win_length, fftbins=True)
    # Pad the window out to n_fft size
    fft_window = pad_center(fft_window, n_fft)
@@ -576,3 +585,145 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
        y = y * 2 / mu - 1
    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
    return x
+def randint(high: int) -> int:
+    """Generate one random integer in range [0 high)
+     This is a helper function for random data augmentaiton
+    """
+    return int(np.random.randint(0, high=high))
+def rand() -> float:
+    """Generate one floating-point number in range [0 1)
+    This is a helper function for random data augmentaiton
+    """
+    return float(np.random.rand(1))
+def depth_augment(y: array,
+                  choices: List=['int8', 'int16'],
+                  probs: List[float]=[0.5, 0.5]) -> array:
+    """ Audio depth augmentation
+    Do audio depth augmentation to simulate the distortion brought by quantization.
+    """
+    assert len(probs) == len(
+        choices
+    ), 'number of choices {} must be equal to size of probs {}'.format(
+        len(choices), len(probs))
+    depth = np.random.choice(choices, p=probs)
+    src_depth = y.dtype
+    y1 = depth_convert(y, depth)
+    y2 = depth_convert(y1, src_depth)
+    return y2
+def adaptive_spect_augment(spect: array, tempo_axis: int=0,
+                           level: float=0.1) -> array:
+    """Do adpative spectrogram augmentation
+    The level of the augmentation is gowern by the paramter level,
+    ranging from 0 to 1, with 0 represents no augmentation。
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+    time_mask_width = int(nt * level * 0.5)
+    freq_mask_width = int(nf * level * 0.5)
+    num_time_mask = int(10 * level)
+    num_freq_mask = int(10 * level)
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+    return spect
+def spect_augment(spect: array,
+                  tempo_axis: int=0,
+                  max_time_mask: int=3,
+                  max_freq_mask: int=3,
+                  max_time_mask_width: int=30,
+                  max_freq_mask_width: int=20) -> array:
+    """Do spectrogram augmentation in both time and freq axis
+    Reference:
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+    num_time_mask = randint(max_time_mask)
+    num_freq_mask = randint(max_freq_mask)
+    time_mask_width = randint(max_time_mask_width)
+    freq_mask_width = randint(max_freq_mask_width)
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+    return spect
+def random_crop1d(y: array, crop_len: int) -> array:
+    """ Do random cropping on 1d input signal
+    The input is a 1d signal, typically a sound waveform
+    """
+    if y.ndim != 1:
+        'only accept 1d tensor or numpy array'
+    n = len(y)
+    idx = randint(n - crop_len)
+    return y[idx:idx + crop_len]
+def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
+    """ Do random cropping for 2D array, typically a spectrogram.
+    The cropping is done in temporal direction on the time-freq input signal.
+    """
+    if tempo_axis >= s.ndim:
+        raise ParameterError('axis out of range')
+    n = s.shape[tempo_axis]
+    idx = randint(high=n - crop_len)
+    sli = [slice(None) for i in range(s.ndim)]
+    sli[tempo_axis] = slice(idx, idx + crop_len)
+    out = s[tuple(sli)]
+    return out
\ No newline at end of file
--- a/paddleaudio/features/window.py
+++ b/paddleaudio/features/window.py
@@ -20,6 +20,19 @@ from paddle import Tensor
 __all__ = [
    'get_window',
+    # windows
+    'taylor',
+    'hamming',
+    'hann',
+    'tukey',
+    'kaiser',
+    'gaussian',
+    'exponential',
+    'triang',
+    'bohman',
+    'blackman',
+    'cosine',
 ]
@@ -73,6 +86,21 @@ def general_gaussian(M: int, p, sig, sym: bool=True,
    return _truncate(w, needs_trunc)
+def general_cosine(M: int, a: float, sym: bool=True,
+                   dtype: str='float64') -> Tensor:
+    """Compute a generic weighted sum of cosine terms window.
+    This function is consistent with scipy.signal.windows.general_cosine().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
+    w = paddle.zeros((M, ), dtype=dtype)
+    for k in range(len(a)):
+        w += a[k] * paddle.cos(k * fac)
+    return _truncate(w, needs_trunc)
 def general_hamming(M: int, alpha: float, sym: bool=True,
                    dtype: str='float64') -> Tensor:
    """Compute a generalized Hamming window.
@@ -143,21 +171,6 @@ def taylor(M: int,
    return _truncate(w, needs_trunc)
-def general_cosine(M: int, a: float, sym: bool=True,
-                   dtype: str='float64') -> Tensor:
-    """Compute a generic weighted sum of cosine terms window.
-    This function is consistent with scipy.signal.windows.general_cosine().
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
-    w = paddle.zeros((M, ), dtype=dtype)
-    for k in range(len(a)):
-        w += a[k] * paddle.cos(k * fac)
-    return _truncate(w, needs_trunc)
 def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    """Compute a Hamming window.
    The Hamming window is a taper formed by using a raised cosine with
@@ -375,6 +388,7 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _truncate(w, needs_trunc)
+## factory function
 def get_window(window: Union[str, Tuple[str, float]],
               win_length: int,
               fftbins: bool=True,

--- a/paddleaudio/paddleaudio/io/__init__.py
+++ b/paddleaudio/paddleaudio/io/__init__.py
+from .audio import save_wav
+from .audio import load
+from .audio import normalize
+from .audio import to_mono
+from .audio import resample
+from .audio import depth_convert
\ No newline at end of file
--- a/paddleaudio/backends/audio.py
+++ b/paddleaudio/backends/audio.py
--- a/paddleaudio/paddleaudio/kaldi/__init__.py
+++ b/paddleaudio/paddleaudio/kaldi/__init__.py
--- a/paddleaudio/paddleaudio/sox_effects/__init__.py
+++ b/paddleaudio/paddleaudio/sox_effects/__init__.py
--- a/paddleaudio/utils/__init__.py
+++ b/paddleaudio/utils/__init__.py
@@ -11,8 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .download import *
-from .env import *
+from .env import USER_HOME
-from .error import *
+from .env import PPAUDIO_HOME
-from .log import *
+from .env import MODEL_HOME
-from .time import *
+from .env import DATA_HOME
+from .download import decompress
+from .download import download_and_decompress
+from .download import load_state_dict_from_url
+from .error import ParameterError
+from .log import logger
+from .log import Logger
+from .time import Timer
+from .time import seconds_to_hms
--- a/paddleaudio/utils/download.py
+++ b/paddleaudio/utils/download.py
@@ -22,6 +22,11 @@ from .log import logger
 download.logger = logger
+__all__ = [
+    'decompress',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+]
 def decompress(file: str):
    """

--- a/paddleaudio/utils/env.py
+++ b/paddleaudio/utils/env.py
@@ -20,6 +20,12 @@ PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. D
 '''
 import os
+__all__ = [
+    'USER_HOME',
+    'PPAUDIO_HOME',
+'MODEL_HOME' ,
+'DATA_HOME' ,
+]
 def _get_user_home():
    return os.path.expanduser('~')

--- a/paddleaudio/utils/error.py
+++ b/paddleaudio/utils/error.py
--- a/paddleaudio/utils/log.py
+++ b/paddleaudio/utils/log.py
@@ -19,7 +19,10 @@ import time
 import colorlog
-loggers = {}
+__all__ = [
+    'Logger',
+    'logger',
+]
 log_config = {
    'DEBUG': {

--- a/paddleaudio/utils/time.py
+++ b/paddleaudio/utils/time.py
@@ -14,6 +14,10 @@
 import math
 import time
+__all__ = [
+    'Timer',
+    'seconds_to_hms',
+]
 class Timer(object):
    '''Calculate runing speed and estimated time of arrival(ETA)'''

--- a/setup_audio.py
+++ b/setup_audio.py
@@ -14,7 +14,7 @@
 import setuptools
 # set the version here
-VERSION = '0.1.0'
+VERSION = '0.2.0'
 def write_version_py(filename='paddleaudio/__init__.py'):

--- a/requirements.txt
+++ b/requirements.txt
-ConfigArgParse
-coverage
-editdistance
-g2p_en
-g2pM
-gpustat
-h5py
-inflect
-jieba
-jsonlines
-kaldiio
-librosa
-loguru
-matplotlib
-nara_wpe
-nltk
-paddleaudio
-paddlenlp
-paddlespeech_ctcdecoders
-paddlespeech_feat
-pandas
-phkit
-Pillow
-praatio==5.0.0
-pre-commit
-pybind11
-pypi-kenlm
-pypinyin
-python-dateutil
-pyworld
-resampy==0.2.2
-sacrebleu
-scipy
-sentencepiece~=0.1.96
-snakeviz
-soundfile~=0.10
-sox
-soxbindings
-textgrid
-timer
-tqdm
-typeguard
-unidecode
-visualdl
-webrtcvad
-yacs~=0.1.8
-yq
-zhon