more backend api

f55c4573 · Hui Zhang · da6692c7 · f55c4573 · f55c4573 · f55c4573
19 changed file
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@@ -35,6 +35,7 @@ function (onnx_print_configuration_summary)
  message(STATUS "  BUILD_ONNX_PYTHON         : ${BUILD_ONNX_PYTHON}")
  message(STATUS "    Python version        : ${Python_VERSION}")
  message(STATUS "    Python executable     : ${Python_EXECUTABLE}")
-  message(STATUS "    Python includes       : ${Python_INCLUDE_DIRS}")
+  message(STATUS "    Python includes       : ${Python_INCLUDE_DIR}")
+  message(STATUS "    Python libraries      : ${Python_LIBRARY}")

 endfunction()
\ No newline at end of file
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -12,5 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import _locale
-
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+from . import audio
+# _init_audio_backend must called after audio import 
+audio.backends.utils._init_audio_backend()
+
+__all__ = [
+    "audio"
+]
--- a/paddlespeech/audio/__init__.py
+++ b/paddlespeech/audio/__init__.py
@@ -11,12 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from . import compliance
 from . import datasets
 from . import features
 from . import functional
 from . import io
 from . import metric
-from . import sox_effects
-from .backends import load
-from .backends import save
+from . import utils
+
+from ._ops import ops
+
+from paddlespeech.audio.backends import get_audio_backend, list_audio_backends, set_audio_backend
+
+__all__ = [
+    "io",
+    "compliance",
+    "datasets",
+    "functional",
+    "features",
+    "utils",
+    'ops'
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
\ No newline at end of file
--- a/paddlespeech/audio/_extension.py
+++ b/paddlespeech/audio/_extension.py
@@ -44,7 +44,7 @@ def _load_lib(lib: str) -> bool:
    path = _get_lib_path(lib)
    if not path.exists():
        return False
-    paddlespeech.ops.load_library(path)
+    paddlespeech.audio.ops.load_library(path)
    return True


@@ -56,7 +56,7 @@ def _init_ffmpeg():
    if _FFMPEG_INITIALIZED:
        return

-    if not paddlespeech.ops.paddlleaudio.is_ffmpeg_available():
+    if not paddlespeech.audio.ops.paddlleaudio.is_ffmpeg_available():
        raise RuntimeError(
            "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
        )
@@ -67,11 +67,11 @@ def _init_ffmpeg():
        raise ImportError(
            "FFmpeg libraries are not found. Please install FFmpeg.") from err

-    import paddllespeech._paddlleaudio_ffmpeg  # noqa
+    import paddllespeech.audio._paddlleaudio_ffmpeg  # noqa

-    paddlespeech.ops.paddlleaudio.ffmpeg_init()
-    if paddlespeech.ops.paddlleaudio.ffmpeg_get_log_level() > 8:
-        paddlespeech.ops.paddlleaudio.ffmpeg_set_log_level(8)
+    paddlespeech.audio.ops.paddlleaudio.ffmpeg_init()
+    if paddlespeech.audio.ops.paddlleaudio.ffmpeg_get_log_level() > 8:
+        paddlespeech.audio.ops.paddlleaudio.ffmpeg_set_log_level(8)

    _FFMPEG_INITIALIZED = True

@@ -84,7 +84,7 @@ def _init_extension():
    _load_lib("libpaddleaudio")
    # This import is for initializing the methods registered via PyBind11
    # This has to happen after the base library is loaded
-    from paddlespeech import _paddleaudio  # noqa
+    from paddlespeech.audio import _paddleaudio  # noqa

    # Because this part is executed as part of `import torchaudio`, we ignore the
    # initialization failure.

--- a/paddlespeech/audio/backends/__init__.py
+++ b/paddlespeech/audio/backends/__init__.py
@@ -11,9 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .soundfile_backend import depth_convert
-from .soundfile_backend import load
-from .soundfile_backend import normalize
-from .soundfile_backend import resample
-from .soundfile_backend import save
-from .soundfile_backend import to_mono
+
+# flake8: noqa
+from . import utils
+from .utils import get_audio_backend, list_audio_backends, set_audio_backend
\ No newline at end of file
--- a/paddlespeech/audio/backends/soundfile_backend.py
+++ b/paddlespeech/audio/backends/soundfile_backend.py
@@ -23,11 +23,11 @@ import soundfile as sf
 from scipy.io import wavfile

 from ..utils import ParameterError
+from ..utils import depth_convert

 __all__ = [
    'resample',
    'to_mono',
-    'depth_convert',
    'normalize',
    'save',
    'load',
@@ -117,78 +117,6 @@ def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
    return y_out


-def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
-    """Data type casting in a safe way, i.e., prevent overflow or underflow.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        dtype (Union[type, str]): Data type of waveform.
-
-    Returns:
-        np.ndarray: `y` after safe casting.
-    """
-    if 'float' in str(y.dtype):
-        return np.clip(y, np.finfo(dtype).min,
-                       np.finfo(dtype).max).astype(dtype)
-    else:
-        return np.clip(y, np.iinfo(dtype).min,
-                       np.iinfo(dtype).max).astype(dtype)
-
-
-def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
-    """Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of
-    preventing overflow/underflow and preserving audio range.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        dtype (Union[type, str]): Data type of waveform.
-
-    Returns:
-        np.ndarray: `y` after safe casting.
-    """
-
-    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
-    if y.dtype not in SUPPORT_DTYPE:
-        raise ParameterError(
-            'Unsupported audio dtype, '
-            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
-
-    if dtype not in SUPPORT_DTYPE:
-        raise ParameterError(
-            'Unsupported audio dtype, '
-            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
-
-    if dtype == y.dtype:
-        return y
-
-    if dtype == 'float64' and y.dtype == 'float32':
-        return _safe_cast(y, dtype)
-    if dtype == 'float32' and y.dtype == 'float64':
-        return _safe_cast(y, dtype)
-
-    if dtype == 'int16' or dtype == 'int8':
-        if y.dtype in ['float64', 'float32']:
-            factor = np.iinfo(dtype).max
-            y = np.clip(y * factor, np.iinfo(dtype).min,
-                        np.iinfo(dtype).max).astype(dtype)
-            y = y.astype(dtype)
-        else:
-            if dtype == 'int16' and y.dtype == 'int8':
-                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
-                y = y.astype('float32') * factor
-                y = y.astype('int16')
-
-            else:  # dtype == 'int8' and y.dtype=='int16':
-                y = y.astype('int32') * np.iinfo('int8').max / \
-                    np.iinfo('int16').max
-                y = y.astype('int8')
-
-    if dtype in ['float32', 'float64']:
-        org_dtype = y.dtype
-        y = y.astype(dtype) / np.iinfo(org_dtype).max
-    return y
-
-
 def sound_file_load(file: os.PathLike,
                    offset: Optional[float]=None,
                    dtype: str='int16',
@@ -323,3 +251,7 @@ def load(

    y = depth_convert(y, dtype)
    return y, r
+
+
+def info(filepath: str) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
\ No newline at end of file
--- a/paddlespeech/audio/backends/sox_backend.py
+++ b/paddlespeech/audio/backends/sox_backend.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/paddlespeech/audio/compliance/librosa.py
+++ b/paddlespeech/audio/compliance/librosa.py
@@ -22,7 +22,7 @@ import scipy
 from numpy.lib.stride_tricks import as_strided
 from scipy import signal

-from ..backends import depth_convert
+from ..utils import depth_convert
 from ..utils import ParameterError

 __all__ = [

--- a/paddlespeech/audio/datasets/dataset.py
+++ b/paddlespeech/audio/datasets/dataset.py
@@ -16,7 +16,6 @@ from typing import List
 import numpy as np
 import paddle

-from ..backends import load as load_audio
 from ..compliance.kaldi import fbank as kaldi_fbank
 from ..compliance.kaldi import mfcc as kaldi_mfcc
 from ..compliance.librosa import melspectrogram
@@ -70,9 +69,9 @@ class AudioClassificationDataset(paddle.io.Dataset):
        file, label = self.files[idx], self.labels[idx]

        if self.sample_rate is None:
-            waveform, sample_rate = load_audio(file)
+            waveform, sample_rate = paddlespeech.audio.load(file)
        else:
-            waveform, sample_rate = load_audio(file, sr=self.sample_rate)
+            waveform, sample_rate = paddlespeech.audio.load(file, sr=self.sample_rate)

        feat_func = feat_funcs[self.feat_type]


--- a/paddlespeech/audio/datasets/rirs_noises.py
+++ b/paddlespeech/audio/datasets/rirs_noises.py
@@ -20,8 +20,6 @@ from typing import List
 from paddle.io import Dataset
 from tqdm import tqdm

-from ..backends import load as load_audio
-from ..backends import save as save_wav
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from .dataset import feat_funcs
@@ -105,7 +103,7 @@ class OpenRIRNoise(Dataset):
        for field in type(sample)._fields:
            record[field] = getattr(sample, field)

-        waveform, sr = load_audio(record['wav'])
+        waveform, sr = paddlespeech.audio.load(record['wav'])

        assert self.feat_type in feat_funcs.keys(), \
            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
@@ -128,7 +126,7 @@ class OpenRIRNoise(Dataset):

    def _get_audio_info(self, wav_file: str,
                        split_chunks: bool) -> List[List[str]]:
-        waveform, sr = load_audio(wav_file)
+        waveform, sr = paddlespeech.audio.load(wav_file)
        audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
        audio_duration = waveform.shape[0] / sr

@@ -143,7 +141,7 @@ class OpenRIRNoise(Dataset):
                end_sample = int(float(e) * sr)
                new_wav_file = os.path.join(self.base_path,
                                            audio_id + f'_chunk_{idx+1:02}.wav')
-                save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
+                paddlespeech.audio.save(waveform[start_sample:end_sample], sr, new_wav_file)
                # id, duration, new_wav
                ret.append([chunk, self.chunk_duration, new_wav_file])
        else:  # Keep whole audio.

--- a/paddlespeech/audio/datasets/voxceleb.py
+++ b/paddlespeech/audio/datasets/voxceleb.py
@@ -23,7 +23,6 @@ from paddle.io import Dataset
 from pathos.multiprocessing import Pool
 from tqdm import tqdm

-from ..backends import load as load_audio
 from ..utils import DATA_HOME
 from ..utils import decompress
 from ..utils.download import download_and_decompress
@@ -192,7 +191,7 @@ class VoxCeleb(Dataset):
        for field in type(sample)._fields:
            record[field] = getattr(sample, field)

-        waveform, sr = load_audio(record['wav'])
+        waveform, sr = paddlespeech.audio.load(record['wav'])

        # random select a chunk audio samples from the audio
        if self.random_chunk:
@@ -231,7 +230,7 @@ class VoxCeleb(Dataset):

    def _get_audio_info(self, wav_file: str,
                        split_chunks: bool) -> List[List[str]]:
-        waveform, sr = load_audio(wav_file)
+        waveform, sr = paddlespeech.audio.load(wav_file)
        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
        audio_duration = waveform.shape[0] / sr

--- a/paddlespeech/audio/sox_effects/__init__.py
+++ b/paddlespeech/audio/sox_effects/__init__.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/paddlespeech/audio/utils/__init__.py
+++ b/paddlespeech/audio/utils/__init__.py
@@ -13,11 +13,18 @@
 # limitations under the License.
 from ...cli.utils import DATA_HOME
 from ...cli.utils import MODEL_HOME
+
 from .download import decompress
 from .download import download_and_decompress
 from .download import load_state_dict_from_url
+
 from .error import ParameterError
+
 from .log import Logger
 from .log import logger
+
 from .time import seconds_to_hms
 from .time import Timer
+
+from .numeric import pcm16to32
+from .numeric import depth_convert
\ No newline at end of file
--- a/paddlespeech/audio/utils/numeric.py
+++ b/paddlespeech/audio/utils/numeric.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
+from typing import Union

+__all__ = [
+    "pcm16to32",
+    "depth_convert"
+]

 def pcm16to32(audio: np.ndarray) -> np.ndarray:
    """pcm int16 to float32
@@ -28,3 +33,76 @@ def pcm16to32(audio: np.ndarray) -> np.ndarray:
        bits = np.iinfo(np.int16).bits
        audio = audio / (2**(bits - 1))
    return audio
+
+
+def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Data type casting in a safe way, i.e., prevent overflow or underflow.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
+    """
+    if 'float' in str(y.dtype):
+        return np.clip(y, np.finfo(dtype).min,
+                       np.finfo(dtype).max).astype(dtype)
+    else:
+        return np.clip(y, np.iinfo(dtype).min,
+                       np.iinfo(dtype).max).astype(dtype)
+
+
+def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Convert audio array to target dtype safely. 
+    This function convert audio waveform to a target dtype, with addition steps of
+    preventing overflow/underflow and preserving audio range.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
+    """
+
+    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
+    if y.dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype == y.dtype:
+        return y
+
+    if dtype == 'float64' and y.dtype == 'float32':
+        return _safe_cast(y, dtype)
+    if dtype == 'float32' and y.dtype == 'float64':
+        return _safe_cast(y, dtype)
+
+    if dtype == 'int16' or dtype == 'int8':
+        if y.dtype in ['float64', 'float32']:
+            factor = np.iinfo(dtype).max
+            y = np.clip(y * factor, np.iinfo(dtype).min,
+                        np.iinfo(dtype).max).astype(dtype)
+            y = y.astype(dtype)
+        else:
+            if dtype == 'int16' and y.dtype == 'int8':
+                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
+                y = y.astype('float32') * factor
+                y = y.astype('int16')
+
+            else:  # dtype == 'int8' and y.dtype=='int16':
+                y = y.astype('int32') * np.iinfo('int8').max / \
+                    np.iinfo('int16').max
+                y = y.astype('int8')
+
+    if dtype in ['float32', 'float64']:
+        org_dtype = y.dtype
+        y = y.astype(dtype) / np.iinfo(org_dtype).max
+    return y
\ No newline at end of file
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -27,7 +27,7 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification

--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@@ -18,7 +18,7 @@ import numpy as np
 from paddle import inference
 from scipy.special import softmax

-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.datasets import ESC50
 from paddlespeech.audio.features import melspectrogram


--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
@@ -17,7 +17,7 @@ from collections import OrderedDict
 import numpy as np
 import paddle

-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.vector.infer import VectorExecutor

--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -18,7 +18,7 @@ import time
 import paddle
 from yacs.config import CfgNode

-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import feature_normalize

--- a/tools/setup_helpers/extension.py
+++ b/tools/setup_helpers/extension.py
@@ -90,7 +90,7 @@ class CMakeBuild(build_ext):
            f"-DCMAKE_INSTALL_PREFIX={extdir}",
            "-DCMAKE_VERBOSE_MAKEFILE=ON",
            f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
-            f"-DPYTHON_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}",
+            f"-DPython_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}",
            f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
            f"-DBUILD_MAD:BOOL={'ON' if _BUILD_MAD else 'OFF'}",
            # f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",