merge audio

38c55e44 · Yang Zhou · cffe555c · a2e8b76a · 38c55e44 · 38c55e44
9 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,7 +57,7 @@ include(openblas)
 # packages
 find_package(Python3 COMPONENTS Interpreter Development)
-find_package(pybind11 CONFIG)
+find_package(pybind11 CONFIG REQUIRED)
 # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g")

--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@@ -37,5 +37,9 @@ function (onnx_print_configuration_summary)
  message(STATUS "    Python executable     : ${Python_EXECUTABLE}")
  message(STATUS "    Python includes       : ${Python_INCLUDE_DIR}")
  message(STATUS "    Python libraries      : ${Python_LIBRARY}")
+  message(STATUS "  PYBIND11                  : ${pybind11_FOUND}")
+  message(STATUS "    Pybind11 version        : ${pybind11_VERSION}")
+  message(STATUS "    Pybind11 include        : ${pybind11_INCLUDE_DIR}")
+  message(STATUS "    Pybind11 includes       : ${pybind11_INCLUDE_DIRS}")
+  message(STATUS "    Pybind11 libraries      : ${pybind11_LIBRARIES}")
 endfunction()
\ No newline at end of file
--- a/paddlespeech/audio/_internal/module_utils.py
+++ b/paddlespeech/audio/_internal/module_utils.py
@@ -5,6 +5,7 @@ from typing import Optional
 #code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py
 def is_module_available(*modules: str) -> bool:
    r"""Returns if a top-level module with :attr:`name` exists *without**
    importing it. This is generally safer than try-catch block around a

--- a/paddlespeech/audio/backends/no_backend.py
+++ b/paddlespeech/audio/backends/no_backend.py
@@ -8,21 +8,25 @@ from paddle import Tensor
 #code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
 def load(
-    filepath: Union[str, Path],
+        filepath: Union[str, Path],
-    out: Optional[Tensor] = None,
+        out: Optional[Tensor]=None,
-    normalization: Union[bool, float, Callable] = True,
+        normalization: Union[bool, float, Callable]=True,
-    channels_first: bool = True,
+        channels_first: bool=True,
-    num_frames: int = 0,
+        num_frames: int=0,
-    offset: int = 0,
+        offset: int=0,
-    filetype: Optional[str] = None,
+        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
-) -> Tuple[Tensor, int]:
    raise RuntimeError("No audio I/O backend is available.")
-def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
+def save(filepath: str,
+         src: Tensor,
+         sample_rate: int,
+         precision: int=16,
+         channels_first: bool=True) -> None:
    raise RuntimeError("No audio I/O backend is available.")
 def info(filepath: str) -> None:
    raise RuntimeError("No audio I/O backend is available.")
\ No newline at end of file
--- a/paddlespeech/audio/backends/sox_io_backend.py
+++ b/paddlespeech/audio/backends/sox_io_backend.py
 from pathlib import Path
 from typing import Callable
 from typing import Optional
@@ -43,17 +42,20 @@ _fallback_load = _fail_load
 _fallback_load_filebj = _fail_load_fileobj
 def load(
-    filepath: Union[str, Path],
+        filepath: Union[str, Path],
-    out: Optional[Tensor] = None,
+        out: Optional[Tensor]=None,
-    normalization: Union[bool, float, Callable] = True,
+        normalization: Union[bool, float, Callable]=True,
-    channels_first: bool = True,
+        channels_first: bool=True,
-    num_frames: int = 0,
+        num_frames: int=0,
-    offset: int = 0,
+        offset: int=0,
-    filetype: Optional[str] = None,
+        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
-) -> Tuple[Tensor, int]:
    raise RuntimeError("No audio I/O backend is available.")
-def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
+def save(filepath: str, 
+         src: Tensor, 
+         sample_rate: int, 
+         precision: int = 16, 
+         channels_first: bool = True) -> None:
    raise RuntimeError("No audio I/O backend is available.")
 @_mod_utils.requires_sox()

--- a/paddlespeech/audio/backends/utils.py
+++ b/paddlespeech/audio/backends/utils.py
@@ -40,7 +40,8 @@ def set_audio_backend(backend: Optional[str]):
            of the system. If ``None`` is provided the  current backend is unassigned.
    """
    if backend is not None and backend not in list_audio_backends():
-        raise RuntimeError(f'Backend "{backend}" is not one of ' f"available backends: {list_audio_backends()}.")
+        raise RuntimeError(f'Backend "{backend}" is not one of '
+                           f"available backends: {list_audio_backends()}.")
    if backend is None:
        module = no_backend
@@ -76,6 +77,7 @@ def _init_audio_backend():
        warnings.warn("No audio backend is available.")
        set_audio_backend(None)
 def get_audio_backend() -> Optional[str]:
    """Get the name of the current backend
@@ -88,4 +90,4 @@ def get_audio_backend() -> Optional[str]:
        return "sox_io"
    if paddlespeech.audio.load == soundfile_backend.load:
        return "soundfile"
    raise ValueError("Unknown backend.")
\ No newline at end of file
--- a/paddlespeech/audio/kaldi/kaldi.py
+++ b/paddlespeech/audio/kaldi/kaldi.py
@@ -27,37 +27,38 @@ __all__ = [
 @module_utils.requires_kaldi()
-def fbank(wav,
+def fbank(
-          samp_freq: int=16000,
+        wav,
-          frame_shift_ms: float=10.0,
+        samp_freq: int=16000,
-          frame_length_ms: float=25.0,
+        frame_shift_ms: float=10.0,
-          dither: float=0.0,
+        frame_length_ms: float=25.0,
-          preemph_coeff: float=0.97,
+        dither: float=0.0,
-          remove_dc_offset: bool=True,
+        preemph_coeff: float=0.97,
-          window_type: str='povey',
+        remove_dc_offset: bool=True,
-          round_to_power_of_two: bool=True,
+        window_type: str='povey',
-          blackman_coeff: float=0.42,
+        round_to_power_of_two: bool=True,
-          snip_edges: bool=True,
+        blackman_coeff: float=0.42,
-          allow_downsample: bool=False,
+        snip_edges: bool=True,
-          allow_upsample: bool=False,
+        allow_downsample: bool=False,
-          max_feature_vectors: int=-1,
+        allow_upsample: bool=False,
-          num_bins: int=23,
+        max_feature_vectors: int=-1,
-          low_freq: float=20,
+        num_bins: int=23,
-          high_freq: float=0,
+        low_freq: float=20,
-          vtln_low: float=100,
+        high_freq: float=0,
-          vtln_high: float=-500,
+        vtln_low: float=100,
-          debug_mel: bool=False,
+        vtln_high: float=-500,
-          htk_mode: bool=False,
+        debug_mel: bool=False,
-          use_energy: bool=False, # fbank opts
+        htk_mode: bool=False,
-          energy_floor: float=0.0,
+        use_energy: bool=False,  # fbank opts
-          raw_energy: bool=True,
+        energy_floor: float=0.0,
-          htk_compat: bool=False,
+        raw_energy: bool=True,
-          use_log_fbank: bool=True,
+        htk_compat: bool=False,
-          use_power: bool=True):
+        use_log_fbank: bool=True,
+        use_power: bool=True):
    frame_opts = FrameExtractionOptions()
    mel_opts = MelBanksOptions()
    fbank_opts = FbankOptions()
-    frame_opts.samp_freq = samp_freq  
+    frame_opts.samp_freq = samp_freq
    frame_opts.frame_shift_ms = frame_shift_ms
    frame_opts.frame_length_ms = frame_length_ms
    frame_opts.dither = dither
@@ -71,7 +72,7 @@ def fbank(wav,
    frame_opts.allow_upsample = allow_upsample
    frame_opts.max_feature_vectors = max_feature_vectors
-    mel_opts.num_bins = num_bins  
+    mel_opts.num_bins = num_bins
    mel_opts.low_freq = low_freq
    mel_opts.high_freq = high_freq
    mel_opts.vtln_low = vtln_low
@@ -79,7 +80,7 @@ def fbank(wav,
    mel_opts.debug_mel = debug_mel
    mel_opts.htk_mode = htk_mode
-    fbank_opts.use_energy = use_energy  
+    fbank_opts.use_energy = use_energy
    fbank_opts.energy_floor = energy_floor
    fbank_opts.raw_energy = raw_energy
    fbank_opts.htk_compat = htk_compat
@@ -88,6 +89,7 @@ def fbank(wav,
    feat = ComputeFbank(frame_opts, mel_opts, fbank_opts, wav)
    return feat
 @module_utils.requires_kaldi()
 def pitch(wav,
          samp_freq: int=16000,
@@ -114,7 +116,7 @@ def pitch(wav,
    pitch_opts.samp_freq = samp_freq
    pitch_opts.frame_shift_ms = frame_shift_ms
    pitch_opts.frame_length_ms = frame_length_ms
-    pitch_opts.preemph_coeff = preemph_coeff 
+    pitch_opts.preemph_coeff = preemph_coeff
    pitch_opts.min_f0 = min_f0
    pitch_opts.max_f0 = max_f0
    pitch_opts.soft_min_f0 = soft_min_f0

--- a/paddlespeech/audio/src/CMakeLists.txt
+++ b/paddlespeech/audio/src/CMakeLists.txt
@@ -105,7 +105,7 @@ function(define_extension name sources include_dirs libraries definitions)
  add_library(${name} SHARED ${sources})
  target_compile_definitions(${name} PRIVATE "${definitions}")
  target_include_directories(
-    ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${include_dirs})
+    ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
  target_link_libraries(
    ${name}
    ${libraries}

--- a/paddlespeech/audio/src/pybind/kaldi/feature_common.h
+++ b/paddlespeech/audio/src/pybind/kaldi/feature_common.h
@@ -14,8 +14,8 @@
 #pragma once
-#include <pybind11/numpy.h>
+#include "pybind11/pybind11.h"
-#include <pybind11/pybind11.h>
+#include "pybind11/numpy.h"
 #include "feat/feature-window.h"
 namespace paddleaudio {