add sox, kaldi feature to paddleaudio && make it work on mac, windows (#2663)

* add paddleaudio sox && kaldifeature * rrm redundant file which compile audio * mv audio test into paddleaudio * fix test bug * add paddleaudio test * rm redundant comment * unsupport sox in windows * rm io in __init__ * fix windows cblas compile error

add sox, kaldi feature to paddleaudio && make it work on mac, windows (#2663)
* add paddleaudio sox && kaldifeature * rrm redundant file which compile audio * mv audio test into paddleaudio * fix test bug * add paddleaudio test * rm redundant comment * unsupport sox in windows * rm io in __init__ * fix windows cblas compile error
0ffcd477 · YangZhou · GitHub · 62fe3d44 · 0ffcd477 · 0ffcd477
81 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,10 @@ paddlespeech/audio/_paddleaudio.so
 paddlespeech/audio/lib/libpaddleaudio.so
 paddlespeech/version.py
+audio/dist/
+audio/fc_patch/
+audio/paddleaudio/version.py
 docs/build/
 docs/topic/ctc/warp-ctc/

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,16 +13,14 @@ if(NOT CMAKE_VERSION VERSION_LESS 3.15.0)
  cmake_policy(SET CMP0092 NEW)
 endif()
+project(paddleaudio)
-project(paddlespeech)
 # check and set CMAKE_CXX_STANDARD
 string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
 if(env_cxx_standard GREATER -1)
  message(
      WARNING "C++ standard version definition detected in environment variable."
-      "paddlespeech requires -std=c++14. Please remove -std=c++ settings in your environment.")
+      "paddleaudio requires -std=c++14. Please remove -std=c++ settings in your environment.")
 endif()
@@ -33,8 +31,6 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 # Options
 option(BUILD_SOX "Build libsox statically" ON)
 option(BUILD_MAD "Enable libmad" ON)
@@ -50,18 +46,21 @@ set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJEC
 set(FETCHCONTENT_QUIET off)
 get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
 set(FETCHCONTENT_BASE_DIR ${fc_patch})
+set(THIRD_PARTY_PATH ${fc_patch})
 include(openblas)
+if (NOT PY_VERSION)
+  set(PY_VERSION 3.7)
+endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 include(pybind)
 # packages
 find_package(Python3 COMPONENTS Interpreter Development)
-#find_package(pybind11 CONFIG REQUIRED)
 # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g")
-add_subdirectory(paddlespeech/audio)
+add_subdirectory(paddleaudio)
 # Summary
 include(cmake/summary.cmake)

--- a/cmake/FindGFortranLibs.cmake
+++ b/cmake/FindGFortranLibs.cmake
--- a/audio/cmake/external/openblas.cmake
+++ b/audio/cmake/external/openblas.cmake
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+include(ExternalProject)
+set(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas)
+set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
+set(CBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
+set(CBLAS_TAG v0.3.10)
+if(NOT WIN32)
+  set(CBLAS_LIBRARIES
+      "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "openblas library." FORCE)
+  set(CBLAS_INC_DIR
+      "${CBLAS_INSTALL_DIR}/include"
+      CACHE PATH "openblas include directory." FORCE)
+  set(OPENBLAS_CC
+      "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+  if(APPLE)
+    set(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+  endif()
+  set(OPTIONAL_ARGS "")
+  set(COMMON_ARGS "")
+  if(APPLE)
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+      set(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
+    endif()
+    set(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1)
+  endif()
+  ExternalProject_Add(
+    OPENBLAS
+    GIT_REPOSITORY ${CBLAS_REPOSITORY}
+    GIT_TAG ${CBLAS_TAG}
+    GIT_SHALLOW YES
+    PREFIX ${CBLAS_PREFIX_DIR}
+    INSTALL_DIR ${CBLAS_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND make -j${NPROC} ${COMMON_ARGS} ${OPTIONAL_ARGS}
+    INSTALL_COMMAND make install PREFIX=<INSTALL_DIR>
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
+    ExternalProject_Get_Property(OPENBLAS INSTALL_DIR)
+    set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR})
+    add_library(openblas STATIC IMPORTED)
+    add_dependencies(openblas OPENBLAS)
+    set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran)
+    set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a)
+    link_directories(${OpenBLAS_INSTALL_PREFIX}/lib)
+    include_directories(${OpenBLAS_INSTALL_PREFIX}/include)
+    set(OPENBLAS_LIBRARIES
+        ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a
+    )
+    add_library(libopenblas INTERFACE)
+    add_dependencies(libopenblas openblas)
+    target_include_directories(libopenblas INTERFACE ${OpenBLAS_INSTALL_PREFIX}/include/openblas)
+    target_link_libraries(libopenblas INTERFACE ${OPENBLAS_LIBRARIES})
+else()
+  set(CBLAS_LIBRARIES
+      "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "openblas library." FORCE)
+  set(CBLAS_INC_DIR
+      "${CBLAS_INSTALL_DIR}/include/openblas"
+      CACHE PATH "openblas include directory." FORCE)
+  ExternalProject_Add(
+    extern_openblas
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY ${CBLAS_REPOSITORY}
+    GIT_TAG ${CBLAS_TAG}
+    PREFIX ${CBLAS_PREFIX_DIR}
+    INSTALL_DIR ${CBLAS_INSTALL_DIR}
+    BUILD_IN_SOURCE 0
+    UPDATE_COMMAND ""
+    CMAKE_ARGS -DCMAKE_C_COMPILER=clang-cl
+               -DCMAKE_CXX_COMPILER=clang-cl
+               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+               -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
+               -DCMAKE_BUILD_TYPE=Release #${THIRD_PARTY_BUILD_TYPE}
+               -DCMAKE_MT=mt
+               -DUSE_THREAD=OFF
+               -DBUILD_WITHOUT_LAPACK=NO
+               -DCMAKE_Fortran_COMPILER=flang
+               -DNOFORTRAN=0
+               -DDYNAMIC_ARCH=ON
+               #${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_BUILD_TYPE:STRING=Release #${THIRD_PARTY_BUILD_TYPE}
+    # ninja need to know where openblas.lib comes from
+    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
+  set(OPENBLAS_SHARED_LIB
+      ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
+  add_library(openblas INTERFACE)
+  add_dependencies(openblas extern_openblas)
+  include_directories(${CBLAS_INC_DIR})
+  link_libraries(${CBLAS_LIBRARIES})
+endif()
--- a/cmake/external/pybind.cmake
+++ b/cmake/external/pybind.cmake
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
--- a/audio/paddleaudio/CMakeLists.txt
+++ b/audio/paddleaudio/CMakeLists.txt
+add_subdirectory(third_party)
+add_subdirectory(src)
--- a/audio/paddleaudio/__init__.py
+++ b/audio/paddleaudio/__init__.py
@@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from . import _extension
 from . import backends
 from . import compliance
 from . import datasets
 from . import features
 from . import functional
-from . import io
 from . import metric
 from . import sox_effects
 from . import utils
--- a/audio/paddleaudio/_extension.py
+++ b/audio/paddleaudio/_extension.py
+import os
+import warnings
+from pathlib import Path
+from ._internal import module_utils as _mod_utils  # noqa: F401
+import contextlib
+import ctypes
+import os
+import sys
+import types
+# Query `hasattr` only once.
+_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
+                                                               'setdlopenflags')
+@contextlib.contextmanager
+def dl_open_guard():
+    """
+    # https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
+    Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
+    shared library to load custom operators.
+    """
+    if _SET_GLOBAL_FLAGS:
+        old_flags = sys.getdlopenflags()
+        sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
+    yield
+    if _SET_GLOBAL_FLAGS:
+        sys.setdlopenflags(old_flags)
+def resolve_library_path(path: str) -> str:
+    return os.path.realpath(path)
+class _Ops(types.ModuleType):
+    #__file__ = '_ops.py'
+    def __init__(self):
+        super(_Ops, self).__init__('paddleaudio.ops')
+        self.loaded_libraries = set()
+    def load_library(self, path):
+        """
+        Loads a shared library from the given path into the current process.
+        This allows dynamically loading custom operators. For this, 
+        you should compile your operator and 
+        the static registration code into a shared library object, and then
+        call ``paddleaudio.ops.load_library('path/to/libcustom.so')`` to load the
+        shared object.
+        After the library is loaded, it is added to the
+        ``paddleaudio.ops.loaded_libraries`` attribute, a set that may be inspected
+        for the paths of all libraries loaded using this function.
+        Args:
+            path (str): A path to a shared library to load.
+        """
+        path = resolve_library_path(path)
+        with dl_open_guard():
+            # https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
+            # Import the shared library into the process, thus running its
+            # static (global) initialization code in order to register custom
+            # operators with the JIT.
+            ctypes.CDLL(path)
+        self.loaded_libraries.add(path)
+_LIB_DIR = Path(__file__).parent / "lib"
+def _get_lib_path(lib: str):
+    suffix = "pyd" if os.name == "nt" else "so"
+    path = _LIB_DIR / f"{lib}.{suffix}"
+    return path
+def _load_lib(lib: str) -> bool:
+    """Load extension module
+    Note:
+        In case `paddleaudio` is deployed with `pex` format, the library file
+        is not in a standard location.
+        In this case, we expect that `libpaddlleaudio` is available somewhere
+        in the search path of dynamic loading mechanism, so that importing
+        `_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
+        This is the reason why the function should not raising an error when the library
+        file is not found.
+    Returns:
+        bool:
+            True if the library file is found AND the library loaded without failure.
+            False if the library file is not found (like in the case where paddlleaudio
+            is deployed with pex format, thus the shared library file is
+            in a non-standard location.).
+            If the library file is found but there is an issue loading the library,
+            (such as missing dependency) then this function raises the exception as-is.
+    Raises:
+        Exception:
+            If the library file is found, but there is an issue loading the library file,
+            (when underlying `ctype.DLL` throws an exception), this function will pass
+            the exception as-is, instead of catching it and returning bool.
+            The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
+            is not found.
+            This behavior was chosen because the expected failure case is not recoverable.
+            If a dependency is missing, then users have to install it.
+    """
+    path = _get_lib_path(lib)
+    if not path.exists():
+        warnings.warn("lib path is not exists:" + str(path))
+        return False
+    ops.load_library(path)
+    return True
+_FFMPEG_INITIALIZED = False
+def _init_ffmpeg():
+    global _FFMPEG_INITIALIZED
+    if _FFMPEG_INITIALIZED:
+        return
+    if not paddleaudio._paddlleaudio.is_ffmpeg_available():
+        raise RuntimeError(
+            "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
+        )
+    try:
+        _load_lib("libpaddlleaudio_ffmpeg")
+    except OSError as err:
+        raise ImportError(
+            "FFmpeg libraries are not found. Please install FFmpeg.") from err
+    import paddllespeech.audio._paddlleaudio_ffmpeg  # noqa
+    paddleaudio._paddlleaudio.ffmpeg_init()
+    if paddleaudio._paddlleaudio.ffmpeg_get_log_level() > 8:
+        paddleaudio._paddlleaudio.ffmpeg_set_log_level(8)
+    _FFMPEG_INITIALIZED = True
+def _init_extension():
+    if not _mod_utils.is_module_available("paddleaudio._paddleaudio"):
+        warnings.warn("paddleaudio C++ extension is not available.")
+        return
+    _load_lib("libpaddleaudio")
+    # This import is for initializing the methods registered via PyBind11
+    # This has to happen after the base library is loaded
+    from paddleaudio import _paddleaudio  # noqa
+    # Because this part is executed as part of `import torchaudio`, we ignore the
+    # initialization failure.
+    # If the FFmpeg integration is not properly initialized, then detailed error
+    # will be raised when client code attempts to import the dedicated feature.
+    try:
+        _init_ffmpeg()
+    except Exception:
+        pass
+ops = _Ops()
+_init_extension()
--- a/audio/paddleaudio/_internal/module_utils.py
+++ b/audio/paddleaudio/_internal/module_utils.py
@@ -2,8 +2,9 @@ import importlib.util
 import warnings
 from functools import wraps
 from typing import Optional
+import platform
-#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py
+#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py with modification.
 def is_module_available(*modules: str) -> bool:
@@ -127,6 +128,8 @@ def requires_soundfile():
 def is_sox_available():
+    if platform.system() == "Windows": # not support sox in windows
+        return False
    return is_module_available("paddleaudio._paddleaudio")

--- a/audio/paddleaudio/functional/window.py
+++ b/audio/paddleaudio/functional/window.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,127 +18,156 @@ from typing import Union
 import paddle
 from paddle import Tensor
-__all__ = [
-    'get_window',
-]
+class WindowFunctionRegister(object):
+    def __init__(self):
+        self._functions_dict = dict()
+    def register(self):
+        def add_subfunction(func):
+            name = func.__name__
+            self._functions_dict[name] = func
+            return func
+        return add_subfunction
+    def get(self, name):
+        return self._functions_dict[name]
+window_function_register = WindowFunctionRegister()
+@window_function_register.register()
 def _cat(x: List[Tensor], data_type: str) -> Tensor:
    l = [paddle.to_tensor(_, data_type) for _ in x]
    return paddle.concat(l)
+@window_function_register.register()
 def _acosh(x: Union[Tensor, float]) -> Tensor:
    if isinstance(x, float):
        return math.log(x + math.sqrt(x**2 - 1))
    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
+@window_function_register.register()
 def _extend(M: int, sym: bool) -> bool:
-    """Extend window by 1 sample if needed for DFT-even symmetry. """
+    """Extend window by 1 sample if needed for DFT-even symmetry."""
    if not sym:
        return M + 1, True
    else:
        return M, False
+@window_function_register.register()
 def _len_guards(M: int) -> bool:
-    """Handle small or incorrect window lengths. """
+    """Handle small or incorrect window lengths."""
    if int(M) != M or M < 0:
        raise ValueError('Window length M must be a non-negative integer')
    return M <= 1
+@window_function_register.register()
 def _truncate(w: Tensor, needed: bool) -> Tensor:
-    """Truncate window by 1 sample if needed for DFT-even symmetry. """
+    """Truncate window by 1 sample if needed for DFT-even symmetry."""
    if needed:
        return w[:-1]
    else:
        return w
-def _general_gaussian(M: int, p, sig, sym: bool=True,
+@window_function_register.register()
-                      dtype: str='float64') -> Tensor:
+def _general_gaussian(
+    M: int, p, sig, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a window with a generalized Gaussian shape.
    This function is consistent with scipy.signal.windows.general_gaussian().
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
-    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
+    w = paddle.exp(-0.5 * paddle.abs(n / sig) ** (2 * p))
    return _truncate(w, needs_trunc)
-def _general_cosine(M: int, a: float, sym: bool=True,
+@window_function_register.register()
-                    dtype: str='float64') -> Tensor:
+def _general_cosine(
+    M: int, a: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a generic weighted sum of cosine terms window.
    This function is consistent with scipy.signal.windows.general_cosine().
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
-    w = paddle.zeros((M, ), dtype=dtype)
+    w = paddle.zeros((M,), dtype=dtype)
    for k in range(len(a)):
        w += a[k] * paddle.cos(k * fac)
    return _truncate(w, needs_trunc)
-def _general_hamming(M: int, alpha: float, sym: bool=True,
+@window_function_register.register()
-                     dtype: str='float64') -> Tensor:
+def _general_hamming(
+    M: int, alpha: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a generalized Hamming window.
    This function is consistent with scipy.signal.windows.general_hamming()
    """
-    return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+    return _general_cosine(M, [alpha, 1.0 - alpha], sym, dtype=dtype)
-def _taylor(M: int,
+@window_function_register.register()
-            nbar=4,
+def _taylor(
-            sll=30,
+    M: int, nbar=4, sll=30, norm=True, sym: bool = True, dtype: str = 'float64'
-            norm=True,
+) -> Tensor:
-            sym: bool=True,
-            dtype: str='float64') -> Tensor:
    """Compute a Taylor window.
    The Taylor window taper function approximates the Dolph-Chebyshev window's
    constant sidelobe level for a parameterized number of near-in sidelobes.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    # Original text uses a negative sidelobe level parameter and then negates
    # it in the calculation of B. To keep consistent with other methods we
    # assume the sidelobe level parameter to be positive.
-    B = 10**(sll / 20)
+    B = 10 ** (sll / 20)
    A = _acosh(B) / math.pi
-    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
+    s2 = nbar**2 / (A**2 + (nbar - 0.5) ** 2)
    ma = paddle.arange(1, nbar, dtype=dtype)
-    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
+    Fm = paddle.empty((nbar - 1,), dtype=dtype)
    signs = paddle.empty_like(ma)
    signs[::2] = 1
    signs[1::2] = -1
    m2 = ma * ma
    for mi in range(len(ma)):
-        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
+        numer = signs[mi] * paddle.prod(
-                                                           ))
+            1 - m2[mi] / s2 / (A**2 + (ma - 0.5) ** 2)
+        )
        if mi == 0:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
        elif mi == len(ma) - 1:
            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
        else:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
+            denom = (
-                mi] / m2[mi + 1:])
+                2
+                * paddle.prod(1 - m2[mi] / m2[:mi])
+                * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
+            )
        Fm[mi] = numer / denom
    def W(n):
        return 1 + 2 * paddle.matmul(
            Fm.unsqueeze(0),
-            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2.0 + 0.5) / M),
+        )
    w = W(paddle.arange(0, M, dtype=dtype))
@@ -150,7 +179,8 @@ def _taylor(M: int,
    return _truncate(w, needs_trunc)
-def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _hamming(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Hamming window.
    The Hamming window is a taper formed by using a raised cosine with
    non-zero endpoints, optimized to minimize the nearest side lobe.
@@ -158,7 +188,8 @@ def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_hamming(M, 0.54, sym, dtype=dtype)
-def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _hann(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Hann window.
    The Hann window is a taper formed by using a raised cosine or sine-squared
    with ends that touch zero.
@@ -166,15 +197,18 @@ def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_hamming(M, 0.5, sym, dtype=dtype)
-def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _tukey(
+    M: int, alpha=0.5, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a Tukey window.
    The Tukey window is also known as a tapered cosine window.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    if alpha <= 0:
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    elif alpha >= 1.0:
        return hann(M, sym=sym)
@@ -182,53 +216,48 @@ def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
    n = paddle.arange(0, M, dtype=dtype)
    width = int(alpha * (M - 1) / 2.0)
-    n1 = n[0:width + 1]
+    n1 = n[0 : width + 1]
-    n2 = n[width + 1:M - width - 1]
+    n2 = n[width + 1 : M - width - 1]
-    n3 = n[M - width - 1:]
+    n3 = n[M - width - 1 :]
    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
    w2 = paddle.ones(n2.shape, dtype=dtype)
-    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
+    w3 = 0.5 * (
-                                          (M - 1))))
+        1
+        + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / (M - 1)))
+    )
    w = paddle.concat([w1, w2, w3])
    return _truncate(w, needs_trunc)
-def _kaiser(M: int, beta: float, sym: bool=True,
+@window_function_register.register()
-            dtype: str='float64') -> Tensor:
+def _gaussian(
-    """Compute a Kaiser window.
+    M: int, std: float, sym: bool = True, dtype: str = 'float64'
-    The Kaiser window is a taper formed by using a Bessel function.
+) -> Tensor:
-    """
-    raise NotImplementedError()
-def _gaussian(M: int, std: float, sym: bool=True,
-              dtype: str='float64') -> Tensor:
    """Compute a Gaussian window.
    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
    sig2 = 2 * std * std
-    w = paddle.exp(-n**2 / sig2)
+    w = paddle.exp(-(n**2) / sig2)
    return _truncate(w, needs_trunc)
-def _exponential(M: int,
+@window_function_register.register()
-                 center=None,
+def _exponential(
-                 tau=1.,
+    M: int, center=None, tau=1.0, sym: bool = True, dtype: str = 'float64'
-                 sym: bool=True,
+) -> Tensor:
-                 dtype: str='float64') -> Tensor:
+    """Compute an exponential (or Poisson) window."""
-    """Compute an exponential (or Poisson) window. """
    if sym and center is not None:
        raise ValueError("If sym==True, center must be None.")
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    if center is None:
@@ -240,11 +269,11 @@ def _exponential(M: int,
    return _truncate(w, needs_trunc)
-def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
-    """Compute a triangular window.
+def _triang(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
-    """
+    """Compute a triangular window."""
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
@@ -258,23 +287,26 @@ def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _truncate(w, needs_trunc)
-def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _bohman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Bohman window.
    The Bohman window is the autocorrelation of a cosine window.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
-        math.pi * fac)
+        math.pi * fac
+    )
    w = _cat([0, w, 0], dtype)
    return _truncate(w, needs_trunc)
-def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _blackman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Blackman window.
    The Blackman window is a taper formed by using the first three terms of
    a summation of cosines. It was designed to have close to the minimal
@@ -284,31 +316,44 @@ def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
-def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
-    """Compute a window with a simple cosine shape.
+def _cosine(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
-    """
+    """Compute a window with a simple cosine shape."""
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
-    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + 0.5))
    return _truncate(w, needs_trunc)
-def get_window(window: Union[str, Tuple[str, float]],
+def get_window(
+    window: Union[str, Tuple[str, float]],
    win_length: int,
-               fftbins: bool=True,
+    fftbins: bool = True,
-               dtype: str='float64') -> Tensor:
+    dtype: str = 'float64',
+) -> Tensor:
    """Return a window of a given length and type.
    Args:
-        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
+        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
        win_length (int): Number of samples.
        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
        dtype (str, optional): The data type of the return window. Defaults to 'float64'.
    Returns:
        Tensor: The window represented as a tensor.
+    Examples:
+        .. code-block:: python
+            import paddle
+            n_fft = 512
+            cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
+            std = 7
+            gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
    """
    sym = not fftbins
@@ -319,19 +364,22 @@ def get_window(window: Union[str, Tuple[str, float]],
            args = window[1:]
    elif isinstance(window, str):
        if window in ['gaussian', 'exponential']:
-            raise ValueError("The '" + window + "' window needs one or "
+            raise ValueError(
-                             "more parameters -- pass a tuple.")
+                "The '" + window + "' window needs one or "
+                "more parameters -- pass a tuple."
+            )
        else:
            winstr = window
    else:
-        raise ValueError("%s as window type is not supported." %
+        raise ValueError(
-                         str(type(window)))
+            "%s as window type is not supported." % str(type(window))
+        )
    try:
-        winfunc = eval('_' + winstr)
+        winfunc = window_function_register.get('_' + winstr)
    except KeyError as e:
        raise ValueError("Unknown window type.") from e
-    params = (win_length, ) + args
+    params = (win_length,) + args
    kwargs = {'sym': sym}
    return winfunc(*params, dtype=dtype, **kwargs)
--- a/audio/paddleaudio/io/__init__.py
+++ b/audio/paddleaudio/io/__init__.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/tests/unit/audio/features/__init__.py
+++ b/tests/unit/audio/features/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .kaldi import fbank
+from .kaldi import pitch
--- a/audio/paddleaudio/kaldi/kaldi.py
+++ b/audio/paddleaudio/kaldi/kaldi.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddleaudio
+from paddleaudio._internal import module_utils 
+__all__ = [
+    'fbank',
+    'pitch',
+]
+@module_utils.requires_kaldi()
+def fbank(
+        wav,
+        samp_freq: int=16000,
+        frame_shift_ms: float=10.0,
+        frame_length_ms: float=25.0,
+        dither: float=0.0,
+        preemph_coeff: float=0.97,
+        remove_dc_offset: bool=True,
+        window_type: str='povey',
+        round_to_power_of_two: bool=True,
+        blackman_coeff: float=0.42,
+        snip_edges: bool=True,
+        allow_downsample: bool=False,
+        allow_upsample: bool=False,
+        max_feature_vectors: int=-1,
+        num_bins: int=23,
+        low_freq: float=20,
+        high_freq: float=0,
+        vtln_low: float=100,
+        vtln_high: float=-500,
+        debug_mel: bool=False,
+        htk_mode: bool=False,
+        use_energy: bool=False,  # fbank opts
+        energy_floor: float=0.0,
+        raw_energy: bool=True,
+        htk_compat: bool=False,
+        use_log_fbank: bool=True,
+        use_power: bool=True):
+    frame_opts = paddleaudio._paddleaudio.FrameExtractionOptions()
+    mel_opts = paddleaudio._paddleaudio.MelBanksOptions()
+    fbank_opts = paddleaudio._paddleaudio.FbankOptions()
+    frame_opts.samp_freq = samp_freq
+    frame_opts.frame_shift_ms = frame_shift_ms
+    frame_opts.frame_length_ms = frame_length_ms
+    frame_opts.dither = dither
+    frame_opts.preemph_coeff = preemph_coeff
+    frame_opts.remove_dc_offset = remove_dc_offset
+    frame_opts.window_type = window_type
+    frame_opts.round_to_power_of_two = round_to_power_of_two
+    frame_opts.blackman_coeff = blackman_coeff
+    frame_opts.snip_edges = snip_edges
+    frame_opts.allow_downsample = allow_downsample
+    frame_opts.allow_upsample = allow_upsample
+    frame_opts.max_feature_vectors = max_feature_vectors
+    mel_opts.num_bins = num_bins
+    mel_opts.low_freq = low_freq
+    mel_opts.high_freq = high_freq
+    mel_opts.vtln_low = vtln_low
+    mel_opts.vtln_high = vtln_high
+    mel_opts.debug_mel = debug_mel
+    mel_opts.htk_mode = htk_mode
+    fbank_opts.use_energy = use_energy
+    fbank_opts.energy_floor = energy_floor
+    fbank_opts.raw_energy = raw_energy
+    fbank_opts.htk_compat = htk_compat
+    fbank_opts.use_log_fbank = use_log_fbank
+    fbank_opts.use_power = use_power
+    feat = paddleaudio._paddleaudio.ComputeFbank(frame_opts, mel_opts, fbank_opts, wav)
+    return feat
+@module_utils.requires_kaldi()
+def pitch(wav,
+          samp_freq: int=16000,
+          frame_shift_ms: float=10.0,
+          frame_length_ms: float=25.0,
+          preemph_coeff: float=0.0,
+          min_f0: int=50,
+          max_f0: int=400,
+          soft_min_f0: float=10.0,
+          penalty_factor: float=0.1,
+          lowpass_cutoff: int=1000,
+          resample_freq: int=4000,
+          delta_pitch: float=0.005,
+          nccf_ballast: int=7000,
+          lowpass_filter_width: int=1,
+          upsample_filter_width: int=5,
+          max_frames_latency: int=0,
+          frames_per_chunk: int=0,
+          simulate_first_pass_online: bool=False,
+          recompute_frame: int=500,
+          nccf_ballast_online: bool=False,
+          snip_edges: bool=True):
+    pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
+    pitch_opts.samp_freq = samp_freq
+    pitch_opts.frame_shift_ms = frame_shift_ms
+    pitch_opts.frame_length_ms = frame_length_ms
+    pitch_opts.preemph_coeff = preemph_coeff
+    pitch_opts.min_f0 = min_f0
+    pitch_opts.max_f0 = max_f0
+    pitch_opts.soft_min_f0 = soft_min_f0
+    pitch_opts.penalty_factor = penalty_factor
+    pitch_opts.lowpass_cutoff = lowpass_cutoff
+    pitch_opts.resample_freq = resample_freq
+    pitch_opts.delta_pitch = delta_pitch
+    pitch_opts.nccf_ballast = nccf_ballast
+    pitch_opts.lowpass_filter_width = lowpass_filter_width
+    pitch_opts.upsample_filter_width = upsample_filter_width
+    pitch_opts.max_frames_latency = max_frames_latency
+    pitch_opts.frames_per_chunk = frames_per_chunk
+    pitch_opts.simulate_first_pass_online = simulate_first_pass_online
+    pitch_opts.recompute_frame = recompute_frame
+    pitch_opts.nccf_ballast_online = nccf_ballast_online
+    pitch_opts.snip_edges = snip_edges
+    pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
+    return pitch
--- a/audio/paddleaudio/sox_effects/__init__.py
+++ b/audio/paddleaudio/sox_effects/__init__.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+from paddleaudio._internal import module_utils as _mod_utils
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
+from .sox_effects import (
-# you may not use this file except in compliance with the License.
+    apply_effects_file,
-# You may obtain a copy of the License at
+    apply_effects_tensor,
-#
+    effect_names,
-#     http://www.apache.org/licenses/LICENSE-2.0
+    init_sox_effects,
-#
+    shutdown_sox_effects,
-# Unless required by applicable law or agreed to in writing, software
+)
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
+if _mod_utils.is_sox_available():
-# limitations under the License.
+    import atexit
+    init_sox_effects()
+    atexit.register(shutdown_sox_effects)
+__all__ = [
+    "init_sox_effects",
+    "shutdown_sox_effects",
+    "effect_names",
+    "apply_effects_tensor",
+    "apply_effects_file",
+]
--- a/audio/paddleaudio/sox_effects/sox_effects.py
+++ b/audio/paddleaudio/sox_effects/sox_effects.py
+import os
+from typing import List, Optional, Tuple
+import paddle
+import numpy
+from paddleaudio._internal import module_utils as _mod_utils
+from paddleaudio.utils.sox_utils import list_effects
+from paddleaudio import _paddleaudio as paddleaudio
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
+@_mod_utils.requires_sox()
+def init_sox_effects():
+    """Initialize resources required to use sox effects.
+    Note:
+        You do not need to call this function manually. It is called automatically.
+    Once initialized, you do not need to call this function again across the multiple uses of
+    sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
+    Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
+    again will result in error.
+    """
+    paddleaudio.sox_effects_initialize_sox_effects()
+@_mod_utils.requires_sox()
+def shutdown_sox_effects():
+    """Clean up resources required to use sox effects.
+    Note:
+        You do not need to call this function manually. It is called automatically.
+    It is safe to call this function multiple times.
+    Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
+    initializing again will result in error.
+    """
+    paddleaudio.sox_effects_shutdown_sox_effects()
+@_mod_utils.requires_sox()
+def effect_names() -> List[str]:
+    """Gets list of valid sox effect names
+    Returns:
+        List[str]: list of available effect names.
+    Example
+        >>> paddleaudio.sox_effects.effect_names()
+        ['allpass', 'band', 'bandpass', ... ]
+    """
+    return list(list_effects().keys())
+@_mod_utils.requires_sox()
+def apply_effects_tensor(
+    tensor: paddle.Tensor,
+    sample_rate: int,
+    effects: List[List[str]],
+    channels_first: bool = True,
+) -> Tuple[paddle.Tensor, int]:
+    """Apply sox effects to given Tensor
+    .. devices:: CPU
+    Note:
+        This function only works on CPU Tensors.
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` command adds certain effects automatically (such as
+        ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
+        only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
+        need to give ``rate`` effect with desired sampling rate.).
+    Args:
+        tensor (paddle.Tensor): Input 2D CPU Tensor.
+        sample_rate (int): Sample rate
+        effects (List[List[str]]): List of effects.
+        channels_first (bool, optional): Indicates if the input Tensor's dimension is
+            `[channels, time]` or `[time, channels]`
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        The resulting Tensor has the same ``dtype`` as the input Tensor, and
+        the same channels order. The shape of the Tensor can be different based on the
+        effects applied. Sample rate can also be different based on the effects applied.
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Generate pseudo wave:
+        >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
+        >>> sample_rate = 16000
+        >>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
+        >>> waveform.shape
+        paddle.Size([2, 16000])
+        >>> waveform
+        tensor([[ 0.3138,  0.7620, -0.9019,  ..., -0.7495, -0.4935,  0.5442],
+                [-0.0832,  0.0061,  0.8233,  ..., -0.5176, -0.9140, -0.2434]])
+        >>>
+        >>> # Apply effects
+        >>> waveform, sample_rate = apply_effects_tensor(
+        ...     wave_form, sample_rate, effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> # The new waveform is sampling rate 8000, 1 second.
+        >>> # normalization and channel order are preserved
+        >>> waveform.shape
+        paddle.Size([2, 8000])
+        >>> waveform
+        tensor([[ 0.5054, -0.5518, -0.4800,  ..., -0.0076,  0.0096, -0.0110],
+                [ 0.1331,  0.0436, -0.3783,  ..., -0.0035,  0.0012,  0.0008]])
+        >>> sample_rate
+        8000
+    """
+    tensor_np = tensor.numpy()
+    ret = paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate, effects, channels_first)
+    if ret is not None:
+       return (paddle.to_tensor(ret[0]), ret[1])
+    raise RuntimeError("Failed to apply sox effect")
+@_mod_utils.requires_sox()
+def apply_effects_file(
+    path: str,
+    effects: List[List[str]],
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[paddle.Tensor, int]:
+    """Apply sox effects to the audio file and load the resulting data as Tensor
+    Note:
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` commnad adds certain effects automatically (such as
+        ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
+        effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
+        effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
+        rate and leave samples untouched.
+    Args:
+        path (path-like object or file-like object):
+        effects (List[List[str]]): List of effects.
+        normalize (bool, optional):
+            When ``True``, this function always return ``float32``, and sample values are
+            normalized to ``[-1.0, 1.0]``.
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type. This argument has no effect for formats other
+            than integer WAV type.
+        channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Override the format detection with the given format.
+            Providing the argument might help when libsox can not infer the format
+            from header or extension,
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        If ``normalize=True``, the resulting Tensor is always ``float32`` type.
+        If ``normalize=False`` and the input audio file is of integer WAV file, then the
+        resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
+        If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
+        otherwise `[time, channel]`.
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Apply effects and load data with channels_first=True
+        >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> waveform.shape
+        paddle.Size([2, 8000])
+        >>> waveform
+        tensor([[ 5.1151e-03,  1.8073e-02,  2.2188e-02,  ...,  1.0431e-07,
+                 -1.4761e-07,  1.8114e-07],
+                [-2.6924e-03,  2.1860e-03,  1.0650e-02,  ...,  6.4122e-07,
+                 -5.6159e-07,  4.8103e-07]])
+        >>> sample_rate
+        8000
+    Example - Apply random speed perturbation to dataset
+        >>>
+        >>> # Load data from file, apply random speed perturbation
+        >>> class RandomPerturbationFile(paddle.utils.data.Dataset):
+        ...     \"\"\"Given flist, apply random speed perturbation
+        ...
+        ...     Suppose all the input files are at least one second long.
+        ...     \"\"\"
+        ...     def __init__(self, flist: List[str], sample_rate: int):
+        ...         super().__init__()
+        ...         self.flist = flist
+        ...         self.sample_rate = sample_rate
+        ...
+        ...     def __getitem__(self, index):
+        ...         speed = 0.5 + 1.5 * random.randn()
+        ...         effects = [
+        ...             ['gain', '-n', '-10'],  # apply 10 db attenuation
+        ...             ['remix', '-'],  # merge all the channels
+        ...             ['speed', f'{speed:.5f}'],  # duration is now 0.5 ~ 2.0 seconds.
+        ...             ['rate', f'{self.sample_rate}'],
+        ...             ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
+        ...             ['trim', '0', '2'],  # get the first 2 seconds
+        ...         ]
+        ...         waveform, _ = paddleaudio.sox_effects.apply_effects_file(
+        ...             self.flist[index], effects)
+        ...         return waveform
+        ...
+        ...     def __len__(self):
+        ...         return len(self.flist)
+        ...
+        >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
+        >>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
+        >>> for batch in loader:
+        >>>     pass
+    """
+    if hasattr(path, "read"):
+        ret = paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format)
+        if ret is None:
+            raise RuntimeError("Failed to load audio from {}".format(path))
+        return (paddle.to_tensor(ret[0]), ret[1])
+    path = os.fspath(path)
+    ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
+    if ret is not None:
+        return (paddle.to_tensor(ret[0]), ret[1])
+    raise RuntimeError("Failed to load audio from {}".format(path))
--- a/audio/paddleaudio/src/CMakeLists.txt
+++ b/audio/paddleaudio/src/CMakeLists.txt
+if (MSVC)
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+if(APPLE)
+set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
+endif(APPLE)
+################################################################################
+# libpaddleaudio
+################################################################################
+set(
+  LIBPADDLEAUDIO_SOURCES
+  utils.cpp
+  )
+set(
+  LIBPADDLEAUDIO_INCLUDE_DIRS
+  ${PROJECT_SOURCE_DIR}
+  )
+set(
+  LIBPADDLEAUDIO_LINK_LIBRARIES
+  )
+set(
+  LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
+#------------------------------------------------------------------------------#
+# START OF CUSTOMIZATION LOGICS
+#------------------------------------------------------------------------------#
+if(BUILD_SOX)
+  list(
+    APPEND
+    LIBPADDLEAUDIO_LINK_LIBRARIES
+    libsox
+    )
+  list(
+    APPEND
+    LIBPADDLEAUDIO_SOURCES
+    #sox/io.cpp
+    #sox/utils.cpp
+    #sox/effects.cpp
+    #sox/effects_chain.cpp
+    #sox/types.cpp
+    )
+  list(
+    APPEND
+    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
+    INCLUDE_SOX
+    )
+endif()
+if(BUILD_KALDI)
+  list(
+    APPEND
+    LIBPADDLEAUDIO_LINK_LIBRARIES
+    libkaldi
+  )
+  list(
+    APPEND
+    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
+    INCLUDE_KALDI
+    COMPILE_WITHOUT_OPENFST
+  )
+endif()
+#------------------------------------------------------------------------------#
+# END OF CUSTOMIZATION LOGICS
+#------------------------------------------------------------------------------#
+function (define_library name source include_dirs link_libraries compile_defs)
+  add_library(${name} SHARED ${source})
+  target_include_directories(${name} PRIVATE ${include_dirs})
+  target_link_libraries(${name} ${link_libraries})
+  target_compile_definitions(${name} PRIVATE ${compile_defs})
+  set_target_properties(${name} PROPERTIES PREFIX "")
+  if (MSVC)
+    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
+  endif(MSVC)
+  install(
+    TARGETS ${name}
+    LIBRARY DESTINATION lib
+    RUNTIME DESTINATION lib  # For Windows
+    )
+endfunction()
+define_library(
+  libpaddleaudio
+  "${LIBPADDLEAUDIO_SOURCES}"
+  "${LIBPADDLEAUDIO_INCLUDE_DIRS}"
+  "${LIBPADDLEAUDIO_LINK_LIBRARIES}"
+  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
+)
+if (APPLE)
+  set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
+else()
+  set(AUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
+endif()
+  ################################################################################
+# _paddleaudio.so
+################################################################################
+if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
+if (WIN32)
+  find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
+  set(ADDITIONAL_ITEMS Python3::Python)
+endif()
+function(define_extension name sources include_dirs libraries definitions)
+  add_library(${name} SHARED ${sources})
+  target_compile_definitions(${name} PRIVATE "${definitions}")
+  target_include_directories(
+    ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
+  target_link_libraries(
+    ${name}
+    ${libraries}
+    ${PYTHON_LIBRARY}
+    ${ADDITIONAL_ITEMS}
+    )
+  set_target_properties(${name} PROPERTIES PREFIX "")
+  if (MSVC)
+    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
+  endif(MSVC)
+  if (APPLE)
+    # https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
+    # https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
+    set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+  endif()
+  install(
+    TARGETS ${name}
+    LIBRARY DESTINATION .
+    RUNTIME DESTINATION .  # For Windows
+    )
+endfunction()
+set(
+  EXTENSION_SOURCES
+  pybind/pybind.cpp
+  )
+#----------------------------------------------------------------------------#
+# START OF CUSTOMIZATION LOGICS
+#----------------------------------------------------------------------------#
+if(BUILD_SOX)
+  list(
+    APPEND
+    EXTENSION_SOURCES
+    pybind/sox/effects.cpp
+    pybind/sox/effects_chain.cpp
+    pybind/sox/io.cpp
+    pybind/sox/types.cpp
+    pybind/sox/utils.cpp
+    )
+endif()
+if(BUILD_KALDI)
+  list(
+    APPEND
+    EXTENSION_SOURCES
+    pybind/kaldi/kaldi_feature_wrapper.cc
+    pybind/kaldi/kaldi_feature.cc
+    )
+endif()
+#----------------------------------------------------------------------------#
+# END OF CUSTOMIZATION LOGICS
+#----------------------------------------------------------------------------#
+define_extension(
+  _paddleaudio
+  "${EXTENSION_SOURCES}"
+  ""
+  libpaddleaudio
+  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
+  )
+# if(BUILD_CTC_DECODER)
+#   set(
+#     DECODER_EXTENSION_SOURCES
+#     decoder/bindings/pybind.cpp
+#     )
+#   define_extension(
+#     _paddleaudio_decoder
+#     "${DECODER_EXTENSION_SOURCES}"
+#     ""
+#     "libpaddleaudio_decoder"
+#     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
+#     )
+# endif()
+# if(USE_FFMPEG)
+#   set(
+#     FFMPEG_EXTENSION_SOURCES
+#     ffmpeg/pybind/typedefs.cpp
+#     ffmpeg/pybind/pybind.cpp
+#     ffmpeg/pybind/stream_reader.cpp
+#     )
+#   define_extension(
+#     _paddleaudio_ffmpeg
+#     "${FFMPEG_EXTENSION_SOURCES}"
+#     "${FFMPEG_INCLUDE_DIRS}"
+#     "libpaddleaudio_ffmpeg"
+#     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
+#     )
+# endif()
+endif()
--- a/audio/paddleaudio/src/optional/COPYING
+++ b/audio/paddleaudio/src/optional/COPYING
+Creative Commons Legal Code
+CC0 1.0 Universal
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+Statement of Purpose
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+4. Limitations and Disclaimers.
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
--- a/audio/paddleaudio/src/optional/optional.hpp
+++ b/audio/paddleaudio/src/optional/optional.hpp
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "pybind11/pybind11.h"
+#include "pybind11/numpy.h"
+#include "feat/feature-window.h"
+namespace paddleaudio {
+namespace kaldi {
+namespace py = pybind11;
+template <class F>
+class StreamingFeatureTpl {
+  public:
+    typedef typename F::Options Options;
+    StreamingFeatureTpl(const Options& opts);
+    bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
+                        ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+    void Reset() { remained_wav_.Resize(0); }
+    int Dim() { return computer_.Dim(); }
+  private:
+    bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
+                 ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+    Options opts_;
+    ::kaldi::FeatureWindowFunction window_function_;
+    ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
+    F computer_;
+};
+}  // namespace kaldi
+}  // namespace ppspeech
+#include "feature_common_inl.h"
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "base/kaldi-common.h"
+namespace paddleaudio {
+namespace kaldi {
+template <class F>
+StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
+    : opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
+    // window_function_(computer_.GetFrameOptions()) { the opt set to zero
+}
+template <class F>
+bool StreamingFeatureTpl<F>::ComputeFeature(
+    const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
+    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    // append remaned waves
+    ::kaldi::int32 wav_len = wav.Dim();
+    if (wav_len == 0) return false;
+    ::kaldi::int32 left_len = remained_wav_.Dim();
+    ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
+    waves.Range(0, left_len).CopyFromVec(remained_wav_);
+    waves.Range(left_len, wav_len).CopyFromVec(wav);
+    // cache remaned waves
+    ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
+    ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
+    ::kaldi::int32 frame_shift = frame_opts.WindowShift();
+    ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
+    remained_wav_.Resize(left_samples);
+    remained_wav_.CopyFromVec(
+        waves.Range(frame_shift * num_frames, left_samples));
+    // compute speech feature
+    Compute(waves, feats);
+    return true;
+}
+// Compute feat
+template <class F>
+bool StreamingFeatureTpl<F>::Compute(
+    const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
+    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    ::kaldi::BaseFloat vtln_warp = 1.0;
+    const ::kaldi::FrameExtractionOptions& frame_opts =
+        computer_.GetFrameOptions();
+    ::kaldi::int32 num_samples = waves.Dim();
+    ::kaldi::int32 frame_length = frame_opts.WindowSize();
+    ::kaldi::int32 sample_rate = frame_opts.samp_freq;
+    if (num_samples < frame_length) {
+        return false;
+    }
+    ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
+    feats->Resize(num_frames * Dim());
+    ::kaldi::Vector<::kaldi::BaseFloat> window;
+    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
+    for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
+        ::kaldi::BaseFloat raw_log_energy = 0.0;
+        ::kaldi::ExtractWindow(0,
+                               waves,
+                               frame,
+                               frame_opts,
+                               window_function_,
+                               &window,
+                               need_raw_log_energy ? &raw_log_energy : NULL);
+        ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
+                                                         ::kaldi::kUndefined);
+        computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
+        ::kaldi::SubVector<::kaldi::BaseFloat> output_row(
+            feats->Data() + frame * Dim(), Dim());
+        output_row.CopyFromVec(this_feature);
+    }
+    return true;
+}
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
+#include "feat/pitch-functions.h"
+namespace paddleaudio {
+namespace kaldi {
+bool InitFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts) {
+    ::kaldi::FbankOptions opts;
+    opts.frame_opts = frame_opts;
+    opts.mel_opts = mel_opts;
+    opts.use_energy = fbank_opts.use_energy;
+    opts.energy_floor = fbank_opts.energy_floor;
+    opts.raw_energy = fbank_opts.raw_energy;
+    opts.htk_compat = fbank_opts.htk_compat;
+    opts.use_log_fbank = fbank_opts.use_log_fbank;
+    opts.use_power = fbank_opts.use_power;
+    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
+    return true;
+}
+py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
+    return paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ComputeFbank(
+        wav);
+}
+py::array_t<float> ComputeFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts,
+    const py::array_t<float>& wav) {
+    InitFbank(frame_opts, mel_opts, fbank_opts);
+    py::array_t<float> result = ComputeFbankStreaming(wav);
+    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
+    return result;
+}
+void ResetFbank() {
+    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
+}
+py::array_t<float> ComputeKaldiPitch(
+  const ::kaldi::PitchExtractionOptions& opts,
+  const py::array_t<float>& wav) {
+    py::buffer_info info = wav.request();
+    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+    ::kaldi::Matrix<::kaldi::BaseFloat> features;
+    ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
+    auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
+    for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
+        std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
+                    sizeof(float)*features.NumCols());
+    }
+   return result;
+}
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <string>
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
+#include "feat/pitch-functions.h"
+namespace py = pybind11;
+namespace paddleaudio {
+namespace kaldi {
+struct FbankOptions{
+  bool use_energy;  // append an extra dimension with energy to the filter banks
+  float energy_floor;
+  bool raw_energy;  // If true, compute energy before preemphasis and windowing
+  bool htk_compat;  // If true, put energy last (if using energy)
+  bool use_log_fbank;  // if true (default), produce log-filterbank, else linear
+  bool use_power; 
+  FbankOptions(): use_energy(false),
+                 energy_floor(0.0),
+                 raw_energy(true),
+                 htk_compat(false),
+                 use_log_fbank(true),
+                 use_power(true) {}
+};
+bool InitFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts);
+py::array_t<float> ComputeFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts,
+    const py::array_t<float>& wav);
+py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
+void ResetFbank();
+py::array_t<float> ComputeKaldiPitch(
+    const ::kaldi::PitchExtractionOptions& opts,
+    const py::array_t<float>& wav);
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
+namespace paddleaudio {
+namespace kaldi {
+KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
+    static KaldiFeatureWrapper instance;
+    return &instance;
+}
+bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
+    fbank_.reset(new Fbank(opts));
+    return true;
+}
+py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
+    const py::array_t<float> wav) {
+    py::buffer_info info = wav.request();
+    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+    ::kaldi::Vector<::kaldi::BaseFloat> feats;
+    bool flag = fbank_->ComputeFeature(input_wav, &feats);
+    if (flag == false || feats.Dim() == 0) return py::array_t<float>();
+    auto result = py::array_t<float>(feats.Dim());
+    py::buffer_info xs = result.request();
+    std::cout << std::endl;
+    float* res_ptr = (float*)xs.ptr;
+    for (int idx = 0; idx < feats.Dim(); ++idx) {
+        *res_ptr = feats(idx);
+        res_ptr++;
+    }
+    return result.reshape({feats.Dim() / Dim(), Dim()});
+}
+}  // namesapce kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "base/kaldi-common.h"
+#include "feat/feature-fbank.h"
+#include "paddleaudio/src/pybind/kaldi/feature_common.h"
+namespace paddleaudio {
+namespace kaldi {
+typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
+class KaldiFeatureWrapper {
+  public:
+    static KaldiFeatureWrapper* GetInstance();
+    bool InitFbank(::kaldi::FbankOptions opts);
+    py::array_t<float> ComputeFbank(const py::array_t<float> wav);
+    int Dim() { return fbank_->Dim(); }
+    void ResetFbank() { fbank_->Reset(); }
+  private:
+    std::unique_ptr<paddleaudio::kaldi::Fbank> fbank_;
+};
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/pybind.cpp
+++ b/audio/paddleaudio/src/pybind/pybind.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
+#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
+#ifdef INCLUDE_SOX
+#include "paddleaudio/src/pybind/sox/io.h"
+#include "paddleaudio/src/pybind/sox/effects.h"
+#endif
+#include <pybind11/stl.h>
+#include <pybind11/pybind11.h>
+// `tl::optional` 
+#ifdef INCLUDE_SOX
+namespace pybind11 { namespace detail {
+   template <typename T>
+   struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
+}}
+#endif
+PYBIND11_MODULE(_paddleaudio, m) {
+#ifdef INCLUDE_SOX
+    m.def("get_info_file",
+          &paddleaudio::sox_io::get_info_file,
+          "Get metadata of audio file.");
+    // support obj later
+    m.def("get_info_fileobj",
+          &paddleaudio::sox_io::get_info_fileobj,
+          "Get metadata of audio in file object.");
+    m.def("load_audio_fileobj",
+          &paddleaudio::sox_io::load_audio_fileobj,
+          "Load audio from file object.");
+    m.def("save_audio_fileobj",
+          &paddleaudio::sox_io::save_audio_fileobj,
+          "Save audio to file obj.");
+    // sox io
+     m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
+     m.def(
+         "sox_io_load_audio_file",
+         &paddleaudio::sox_io::load_audio_file);
+     m.def(
+         "sox_io_save_audio_file",
+         &paddleaudio::sox_io::save_audio_file);
+     // sox utils
+     m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed);
+     m.def(
+         "sox_utils_set_verbosity",
+         &paddleaudio::sox_utils::set_verbosity);
+     m.def(
+         "sox_utils_set_use_threads",
+         &paddleaudio::sox_utils::set_use_threads);
+     m.def(
+         "sox_utils_set_buffer_size",
+         &paddleaudio::sox_utils::set_buffer_size);
+     m.def(
+         "sox_utils_list_effects",
+         &paddleaudio::sox_utils::list_effects);
+     m.def(
+         "sox_utils_list_read_formats",
+         &paddleaudio::sox_utils::list_read_formats);
+     m.def(
+         "sox_utils_list_write_formats",
+         &paddleaudio::sox_utils::list_write_formats);
+     m.def(
+         "sox_utils_get_buffer_size",
+         &paddleaudio::sox_utils::get_buffer_size);
+     // effect
+     m.def("apply_effects_fileobj",
+           &paddleaudio::sox_effects::apply_effects_fileobj,
+           "Decode audio data from file-like obj and apply effects.");
+     m.def("sox_effects_initialize_sox_effects",
+       &paddleaudio::sox_effects::initialize_sox_effects);
+     m.def(
+         "sox_effects_shutdown_sox_effects",
+         &paddleaudio::sox_effects::shutdown_sox_effects);
+     m.def(
+         "sox_effects_apply_effects_tensor",
+         &paddleaudio::sox_effects::apply_effects_tensor);
+     m.def(
+         "sox_effects_apply_effects_file",
+         &paddleaudio::sox_effects::apply_effects_file);
+#endif
+#ifdef INCLUDE_KALDI
+    m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
+    py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
+        .def(py::init<>())
+        .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
+        .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
+        .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
+        .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
+        .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
+        .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
+        .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
+        .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
+        .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
+        .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
+        .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
+        .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
+        .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
+        .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
+        .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
+        .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
+        .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
+        .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
+        .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
+        .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
+    m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
+    py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
+        .def(py::init<>())            
+        .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
+        .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)            
+        .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
+        .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)            
+        .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)            
+        .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)            
+        .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
+        .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)           
+        .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)          
+        .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
+        .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
+        .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
+        .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
+    py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
+        .def(py::init<>())
+        .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
+        .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
+        .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
+        .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
+        .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
+        .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
+        .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
+    py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
+        .def(py::init<>())
+        .def_readwrite("use_energy", &paddleaudio::kaldi::FbankOptions::use_energy)
+        .def_readwrite("energy_floor", &paddleaudio::kaldi::FbankOptions::energy_floor)
+        .def_readwrite("raw_energy", &paddleaudio::kaldi::FbankOptions::raw_energy)
+        .def_readwrite("htk_compat", &paddleaudio::kaldi::FbankOptions::htk_compat)
+        .def_readwrite("use_log_fbank", &paddleaudio::kaldi::FbankOptions::use_log_fbank)
+        .def_readwrite("use_power", &paddleaudio::kaldi::FbankOptions::use_power);
+#endif
+}
--- a/audio/paddleaudio/src/pybind/sox/effects.cpp
+++ b/audio/paddleaudio/src/pybind/sox/effects.cpp
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp  with modification.
+#include <mutex>
+#include <sox.h>
+#include "paddleaudio/src/pybind/sox/effects.h"
+#include "paddleaudio/src/pybind/sox/effects_chain.h"
+#include "paddleaudio/src/pybind/sox/utils.h"
+using namespace paddleaudio::sox_utils;
+namespace paddleaudio::sox_effects {
+// Streaming decoding over file-like object is tricky because libsox operates on
+// FILE pointer. The folloing is what `sox` and `play` commands do
+//  - file input -> FILE pointer
+//  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
+//  - stdin -> FILE pointer
+//
+// We want to, instead, fetch byte strings chunk by chunk, consume them, and
+// discard.
+//
+// Here is the approach
+// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
+// chunk of byte string
+//    This will perform header-based format detection, if necessary, then fill
+//    the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
+//    which returns FILE* which points the buffer of the provided byte string.
+// 2. Each time sox reads a chunk from the FILE*, we update the underlying
+// buffer in a way that it
+//    starts with unseen data, and append the new data read from the given
+//    fileobj. This will trick libsox as if it keeps reading from the FILE*
+//    continuously.
+// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
+auto apply_effects_fileobj(
+    py::object fileobj,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    tl::optional<std::string> format)
+    -> tl::optional<std::tuple<py::array, int64_t>> {
+  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
+  //
+  // For certain format (such as FLAC), libsox keeps reading the content at
+  // the initialization unless it reaches EOF even when the header is properly
+  // parsed. (Making buffer size 8192, which is way bigger than the header,
+  // resulted in libsox consuming all the buffer content at the time it opens
+  // the file.) Therefore buffer has to always contain valid data, except after
+  // EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
+  // first check if there is enough data to fill the buffer. `read_fileobj`
+  // repeatedly calls `read`  method until it receives the requested length of
+  // bytes or it reaches EOF. If we get bytes shorter than requested, that means
+  // the whole audio data are fetched.
+  //
+  // * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
+  const auto capacity = [&]() {
+    // NOTE:
+    // Use the abstraction provided by `libpaddleaudio` to access the global
+    // config defined by libsox. Directly using `sox_get_globals` function will
+    // end up retrieving the static variable defined in `_paddleaudio`, which is
+    // not correct.
+    const auto bufsiz = get_buffer_size();
+    const int64_t kDefaultCapacityInBytes = 256;
+    return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
+                                              : kDefaultCapacityInBytes;
+  }();
+  std::string buffer(capacity, '\0');
+  auto* in_buf = const_cast<char*>(buffer.data());
+  auto num_read = read_fileobj(&fileobj, capacity, in_buf);
+  // If the file is shorter than 256, then libsox cannot read the header.
+  auto in_buffer_size = (num_read > 256) ? num_read : 256;
+  // Open file (this starts reading the header)
+  // When opening a file there are two functions that can touches FILE*.
+  // * `auto_detect_format`
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
+  // * `startread` handler of detected format.
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
+  // To see the handler of a particular format, go to
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
+  // For example, voribs can be found
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
+  SoxFormat sf(sox_open_mem_read(
+      in_buf,
+      in_buffer_size,
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+  // In case of streamed data, length can be 0
+  if (static_cast<sox_format_t*>(sf) == nullptr ||
+      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    return {};
+  }
+  // Prepare output buffer
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(sf->signal.length);
+  // Create and run SoxEffectsChain
+  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
+  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+      /*input_encoding=*/sf->encoding,
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+  chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+  // Create tensor from buffer
+  bool channels_first_ = channels_first.value_or(true);
+  auto tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      normalize.value_or(true),
+      channels_first_);
+  return std::forward_as_tuple(
+      tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
+}
+namespace {
+enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
+SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
+std::mutex SOX_RESOUCE_STATE_MUTEX;
+} // namespace
+void initialize_sox_effects() {
+  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
+  switch (SOX_RESOURCE_STATE) {
+    case NotInitialized:
+      if (sox_init() != SOX_SUCCESS) {
+        throw std::runtime_error("Failed to initialize sox effects.");
+      };
+      SOX_RESOURCE_STATE = Initialized;
+      break;
+    case Initialized:
+      break;
+    case ShutDown:
+      throw std::runtime_error(
+          "SoX Effects has been shut down. Cannot initialize again.");
+  }
+};
+void shutdown_sox_effects() {
+  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
+  switch (SOX_RESOURCE_STATE) {
+    case NotInitialized:
+      throw std::runtime_error(
+          "SoX Effects is not initialized. Cannot shutdown.");
+    case Initialized:
+      if (sox_quit() != SOX_SUCCESS) {
+        throw std::runtime_error("Failed to initialize sox effects.");
+      };
+      SOX_RESOURCE_STATE = ShutDown;
+      break;
+    case ShutDown:
+      break;
+  }
+}
+auto apply_effects_tensor(
+    py::array waveform,
+    int64_t sample_rate,
+    const std::vector<std::vector<std::string>>& effects,
+    bool channels_first) -> std::tuple<py::array, int64_t> {
+  validate_input_tensor(waveform);
+  // Create SoxEffectsChain
+  const auto dtype = waveform.dtype();
+  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+      /*input_encoding=*/get_tensor_encodinginfo(dtype),
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+  // Prepare output buffer
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(waveform.size());
+  // Build and run effects chain
+  chain.addInputTensor(&waveform, sample_rate, channels_first);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+  // Create tensor from buffer
+  auto out_tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      /*normalize=*/false,
+      channels_first);
+  return std::tuple<py::array, int64_t>(
+      out_tensor, chain.getOutputSampleRate());
+}
+auto apply_effects_file(
+    const std::string& path,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format)
+    -> tl::optional<std::tuple<py::array, int64_t>> {
+  // Open input file
+  SoxFormat sf(sox_open_read(
+      path.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+  if (static_cast<sox_format_t*>(sf) == nullptr ||
+      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    return {};
+  }
+  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
+  // Prepare output
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(sf->signal.length);
+  // Create and run SoxEffectsChain
+  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+      /*input_encoding=*/sf->encoding,
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+  chain.addInputFile(sf);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+  // Create tensor from buffer
+  bool channels_first_ = channels_first.value_or(true);
+  auto tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      normalize.value_or(true),
+      channels_first_);
+  return std::tuple<py::array, int64_t>(
+      tensor, chain.getOutputSampleRate());
+}
+} // namespace paddleaudio::sox_effects
--- a/audio/paddleaudio/src/pybind/sox/effects.h
+++ b/audio/paddleaudio/src/pybind/sox/effects.h
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h  with modification.
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include "paddleaudio/src/optional/optional.hpp"
+namespace py = pybind11;
+namespace paddleaudio::sox_effects {
+auto apply_effects_fileobj(
+    py::object fileobj,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    tl::optional<std::string> format)
+    -> tl::optional<std::tuple<py::array, int64_t>>;
+void initialize_sox_effects();
+void shutdown_sox_effects();
+auto apply_effects_tensor(
+    py::array waveform,
+    int64_t sample_rate,
+    const std::vector<std::vector<std::string>>& effects,
+    bool channels_first) -> std::tuple<py::array, int64_t>;
+auto apply_effects_file(
+    const std::string& path,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format)
+    -> tl::optional<std::tuple<py::array, int64_t>>;
+} // namespace paddleaudio::sox_effects
--- a/audio/paddleaudio/src/pybind/sox/effects_chain.cpp
+++ b/audio/paddleaudio/src/pybind/sox/effects_chain.cpp
--- a/audio/paddleaudio/src/pybind/sox/effects_chain.h
+++ b/audio/paddleaudio/src/pybind/sox/effects_chain.h
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h with modification.
+#pragma once
+#include <sox.h>
+#include "paddleaudio/src/pybind/sox/utils.h"
+namespace paddleaudio::sox_effects_chain {
+// Helper struct to safely close sox_effect_t* pointer returned by
+// sox_create_effect
+struct SoxEffect {
+  explicit SoxEffect(sox_effect_t* se) noexcept;
+  SoxEffect(const SoxEffect& other) = delete;
+  SoxEffect(const SoxEffect&& other) = delete;
+  auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
+  auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
+  ~SoxEffect();
+  operator sox_effect_t*() const;
+  auto operator->() noexcept -> sox_effect_t*;
+ private:
+  sox_effect_t* se_;
+};
+// Helper struct to safely close sox_effects_chain_t with handy methods
+class SoxEffectsChain {
+  const sox_encodinginfo_t in_enc_;
+  const sox_encodinginfo_t out_enc_;
+ protected:
+  sox_signalinfo_t in_sig_;
+  sox_signalinfo_t interm_sig_;
+  sox_signalinfo_t out_sig_;
+  sox_effects_chain_t* sec_;
+ public:
+  explicit SoxEffectsChain(
+      sox_encodinginfo_t input_encoding,
+      sox_encodinginfo_t output_encoding);
+  SoxEffectsChain(const SoxEffectsChain& other) = delete;
+  SoxEffectsChain(const SoxEffectsChain&& other) = delete;
+  SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
+  SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
+  ~SoxEffectsChain();
+  void run();
+  void addInputTensor(
+      py::array* waveform,
+      int64_t sample_rate,
+      bool channels_first);
+  void addInputFile(sox_format_t* sf);
+  void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
+  void addOutputFile(sox_format_t* sf);
+  void addEffect(const std::vector<std::string> effect);
+  int64_t getOutputNumChannels();
+  int64_t getOutputSampleRate();
+};
+class SoxEffectsChainPyBind : public SoxEffectsChain {
+  using SoxEffectsChain::SoxEffectsChain;
+ public:
+  void addInputFileObj(
+      sox_format_t* sf,
+      char* buffer,
+      uint64_t buffer_size,
+      py::object* fileobj);
+  void addOutputFileObj(
+      sox_format_t* sf,
+      char** buffer,
+      size_t* buffer_size,
+      py::object* fileobj);
+};
+} // namespace paddleaudio::sox_effects_chain
--- a/audio/paddleaudio/src/pybind/sox/io.cpp
+++ b/audio/paddleaudio/src/pybind/sox/io.cpp
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp with modification.
+#include "paddleaudio/src/pybind/sox/io.h"
+#include "paddleaudio/src/pybind/sox/effects.h"
+#include "paddleaudio/src/pybind/sox/types.h"
+#include "paddleaudio/src/pybind/sox/effects_chain.h"
+#include "paddleaudio/src/pybind/sox/utils.h"
+#include "paddleaudio/src/optional/optional.hpp"
+using namespace paddleaudio::sox_utils;
+namespace paddleaudio {
+namespace sox_io {
+auto get_info_file(const std::string &path, 
+                   const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
+    SoxFormat sf(
+        sox_open_read(path.data(),
+                      /*signal=*/nullptr,
+                      /*encoding=*/nullptr,
+                      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+    validate_input_file(sf, path);
+    return std::make_tuple(
+        static_cast<int64_t>(sf->signal.rate),
+        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
+        static_cast<int64_t>(sf->signal.channels),
+        static_cast<int64_t>(sf->encoding.bits_per_sample),
+        get_encoding(sf->encoding.encoding));
+}
+std::vector<std::vector<std::string>> get_effects(
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames) {
+  const auto offset = frame_offset.value_or(0);
+  if (offset < 0) {
+    throw std::runtime_error(
+        "Invalid argument: frame_offset must be non-negative.");
+  }
+  const auto frames = num_frames.value_or(-1);
+  if (frames == 0 || frames < -1) {
+    throw std::runtime_error(
+        "Invalid argument: num_frames must be -1 or greater than 0.");
+  }
+  std::vector<std::vector<std::string>> effects;
+  if (frames != -1) {
+    std::ostringstream os_offset, os_frames;
+    os_offset << offset << "s";
+    os_frames << "+" << frames << "s";
+    effects.emplace_back(
+        std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
+  } else if (offset != 0) {
+    std::ostringstream os_offset;
+    os_offset << offset << "s";
+    effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
+  }
+  return effects;
+}
+auto get_info_fileobj(py::object fileobj, 
+                      const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
+    const auto capacity = [&]() {
+        const auto bufsiz = get_buffer_size();
+        const int64_t kDefaultCapacityInBytes = 4096;
+        return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
+                                                  : kDefaultCapacityInBytes;
+    }();
+    std::string buffer(capacity, '\0');
+    auto *buf = const_cast<char *>(buffer.data());
+    auto num_read = read_fileobj(&fileobj, capacity, buf);
+    // If the file is shorter than 256, then libsox cannot read the header.
+    auto buf_size = (num_read > 256) ? num_read : 256;
+    SoxFormat sf(sox_open_mem_read(
+        buf,
+        buf_size,
+        /*signal=*/nullptr,
+        /*encoding=*/nullptr,
+        /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+    // In case of streamed data, length can be 0
+    validate_input_memfile(sf);
+    return std::make_tuple(
+        static_cast<int64_t>(sf->signal.rate),
+        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
+        static_cast<int64_t>(sf->signal.channels),
+        static_cast<int64_t>(sf->encoding.bits_per_sample),
+        get_encoding(sf->encoding.encoding));
+}
+tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
+    py::object fileobj,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format) {
+  auto effects = get_effects(frame_offset, num_frames);
+  return paddleaudio::sox_effects::apply_effects_fileobj(
+      std::move(fileobj), effects, normalize, channels_first, std::move(format));
+}
+tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
+    const std::string& path,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format) {
+    auto effects = get_effects(frame_offset, num_frames);
+    return paddleaudio::sox_effects::apply_effects_file(
+        path, effects, normalize, channels_first, format);
+}
+void save_audio_file(const std::string& path,
+                     py::array tensor,
+                     int64_t sample_rate,
+                     bool channels_first,
+                     tl::optional<double> compression,
+                     tl::optional<std::string> format,
+                     tl::optional<std::string> encoding,
+                     tl::optional<int64_t> bits_per_sample) {
+    validate_input_tensor(tensor);
+    const auto filetype = [&]() {
+        if (format.has_value()) return format.value();
+        return get_filetype(path);
+    }();
+    if (filetype == "amr-nb") {
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+        //TORCH_CHECK(num_channels == 1,
+        //            "amr-nb format only supports single channel audio.");
+        assert(num_channels == 1);
+    } else if (filetype == "htk") {
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+       // TORCH_CHECK(num_channels == 1,
+        //            "htk format only supports single channel audio.");
+        assert(num_channels == 1);
+    } else if (filetype == "gsm") {
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+        assert(num_channels == 1);
+        assert(sample_rate == 8000);
+        //TORCH_CHECK(num_channels == 1,
+        //            "gsm format only supports single channel audio.");
+        //TORCH_CHECK(sample_rate == 8000,
+        //            "gsm format only supports a sampling rate of 8kHz.");
+    }
+    const auto signal_info =
+        get_signalinfo(&tensor, sample_rate, filetype, channels_first);
+    const auto encoding_info = get_encodinginfo_for_save(
+        filetype, tensor.dtype(), compression, encoding, bits_per_sample);
+    SoxFormat sf(sox_open_write(path.c_str(),
+                                &signal_info,
+                                &encoding_info,
+                                /*filetype=*/filetype.c_str(),
+                                /*oob=*/nullptr,
+                                /*overwrite_permitted=*/nullptr));
+    if (static_cast<sox_format_t*>(sf) == nullptr) {
+        throw std::runtime_error(
+            "Error saving audio file: failed to open file " + path);
+    }
+    paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+        /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
+        /*output_encoding=*/sf->encoding);
+    chain.addInputTensor(&tensor, sample_rate, channels_first);
+    chain.addOutputFile(sf);
+    chain.run();
+}
+namespace {
+// helper class to automatically release buffer, to be used by
+// save_audio_fileobj
+struct AutoReleaseBuffer {
+  char* ptr;
+  size_t size;
+  AutoReleaseBuffer() : ptr(nullptr), size(0) {}
+  AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
+  AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
+  auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
+  auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
+  ~AutoReleaseBuffer() {
+    if (ptr) {
+      free(ptr);
+    }
+  }
+};
+} // namespace
+void save_audio_fileobj(
+    py::object fileobj,
+    py::array tensor,
+    int64_t sample_rate,
+    bool channels_first,
+    tl::optional<double> compression,
+    tl::optional<std::string> format,
+    tl::optional<std::string> encoding,
+    tl::optional<int64_t> bits_per_sample) {
+  if (!format.has_value()) {
+    throw std::runtime_error(
+        "`format` is required when saving to file object.");
+  }
+  const auto filetype = format.value();
+  if (filetype == "amr-nb") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "amr-nb format only supports single channel audio.");
+    }
+  } else if (filetype == "htk") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "htk format only supports single channel audio.");
+    }
+  } else if (filetype == "gsm") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "gsm format only supports single channel audio.");
+    }
+    if (sample_rate != 8000) {
+      throw std::runtime_error(
+          "gsm format only supports a sampling rate of 8kHz.");
+    }
+  }
+  const auto signal_info =
+      get_signalinfo(&tensor, sample_rate, filetype, channels_first);
+  const auto encoding_info = get_encodinginfo_for_save(
+      filetype,
+      tensor.dtype(),
+      compression,
+      std::move(encoding),
+      bits_per_sample);
+  AutoReleaseBuffer buffer;
+  SoxFormat sf(sox_open_memstream_write(
+      &buffer.ptr,
+      &buffer.size,
+      &signal_info,
+      &encoding_info,
+      filetype.c_str(),
+      /*oob=*/nullptr));
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error(
+        "Error saving audio file: failed to open memory stream.");
+  }
+  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+      /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
+      /*output_encoding=*/sf->encoding);
+  chain.addInputTensor(&tensor, sample_rate, channels_first);
+  chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
+  chain.run();
+  // Closing the sox_format_t is necessary for flushing the last chunk to the
+  // buffer
+  sf.close();
+  fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
+}
+}  // namespace paddleaudio
+}  // namespace sox_io
--- a/audio/paddleaudio/src/pybind/sox/io.h
+++ b/audio/paddleaudio/src/pybind/sox/io.h
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.h with modification.
+#pragma once
+#include "paddleaudio/src/pybind/sox/utils.h"
+namespace py = pybind11;
+namespace paddleaudio {
+namespace sox_io {
+auto get_info_file(const std::string &path, 
+                   const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
+auto get_info_fileobj(py::object fileobj,
+                   const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
+tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
+    py::object fileobj,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format);
+void save_audio_fileobj(
+    py::object fileobj,
+    py::array tensor,
+    int64_t sample_rate,
+    bool channels_first,
+    tl::optional<double> compression,
+    tl::optional<std::string> format,
+    tl::optional<std::string> encoding,
+    tl::optional<int64_t> bits_per_sample);
+auto get_effects(const tl::optional<int64_t>& frame_offset,
+                 const tl::optional<int64_t>& num_frames)
+    -> std::vector<std::vector<std::string>>;
+tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
+    const std::string& path,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format);
+void save_audio_file(const std::string& path,
+                     py::array tensor,
+                     int64_t sample_rate,
+                     bool channels_first,
+                     tl::optional<double> compression,
+                     tl::optional<std::string> format,
+                     tl::optional<std::string> encoding,
+                     tl::optional<int64_t> bits_per_sample);    
+}  // namespace paddleaudio
+}  // namespace sox_io
--- a/audio/paddleaudio/src/pybind/sox/types.cpp
+++ b/audio/paddleaudio/src/pybind/sox/types.cpp
--- a/audio/paddleaudio/src/pybind/sox/types.h
+++ b/audio/paddleaudio/src/pybind/sox/types.h
--- a/audio/paddleaudio/src/pybind/sox/utils.cpp
+++ b/audio/paddleaudio/src/pybind/sox/utils.cpp
--- a/audio/paddleaudio/src/pybind/sox/utils.h
+++ b/audio/paddleaudio/src/pybind/sox/utils.h
--- a/audio/paddleaudio/src/utils.cpp
+++ b/audio/paddleaudio/src/utils.cpp
--- a/audio/paddleaudio/third_party/.gitignore
+++ b/audio/paddleaudio/third_party/.gitignore
+archives/
+install/
--- a/audio/paddleaudio/third_party/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/CMakeLists.txt
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+################################################################################
+# sox
+################################################################################
+if (BUILD_SOX)
+  add_subdirectory(sox)
+endif()
+################################################################################
+# kaldi
+################################################################################
+if (BUILD_KALDI)
+  add_subdirectory(kaldi)
+endif()
\ No newline at end of file
--- a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
--- a/audio/paddleaudio/third_party/patches/config.guess
+++ b/audio/paddleaudio/third_party/patches/config.guess
--- a/audio/paddleaudio/third_party/patches/config.sub
+++ b/audio/paddleaudio/third_party/patches/config.sub
--- a/audio/paddleaudio/third_party/patches/libmad.patch
+++ b/audio/paddleaudio/third_party/patches/libmad.patch
--- a/audio/paddleaudio/third_party/patches/sox.patch
+++ b/audio/paddleaudio/third_party/patches/sox.patch
--- a/audio/paddleaudio/third_party/sox/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/sox/CMakeLists.txt
--- a/audio/paddleaudio/utils/sox_utils.py
+++ b/audio/paddleaudio/utils/sox_utils.py
--- a/audio/setup.py
+++ b/audio/setup.py
--- a/audio/tests/backends/__init__.py
+++ b/audio/tests/backends/__init__.py
--- a/audio/tests/backends/soundfile/__init__.py
+++ b/audio/tests/backends/soundfile/__init__.py
--- a/audio/tests/backends/soundfile/common.py
+++ b/audio/tests/backends/soundfile/common.py
--- a/audio/tests/backends/soundfile/common_utils
+++ b/audio/tests/backends/soundfile/common_utils
+../../common_utils
\ No newline at end of file
--- a/audio/tests/backends/soundfile/info_test.py
+++ b/audio/tests/backends/soundfile/info_test.py
--- a/audio/tests/backends/soundfile/load_test.py
+++ b/audio/tests/backends/soundfile/load_test.py
--- a/audio/tests/backends/soundfile/save_test.py
+++ b/audio/tests/backends/soundfile/save_test.py
--- a/audio/tests/backends/sox_io/common.py
+++ b/audio/tests/backends/sox_io/common.py
--- a/audio/tests/backends/sox_io/common_utils
+++ b/audio/tests/backends/sox_io/common_utils
+../../common_utils
\ No newline at end of file
--- a/audio/tests/backends/sox_io/info_test.py
+++ b/audio/tests/backends/sox_io/info_test.py
--- a/audio/tests/backends/sox_io/load_test.py
+++ b/audio/tests/backends/sox_io/load_test.py
--- a/audio/tests/backends/sox_io/save_test.py
+++ b/audio/tests/backends/sox_io/save_test.py
--- a/audio/tests/backends/sox_io/smoke_test.py
+++ b/audio/tests/backends/sox_io/smoke_test.py
--- a/audio/tests/backends/sox_io/sox_effect_test.py
+++ b/audio/tests/backends/sox_io/sox_effect_test.py
--- a/audio/tests/backends/sox_io/sox_effect_test_args.jsonl
+++ b/audio/tests/backends/sox_io/sox_effect_test_args.jsonl
--- a/audio/tests/common_utils/__init__.py
+++ b/audio/tests/common_utils/__init__.py
--- a/audio/tests/common_utils/data_utils.py
+++ b/audio/tests/common_utils/data_utils.py
--- a/audio/tests/common_utils/sox_utils.py
+++ b/audio/tests/common_utils/sox_utils.py
--- a/tests/unit/audio/features/test_istft.py
+++ b/tests/unit/audio/features/test_istft.py
--- a/audio/tests/features/test_kaldi_feat.py
+++ b/audio/tests/features/test_kaldi_feat.py
--- a/tests/unit/audio/features/test_log_melspectrogram.py
+++ b/tests/unit/audio/features/test_log_melspectrogram.py
--- a/tests/unit/audio/features/test_spectrogram.py
+++ b/tests/unit/audio/features/test_spectrogram.py
--- a/tests/unit/audio/features/test_stft.py
+++ b/tests/unit/audio/features/test_stft.py
--- a/audio/tests/features/testdata/fbank_feat.ark
+++ b/audio/tests/features/testdata/fbank_feat.ark
--- a/tests/unit/audio/features/testdata/fbank_feat.ark
+++ b/tests/unit/audio/features/testdata/fbank_feat.ark
--- a/audio/tests/features/testdata/pitch_feat.ark
+++ b/audio/tests/features/testdata/pitch_feat.ark
--- a/tests/unit/audio/features/testdata/pitch_feat.ark
+++ b/tests/unit/audio/features/testdata/pitch_feat.ark
--- a/tests/unit/audio/features/testdata/test.wav
+++ b/tests/unit/audio/features/testdata/test.wav
--- a/tools/setup_helpers/__init__.py
+++ b/tools/setup_helpers/__init__.py
--- a/tools/setup_helpers/extension.py
+++ b/tools/setup_helpers/extension.py
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
--- a/speechx/speechx/kaldi/matrix/kaldi-blas.h
+++ b/speechx/speechx/kaldi/matrix/kaldi-blas.h
--- a/tests/unit/audio/features/base.py
+++ b/tests/unit/audio/features/base.py
--- a/tests/unit/audio/features/testdata/wav.ark
+++ b/tests/unit/audio/features/testdata/wav.ark