[audio] mv paddlespeech/audio to paddleaudio (#2706)

* split paddlespeech/audio to paddleaudio. * add sox io ,sox effect, kaldi native fbank to paddleaudio.

[audio] mv paddlespeech/audio to paddleaudio (#2706)
* split paddlespeech/audio to paddleaudio. * add sox io ,sox effect, kaldi native fbank to paddleaudio.
42ff9460 · YangZhou · GitHub · 0cc54bb7 · 42ff9460 · 42ff9460
245 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,9 @@
 build
 *output/

+audio/dist/
+audio/fc_patch/
+
 docs/build/
 docs/topic/ctc/warp-ctc/

@@ -42,6 +45,7 @@ tools/python-soundfile/
 tools/onnx
 tools/onnxruntime
 tools/Paddle2ONNX
+tools/onnx-simplifier/

 speechx/fc_patch/


--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,8 +3,13 @@ repos:
    rev: v0.16.0
    hooks:
    -   id: yapf
-        files: \.py$
-        exclude: (?=third_party).*(\.py)$
+        name: yapf
+        language: python
+        entry: yapf
+        args: [-i, -vv]
+        types: [python]
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: a11d9314b22d8f8c7556443875b731ef05965464
    hooks:
@@ -30,7 +35,8 @@ repos:
        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
        -  --builtins=G,request
        -  --jobs=1
-        exclude: (?=third_party).*(\.py)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+
 -   repo : https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.0.1
    hooks:
@@ -42,6 +48,7 @@ repos:
        files: \.md$
    -   id: remove-tabs
        files: \.md$
+
 -   repo: local
    hooks:
    -   id: clang-format
@@ -49,23 +56,17 @@ repos:
        description: Format files with ClangFormat
        entry: bash .pre-commit-hooks/clang-format.hook -i
        language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
-        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$
-    #-   id: copyright_checker
-    #    name: copyright_checker
-    #    entry: python .pre-commit-hooks/copyright-check.hook
-    #    language: system
-    #    files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
-    #    exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$
+        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
    -   id: cpplint
        name: cpplint
        description: Static code analysis of C/C++ files
        language: python
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ 
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
        entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
    hooks:
      - id: reorder-python-imports
-        exclude: (?=third_party).*(\.py)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -23,4 +23,4 @@ python:
    - requirements: docs/requirements.txt
    - method: setuptools
      path: .
-  system_packages: true
\ No newline at end of file
+  system_packages: true
--- a/audio/CMakeLists.txt
+++ b/audio/CMakeLists.txt
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+# Use compiler ID "AppleClang" instead of "Clang" for XCode.
+# Not setting this sometimes makes XCode C compiler gets detected as "Clang",
+# even when the C++ one is detected as "AppleClang".
+cmake_policy(SET CMP0010 NEW)
+cmake_policy(SET CMP0025 NEW)
+
+# Suppress warning flags in default MSVC configuration.  It's not
+# mandatory that we do this (and we don't if cmake is old), but it's
+# nice when it's possible, and it's possible on our Windows configs.
+if(NOT CMAKE_VERSION VERSION_LESS 3.15.0)
+  cmake_policy(SET CMP0092 NEW)
+endif()
+
+project(paddleaudio)
+
+# check and set CMAKE_CXX_STANDARD
+string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
+if(env_cxx_standard GREATER -1)
+  message(
+      WARNING "C++ standard version definition detected in environment variable."
+      "paddleaudio requires -std=c++14. Please remove -std=c++ settings in your environment.")
+endif()
+
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_C_STANDARD 11)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+# Options
+option(BUILD_SOX "Build libsox statically" ON)
+option(BUILD_MAD "Enable libmad" ON)
+option(BUILD_KALDI "Build kaldi statically" ON)
+option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON)
+
+
+# cmake
+set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external")
+
+if (NOT MSVC)
+    find_package(GFortranLibs REQUIRED)
+    include(FortranCInterface)
+    include(FindGFortranLibs REQUIRED)
+endif()
+
+# fc_patch dir
+set(FETCHCONTENT_QUIET off)
+get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
+set(FETCHCONTENT_BASE_DIR ${fc_patch})
+set(THIRD_PARTY_PATH ${fc_patch})
+
+include(openblas)
+
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
+include(cmake/pybind.cmake)
+include_directories(${PYTHON_INCLUDE_DIR})
+
+# packages
+find_package(Python3 COMPONENTS Interpreter Development)
+
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g")
+add_subdirectory(paddleaudio)
+
+# Summary
+include(cmake/summary.cmake)
+onnx_print_configuration_summary()
--- a/audio/README.md
+++ b/audio/README.md
+# PaddleAudio
+
+安装方式： pip install paddleaudio
+
+目前支持的平台：Linux：
+
+## Environment
+
+## Build wheel
+
+Linux test build whl environment:
+* docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2`
+* os - Ubuntu 16.04.7 LTS
+* gcc/g++/gfortran - 8.2.0
+* cmake - 3.18.0 (need install)
+
+* [How to Install Docker](https://docs.docker.com/engine/install/)
+* [A Docker Tutorial for Beginners](https://docker-curriculum.com/)
+
+1. First to launch docker container.
+
+```
+docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash
+```
+2. python setup.py bdist_wheel
+
+MAC：test build whl envrioment：
+* os 
+* gcc/g++/gfortran 12.2.0
+* cpu Intel Xeon E5 x86_64
+
+
+Windows：
+not support： paddleaudio C++ extension lib (sox io, kaldi native fbank)
+python setup.py bdist_wheel
\ No newline at end of file
--- a/audio/cmake/FindGFortranLibs.cmake
+++ b/audio/cmake/FindGFortranLibs.cmake
+#.rst:
+# FindGFortranLibs
+# --------
+#  https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake
+#  https://enccs.github.io/cmake-workshop/cxx-fortran/
+#
+# Find gcc Fortran compiler & library paths
+#
+# The module defines the following variables:
+#
+# ::
+#
+#
+#   GFORTRANLIBS_FOUND - true if system has gfortran
+#   LIBGFORTRAN_LIBRARIES - path to libgfortran
+#   LIBQUADMATH_LIBRARIES - path to libquadmath
+#   GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath
+#   GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers
+#   LIBGOMP_LIBRARIES - path to libgomp
+#   LIBGOMP_INCLUDE_DIR - directory containing omp.h header
+#   GFORTRAN_VERSION_STRING - version of gfortran found
+#
+set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY})
+
+if(NOT CMAKE_REQUIRED_QUIET)
+  message(STATUS "Looking for gfortran related libraries...")
+endif()
+
+enable_language(Fortran)
+if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
+
+  # Basically, call "gfortran -v" to dump compiler info to the string
+  # GFORTRAN_VERBOSE_STR, which will be used to get necessary paths
+  message(STATUS "Extracting library and header information by calling 'gfortran -v'...")
+  execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE
+    GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG)
+
+  # For debugging
+  message(STATUS "'gfortran -v' returned:")
+  message(STATUS "${GFORTRAN_VERBOSE_STR}")
+
+  # Detect gfortran version
+  string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}")
+  string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}")
+  message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}")
+  unset(GFORTRAN_VER_STR)
+
+  set(MATCH_REGEX "[^\t\n ]+[\t\n ]+")
+  set(REPLACE_REGEX "([^\t\n ]+)")
+
+  # Find architecture for compiler
+  string(REGEX MATCH "Target: [^\t\n ]+"
+    GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}")
+  message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}")
+  string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1"
+    GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}")
+  message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}")
+  unset(GFORTRAN_ARCH_STR)
+
+  # Find install prefix, if it exists; if not, use default
+  string(REGEX MATCH  "--prefix=[^\t\n ]+[\t\n ]+"
+    GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_PREFIX_STR)
+    message(STATUS "Detected default gfortran prefix")
+    set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install
+  else()
+    string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1"
+      GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}")
+  endif()
+  message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}")
+  unset(GFORTRAN_PREFIX_STR)
+
+  # Find install exec-prefix, if it exists; if not, use default
+  string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1"
+    GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_EXEC_PREFIX_STR)
+    message(STATUS "Detected default gfortran exec-prefix")
+    set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}")
+  else()
+    string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1"
+      GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}")
+  endif()
+  message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}")
+  UNSET(GFORTRAN_EXEC_PREFIX_STR)
+
+  # Find library directory and include directory, if library directory specified
+  string(REGEX MATCH "--libdir=[^\t\n ]+"
+    GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_LIB_DIR_STR)
+    message(STATUS "Found --libdir flag -- not found")
+    message(STATUS "Using default gfortran library & include directory paths")
+    string(STRIP ${GFORTRAN_PREFIX_DIR} TMPLIBDIR)
+    set(GFORTRAN_LIBRARIES_DIR "${TMPLIBDIR}/lib64")
+    set(GFORTRAN_INCLUDE_DIR "${TMPLIBDIR}/include")
+  else()
+    message(STATUS "Found --libdir flag -- yes")
+    string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1"
+      GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}")
+    string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include")
+  endif()
+  message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}")
+  message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}")
+  unset(GFORTRAN_LIB_DIR_STR)
+
+  # There are lots of other build options for gcc & gfortran. For now, the
+  # options implemented above should cover a lot of common use cases.
+
+  # Clean up be deleting the output string from "gfortran -v"
+  unset(GFORTRAN_VERBOSE_STR)
+
+  # Find paths for libgfortran, libquadmath, libgomp
+  # libgomp needed for OpenMP support without Clang
+  find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+  find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+  find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+
+  # Find OpenMP headers
+  find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR})
+
+else()
+  message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!")
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+# Required: libgfortran, libquadmath, path for gfortran libraries
+# Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers
+find_package_handle_standard_args(GFortranLibs
+  REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR
+  VERSION_VAR GFORTRAN_VERSION_STRING)
+
+if(GFORTRANLIBS_FOUND)
+  message(STATUS "Looking for gfortran libraries -- found")
+  message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}")
+else()
+  message(STATUS "Looking for gfortran libraries -- not found")
+endif()
+
+mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES
+  LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR
+  GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR)
+# FindGFortranLIBS.cmake ends here
+
+
+message(STATUS LIBGFORTRAN_LIBRARIES= ${LIBGFORTRAN_LIBRARIES})
+message(STATUS LIBQUADMATH_LIBRARIES= ${LIBQUADMATH_LIBRARIES})
+message(STATUS LIBGOMP_LIBRARIES= ${LIBGOMP_LIBRARIES})
+message(STATUS LIBGOMP_INCLUDE_DIR= ${LIBGOMP_INCLUDE_DIR})
+message(STATUS GFORTRAN_LIBRARIES_DIR= ${GFORTRAN_LIBRARIES_DIR})
+message(STATUS GFORTRAN_INCLUDE_DIR= ${GFORTRAN_INCLUDE_DIR})
--- a/audio/cmake/external/openblas.cmake
+++ b/audio/cmake/external/openblas.cmake
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas)
+set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
+set(CBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
+set(CBLAS_TAG v0.3.10)
+
+if(NOT WIN32)
+  set(CBLAS_LIBRARIES
+      "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "openblas library." FORCE)
+  set(CBLAS_INC_DIR
+      "${CBLAS_INSTALL_DIR}/include"
+      CACHE PATH "openblas include directory." FORCE)
+  set(OPENBLAS_CC
+      "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+
+  if(APPLE)
+    set(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+  endif()
+  set(OPTIONAL_ARGS "")
+  set(COMMON_ARGS "")
+
+  if(APPLE)
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+      set(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
+    endif()
+    set(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1)
+  endif()
+
+  ExternalProject_Add(
+    OPENBLAS
+    URL "https://paddleaudio.bj.bcebos.com/build/OpenBLAS-0.3.10.zip"
+    GIT_SHALLOW YES
+    DOWNLOAD_DIR ${CBLAS_PREFIX_DIR}
+    SOURCE_DIR ${CBLAS_PREFIX_DIR}
+    INSTALL_DIR ${CBLAS_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND make -j${NPROC} ${COMMON_ARGS} ${OPTIONAL_ARGS}
+    INSTALL_COMMAND make install PREFIX=<INSTALL_DIR>
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
+
+    ExternalProject_Get_Property(OPENBLAS INSTALL_DIR)
+    set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR})
+    add_library(openblas STATIC IMPORTED)
+    add_dependencies(openblas OPENBLAS)
+    set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran)
+    set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a)
+
+    link_directories(${OpenBLAS_INSTALL_PREFIX}/lib)
+    include_directories(${OpenBLAS_INSTALL_PREFIX}/include)
+
+    set(OPENBLAS_LIBRARIES
+        ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a
+    )
+
+    add_library(libopenblas INTERFACE)
+    add_dependencies(libopenblas openblas)
+    target_include_directories(libopenblas INTERFACE ${OpenBLAS_INSTALL_PREFIX}/include/openblas)
+    target_link_libraries(libopenblas INTERFACE ${OPENBLAS_LIBRARIES})
+else()
+  set(CBLAS_LIBRARIES
+      "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "openblas library." FORCE)
+  set(CBLAS_INC_DIR
+      "${CBLAS_INSTALL_DIR}/include/openblas"
+      CACHE PATH "openblas include directory." FORCE)
+  ExternalProject_Add(
+    extern_openblas
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY ${CBLAS_REPOSITORY}
+    GIT_TAG ${CBLAS_TAG}
+    PREFIX ${CBLAS_PREFIX_DIR}
+    INSTALL_DIR ${CBLAS_INSTALL_DIR}
+    BUILD_IN_SOURCE 0
+    UPDATE_COMMAND ""
+    CMAKE_ARGS -DCMAKE_C_COMPILER=clang-cl
+               -DCMAKE_CXX_COMPILER=clang-cl
+               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+               -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
+               -DCMAKE_BUILD_TYPE=Release #${THIRD_PARTY_BUILD_TYPE}
+               -DCMAKE_MT=mt
+               -DUSE_THREAD=OFF
+               -DBUILD_WITHOUT_LAPACK=NO
+               -DCMAKE_Fortran_COMPILER=flang
+               -DNOFORTRAN=0
+               -DDYNAMIC_ARCH=ON
+               #${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_BUILD_TYPE:STRING=Release #${THIRD_PARTY_BUILD_TYPE}
+    # ninja need to know where openblas.lib comes from
+    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
+  set(OPENBLAS_SHARED_LIB
+      ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
+
+  add_library(openblas INTERFACE)
+  add_dependencies(openblas extern_openblas)
+  include_directories(${CBLAS_INC_DIR})
+  link_libraries(${CBLAS_LIBRARIES})
+endif()
+
--- a/audio/cmake/pybind.cmake
+++ b/audio/cmake/pybind.cmake
+#the pybind11 is from:https://github.com/pybind/pybind11
+# Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+SET(PYBIND_ZIP "v2.10.0.zip")
+SET(LOCAL_PYBIND_ZIP ${FETCHCONTENT_BASE_DIR}/${PYBIND_ZIP})
+SET(PYBIND_SRC ${FETCHCONTENT_BASE_DIR}/pybind11)
+SET(DOWNLOAD_URL "https://paddleaudio.bj.bcebos.com/build/v2.10.0.zip")
+SET(PYBIND_TIMEOUT 600 CACHE STRING "Timeout in seconds when downloading pybind.")
+
+IF(NOT EXISTS ${LOCAL_PYBIND_ZIP})
+    FILE(DOWNLOAD ${DOWNLOAD_URL}
+      ${LOCAL_PYBIND_ZIP}
+      TIMEOUT ${PYBIND_TIMEOUT}
+      STATUS ERR
+      SHOW_PROGRESS
+    )
+
+    IF(ERR EQUAL 0)
+        MESSAGE(STATUS "download pybind success")
+    ELSE()
+        MESSAGE(FATAL_ERROR "download pybind fail")
+    ENDIF()
+ENDIF()
+
+IF(NOT EXISTS ${PYBIND_SRC})
+    EXECUTE_PROCESS(
+      COMMAND ${CMAKE_COMMAND} -E tar xfz ${LOCAL_PYBIND_ZIP}
+       WORKING_DIRECTORY ${FETCHCONTENT_BASE_DIR}
+       RESULT_VARIABLE tar_result
+    )
+
+    file(RENAME ${FETCHCONTENT_BASE_DIR}/pybind11-2.10.0 ${PYBIND_SRC})
+
+  IF (tar_result MATCHES 0)
+      MESSAGE(STATUS "unzip pybind success")
+  ELSE()
+      MESSAGE(FATAL_ERROR "unzip pybind fail")
+  ENDIF()
+
+ENDIF()
+
+include_directories(${PYBIND_SRC}/include)
--- a/audio/cmake/summary.cmake
+++ b/audio/cmake/summary.cmake
+# SPDX-License-Identifier: Apache-2.0
+
+# Prints accumulated ONNX configuration summary
+function (onnx_print_configuration_summary)
+  message(STATUS "")
+  message(STATUS "******** Summary ********")
+  message(STATUS "  CMake version             : ${CMAKE_VERSION}")
+  message(STATUS "  CMake command             : ${CMAKE_COMMAND}")
+  message(STATUS "  System                    : ${CMAKE_SYSTEM_NAME}")
+  message(STATUS "  C++ compiler              : ${CMAKE_CXX_COMPILER}")
+  message(STATUS "  C++ compiler version      : ${CMAKE_CXX_COMPILER_VERSION}")
+  message(STATUS "  CXX flags                 : ${CMAKE_CXX_FLAGS}")
+  message(STATUS "  Build type                : ${CMAKE_BUILD_TYPE}")
+  get_directory_property(tmp DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
+  message(STATUS "  Compile definitions       : ${tmp}")
+  message(STATUS "  CMAKE_PREFIX_PATH         : ${CMAKE_PREFIX_PATH}")
+  message(STATUS "  CMAKE_INSTALL_PREFIX      : ${CMAKE_INSTALL_PREFIX}")
+  message(STATUS "  CMAKE_MODULE_PATH         : ${CMAKE_MODULE_PATH}")
+  message(STATUS "")
+  message(STATUS "  ONNX version              : ${ONNX_VERSION}")
+  message(STATUS "  ONNX NAMESPACE            : ${ONNX_NAMESPACE}")
+  message(STATUS "  ONNX_USE_LITE_PROTO       : ${ONNX_USE_LITE_PROTO}")
+  message(STATUS "  USE_PROTOBUF_SHARED_LIBS  : ${ONNX_USE_PROTOBUF_SHARED_LIBS}")
+  message(STATUS "  Protobuf_USE_STATIC_LIBS  : ${Protobuf_USE_STATIC_LIBS}")
+  message(STATUS "  ONNX_DISABLE_EXCEPTIONS   : ${ONNX_DISABLE_EXCEPTIONS}")
+  message(STATUS "  ONNX_WERROR               : ${ONNX_WERROR}")
+  message(STATUS "  ONNX_BUILD_TESTS          : ${ONNX_BUILD_TESTS}")
+  message(STATUS "  ONNX_BUILD_BENCHMARKS     : ${ONNX_BUILD_BENCHMARKS}")
+  message(STATUS "  ONNXIFI_DUMMY_BACKEND     : ${ONNXIFI_DUMMY_BACKEND}")
+  message(STATUS "  ONNXIFI_ENABLE_EXT        : ${ONNXIFI_ENABLE_EXT}")
+  message(STATUS "")
+  message(STATUS "  Protobuf compiler         : ${PROTOBUF_PROTOC_EXECUTABLE}")
+  message(STATUS "  Protobuf includes         : ${PROTOBUF_INCLUDE_DIRS}")
+  message(STATUS "  Protobuf libraries        : ${PROTOBUF_LIBRARIES}")
+  message(STATUS "  BUILD_ONNX_PYTHON         : ${BUILD_ONNX_PYTHON}")
+  message(STATUS "    Python version        : ${Python_VERSION}")
+  message(STATUS "    Python executable     : ${Python_EXECUTABLE}")
+  message(STATUS "    Python includes       : ${Python_INCLUDE_DIR}")
+  message(STATUS "    Python libraries      : ${Python_LIBRARY}")
+  message(STATUS "  PYBIND11                  : ${pybind11_FOUND}")
+  message(STATUS "    Pybind11 version        : ${pybind11_VERSION}")
+  message(STATUS "    Pybind11 include        : ${pybind11_INCLUDE_DIR}")
+  message(STATUS "    Pybind11 includes       : ${pybind11_INCLUDE_DIRS}")
+  message(STATUS "    Pybind11 libraries      : ${pybind11_LIBRARIES}")
+endfunction()
\ No newline at end of file
--- a/audio/paddleaudio/CMakeLists.txt
+++ b/audio/paddleaudio/CMakeLists.txt
+
+add_subdirectory(third_party)
+add_subdirectory(src)
+
+if (APPLE) 
+  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib
+          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib)
+endif(APPLE)
+
+if (UNIX AND NOT APPLE)
+  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgfortran.so.5
+          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
+
+  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libquadmath.so.0
+          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
+
+  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.so.1
+          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
+endif()
--- a/paddlespeech/audio/backends/__init__.py
+++ b/paddlespeech/audio/backends/__init__.py
@@ -11,9 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .soundfile_backend import depth_convert
-from .soundfile_backend import load
-from .soundfile_backend import normalize
-from .soundfile_backend import resample
-from .soundfile_backend import save
-from .soundfile_backend import to_mono
+from . import _extension
+from . import backends
+from . import compliance
+from . import datasets
+from . import features
+from . import functional
+from . import metric
+from . import sox_effects
+from . import utils
--- a/audio/paddleaudio/_extension.py
+++ b/audio/paddleaudio/_extension.py
+import contextlib
+import ctypes
+import os
+import sys
+import types
+import warnings
+from pathlib import Path
+
+from ._internal import module_utils as _mod_utils  # noqa: F401
+
+# Query `hasattr` only once.
+_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
+                                                               'setdlopenflags')
+
+
+@contextlib.contextmanager
+def dl_open_guard():
+    """
+    # https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
+    Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
+    shared library to load custom operators.
+    """
+    if _SET_GLOBAL_FLAGS:
+        old_flags = sys.getdlopenflags()
+        sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
+    yield
+    if _SET_GLOBAL_FLAGS:
+        sys.setdlopenflags(old_flags)
+
+
+def resolve_library_path(path: str) -> str:
+    return os.path.realpath(path)
+
+
+class _Ops(types.ModuleType):
+    #__file__ = '_ops.py'
+
+    def __init__(self):
+        super(_Ops, self).__init__('paddleaudio.ops')
+        self.loaded_libraries = set()
+
+    def load_library(self, path):
+        """
+        Loads a shared library from the given path into the current process.
+        This allows dynamically loading custom operators. For this, 
+        you should compile your operator and 
+        the static registration code into a shared library object, and then
+        call ``paddleaudio.ops.load_library('path/to/libcustom.so')`` to load the
+        shared object.
+        After the library is loaded, it is added to the
+        ``paddleaudio.ops.loaded_libraries`` attribute, a set that may be inspected
+        for the paths of all libraries loaded using this function.
+        Args:
+            path (str): A path to a shared library to load.
+        """
+        path = resolve_library_path(path)
+        with dl_open_guard():
+            # https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
+            # Import the shared library into the process, thus running its
+            # static (global) initialization code in order to register custom
+            # operators with the JIT.
+            ctypes.CDLL(path)
+        self.loaded_libraries.add(path)
+
+
+_LIB_DIR = Path(__file__).parent / "lib"
+
+
+def _get_lib_path(lib: str):
+    suffix = "pyd" if os.name == "nt" else "so"
+    path = _LIB_DIR / f"{lib}.{suffix}"
+    return path
+
+
+def _load_lib(lib: str) -> bool:
+    """Load extension module
+    Note:
+        In case `paddleaudio` is deployed with `pex` format, the library file
+        is not in a standard location.
+        In this case, we expect that `libpaddlleaudio` is available somewhere
+        in the search path of dynamic loading mechanism, so that importing
+        `_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
+        This is the reason why the function should not raising an error when the library
+        file is not found.
+    Returns:
+        bool:
+            True if the library file is found AND the library loaded without failure.
+            False if the library file is not found (like in the case where paddlleaudio
+            is deployed with pex format, thus the shared library file is
+            in a non-standard location.).
+            If the library file is found but there is an issue loading the library,
+            (such as missing dependency) then this function raises the exception as-is.
+    Raises:
+        Exception:
+            If the library file is found, but there is an issue loading the library file,
+            (when underlying `ctype.DLL` throws an exception), this function will pass
+            the exception as-is, instead of catching it and returning bool.
+            The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
+            is not found.
+            This behavior was chosen because the expected failure case is not recoverable.
+            If a dependency is missing, then users have to install it.
+    """
+    path = _get_lib_path(lib)
+    if not path.exists():
+        warnings.warn("lib path is not exists:" + str(path))
+        return False
+    ops.load_library(path)
+    return True
+
+
+_FFMPEG_INITIALIZED = False
+
+
+def _init_ffmpeg():
+    global _FFMPEG_INITIALIZED
+    if _FFMPEG_INITIALIZED:
+        return
+
+    if not paddleaudio._paddlleaudio.is_ffmpeg_available():
+        raise RuntimeError(
+            "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
+        )
+
+    try:
+        _load_lib("libpaddlleaudio_ffmpeg")
+    except OSError as err:
+        raise ImportError(
+            "FFmpeg libraries are not found. Please install FFmpeg.") from err
+
+    import paddllespeech.audio._paddlleaudio_ffmpeg  # noqa
+
+    paddleaudio._paddlleaudio.ffmpeg_init()
+    if paddleaudio._paddlleaudio.ffmpeg_get_log_level() > 8:
+        paddleaudio._paddlleaudio.ffmpeg_set_log_level(8)
+
+    _FFMPEG_INITIALIZED = True
+
+
+def _init_extension():
+    if not _mod_utils.is_module_available("paddleaudio._paddleaudio"):
+        warnings.warn(
+            "paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
+        return
+
+    _load_lib("libpaddleaudio")
+    # This import is for initializing the methods registered via PyBind11
+    # This has to happen after the base library is loaded
+    try:
+        from paddleaudio import _paddleaudio  # noqa
+    except Exception:
+        warnings.warn(
+            "paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
+        return
+
+    # Because this part is executed as part of `import torchaudio`, we ignore the
+    # initialization failure.
+    # If the FFmpeg integration is not properly initialized, then detailed error
+    # will be raised when client code attempts to import the dedicated feature.
+    try:
+        _init_ffmpeg()
+    except Exception:
+        pass
+
+
+ops = _Ops()
+
+_init_extension()
--- a/audio/paddleaudio/_internal/__init__.py
+++ b/audio/paddleaudio/_internal/__init__.py
--- a/audio/paddleaudio/_internal/module_utils.py
+++ b/audio/paddleaudio/_internal/module_utils.py
+import importlib.util
+import platform
+import warnings
+from functools import wraps
+from typing import Optional
+
+#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py with modification.
+
+
+def is_module_available(*modules: str) -> bool:
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    return all(importlib.util.find_spec(m) is not None for m in modules)
+
+
+def requires_module(*modules: str):
+    """Decorate function to give error message if invoked without required optional modules.
+    This decorator is to give better error message to users rather
+    than raising ``NameError:  name 'module' is not defined`` at random places.
+    """
+    missing = [m for m in modules if not is_module_available(m)]
+
+    if not missing:
+        # fall through. If all the modules are available, no need to decorate
+        def decorator(func):
+            return func
+
+    else:
+        req = f"module: {missing[0]}" if len(
+            missing) == 1 else f"modules: {missing}"
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires {req}")
+
+            return wrapped
+
+    return decorator
+
+
+def deprecated(direction: str, version: Optional[str]=None):
+    """Decorator to add deprecation message
+    Args:
+        direction (str): Migration steps to be given to users.
+        version (str or int): The version when the object will be removed
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+            message = (
+                f"{func.__module__}.{func.__name__} has been deprecated "
+                f'and will be removed from {"future" if version is None else version} release. '
+                f"{direction}")
+            warnings.warn(message, stacklevel=2)
+            return func(*args, **kwargs)
+
+        return wrapped
+
+    return decorator
+
+
+def is_kaldi_available():
+    return is_module_available("paddleaudio._paddleaudio")
+
+
+def requires_kaldi():
+    if is_kaldi_available():
+
+        def decorator(func):
+            return func
+
+    else:
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires libpaddleaudio build with kaldi")
+
+            return wrapped
+
+    return decorator
+
+
+def _check_soundfile_importable():
+    if not is_module_available("soundfile"):
+        return False
+    try:
+        import soundfile  # noqa: F401
+
+        return True
+    except Exception:
+        warnings.warn(
+            "Failed to import soundfile. 'soundfile' backend is not available.")
+        return False
+
+
+_is_soundfile_importable = _check_soundfile_importable()
+
+
+def is_soundfile_available():
+    return _is_soundfile_importable
+
+
+def requires_soundfile():
+    if is_soundfile_available():
+
+        def decorator(func):
+            return func
+    else:
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires soundfile")
+
+            return wrapped
+
+    return decorator
+
+
+def is_sox_available():
+    if platform.system() == "Windows":  # not support sox in windows
+        return False
+    return is_module_available("paddleaudio._paddleaudio")
+
+
+def requires_sox():
+    if is_sox_available():
+
+        def decorator(func):
+            return func
+    else:
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires libpaddleaudio build with sox")
+
+            return wrapped
+
+    return decorator
--- a/audio/paddleaudio/backends/__init__.py
+++ b/audio/paddleaudio/backends/__init__.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import utils
+from .soundfile_backend import depth_convert
+from .soundfile_backend import normalize
+from .soundfile_backend import resample
+from .soundfile_backend import soundfile_load
+from .soundfile_backend import soundfile_save
+from .soundfile_backend import to_mono
+from .utils import get_audio_backend
+from .utils import list_audio_backends
+from .utils import set_audio_backend
+
+utils._init_audio_backend()
--- a/audio/paddleaudio/backends/common.py
+++ b/audio/paddleaudio/backends/common.py
+# Token form https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification.
+
+class AudioInfo:
+    """return of info function.
+
+    This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
+    :ref:`"soundfile" backend with the new interface<soundfile_backend>`.
+
+    :ivar int sample_rate: Sample rate
+    :ivar int num_frames: The number of frames
+    :ivar int num_channels: The number of channels
+    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
+        or when it cannot be accurately inferred.
+    :ivar str encoding: Audio encoding
+        The values encoding can take are one of the following:
+
+            * ``PCM_S``: Signed integer linear PCM
+            * ``PCM_U``: Unsigned integer linear PCM
+            * ``PCM_F``: Floating point linear PCM
+            * ``FLAC``: Flac, Free Lossless Audio Codec
+            * ``ULAW``: Mu-law
+            * ``ALAW``: A-law
+            * ``MP3`` : MP3, MPEG-1 Audio Layer III
+            * ``VORBIS``: OGG Vorbis
+            * ``AMR_WB``: Adaptive Multi-Rate
+            * ``AMR_NB``: Adaptive Multi-Rate Wideband
+            * ``OPUS``: Opus
+            * ``HTK``: Single channel 16-bit PCM
+            * ``UNKNOWN`` : None of above
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        num_frames: int,
+        num_channels: int,
+        bits_per_sample: int,
+        encoding: str,
+    ):
+        self.sample_rate = sample_rate
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.bits_per_sample = bits_per_sample
+        self.encoding = encoding
+
+    def __str__(self):
+        return (
+            f"AudioMetaData("
+            f"sample_rate={self.sample_rate}, "
+            f"num_frames={self.num_frames}, "
+            f"num_channels={self.num_channels}, "
+            f"bits_per_sample={self.bits_per_sample}, "
+            f"encoding={self.encoding}"
+            f")"
+        )
--- a/audio/paddleaudio/backends/no_backend.py
+++ b/audio/paddleaudio/backends/no_backend.py
+from pathlib import Path
+from typing import Callable
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from paddle import Tensor
+
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
+
+
+def load(
+        filepath: Union[str, Path],
+        out: Optional[Tensor]=None,
+        normalization: Union[bool, float, Callable]=True,
+        channels_first: bool=True,
+        num_frames: int=0,
+        offset: int=0,
+        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def save(filepath: str,
+         src: Tensor,
+         sample_rate: int,
+         precision: int=16,
+         channels_first: bool=True) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def info(filepath: str) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
--- a/paddlespeech/audio/backends/soundfile_backend.py
+++ b/paddlespeech/audio/backends/soundfile_backend.py
--- a/audio/paddleaudio/backends/sox_io_backend.py
+++ b/audio/paddleaudio/backends/sox_io_backend.py
+import os
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddleaudio
+from paddle import Tensor
+from paddleaudio._internal import module_utils as _mod_utils
+
+from .common import AudioInfo
+
+#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
+
+
+def _fail_info(filepath: str, format: Optional[str]) -> AudioInfo:
+    raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
+
+
+def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioInfo:
+    raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
+
+
+# Note: need to comply TorchScript syntax -- need annotation and no f-string
+def _fail_load(
+        filepath: str,
+        frame_offset: int=0,
+        num_frames: int=-1,
+        normalize: bool=True,
+        channels_first: bool=True,
+        format: Optional[str]=None, ) -> Tuple[Tensor, int]:
+    raise RuntimeError("Failed to load audio from {}".format(filepath))
+
+
+def _fail_load_fileobj(fileobj, *args, **kwargs):
+    raise RuntimeError(f"Failed to load audio from {fileobj}")
+
+
+_fallback_info = _fail_info
+_fallback_info_fileobj = _fail_info_fileobj
+_fallback_load = _fail_load
+_fallback_load_filebj = _fail_load_fileobj
+
+
+@_mod_utils.requires_sox()
+def load(
+        filepath: str,
+        frame_offset: int=0,
+        num_frames: int=-1,
+        normalize: bool=True,
+        channels_first: bool=True,
+        format: Optional[str]=None, ) -> Tuple[Tensor, int]:
+    if hasattr(filepath, "read"):
+        ret = paddleaudio._paddleaudio.load_audio_fileobj(
+            filepath, frame_offset, num_frames, normalize, channels_first,
+            format)
+        if ret is not None:
+            audio_tensor = paddle.to_tensor(ret[0])
+            return (audio_tensor, ret[1])
+        return _fallback_load_fileobj(filepath, frame_offset, num_frames,
+                                      normalize, channels_first, format)
+    filepath = os.fspath(filepath)
+    ret = paddleaudio._paddleaudio.sox_io_load_audio_file(
+        filepath, frame_offset, num_frames, normalize, channels_first, format)
+    if ret is not None:
+        audio_tensor = paddle.to_tensor(ret[0])
+        return (audio_tensor, ret[1])
+    return _fallback_load(filepath, frame_offset, num_frames, normalize,
+                          channels_first, format)
+
+
+@_mod_utils.requires_sox()
+def save(
+        filepath: str,
+        src: Tensor,
+        sample_rate: int,
+        channels_first: bool=True,
+        compression: Optional[float]=None,
+        format: Optional[str]=None,
+        encoding: Optional[str]=None,
+        bits_per_sample: Optional[int]=None, ):
+    src_arr = src.numpy()
+    if hasattr(filepath, "write"):
+        paddleaudio._paddleaudio.save_audio_fileobj(
+            filepath, src_arr, sample_rate, channels_first, compression, format,
+            encoding, bits_per_sample)
+        return
+    filepath = os.fspath(filepath)
+    paddleaudio._paddleaudio.sox_io_save_audio_file(
+        filepath, src_arr, sample_rate, channels_first, compression, format,
+        encoding, bits_per_sample)
+
+
+@_mod_utils.requires_sox()
+def info(
+        filepath: str,
+        format: Optional[str]=None, ) -> AudioInfo:
+    if hasattr(filepath, "read"):
+        sinfo = paddleaudio._paddleaudio.get_info_fileobj(filepath, format)
+        if sinfo is not None:
+            return AudioInfo(*sinfo)
+        return _fallback_info_fileobj(filepath, format)
+    filepath = os.fspath(filepath)
+    sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format)
+    if sinfo is not None:
+        return AudioInfo(*sinfo)
+    return _fallback_info(filepath, format)
--- a/audio/paddleaudio/backends/utils.py
+++ b/audio/paddleaudio/backends/utils.py
+"""Defines utilities for switching audio backends"""
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
+import warnings
+from typing import List
+from typing import Optional
+
+import paddleaudio
+from paddleaudio._internal import module_utils as _mod_utils
+
+from . import no_backend
+from . import soundfile_backend
+from . import sox_io_backend
+
+__all__ = [
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
+
+
+def list_audio_backends() -> List[str]:
+    """List available backends
+
+    Returns:
+        List[str]: The list of available backends.
+    """
+    backends = []
+    if _mod_utils.is_module_available("soundfile"):
+        backends.append("soundfile")
+    if _mod_utils.is_sox_available():
+        backends.append("sox_io")
+    return backends
+
+
+def set_audio_backend(backend: Optional[str]):
+    """Set the backend for I/O operation
+
+    Args:
+        backend (str or None): Name of the backend.
+            One of ``"sox_io"`` or ``"soundfile"`` based on availability
+            of the system. If ``None`` is provided the  current backend is unassigned.
+    """
+    if backend is not None and backend not in list_audio_backends():
+        raise RuntimeError(f'Backend "{backend}" is not one of '
+                           f"available backends: {list_audio_backends()}.")
+
+    if backend is None:
+        module = no_backend
+    elif backend == "sox_io":
+        module = sox_io_backend
+    elif backend == "soundfile":
+        module = soundfile_backend
+    else:
+        raise NotImplementedError(f'Unexpected backend "{backend}"')
+
+    for func in ["save", "load", "info"]:
+        setattr(paddleaudio, func, getattr(module, func))
+
+
+def _init_audio_backend():
+    backends = list_audio_backends()
+    if "soundfile" in backends:
+        set_audio_backend("soundfile")
+    elif "sox_io" in backends:
+        set_audio_backend("sox_io")
+    else:
+        warnings.warn("No audio backend is available.")
+        set_audio_backend(None)
+
+
+def get_audio_backend() -> Optional[str]:
+    """Get the name of the current backend
+
+    Returns:
+        Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
+    """
+    if paddleaudio.load == no_backend.load:
+        return None
+    if paddleaudio.load == sox_io_backend.load:
+        return "sox_io"
+    if paddleaudio.load == soundfile_backend.load:
+        return "soundfile"
+    raise ValueError("Unknown backend.")
--- a/paddlespeech/audio/compliance/__init__.py
+++ b/paddlespeech/audio/compliance/__init__.py
--- a/paddlespeech/audio/compliance/kaldi.py
+++ b/paddlespeech/audio/compliance/kaldi.py
--- a/paddlespeech/audio/compliance/librosa.py
+++ b/paddlespeech/audio/compliance/librosa.py
--- a/paddlespeech/audio/datasets/__init__.py
+++ b/paddlespeech/audio/datasets/__init__.py
--- a/paddlespeech/audio/datasets/dataset.py
+++ b/paddlespeech/audio/datasets/dataset.py
@@ -16,7 +16,7 @@ from typing import List
 import numpy as np
 import paddle

-from ..backends import load as load_audio
+from ..backends.soundfile_backend import soundfile_load as load_audio
 from ..compliance.kaldi import fbank as kaldi_fbank
 from ..compliance.kaldi import mfcc as kaldi_mfcc
 from ..compliance.librosa import melspectrogram

--- a/paddlespeech/audio/datasets/esc50.py
+++ b/paddlespeech/audio/datasets/esc50.py
@@ -16,8 +16,8 @@ import os
 from typing import List
 from typing import Tuple

-from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset

 __all__ = ['ESC50']

--- a/paddlespeech/audio/datasets/gtzan.py
+++ b/paddlespeech/audio/datasets/gtzan.py
@@ -17,8 +17,8 @@ import random
 from typing import List
 from typing import Tuple

-from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset

 __all__ = ['GTZAN']

--- a/paddlespeech/audio/datasets/hey_snips.py
+++ b/paddlespeech/audio/datasets/hey_snips.py
--- a/paddlespeech/audio/datasets/rirs_noises.py
+++ b/paddlespeech/audio/datasets/rirs_noises.py
@@ -20,8 +20,8 @@ from typing import List
 from paddle.io import Dataset
 from tqdm import tqdm

-from ..backends import load as load_audio
-from ..backends import save as save_wav
+from ..backends.soundfile_backend import soundfile_load as load_audio
+from ..backends.soundfile_backend import soundfile_save as save_wav
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from .dataset import feat_funcs

--- a/paddlespeech/audio/datasets/tess.py
+++ b/paddlespeech/audio/datasets/tess.py
@@ -17,8 +17,8 @@ import random
 from typing import List
 from typing import Tuple

-from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset

 __all__ = ['TESS']

--- a/paddlespeech/audio/datasets/urban_sound.py
+++ b/paddlespeech/audio/datasets/urban_sound.py
@@ -16,8 +16,8 @@ import os
 from typing import List
 from typing import Tuple

-from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset

 __all__ = ['UrbanSound8K']

--- a/paddlespeech/audio/datasets/voxceleb.py
+++ b/paddlespeech/audio/datasets/voxceleb.py
@@ -23,7 +23,7 @@ from paddle.io import Dataset
 from pathos.multiprocessing import Pool
 from tqdm import tqdm

-from ..backends import load as load_audio
+from ..backends.soundfile_backend import soundfile_load as load_audio
 from ..utils import DATA_HOME
 from ..utils import decompress
 from ..utils.download import download_and_decompress

--- a/paddlespeech/audio/features/__init__.py
+++ b/paddlespeech/audio/features/__init__.py
--- a/paddlespeech/audio/features/layers.py
+++ b/paddlespeech/audio/features/layers.py
--- a/paddlespeech/audio/functional/__init__.py
+++ b/paddlespeech/audio/functional/__init__.py
--- a/paddlespeech/audio/functional/functional.py
+++ b/paddlespeech/audio/functional/functional.py
--- a/paddlespeech/audio/functional/window.py
+++ b/paddlespeech/audio/functional/window.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,127 +18,156 @@ from typing import Union
 import paddle
 from paddle import Tensor

-__all__ = [
-    'get_window',
-]

+class WindowFunctionRegister(object):
+    def __init__(self):
+        self._functions_dict = dict()

+    def register(self):
+        def add_subfunction(func):
+            name = func.__name__
+            self._functions_dict[name] = func
+            return func
+
+        return add_subfunction
+
+    def get(self, name):
+        return self._functions_dict[name]
+
+
+window_function_register = WindowFunctionRegister()
+
+
+@window_function_register.register()
 def _cat(x: List[Tensor], data_type: str) -> Tensor:
    l = [paddle.to_tensor(_, data_type) for _ in x]
    return paddle.concat(l)


+@window_function_register.register()
 def _acosh(x: Union[Tensor, float]) -> Tensor:
    if isinstance(x, float):
        return math.log(x + math.sqrt(x**2 - 1))
    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))


+@window_function_register.register()
 def _extend(M: int, sym: bool) -> bool:
-    """Extend window by 1 sample if needed for DFT-even symmetry. """
+    """Extend window by 1 sample if needed for DFT-even symmetry."""
    if not sym:
        return M + 1, True
    else:
        return M, False


+@window_function_register.register()
 def _len_guards(M: int) -> bool:
-    """Handle small or incorrect window lengths. """
+    """Handle small or incorrect window lengths."""
    if int(M) != M or M < 0:
        raise ValueError('Window length M must be a non-negative integer')

    return M <= 1


+@window_function_register.register()
 def _truncate(w: Tensor, needed: bool) -> Tensor:
-    """Truncate window by 1 sample if needed for DFT-even symmetry. """
+    """Truncate window by 1 sample if needed for DFT-even symmetry."""
    if needed:
        return w[:-1]
    else:
        return w


-def _general_gaussian(M: int, p, sig, sym: bool=True,
-                      dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _general_gaussian(
+    M: int, p, sig, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a window with a generalized Gaussian shape.
    This function is consistent with scipy.signal.windows.general_gaussian().
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
-    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
+    w = paddle.exp(-0.5 * paddle.abs(n / sig) ** (2 * p))

    return _truncate(w, needs_trunc)


-def _general_cosine(M: int, a: float, sym: bool=True,
-                    dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _general_cosine(
+    M: int, a: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a generic weighted sum of cosine terms window.
    This function is consistent with scipy.signal.windows.general_cosine().
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
-    w = paddle.zeros((M, ), dtype=dtype)
+    w = paddle.zeros((M,), dtype=dtype)
    for k in range(len(a)):
        w += a[k] * paddle.cos(k * fac)
    return _truncate(w, needs_trunc)


-def _general_hamming(M: int, alpha: float, sym: bool=True,
-                     dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _general_hamming(
+    M: int, alpha: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a generalized Hamming window.
    This function is consistent with scipy.signal.windows.general_hamming()
    """
-    return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+    return _general_cosine(M, [alpha, 1.0 - alpha], sym, dtype=dtype)


-def _taylor(M: int,
-            nbar=4,
-            sll=30,
-            norm=True,
-            sym: bool=True,
-            dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _taylor(
+    M: int, nbar=4, sll=30, norm=True, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a Taylor window.
    The Taylor window taper function approximates the Dolph-Chebyshev window's
    constant sidelobe level for a parameterized number of near-in sidelobes.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    # Original text uses a negative sidelobe level parameter and then negates
    # it in the calculation of B. To keep consistent with other methods we
    # assume the sidelobe level parameter to be positive.
-    B = 10**(sll / 20)
+    B = 10 ** (sll / 20)
    A = _acosh(B) / math.pi
-    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
+    s2 = nbar**2 / (A**2 + (nbar - 0.5) ** 2)
    ma = paddle.arange(1, nbar, dtype=dtype)

-    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
+    Fm = paddle.empty((nbar - 1,), dtype=dtype)
    signs = paddle.empty_like(ma)
    signs[::2] = 1
    signs[1::2] = -1
    m2 = ma * ma
    for mi in range(len(ma)):
-        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
-                                                           ))
+        numer = signs[mi] * paddle.prod(
+            1 - m2[mi] / s2 / (A**2 + (ma - 0.5) ** 2)
+        )
        if mi == 0:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
        elif mi == len(ma) - 1:
            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
        else:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
-                mi] / m2[mi + 1:])
+            denom = (
+                2
+                * paddle.prod(1 - m2[mi] / m2[:mi])
+                * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
+            )

        Fm[mi] = numer / denom

    def W(n):
        return 1 + 2 * paddle.matmul(
            Fm.unsqueeze(0),
-            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2.0 + 0.5) / M),
+        )

    w = W(paddle.arange(0, M, dtype=dtype))

@@ -150,7 +179,8 @@ def _taylor(M: int,
    return _truncate(w, needs_trunc)


-def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _hamming(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Hamming window.
    The Hamming window is a taper formed by using a raised cosine with
    non-zero endpoints, optimized to minimize the nearest side lobe.
@@ -158,7 +188,8 @@ def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_hamming(M, 0.54, sym, dtype=dtype)


-def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _hann(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Hann window.
    The Hann window is a taper formed by using a raised cosine or sine-squared
    with ends that touch zero.
@@ -166,15 +197,18 @@ def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_hamming(M, 0.5, sym, dtype=dtype)


-def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _tukey(
+    M: int, alpha=0.5, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a Tukey window.
    The Tukey window is also known as a tapered cosine window.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)

    if alpha <= 0:
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    elif alpha >= 1.0:
        return hann(M, sym=sym)

@@ -182,53 +216,48 @@ def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:

    n = paddle.arange(0, M, dtype=dtype)
    width = int(alpha * (M - 1) / 2.0)
-    n1 = n[0:width + 1]
-    n2 = n[width + 1:M - width - 1]
-    n3 = n[M - width - 1:]
+    n1 = n[0 : width + 1]
+    n2 = n[width + 1 : M - width - 1]
+    n3 = n[M - width - 1 :]

    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
    w2 = paddle.ones(n2.shape, dtype=dtype)
-    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
-                                          (M - 1))))
+    w3 = 0.5 * (
+        1
+        + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / (M - 1)))
+    )
    w = paddle.concat([w1, w2, w3])

    return _truncate(w, needs_trunc)


-def _kaiser(M: int, beta: float, sym: bool=True,
-            dtype: str='float64') -> Tensor:
-    """Compute a Kaiser window.
-    The Kaiser window is a taper formed by using a Bessel function.
-    """
-    raise NotImplementedError()
-
-
-def _gaussian(M: int, std: float, sym: bool=True,
-              dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _gaussian(
+    M: int, std: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a Gaussian window.
    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
    sig2 = 2 * std * std
-    w = paddle.exp(-n**2 / sig2)
+    w = paddle.exp(-(n**2) / sig2)

    return _truncate(w, needs_trunc)


-def _exponential(M: int,
-                 center=None,
-                 tau=1.,
-                 sym: bool=True,
-                 dtype: str='float64') -> Tensor:
-    """Compute an exponential (or Poisson) window. """
+@window_function_register.register()
+def _exponential(
+    M: int, center=None, tau=1.0, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
+    """Compute an exponential (or Poisson) window."""
    if sym and center is not None:
        raise ValueError("If sym==True, center must be None.")
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    if center is None:
@@ -240,11 +269,11 @@ def _exponential(M: int,
    return _truncate(w, needs_trunc)


-def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a triangular window.
-    """
+@window_function_register.register()
+def _triang(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a triangular window."""
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
@@ -258,23 +287,26 @@ def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _truncate(w, needs_trunc)


-def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _bohman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Bohman window.
    The Bohman window is the autocorrelation of a cosine window.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
-        math.pi * fac)
+        math.pi * fac
+    )
    w = _cat([0, w, 0], dtype)

    return _truncate(w, needs_trunc)


-def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _blackman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Blackman window.
    The Blackman window is a taper formed by using the first three terms of
    a summation of cosines. It was designed to have close to the minimal
@@ -284,31 +316,44 @@ def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)


-def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a window with a simple cosine shape.
-    """
+@window_function_register.register()
+def _cosine(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a window with a simple cosine shape."""
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
-    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + 0.5))

    return _truncate(w, needs_trunc)


-def get_window(window: Union[str, Tuple[str, float]],
-               win_length: int,
-               fftbins: bool=True,
-               dtype: str='float64') -> Tensor:
+def get_window(
+    window: Union[str, Tuple[str, float]],
+    win_length: int,
+    fftbins: bool = True,
+    dtype: str = 'float64',
+) -> Tensor:
    """Return a window of a given length and type.

    Args:
-        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
+        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
        win_length (int): Number of samples.
        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
        dtype (str, optional): The data type of the return window. Defaults to 'float64'.

    Returns:
        Tensor: The window represented as a tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            n_fft = 512
+            cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
+
+            std = 7
+            gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
    """
    sym = not fftbins

@@ -319,19 +364,22 @@ def get_window(window: Union[str, Tuple[str, float]],
            args = window[1:]
    elif isinstance(window, str):
        if window in ['gaussian', 'exponential']:
-            raise ValueError("The '" + window + "' window needs one or "
-                             "more parameters -- pass a tuple.")
+            raise ValueError(
+                "The '" + window + "' window needs one or "
+                "more parameters -- pass a tuple."
+            )
        else:
            winstr = window
    else:
-        raise ValueError("%s as window type is not supported." %
-                         str(type(window)))
+        raise ValueError(
+            "%s as window type is not supported." % str(type(window))
+        )

    try:
-        winfunc = eval('_' + winstr)
+        winfunc = window_function_register.get('_' + winstr)
    except KeyError as e:
        raise ValueError("Unknown window type.") from e

-    params = (win_length, ) + args
+    params = (win_length,) + args
    kwargs = {'sym': sym}
    return winfunc(*params, dtype=dtype, **kwargs)
--- a/tests/unit/audio/backends/soundfile/__init__.py
+++ b/tests/unit/audio/backends/soundfile/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .kaldi import fbank
+from .kaldi import pitch
--- a/audio/paddleaudio/kaldi/kaldi.py
+++ b/audio/paddleaudio/kaldi/kaldi.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddleaudio
+from paddleaudio._internal import module_utils
+
+__all__ = [
+    'fbank',
+    'pitch',
+]
+
+
+@module_utils.requires_kaldi()
+def fbank(
+        wav,
+        samp_freq: int=16000,
+        frame_shift_ms: float=10.0,
+        frame_length_ms: float=25.0,
+        dither: float=0.0,
+        preemph_coeff: float=0.97,
+        remove_dc_offset: bool=True,
+        window_type: str='povey',
+        round_to_power_of_two: bool=True,
+        blackman_coeff: float=0.42,
+        snip_edges: bool=True,
+        allow_downsample: bool=False,
+        allow_upsample: bool=False,
+        max_feature_vectors: int=-1,
+        num_bins: int=23,
+        low_freq: float=20,
+        high_freq: float=0,
+        vtln_low: float=100,
+        vtln_high: float=-500,
+        debug_mel: bool=False,
+        htk_mode: bool=False,
+        use_energy: bool=False,  # fbank opts
+        energy_floor: float=0.0,
+        raw_energy: bool=True,
+        htk_compat: bool=False,
+        use_log_fbank: bool=True,
+        use_power: bool=True):
+    frame_opts = paddleaudio._paddleaudio.FrameExtractionOptions()
+    mel_opts = paddleaudio._paddleaudio.MelBanksOptions()
+    fbank_opts = paddleaudio._paddleaudio.FbankOptions()
+    frame_opts.samp_freq = samp_freq
+    frame_opts.frame_shift_ms = frame_shift_ms
+    frame_opts.frame_length_ms = frame_length_ms
+    frame_opts.dither = dither
+    frame_opts.preemph_coeff = preemph_coeff
+    frame_opts.remove_dc_offset = remove_dc_offset
+    frame_opts.window_type = window_type
+    frame_opts.round_to_power_of_two = round_to_power_of_two
+    frame_opts.blackman_coeff = blackman_coeff
+    frame_opts.snip_edges = snip_edges
+    frame_opts.allow_downsample = allow_downsample
+    frame_opts.allow_upsample = allow_upsample
+    frame_opts.max_feature_vectors = max_feature_vectors
+
+    mel_opts.num_bins = num_bins
+    mel_opts.low_freq = low_freq
+    mel_opts.high_freq = high_freq
+    mel_opts.vtln_low = vtln_low
+    mel_opts.vtln_high = vtln_high
+    mel_opts.debug_mel = debug_mel
+    mel_opts.htk_mode = htk_mode
+
+    fbank_opts.use_energy = use_energy
+    fbank_opts.energy_floor = energy_floor
+    fbank_opts.raw_energy = raw_energy
+    fbank_opts.htk_compat = htk_compat
+    fbank_opts.use_log_fbank = use_log_fbank
+    fbank_opts.use_power = use_power
+    feat = paddleaudio._paddleaudio.ComputeFbank(frame_opts, mel_opts,
+                                                 fbank_opts, wav)
+    return feat
+
+
+@module_utils.requires_kaldi()
+def pitch(wav,
+          samp_freq: int=16000,
+          frame_shift_ms: float=10.0,
+          frame_length_ms: float=25.0,
+          preemph_coeff: float=0.0,
+          min_f0: int=50,
+          max_f0: int=400,
+          soft_min_f0: float=10.0,
+          penalty_factor: float=0.1,
+          lowpass_cutoff: int=1000,
+          resample_freq: int=4000,
+          delta_pitch: float=0.005,
+          nccf_ballast: int=7000,
+          lowpass_filter_width: int=1,
+          upsample_filter_width: int=5,
+          max_frames_latency: int=0,
+          frames_per_chunk: int=0,
+          simulate_first_pass_online: bool=False,
+          recompute_frame: int=500,
+          nccf_ballast_online: bool=False,
+          snip_edges: bool=True):
+    pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
+    pitch_opts.samp_freq = samp_freq
+    pitch_opts.frame_shift_ms = frame_shift_ms
+    pitch_opts.frame_length_ms = frame_length_ms
+    pitch_opts.preemph_coeff = preemph_coeff
+    pitch_opts.min_f0 = min_f0
+    pitch_opts.max_f0 = max_f0
+    pitch_opts.soft_min_f0 = soft_min_f0
+    pitch_opts.penalty_factor = penalty_factor
+    pitch_opts.lowpass_cutoff = lowpass_cutoff
+    pitch_opts.resample_freq = resample_freq
+    pitch_opts.delta_pitch = delta_pitch
+    pitch_opts.nccf_ballast = nccf_ballast
+    pitch_opts.lowpass_filter_width = lowpass_filter_width
+    pitch_opts.upsample_filter_width = upsample_filter_width
+    pitch_opts.max_frames_latency = max_frames_latency
+    pitch_opts.frames_per_chunk = frames_per_chunk
+    pitch_opts.simulate_first_pass_online = simulate_first_pass_online
+    pitch_opts.recompute_frame = recompute_frame
+    pitch_opts.nccf_ballast_online = nccf_ballast_online
+    pitch_opts.snip_edges = snip_edges
+    pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
+    return pitch
--- a/paddlespeech/audio/metric/__init__.py
+++ b/paddlespeech/audio/metric/__init__.py
--- a/paddlespeech/audio/metric/eer.py
+++ b/paddlespeech/audio/metric/eer.py
--- a/audio/paddleaudio/sox_effects/__init__.py
+++ b/audio/paddleaudio/sox_effects/__init__.py
+from paddleaudio._internal import module_utils as _mod_utils
+
+from .sox_effects import apply_effects_file
+from .sox_effects import apply_effects_tensor
+from .sox_effects import effect_names
+from .sox_effects import init_sox_effects
+from .sox_effects import shutdown_sox_effects
+
+if _mod_utils.is_sox_available():
+    import atexit
+
+    init_sox_effects()
+    atexit.register(shutdown_sox_effects)
+
+__all__ = [
+    "init_sox_effects",
+    "shutdown_sox_effects",
+    "effect_names",
+    "apply_effects_tensor",
+    "apply_effects_file",
+]
--- a/audio/paddleaudio/sox_effects/sox_effects.py
+++ b/audio/paddleaudio/sox_effects/sox_effects.py
+import os
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddleaudio
+from paddleaudio._internal import module_utils as _mod_utils
+from paddleaudio.utils.sox_utils import list_effects
+
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
+
+
+@_mod_utils.requires_sox()
+def init_sox_effects():
+    """Initialize resources required to use sox effects.
+
+    Note:
+        You do not need to call this function manually. It is called automatically.
+
+    Once initialized, you do not need to call this function again across the multiple uses of
+    sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
+    Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
+    again will result in error.
+    """
+    paddleaudio._paddleaudio.sox_effects_initialize_sox_effects()
+
+
+@_mod_utils.requires_sox()
+def shutdown_sox_effects():
+    """Clean up resources required to use sox effects.
+
+    Note:
+        You do not need to call this function manually. It is called automatically.
+
+    It is safe to call this function multiple times.
+    Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
+    initializing again will result in error.
+    """
+    paddleaudio._paddleaudio.sox_effects_shutdown_sox_effects()
+
+
+@_mod_utils.requires_sox()
+def effect_names() -> List[str]:
+    """Gets list of valid sox effect names
+
+    Returns:
+        List[str]: list of available effect names.
+
+    Example
+        >>> paddleaudio.sox_effects.effect_names()
+        ['allpass', 'band', 'bandpass', ... ]
+    """
+    return list(list_effects().keys())
+
+
+@_mod_utils.requires_sox()
+def apply_effects_tensor(
+        tensor: paddle.Tensor,
+        sample_rate: int,
+        effects: List[List[str]],
+        channels_first: bool=True, ) -> Tuple[paddle.Tensor, int]:
+    """Apply sox effects to given Tensor
+
+    .. devices:: CPU
+
+    Note:
+        This function only works on CPU Tensors.
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` command adds certain effects automatically (such as
+        ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
+        only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
+        need to give ``rate`` effect with desired sampling rate.).
+
+    Args:
+        tensor (paddle.Tensor): Input 2D CPU Tensor.
+        sample_rate (int): Sample rate
+        effects (List[List[str]]): List of effects.
+        channels_first (bool, optional): Indicates if the input Tensor's dimension is
+            `[channels, time]` or `[time, channels]`
+
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        The resulting Tensor has the same ``dtype`` as the input Tensor, and
+        the same channels order. The shape of the Tensor can be different based on the
+        effects applied. Sample rate can also be different based on the effects applied.
+
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Generate pseudo wave:
+        >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
+        >>> sample_rate = 16000
+        >>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
+        >>> waveform.shape
+        paddle.Size([2, 16000])
+        >>> waveform
+        tensor([[ 0.3138,  0.7620, -0.9019,  ..., -0.7495, -0.4935,  0.5442],
+                [-0.0832,  0.0061,  0.8233,  ..., -0.5176, -0.9140, -0.2434]])
+        >>>
+        >>> # Apply effects
+        >>> waveform, sample_rate = apply_effects_tensor(
+        ...     wave_form, sample_rate, effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> # The new waveform is sampling rate 8000, 1 second.
+        >>> # normalization and channel order are preserved
+        >>> waveform.shape
+        paddle.Size([2, 8000])
+        >>> waveform
+        tensor([[ 0.5054, -0.5518, -0.4800,  ..., -0.0076,  0.0096, -0.0110],
+                [ 0.1331,  0.0436, -0.3783,  ..., -0.0035,  0.0012,  0.0008]])
+        >>> sample_rate
+        8000
+
+    """
+    tensor_np = tensor.numpy()
+    ret = paddleaudio._paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate,
+                                                       effects, channels_first)
+    if ret is not None:
+        return (paddle.to_tensor(ret[0]), ret[1])
+    raise RuntimeError("Failed to apply sox effect")
+
+
+@_mod_utils.requires_sox()
+def apply_effects_file(
+        path: str,
+        effects: List[List[str]],
+        normalize: bool=True,
+        channels_first: bool=True,
+        format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]:
+    """Apply sox effects to the audio file and load the resulting data as Tensor
+
+    Note:
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` commnad adds certain effects automatically (such as
+        ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
+        effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
+        effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
+        rate and leave samples untouched.
+
+    Args:
+        path (path-like object or file-like object):
+        effects (List[List[str]]): List of effects.
+        normalize (bool, optional):
+            When ``True``, this function always return ``float32``, and sample values are
+            normalized to ``[-1.0, 1.0]``.
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type. This argument has no effect for formats other
+            than integer WAV type.
+        channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Override the format detection with the given format.
+            Providing the argument might help when libsox can not infer the format
+            from header or extension,
+
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        If ``normalize=True``, the resulting Tensor is always ``float32`` type.
+        If ``normalize=False`` and the input audio file is of integer WAV file, then the
+        resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
+        If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
+        otherwise `[time, channel]`.
+
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Apply effects and load data with channels_first=True
+        >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> waveform.shape
+        paddle.Size([2, 8000])
+        >>> waveform
+        tensor([[ 5.1151e-03,  1.8073e-02,  2.2188e-02,  ...,  1.0431e-07,
+                 -1.4761e-07,  1.8114e-07],
+                [-2.6924e-03,  2.1860e-03,  1.0650e-02,  ...,  6.4122e-07,
+                 -5.6159e-07,  4.8103e-07]])
+        >>> sample_rate
+        8000
+
+    Example - Apply random speed perturbation to dataset
+        >>>
+        >>> # Load data from file, apply random speed perturbation
+        >>> class RandomPerturbationFile(paddle.utils.data.Dataset):
+        ...     \"\"\"Given flist, apply random speed perturbation
+        ...
+        ...     Suppose all the input files are at least one second long.
+        ...     \"\"\"
+        ...     def __init__(self, flist: List[str], sample_rate: int):
+        ...         super().__init__()
+        ...         self.flist = flist
+        ...         self.sample_rate = sample_rate
+        ...
+        ...     def __getitem__(self, index):
+        ...         speed = 0.5 + 1.5 * random.randn()
+        ...         effects = [
+        ...             ['gain', '-n', '-10'],  # apply 10 db attenuation
+        ...             ['remix', '-'],  # merge all the channels
+        ...             ['speed', f'{speed:.5f}'],  # duration is now 0.5 ~ 2.0 seconds.
+        ...             ['rate', f'{self.sample_rate}'],
+        ...             ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
+        ...             ['trim', '0', '2'],  # get the first 2 seconds
+        ...         ]
+        ...         waveform, _ = paddleaudio.sox_effects.apply_effects_file(
+        ...             self.flist[index], effects)
+        ...         return waveform
+        ...
+        ...     def __len__(self):
+        ...         return len(self.flist)
+        ...
+        >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
+        >>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
+        >>> for batch in loader:
+        >>>     pass
+    """
+    if hasattr(path, "read"):
+        ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize,
+                                                channels_first, format)
+        if ret is None:
+            raise RuntimeError("Failed to load audio from {}".format(path))
+        return (paddle.to_tensor(ret[0]), ret[1])
+    path = os.fspath(path)
+    ret = paddleaudio._paddleaudio.sox_effects_apply_effects_file(path, effects, normalize,
+                                                     channels_first, format)
+    if ret is not None:
+        return (paddle.to_tensor(ret[0]), ret[1])
+    raise RuntimeError("Failed to load audio from {}".format(path))
--- a/audio/paddleaudio/src/CMakeLists.txt
+++ b/audio/paddleaudio/src/CMakeLists.txt
+if (MSVC)
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
+if(APPLE)
+set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
+endif(APPLE)
+
+################################################################################
+# libpaddleaudio
+################################################################################
+set(
+  LIBPADDLEAUDIO_SOURCES
+  utils.cpp
+  )
+
+set(
+  LIBPADDLEAUDIO_INCLUDE_DIRS
+  ${PROJECT_SOURCE_DIR}
+  )
+
+set(
+  LIBPADDLEAUDIO_LINK_LIBRARIES
+  )
+
+set(
+  LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
+
+#------------------------------------------------------------------------------#
+# START OF CUSTOMIZATION LOGICS
+#------------------------------------------------------------------------------#
+
+if(BUILD_SOX)
+  list(
+    APPEND
+    LIBPADDLEAUDIO_LINK_LIBRARIES
+    libsox
+    )
+  list(
+    APPEND
+    LIBPADDLEAUDIO_SOURCES
+    )
+  list(
+    APPEND
+    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
+    INCLUDE_SOX
+    )
+endif()
+
+
+if(BUILD_KALDI)
+  list(
+    APPEND
+    LIBPADDLEAUDIO_LINK_LIBRARIES
+    libkaldi
+  )
+  list(
+    APPEND
+    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
+    INCLUDE_KALDI
+    COMPILE_WITHOUT_OPENFST
+  )
+endif()
+
+#------------------------------------------------------------------------------#
+# END OF CUSTOMIZATION LOGICS
+#------------------------------------------------------------------------------#
+
+function (define_library name source include_dirs link_libraries compile_defs)
+  add_library(${name} SHARED ${source})
+  target_include_directories(${name} PRIVATE ${include_dirs})
+  target_link_libraries(${name} ${link_libraries})
+  target_compile_definitions(${name} PRIVATE ${compile_defs})
+  set_target_properties(${name} PROPERTIES PREFIX "")
+  if (MSVC)
+    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
+  endif(MSVC)
+
+  install(
+    TARGETS ${name}
+    LIBRARY DESTINATION lib
+    RUNTIME DESTINATION lib  # For Windows
+    )
+endfunction()
+
+
+define_library(
+  libpaddleaudio
+  "${LIBPADDLEAUDIO_SOURCES}"
+  "${LIBPADDLEAUDIO_INCLUDE_DIRS}"
+  "${LIBPADDLEAUDIO_LINK_LIBRARIES}"
+  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
+)
+
+if (APPLE)
+  add_custom_command(TARGET libpaddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/libgcc_s.1.1.dylib" libpaddleaudio.so)
+endif(APPLE)
+
+if (UNIX AND NOT APPLE)
+  set_target_properties(libpaddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN")
+endif()
+
+if (APPLE)
+  set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
+else()
+  set(AUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
+endif()
+
+  ################################################################################
+# _paddleaudio.so
+################################################################################
+if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
+if (WIN32)
+  find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
+  set(ADDITIONAL_ITEMS Python3::Python)
+endif()
+function(define_extension name sources include_dirs libraries definitions)
+  add_library(${name} SHARED ${sources})
+  target_compile_definitions(${name} PRIVATE "${definitions}")
+  target_include_directories(
+    ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
+  target_link_libraries(
+    ${name}
+    ${libraries}
+    ${PYTHON_LIBRARY}
+    ${ADDITIONAL_ITEMS}
+    )
+  set_target_properties(${name} PROPERTIES PREFIX "")
+  if (MSVC)
+    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
+  endif(MSVC)
+  if (APPLE)
+    # https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
+    # https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
+    set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+  endif()
+  install(
+    TARGETS ${name}
+    LIBRARY DESTINATION .
+    RUNTIME DESTINATION .  # For Windows
+    )
+endfunction()
+
+set(
+  EXTENSION_SOURCES
+  pybind/pybind.cpp
+  )
+#----------------------------------------------------------------------------#
+# START OF CUSTOMIZATION LOGICS
+#----------------------------------------------------------------------------#
+if(BUILD_SOX)
+  list(
+    APPEND
+    EXTENSION_SOURCES
+    pybind/sox/effects.cpp
+    pybind/sox/effects_chain.cpp
+    pybind/sox/io.cpp
+    pybind/sox/types.cpp
+    pybind/sox/utils.cpp
+    )
+endif()
+
+if(BUILD_KALDI)
+  list(
+    APPEND
+    EXTENSION_SOURCES
+    pybind/kaldi/kaldi_feature_wrapper.cc
+    pybind/kaldi/kaldi_feature.cc
+    )
+endif()
+#----------------------------------------------------------------------------#
+# END OF CUSTOMIZATION LOGICS
+#----------------------------------------------------------------------------#
+define_extension(
+  _paddleaudio
+  "${EXTENSION_SOURCES}"
+  ""
+  libpaddleaudio
+  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
+  )
+# if(BUILD_CTC_DECODER)
+#   set(
+#     DECODER_EXTENSION_SOURCES
+#     decoder/bindings/pybind.cpp
+#     )
+#   define_extension(
+#     _paddleaudio_decoder
+#     "${DECODER_EXTENSION_SOURCES}"
+#     ""
+#     "libpaddleaudio_decoder"
+#     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
+#     )
+# endif()
+# if(USE_FFMPEG)
+#   set(
+#     FFMPEG_EXTENSION_SOURCES
+#     ffmpeg/pybind/typedefs.cpp
+#     ffmpeg/pybind/pybind.cpp
+#     ffmpeg/pybind/stream_reader.cpp
+#     )
+#   define_extension(
+#     _paddleaudio_ffmpeg
+#     "${FFMPEG_EXTENSION_SOURCES}"
+#     "${FFMPEG_INCLUDE_DIRS}"
+#     "libpaddleaudio_ffmpeg"
+#     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
+#     )
+# endif()
+endif()
+
+if (APPLE)
+  add_custom_command(TARGET _paddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/lib/libgcc_s.1.1.dylib" _paddleaudio.so)
+endif(APPLE)
+
+if (UNIX AND NOT APPLE)
+  set_target_properties(_paddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN/lib")
+endif()
--- a/audio/paddleaudio/src/optional/COPYING
+++ b/audio/paddleaudio/src/optional/COPYING
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
--- a/audio/paddleaudio/src/optional/optional.hpp
+++ b/audio/paddleaudio/src/optional/optional.hpp
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/numpy.h"
+#include "feat/feature-window.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+namespace py = pybind11;
+
+template <class F>
+class StreamingFeatureTpl {
+  public:
+    typedef typename F::Options Options;
+    StreamingFeatureTpl(const Options& opts);
+    bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
+                        ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+    void Reset() { remained_wav_.Resize(0); }
+
+    int Dim() { return computer_.Dim(); }
+
+  private:
+    bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
+                 ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+    Options opts_;
+    ::kaldi::FeatureWindowFunction window_function_;
+    ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
+    F computer_;
+};
+
+}  // namespace kaldi
+}  // namespace ppspeech
+
+#include "feature_common_inl.h"
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+template <class F>
+StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
+    : opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
+    // window_function_(computer_.GetFrameOptions()) { the opt set to zero
+}
+
+template <class F>
+bool StreamingFeatureTpl<F>::ComputeFeature(
+    const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
+    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    // append remaned waves
+    ::kaldi::int32 wav_len = wav.Dim();
+    if (wav_len == 0) return false;
+    ::kaldi::int32 left_len = remained_wav_.Dim();
+    ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
+    waves.Range(0, left_len).CopyFromVec(remained_wav_);
+    waves.Range(left_len, wav_len).CopyFromVec(wav);
+
+    // cache remaned waves
+    ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
+    ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
+    ::kaldi::int32 frame_shift = frame_opts.WindowShift();
+    ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
+    remained_wav_.Resize(left_samples);
+    remained_wav_.CopyFromVec(
+        waves.Range(frame_shift * num_frames, left_samples));
+
+    // compute speech feature
+    Compute(waves, feats);
+    return true;
+}
+
+// Compute feat
+template <class F>
+bool StreamingFeatureTpl<F>::Compute(
+    const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
+    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    ::kaldi::BaseFloat vtln_warp = 1.0;
+    const ::kaldi::FrameExtractionOptions& frame_opts =
+        computer_.GetFrameOptions();
+    ::kaldi::int32 num_samples = waves.Dim();
+    ::kaldi::int32 frame_length = frame_opts.WindowSize();
+    ::kaldi::int32 sample_rate = frame_opts.samp_freq;
+    if (num_samples < frame_length) {
+        return false;
+    }
+
+    ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
+    feats->Resize(num_frames * Dim());
+
+    ::kaldi::Vector<::kaldi::BaseFloat> window;
+    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
+    for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
+        ::kaldi::BaseFloat raw_log_energy = 0.0;
+        ::kaldi::ExtractWindow(0,
+                               waves,
+                               frame,
+                               frame_opts,
+                               window_function_,
+                               &window,
+                               need_raw_log_energy ? &raw_log_energy : NULL);
+
+        ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
+                                                         ::kaldi::kUndefined);
+        computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
+        ::kaldi::SubVector<::kaldi::BaseFloat> output_row(
+            feats->Data() + frame * Dim(), Dim());
+        output_row.CopyFromVec(this_feature);
+    }
+    return true;
+}
+
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
+#include "feat/pitch-functions.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+bool InitFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts) {
+    ::kaldi::FbankOptions opts;
+    opts.frame_opts = frame_opts;
+    opts.mel_opts = mel_opts;
+    opts.use_energy = fbank_opts.use_energy;
+    opts.energy_floor = fbank_opts.energy_floor;
+    opts.raw_energy = fbank_opts.raw_energy;
+    opts.htk_compat = fbank_opts.htk_compat;
+    opts.use_log_fbank = fbank_opts.use_log_fbank;
+    opts.use_power = fbank_opts.use_power;
+    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
+    return true;
+}
+
+py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
+    return paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ComputeFbank(
+        wav);
+}
+
+py::array_t<float> ComputeFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts,
+    const py::array_t<float>& wav) {
+    InitFbank(frame_opts, mel_opts, fbank_opts);
+    py::array_t<float> result = ComputeFbankStreaming(wav);
+    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
+    return result;
+}
+
+void ResetFbank() {
+    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
+}
+
+py::array_t<float> ComputeKaldiPitch(
+  const ::kaldi::PitchExtractionOptions& opts,
+  const py::array_t<float>& wav) {
+    py::buffer_info info = wav.request();
+    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+   
+    ::kaldi::Matrix<::kaldi::BaseFloat> features;
+    ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
+    auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
+    for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
+        std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
+                    sizeof(float)*features.NumCols());
+    }
+   return result;
+}
+
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <string>
+
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
+#include "feat/pitch-functions.h"
+
+namespace py = pybind11;
+
+namespace paddleaudio {
+namespace kaldi {
+
+struct FbankOptions{
+  bool use_energy;  // append an extra dimension with energy to the filter banks
+  float energy_floor;
+  bool raw_energy;  // If true, compute energy before preemphasis and windowing
+  bool htk_compat;  // If true, put energy last (if using energy)
+  bool use_log_fbank;  // if true (default), produce log-filterbank, else linear
+  bool use_power; 
+  FbankOptions(): use_energy(false),
+                 energy_floor(0.0),
+                 raw_energy(true),
+                 htk_compat(false),
+                 use_log_fbank(true),
+                 use_power(true) {}
+};
+
+bool InitFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts);
+
+py::array_t<float> ComputeFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts,
+    const py::array_t<float>& wav);
+
+py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
+
+void ResetFbank();
+
+py::array_t<float> ComputeKaldiPitch(
+    const ::kaldi::PitchExtractionOptions& opts,
+    const py::array_t<float>& wav);
+
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
+    static KaldiFeatureWrapper instance;
+    return &instance;
+}
+
+bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
+    fbank_.reset(new Fbank(opts));
+    return true;
+}
+
+py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
+    const py::array_t<float> wav) {
+    py::buffer_info info = wav.request();
+    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+
+    ::kaldi::Vector<::kaldi::BaseFloat> feats;
+    bool flag = fbank_->ComputeFeature(input_wav, &feats);
+    if (flag == false || feats.Dim() == 0) return py::array_t<float>();
+    auto result = py::array_t<float>(feats.Dim());
+    py::buffer_info xs = result.request();
+    std::cout << std::endl;
+    float* res_ptr = (float*)xs.ptr;
+    for (int idx = 0; idx < feats.Dim(); ++idx) {
+        *res_ptr = feats(idx);
+        res_ptr++;
+    }
+
+    return result.reshape({feats.Dim() / Dim(), Dim()});
+}
+
+}  // namesapce kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/kaldi-common.h"
+#include "feat/feature-fbank.h"
+
+#include "paddleaudio/src/pybind/kaldi/feature_common.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
+
+class KaldiFeatureWrapper {
+  public:
+    static KaldiFeatureWrapper* GetInstance();
+    bool InitFbank(::kaldi::FbankOptions opts);
+    py::array_t<float> ComputeFbank(const py::array_t<float> wav);
+    int Dim() { return fbank_->Dim(); }
+    void ResetFbank() { fbank_->Reset(); }
+
+  private:
+    std::unique_ptr<paddleaudio::kaldi::Fbank> fbank_;
+};
+
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/pybind.cpp
+++ b/audio/paddleaudio/src/pybind/pybind.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
+#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
+
+#ifdef INCLUDE_SOX
+#include "paddleaudio/src/pybind/sox/io.h"
+#include "paddleaudio/src/pybind/sox/effects.h"
+#endif
+
+#include <pybind11/stl.h>
+#include <pybind11/pybind11.h>
+
+// `tl::optional` 
+#ifdef INCLUDE_SOX
+namespace pybind11 { namespace detail {
+   template <typename T>
+   struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
+}}
+#endif
+
+PYBIND11_MODULE(_paddleaudio, m) {
+#ifdef INCLUDE_SOX
+    m.def("get_info_file",
+          &paddleaudio::sox_io::get_info_file,
+          "Get metadata of audio file.");
+    // support obj later
+    m.def("get_info_fileobj",
+          &paddleaudio::sox_io::get_info_fileobj,
+          "Get metadata of audio in file object.");
+    m.def("load_audio_fileobj",
+          &paddleaudio::sox_io::load_audio_fileobj,
+          "Load audio from file object.");
+    m.def("save_audio_fileobj",
+          &paddleaudio::sox_io::save_audio_fileobj,
+          "Save audio to file obj.");
+          
+    // sox io
+     m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
+     m.def(
+         "sox_io_load_audio_file",
+         &paddleaudio::sox_io::load_audio_file);
+     m.def(
+         "sox_io_save_audio_file",
+         &paddleaudio::sox_io::save_audio_file);
+    
+     // sox utils
+     m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed);
+     m.def(
+         "sox_utils_set_verbosity",
+         &paddleaudio::sox_utils::set_verbosity);
+     m.def(
+         "sox_utils_set_use_threads",
+         &paddleaudio::sox_utils::set_use_threads);
+     m.def(
+         "sox_utils_set_buffer_size",
+         &paddleaudio::sox_utils::set_buffer_size);
+     m.def(
+         "sox_utils_list_effects",
+         &paddleaudio::sox_utils::list_effects);
+     m.def(
+         "sox_utils_list_read_formats",
+         &paddleaudio::sox_utils::list_read_formats);
+     m.def(
+         "sox_utils_list_write_formats",
+         &paddleaudio::sox_utils::list_write_formats);
+     m.def(
+         "sox_utils_get_buffer_size",
+         &paddleaudio::sox_utils::get_buffer_size);
+
+     // effect
+     m.def("apply_effects_fileobj",
+           &paddleaudio::sox_effects::apply_effects_fileobj,
+           "Decode audio data from file-like obj and apply effects.");
+     m.def("sox_effects_initialize_sox_effects",
+       &paddleaudio::sox_effects::initialize_sox_effects);
+     m.def(
+         "sox_effects_shutdown_sox_effects",
+         &paddleaudio::sox_effects::shutdown_sox_effects);
+     m.def(
+         "sox_effects_apply_effects_tensor",
+         &paddleaudio::sox_effects::apply_effects_tensor);
+     m.def(
+         "sox_effects_apply_effects_file",
+         &paddleaudio::sox_effects::apply_effects_file);
+#endif
+
+#ifdef INCLUDE_KALDI
+    m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
+    py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
+        .def(py::init<>())
+        .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
+        .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
+        .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
+        .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
+        .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
+        .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
+        .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
+        .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
+        .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
+        .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
+        .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
+        .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
+        .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
+        .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
+        .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
+        .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
+        .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
+        .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
+        .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
+        .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
+    m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
+    py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
+        .def(py::init<>())            
+        .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
+        .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)            
+        .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
+        .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)            
+        .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)            
+        .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)            
+        .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
+        .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)           
+        .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)          
+        .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
+        .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
+        .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
+        .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
+    py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
+        .def(py::init<>())
+        .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
+        .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
+        .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
+        .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
+        .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
+        .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
+        .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
+
+    py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
+        .def(py::init<>())
+        .def_readwrite("use_energy", &paddleaudio::kaldi::FbankOptions::use_energy)
+        .def_readwrite("energy_floor", &paddleaudio::kaldi::FbankOptions::energy_floor)
+        .def_readwrite("raw_energy", &paddleaudio::kaldi::FbankOptions::raw_energy)
+        .def_readwrite("htk_compat", &paddleaudio::kaldi::FbankOptions::htk_compat)
+        .def_readwrite("use_log_fbank", &paddleaudio::kaldi::FbankOptions::use_log_fbank)
+        .def_readwrite("use_power", &paddleaudio::kaldi::FbankOptions::use_power);
+#endif
+
+}
--- a/audio/paddleaudio/src/pybind/sox/effects.cpp
+++ b/audio/paddleaudio/src/pybind/sox/effects.cpp
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp  with modification.
+
+#include <mutex>
+#include <sox.h>
+
+#include "paddleaudio/src/pybind/sox/effects.h"
+#include "paddleaudio/src/pybind/sox/effects_chain.h"
+#include "paddleaudio/src/pybind/sox/utils.h"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio::sox_effects {
+
+// Streaming decoding over file-like object is tricky because libsox operates on
+// FILE pointer. The folloing is what `sox` and `play` commands do
+//  - file input -> FILE pointer
+//  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
+//  - stdin -> FILE pointer
+//
+// We want to, instead, fetch byte strings chunk by chunk, consume them, and
+// discard.
+//
+// Here is the approach
+// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
+// chunk of byte string
+//    This will perform header-based format detection, if necessary, then fill
+//    the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
+//    which returns FILE* which points the buffer of the provided byte string.
+// 2. Each time sox reads a chunk from the FILE*, we update the underlying
+// buffer in a way that it
+//    starts with unseen data, and append the new data read from the given
+//    fileobj. This will trick libsox as if it keeps reading from the FILE*
+//    continuously.
+// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
+auto apply_effects_fileobj(
+    py::object fileobj,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    tl::optional<std::string> format)
+    -> tl::optional<std::tuple<py::array, int64_t>> {
+  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
+  //
+  // For certain format (such as FLAC), libsox keeps reading the content at
+  // the initialization unless it reaches EOF even when the header is properly
+  // parsed. (Making buffer size 8192, which is way bigger than the header,
+  // resulted in libsox consuming all the buffer content at the time it opens
+  // the file.) Therefore buffer has to always contain valid data, except after
+  // EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
+  // first check if there is enough data to fill the buffer. `read_fileobj`
+  // repeatedly calls `read`  method until it receives the requested length of
+  // bytes or it reaches EOF. If we get bytes shorter than requested, that means
+  // the whole audio data are fetched.
+  //
+  // * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
+  const auto capacity = [&]() {
+    // NOTE:
+    // Use the abstraction provided by `libpaddleaudio` to access the global
+    // config defined by libsox. Directly using `sox_get_globals` function will
+    // end up retrieving the static variable defined in `_paddleaudio`, which is
+    // not correct.
+    const auto bufsiz = get_buffer_size();
+    const int64_t kDefaultCapacityInBytes = 256;
+    return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
+                                              : kDefaultCapacityInBytes;
+  }();
+  std::string buffer(capacity, '\0');
+  auto* in_buf = const_cast<char*>(buffer.data());
+  auto num_read = read_fileobj(&fileobj, capacity, in_buf);
+  // If the file is shorter than 256, then libsox cannot read the header.
+  auto in_buffer_size = (num_read > 256) ? num_read : 256;
+
+  // Open file (this starts reading the header)
+  // When opening a file there are two functions that can touches FILE*.
+  // * `auto_detect_format`
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
+  // * `startread` handler of detected format.
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
+  // To see the handler of a particular format, go to
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
+  // For example, voribs can be found
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
+  SoxFormat sf(sox_open_mem_read(
+      in_buf,
+      in_buffer_size,
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+  // In case of streamed data, length can be 0
+  if (static_cast<sox_format_t*>(sf) == nullptr ||
+      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    return {};
+  }
+
+  // Prepare output buffer
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(sf->signal.length);
+
+  // Create and run SoxEffectsChain
+  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
+  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+      /*input_encoding=*/sf->encoding,
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+  chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+
+  // Create tensor from buffer
+  bool channels_first_ = channels_first.value_or(true);
+  auto tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      normalize.value_or(true),
+      channels_first_);
+
+  return std::forward_as_tuple(
+      tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
+}
+
+namespace {
+
+enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
+SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
+std::mutex SOX_RESOUCE_STATE_MUTEX;
+
+} // namespace
+
+void initialize_sox_effects() {
+  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
+
+  switch (SOX_RESOURCE_STATE) {
+    case NotInitialized:
+      if (sox_init() != SOX_SUCCESS) {
+        throw std::runtime_error("Failed to initialize sox effects.");
+      };
+      SOX_RESOURCE_STATE = Initialized;
+      break;
+    case Initialized:
+      break;
+    case ShutDown:
+      throw std::runtime_error(
+          "SoX Effects has been shut down. Cannot initialize again.");
+  }
+};
+
+void shutdown_sox_effects() {
+  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
+
+  switch (SOX_RESOURCE_STATE) {
+    case NotInitialized:
+      throw std::runtime_error(
+          "SoX Effects is not initialized. Cannot shutdown.");
+    case Initialized:
+      if (sox_quit() != SOX_SUCCESS) {
+        throw std::runtime_error("Failed to initialize sox effects.");
+      };
+      SOX_RESOURCE_STATE = ShutDown;
+      break;
+    case ShutDown:
+      break;
+  }
+}
+
+auto apply_effects_tensor(
+    py::array waveform,
+    int64_t sample_rate,
+    const std::vector<std::vector<std::string>>& effects,
+    bool channels_first) -> std::tuple<py::array, int64_t> {
+  validate_input_tensor(waveform);
+
+  // Create SoxEffectsChain
+  const auto dtype = waveform.dtype();
+  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+      /*input_encoding=*/get_tensor_encodinginfo(dtype),
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+
+  // Prepare output buffer
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(waveform.size());
+
+  // Build and run effects chain
+  chain.addInputTensor(&waveform, sample_rate, channels_first);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+
+  // Create tensor from buffer
+  auto out_tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      /*normalize=*/false,
+      channels_first);
+
+  return std::tuple<py::array, int64_t>(
+      out_tensor, chain.getOutputSampleRate());
+}
+
+auto apply_effects_file(
+    const std::string& path,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format)
+    -> tl::optional<std::tuple<py::array, int64_t>> {
+  // Open input file
+  SoxFormat sf(sox_open_read(
+      path.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+  if (static_cast<sox_format_t*>(sf) == nullptr ||
+      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    return {};
+  }
+
+  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
+
+  // Prepare output
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(sf->signal.length);
+
+  // Create and run SoxEffectsChain
+  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+      /*input_encoding=*/sf->encoding,
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+
+  chain.addInputFile(sf);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+
+  // Create tensor from buffer
+  bool channels_first_ = channels_first.value_or(true);
+  auto tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      normalize.value_or(true),
+      channels_first_);
+
+  return std::tuple<py::array, int64_t>(
+      tensor, chain.getOutputSampleRate());
+}
+
+} // namespace paddleaudio::sox_effects
--- a/audio/paddleaudio/src/pybind/sox/effects.h
+++ b/audio/paddleaudio/src/pybind/sox/effects.h
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h  with modification.
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+#include "paddleaudio/src/optional/optional.hpp"
+
+namespace py = pybind11;
+
+namespace paddleaudio::sox_effects {
+
+auto apply_effects_fileobj(
+    py::object fileobj,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    tl::optional<std::string> format)
+    -> tl::optional<std::tuple<py::array, int64_t>>;
+
+void initialize_sox_effects();
+
+void shutdown_sox_effects();
+
+auto apply_effects_tensor(
+    py::array waveform,
+    int64_t sample_rate,
+    const std::vector<std::vector<std::string>>& effects,
+    bool channels_first) -> std::tuple<py::array, int64_t>;
+
+auto apply_effects_file(
+    const std::string& path,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format)
+    -> tl::optional<std::tuple<py::array, int64_t>>;
+
+} // namespace paddleaudio::sox_effects
--- a/audio/paddleaudio/src/pybind/sox/effects_chain.cpp
+++ b/audio/paddleaudio/src/pybind/sox/effects_chain.cpp
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp with modification.
+
+#include <sox.h>
+#include <iostream>
+#include <vector>
+#include "paddleaudio/src/pybind/sox/effects_chain.h"
+#include "paddleaudio/src/pybind/sox/utils.h"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio::sox_effects_chain {
+
+namespace {
+
+/// helper classes for passing the location of input tensor and output buffer
+///
+/// drain/flow callback functions require plaing C style function signature and
+/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
+/// The following structs will be assigned to sox_effect_t::priv pointer which
+/// gives sox_effect_t an access to input Tensor and output buffer object.
+struct TensorInputPriv {
+  size_t index;
+  py::array* waveform;
+  int64_t sample_rate;
+  bool channels_first;
+};
+
+struct TensorOutputPriv {
+  std::vector<sox_sample_t>* buffer;
+};
+struct FileOutputPriv {
+  sox_format_t* sf;
+};
+
+/// Callback function to feed Tensor data to SoxEffectChain.
+int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
+  // Retrieve the input Tensor and current index
+  auto priv = static_cast<TensorInputPriv*>(effp->priv);
+  auto index = priv->index;
+  auto tensor = *(priv->waveform);
+  auto num_channels = effp->out_signal.channels;
+
+  // Adjust the number of samples to read
+  const size_t num_samples = tensor.size();
+  if (index + *osamp > num_samples) {
+    *osamp = num_samples - index;
+  }
+
+  // Ensure that it's a multiple of the number of channels
+  *osamp -= *osamp % num_channels;
+
+  // Slice the input Tensor
+  // refacor this module, chunk
+  auto i_frame = index / num_channels;
+  auto num_frames = *osamp / num_channels;
+
+  std::vector<int> chunk(num_frames*num_channels);
+  py::buffer_info ori_info = tensor.request();
+  void* ptr = ori_info.ptr;
+  // Convert to sox_sample_t (int32_t)
+  switch (tensor.dtype().num()) {
+    //case c10::ScalarType::Float: {
+    case 11: {
+      // Need to convert to 64-bit precision so that
+      // values around INT32_MIN/MAX are handled correctly.
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        int frame_idx = (idx + index) / num_channels;
+        int channels_idx = (idx + index) % num_channels;
+        double elem = 0; 
+        if (priv->channels_first) {
+          elem = *(float*)tensor.data(channels_idx, frame_idx);
+        } else {
+          elem = *(float*)tensor.data(frame_idx, channels_idx);
+        } 
+        elem = elem * 2147483648.;
+        // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
+        if (elem > INT32_MAX) { 
+          chunk[idx] = INT32_MAX; 
+        } else if (elem < INT32_MIN) {
+          chunk[idx] = INT32_MIN; 
+        } else { 
+          chunk[idx] = elem;
+        }
+      }
+      break;
+    }
+    //case c10::ScalarType::Int: {
+    case 5: {
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        int frame_idx = (idx + index) / num_channels;
+        int channels_idx = (idx + index) % num_channels;
+        int elem = 0;
+        if (priv->channels_first) {
+          elem = *(int*)tensor.data(channels_idx, frame_idx);
+        } else {
+          elem = *(int*)tensor.data(frame_idx, channels_idx);
+        }
+        chunk[idx] = elem;
+      }
+      break;
+    }
+    // case short
+    case 3: {
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        int frame_idx = (idx + index) / num_channels;
+        int channels_idx = (idx + index) % num_channels;
+        int16_t elem = 0;
+        if (priv->channels_first) {
+          elem = *(int16_t*)tensor.data(channels_idx, frame_idx);
+        } else {
+          elem = *(int16_t*)tensor.data(frame_idx, channels_idx);
+        }
+        chunk[idx] = elem * 65536;
+      }
+      break;
+    }
+    // case byte
+    case 1: {
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        int frame_idx = (idx + index) / num_channels;
+        int channels_idx = (idx + index) % num_channels;
+        int8_t elem = 0;
+        if (priv->channels_first) {
+          elem = *(int8_t*)tensor.data(channels_idx, frame_idx);
+        } else {
+          elem = *(int8_t*)tensor.data(frame_idx, channels_idx);
+        }
+        chunk[idx] = (elem - 128) * 16777216; 
+      }
+      break;
+    }
+    default:
+      throw std::runtime_error("Unexpected dtype.");
+  }
+  // Write to buffer
+  memcpy(obuf, chunk.data(), *osamp * 4);
+  priv->index += *osamp;
+  return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
+}
+
+/// Callback function to fetch data from SoxEffectChain.
+int tensor_output_flow(
+    sox_effect_t* effp,
+    sox_sample_t const* ibuf,
+    sox_sample_t* obuf LSX_UNUSED,
+    size_t* isamp,
+    size_t* osamp) {
+  *osamp = 0;
+  // Get output buffer
+  auto out_buffer = static_cast<TensorOutputPriv*>(effp->priv)->buffer;
+  // Append at the end
+  out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
+  return SOX_SUCCESS;
+}
+
+int file_output_flow(
+    sox_effect_t* effp,
+    sox_sample_t const* ibuf,
+    sox_sample_t* obuf LSX_UNUSED,
+    size_t* isamp,
+    size_t* osamp) {
+  *osamp = 0;
+  if (*isamp) {
+    auto sf = static_cast<FileOutputPriv*>(effp->priv)->sf;
+    if (sox_write(sf, ibuf, *isamp) != *isamp) {
+      if (sf->sox_errno) {
+        std::ostringstream stream;
+        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
+               << sf->filename;
+        throw std::runtime_error(stream.str());
+      }
+      return SOX_EOF;
+    }
+  }
+  return SOX_SUCCESS;
+}
+
+sox_effect_handler_t* get_tensor_input_handler() {
+  static sox_effect_handler_t handler{
+      /*name=*/"input_tensor",
+      /*usage=*/NULL,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/NULL,
+      /*start=*/NULL,
+      /*flow=*/NULL,
+      /*drain=*/tensor_input_drain,
+      /*stop=*/NULL,
+      /*kill=*/NULL,
+      /*priv_size=*/sizeof(TensorInputPriv)};
+  return &handler;
+}
+
+sox_effect_handler_t* get_tensor_output_handler() {
+  static sox_effect_handler_t handler{
+      /*name=*/"output_tensor",
+      /*usage=*/NULL,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/NULL,
+      /*start=*/NULL,
+      /*flow=*/tensor_output_flow,
+      /*drain=*/NULL,
+      /*stop=*/NULL,
+      /*kill=*/NULL,
+      /*priv_size=*/sizeof(TensorOutputPriv)};
+  return &handler;
+}
+
+sox_effect_handler_t* get_file_output_handler() {
+  static sox_effect_handler_t handler{
+      /*name=*/"output_file",
+      /*usage=*/NULL,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/NULL,
+      /*start=*/NULL,
+      /*flow=*/file_output_flow,
+      /*drain=*/NULL,
+      /*stop=*/NULL,
+      /*kill=*/NULL,
+      /*priv_size=*/sizeof(FileOutputPriv)};
+  return &handler;
+}
+
+} // namespace
+
+SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
+
+SoxEffect::~SoxEffect() {
+  if (se_ != nullptr) {
+    free(se_);
+  }
+}
+
+SoxEffect::operator sox_effect_t*() const {
+  return se_;
+}
+
+auto SoxEffect::operator->() noexcept -> sox_effect_t* {
+  return se_;
+}
+
+SoxEffectsChain::SoxEffectsChain(
+    sox_encodinginfo_t input_encoding,
+    sox_encodinginfo_t output_encoding)
+    : in_enc_(input_encoding),
+      out_enc_(output_encoding),
+      in_sig_(),
+      interm_sig_(),
+      out_sig_(),
+      sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
+  if (!sec_) {
+    throw std::runtime_error("Failed to create effect chain.");
+  }
+}
+
+SoxEffectsChain::~SoxEffectsChain() {
+  if (sec_ != nullptr) {
+    sox_delete_effects_chain(sec_);
+  }
+}
+
+void SoxEffectsChain::run() {
+  sox_flow_effects(sec_, NULL, NULL);
+}
+
+void SoxEffectsChain::addInputTensor(
+    py::array* waveform,
+    int64_t sample_rate,
+    bool channels_first) {
+  in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
+  interm_sig_ = in_sig_;
+  SoxEffect e(sox_create_effect(get_tensor_input_handler()));
+  auto priv = static_cast<TensorInputPriv*>(e->priv);
+  priv->index = 0;
+  priv->waveform = waveform;
+  priv->sample_rate = sample_rate;
+  priv->channels_first = channels_first;
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: input_tensor");
+  }
+}
+
+void SoxEffectsChain::addOutputBuffer(
+    std::vector<sox_sample_t>* output_buffer) {
+  SoxEffect e(sox_create_effect(get_tensor_output_handler()));
+  static_cast<TensorOutputPriv*>(e->priv)->buffer = output_buffer;
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: output_tensor");
+  }
+}
+
+void SoxEffectsChain::addInputFile(sox_format_t* sf) {
+  in_sig_ = sf->signal;
+  interm_sig_ = in_sig_;
+  SoxEffect e(sox_create_effect(sox_find_effect("input")));
+  char* opts[] = {(char*)sf};
+  sox_effect_options(e, 1, opts);
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Internal Error: Failed to add effect: input " << sf->filename;
+    throw std::runtime_error(stream.str());
+  }
+}
+
+void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
+  out_sig_ = sf->signal;
+  SoxEffect e(sox_create_effect(get_file_output_handler()));
+  static_cast<FileOutputPriv*>(e->priv)->sf = sf;
+  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Internal Error: Failed to add effect: output " << sf->filename;
+    throw std::runtime_error(stream.str());
+  }
+}
+
+void SoxEffectsChain::addEffect(const std::vector<std::string> effect) {
+  const auto num_args = effect.size();
+  if (num_args == 0) {
+    throw std::runtime_error("Invalid argument: empty effect.");
+  }
+  const auto name = effect[0];
+  if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) {
+    std::ostringstream stream;
+    stream << "Unsupported effect: " << name;
+    throw std::runtime_error(stream.str());
+  }
+
+  auto returned_effect = sox_find_effect(name.c_str());
+  if (!returned_effect) {
+    std::ostringstream stream;
+    stream << "Unsupported effect: " << name;
+    throw std::runtime_error(stream.str());
+  }
+  SoxEffect e(sox_create_effect(returned_effect));
+  const auto num_options = num_args - 1;
+
+  std::vector<char*> opts;
+  for (size_t i = 1; i < num_args; ++i) {
+    opts.push_back((char*)effect[i].c_str());
+  }
+  if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) !=
+      SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Invalid effect option:";
+    for (const auto& v : effect) {
+      stream << " " << v;
+    }
+    throw std::runtime_error(stream.str());
+  }
+
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Internal Error: Failed to add effect: \"" << name;
+    for (size_t i = 1; i < num_args; ++i) {
+      stream << " " << effect[i];
+    }
+    stream << "\"";
+    throw std::runtime_error(stream.str());
+  }
+}
+
+int64_t SoxEffectsChain::getOutputNumChannels() {
+  return interm_sig_.channels;
+}
+
+int64_t SoxEffectsChain::getOutputSampleRate() {
+  return interm_sig_.rate;
+}
+
+namespace {
+
+/// helper classes for passing file-like object to SoxEffectChain
+struct FileObjInputPriv {
+  sox_format_t* sf;
+  py::object* fileobj;
+  bool eof_reached;
+  char* buffer;
+  uint64_t buffer_size;
+};
+
+struct FileObjOutputPriv {
+  sox_format_t* sf;
+  py::object* fileobj;
+  char** buffer;
+  size_t* buffer_size;
+};
+
+/// Callback function to feed byte string
+/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
+auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
+    -> int {
+  auto priv = static_cast<FileObjInputPriv*>(effp->priv);
+  auto sf = priv->sf;
+  auto buffer = priv->buffer;
+
+  // 1. Refresh the buffer
+  //
+  // NOTE:
+  //   Since the underlying FILE* was opened with `fmemopen`, the only way
+  //   libsox detect EOF is reaching the end of the buffer. (null byte won't
+  //   help) Therefore we need to align the content at the end of buffer,
+  //   otherwise, libsox will keep reading the content beyond intended length.
+  //
+  // Before:
+  //
+  //     |<-------consumed------>|<---remaining--->|
+  //     |***********************|-----------------|
+  //                             ^ ftell
+  //
+  // After:
+  //
+  //     |<-offset->|<---remaining--->|<-new data->|
+  //     |**********|-----------------|++++++++++++|
+  //                ^ ftell
+
+  // NOTE:
+  //   Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
+  //   supposed to be in sync, but there are cases (Vorbis) they are not
+  //   in sync and `tell_off` has seemingly uninitialized value, which
+  //   leads num_remain to be negative and cause segmentation fault
+  //   in `memmove`.
+  const auto tell = ftell((FILE*)sf->fp);
+  if (tell < 0) {
+    throw std::runtime_error("Internal Error: ftell failed.");
+  }
+  const auto num_consumed = static_cast<size_t>(tell);
+  if (num_consumed > priv->buffer_size) {
+    throw std::runtime_error("Internal Error: buffer overrun.");
+  }
+
+  const auto num_remain = priv->buffer_size - num_consumed;
+
+  // 1.1. Fetch the data to see if there is data to fill the buffer
+  size_t num_refill = 0;
+  std::string chunk(num_consumed, '\0');
+  if (num_consumed && !priv->eof_reached) {
+    num_refill = read_fileobj(
+        priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
+    if (num_refill < num_consumed) {
+      priv->eof_reached = true;
+    }
+  }
+  const auto offset = num_consumed - num_refill;
+
+  // 1.2. Move the unconsumed data towards the beginning of buffer.
+  if (num_remain) {
+    auto src = static_cast<void*>(buffer + num_consumed);
+    auto dst = static_cast<void*>(buffer + offset);
+    memmove(dst, src, num_remain);
+  }
+
+  // 1.3. Refill the remaining buffer.
+  if (num_refill) {
+    auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
+    auto dst = buffer + offset + num_remain;
+    memcpy(dst, src, num_refill);
+  }
+
+  // 1.4. Set the file pointer to the new offset
+  sf->tell_off = offset;
+  fseek((FILE*)sf->fp, offset, SEEK_SET);
+
+  // 2. Perform decoding operation
+  // The following part is practically same as "input" effect
+  // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
+
+  // At this point, osamp represents the buffer size in bytes,
+  // but sox_read expects the maximum number of samples ready to read.
+  // Normally, this is fine, but in case when the samples are not 4-byte
+  // aligned, (e.g. sample is 24bits), the resulting signal is not correct.
+  // https://github.com/pytorch/audio/issues/2083
+  if (sf->encoding.bits_per_sample > 0)
+    *osamp /= (sf->encoding.bits_per_sample / 8);
+
+  // Ensure that it's a multiple of the number of channels
+  *osamp -= *osamp % effp->out_signal.channels;
+
+  // Read up to *osamp samples into obuf;
+  // store the actual number read back to *osamp
+  *osamp = sox_read(sf, obuf, *osamp);
+
+  // Decoding is finished when fileobject is exhausted and sox can no longer
+  // decode a sample.
+  return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
+}
+
+auto fileobj_output_flow(
+    sox_effect_t* effp,
+    sox_sample_t const* ibuf,
+    sox_sample_t* obuf LSX_UNUSED,
+    size_t* isamp,
+    size_t* osamp) -> int {
+  *osamp = 0;
+  if (*isamp) {
+    auto priv = static_cast<FileObjOutputPriv*>(effp->priv);
+    auto sf = priv->sf;
+    auto fp = static_cast<FILE*>(sf->fp);
+    auto fileobj = priv->fileobj;
+    auto buffer = priv->buffer;
+
+    // Encode chunk
+    auto num_samples_written = sox_write(sf, ibuf, *isamp);
+    fflush(fp);
+
+    // Copy the encoded chunk to python object.
+    fileobj->attr("write")(py::bytes(*buffer, ftell(fp)));
+
+    // Reset FILE*
+    sf->tell_off = 0;
+    fseek(fp, 0, SEEK_SET);
+
+    if (num_samples_written != *isamp) {
+      if (sf->sox_errno) {
+        std::ostringstream stream;
+        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
+               << sf->filename;
+        throw std::runtime_error(stream.str());
+      }
+      return SOX_EOF;
+    }
+  }
+  return SOX_SUCCESS;
+}
+
+auto get_fileobj_input_handler() -> sox_effect_handler_t* {
+  static sox_effect_handler_t handler{
+      /*name=*/"input_fileobj_object",
+      /*usage=*/nullptr,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/nullptr,
+      /*start=*/nullptr,
+      /*flow=*/nullptr,
+      /*drain=*/fileobj_input_drain,
+      /*stop=*/nullptr,
+      /*kill=*/nullptr,
+      /*priv_size=*/sizeof(FileObjInputPriv)};
+  return &handler;
+}
+
+auto get_fileobj_output_handler() -> sox_effect_handler_t* {
+  static sox_effect_handler_t handler{
+      /*name=*/"output_fileobj_object",
+      /*usage=*/nullptr,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/nullptr,
+      /*start=*/nullptr,
+      /*flow=*/fileobj_output_flow,
+      /*drain=*/nullptr,
+      /*stop=*/nullptr,
+      /*kill=*/nullptr,
+      /*priv_size=*/sizeof(FileObjOutputPriv)};
+  return &handler;
+}
+
+} // namespace
+
+void SoxEffectsChainPyBind::addInputFileObj(
+    sox_format_t* sf,
+    char* buffer,
+    uint64_t buffer_size,
+    py::object* fileobj) {
+  in_sig_ = sf->signal;
+  interm_sig_ = in_sig_;
+
+  SoxEffect e(sox_create_effect(get_fileobj_input_handler()));
+  auto priv = static_cast<FileObjInputPriv*>(e->priv);
+  priv->sf = sf;
+  priv->fileobj = fileobj;
+  priv->eof_reached = false;
+  priv->buffer = buffer;
+  priv->buffer_size = buffer_size;
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: input fileobj");
+  }
+}
+
+void SoxEffectsChainPyBind::addOutputFileObj(
+    sox_format_t* sf,
+    char** buffer,
+    size_t* buffer_size,
+    py::object* fileobj) {
+  out_sig_ = sf->signal;
+  SoxEffect e(sox_create_effect(get_fileobj_output_handler()));
+  auto priv = static_cast<FileObjOutputPriv*>(e->priv);
+  priv->sf = sf;
+  priv->fileobj = fileobj;
+  priv->buffer = buffer;
+  priv->buffer_size = buffer_size;
+  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: output fileobj");
+  }
+}
+
+} // namespace paddleaudio::sox_effects_chain
--- a/audio/paddleaudio/src/pybind/sox/effects_chain.h
+++ b/audio/paddleaudio/src/pybind/sox/effects_chain.h
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h with modification.
+
+#pragma once
+
+#include <sox.h>
+#include "paddleaudio/src/pybind/sox/utils.h"
+
+namespace paddleaudio::sox_effects_chain {
+
+// Helper struct to safely close sox_effect_t* pointer returned by
+// sox_create_effect
+
+struct SoxEffect {
+  explicit SoxEffect(sox_effect_t* se) noexcept;
+  SoxEffect(const SoxEffect& other) = delete;
+  SoxEffect(const SoxEffect&& other) = delete;
+  auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
+  auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
+  ~SoxEffect();
+  operator sox_effect_t*() const;
+  auto operator->() noexcept -> sox_effect_t*;
+
+ private:
+  sox_effect_t* se_;
+};
+
+// Helper struct to safely close sox_effects_chain_t with handy methods
+class SoxEffectsChain {
+  const sox_encodinginfo_t in_enc_;
+  const sox_encodinginfo_t out_enc_;
+
+ protected:
+  sox_signalinfo_t in_sig_;
+  sox_signalinfo_t interm_sig_;
+  sox_signalinfo_t out_sig_;
+  sox_effects_chain_t* sec_;
+
+ public:
+  explicit SoxEffectsChain(
+      sox_encodinginfo_t input_encoding,
+      sox_encodinginfo_t output_encoding);
+  SoxEffectsChain(const SoxEffectsChain& other) = delete;
+  SoxEffectsChain(const SoxEffectsChain&& other) = delete;
+  SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
+  SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
+  ~SoxEffectsChain();
+  void run();
+  void addInputTensor(
+      py::array* waveform,
+      int64_t sample_rate,
+      bool channels_first);
+  void addInputFile(sox_format_t* sf);
+  void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
+  void addOutputFile(sox_format_t* sf);
+  void addEffect(const std::vector<std::string> effect);
+  int64_t getOutputNumChannels();
+  int64_t getOutputSampleRate();
+};
+
+class SoxEffectsChainPyBind : public SoxEffectsChain {
+  using SoxEffectsChain::SoxEffectsChain;
+
+ public:
+  void addInputFileObj(
+      sox_format_t* sf,
+      char* buffer,
+      uint64_t buffer_size,
+      py::object* fileobj);
+
+  void addOutputFileObj(
+      sox_format_t* sf,
+      char** buffer,
+      size_t* buffer_size,
+      py::object* fileobj);
+};
+
+} // namespace paddleaudio::sox_effects_chain
+
--- a/audio/paddleaudio/src/pybind/sox/io.cpp
+++ b/audio/paddleaudio/src/pybind/sox/io.cpp
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp with modification.
+
+#include "paddleaudio/src/pybind/sox/io.h"
+#include "paddleaudio/src/pybind/sox/effects.h"
+#include "paddleaudio/src/pybind/sox/types.h"
+#include "paddleaudio/src/pybind/sox/effects_chain.h"
+#include "paddleaudio/src/pybind/sox/utils.h"
+#include "paddleaudio/src/optional/optional.hpp"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio {
+namespace sox_io {
+
+auto get_info_file(const std::string &path, 
+                   const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
+    SoxFormat sf(
+        sox_open_read(path.data(),
+                      /*signal=*/nullptr,
+                      /*encoding=*/nullptr,
+                      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+
+    validate_input_file(sf, path);
+
+    return std::make_tuple(
+        static_cast<int64_t>(sf->signal.rate),
+        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
+        static_cast<int64_t>(sf->signal.channels),
+        static_cast<int64_t>(sf->encoding.bits_per_sample),
+        get_encoding(sf->encoding.encoding));
+}
+
+std::vector<std::vector<std::string>> get_effects(
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames) {
+  const auto offset = frame_offset.value_or(0);
+  if (offset < 0) {
+    throw std::runtime_error(
+        "Invalid argument: frame_offset must be non-negative.");
+  }
+  const auto frames = num_frames.value_or(-1);
+  if (frames == 0 || frames < -1) {
+    throw std::runtime_error(
+        "Invalid argument: num_frames must be -1 or greater than 0.");
+  }
+
+  std::vector<std::vector<std::string>> effects;
+  if (frames != -1) {
+    std::ostringstream os_offset, os_frames;
+    os_offset << offset << "s";
+    os_frames << "+" << frames << "s";
+    effects.emplace_back(
+        std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
+  } else if (offset != 0) {
+    std::ostringstream os_offset;
+    os_offset << offset << "s";
+    effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
+  }
+  return effects;
+}
+
+auto get_info_fileobj(py::object fileobj, 
+                      const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
+    const auto capacity = [&]() {
+        const auto bufsiz = get_buffer_size();
+        const int64_t kDefaultCapacityInBytes = 4096;
+        return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
+                                                  : kDefaultCapacityInBytes;
+    }();
+    std::string buffer(capacity, '\0');
+    auto *buf = const_cast<char *>(buffer.data());
+    auto num_read = read_fileobj(&fileobj, capacity, buf);
+    // If the file is shorter than 256, then libsox cannot read the header.
+    auto buf_size = (num_read > 256) ? num_read : 256;
+
+    SoxFormat sf(sox_open_mem_read(
+        buf,
+        buf_size,
+        /*signal=*/nullptr,
+        /*encoding=*/nullptr,
+        /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+    // In case of streamed data, length can be 0
+    validate_input_memfile(sf);
+
+    return std::make_tuple(
+        static_cast<int64_t>(sf->signal.rate),
+        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
+        static_cast<int64_t>(sf->signal.channels),
+        static_cast<int64_t>(sf->encoding.bits_per_sample),
+        get_encoding(sf->encoding.encoding));
+}
+
+tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
+    py::object fileobj,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format) {
+  auto effects = get_effects(frame_offset, num_frames);
+  return paddleaudio::sox_effects::apply_effects_fileobj(
+      std::move(fileobj), effects, normalize, channels_first, std::move(format));
+}
+
+tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
+    const std::string& path,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format) {
+    auto effects = get_effects(frame_offset, num_frames);
+    return paddleaudio::sox_effects::apply_effects_file(
+        path, effects, normalize, channels_first, format);
+}
+
+void save_audio_file(const std::string& path,
+                     py::array tensor,
+                     int64_t sample_rate,
+                     bool channels_first,
+                     tl::optional<double> compression,
+                     tl::optional<std::string> format,
+                     tl::optional<std::string> encoding,
+                     tl::optional<int64_t> bits_per_sample) {
+    validate_input_tensor(tensor);
+
+    const auto filetype = [&]() {
+        if (format.has_value()) return format.value();
+        return get_filetype(path);
+    }();
+
+    if (filetype == "amr-nb") {
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+        //TORCH_CHECK(num_channels == 1,
+        //            "amr-nb format only supports single channel audio.");
+        assert(num_channels == 1);
+    } else if (filetype == "htk") {
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+       // TORCH_CHECK(num_channels == 1,
+        //            "htk format only supports single channel audio.");
+        assert(num_channels == 1);
+    } else if (filetype == "gsm") {
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+        assert(num_channels == 1);
+        assert(sample_rate == 8000);
+        //TORCH_CHECK(num_channels == 1,
+        //            "gsm format only supports single channel audio.");
+        //TORCH_CHECK(sample_rate == 8000,
+        //            "gsm format only supports a sampling rate of 8kHz.");
+    }
+    const auto signal_info =
+        get_signalinfo(&tensor, sample_rate, filetype, channels_first);
+    const auto encoding_info = get_encodinginfo_for_save(
+        filetype, tensor.dtype(), compression, encoding, bits_per_sample);
+
+    SoxFormat sf(sox_open_write(path.c_str(),
+                                &signal_info,
+                                &encoding_info,
+                                /*filetype=*/filetype.c_str(),
+                                /*oob=*/nullptr,
+                                /*overwrite_permitted=*/nullptr));
+
+    if (static_cast<sox_format_t*>(sf) == nullptr) {
+        throw std::runtime_error(
+            "Error saving audio file: failed to open file " + path);
+    }
+
+    paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+        /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
+        /*output_encoding=*/sf->encoding);
+    chain.addInputTensor(&tensor, sample_rate, channels_first);
+    chain.addOutputFile(sf);
+    chain.run();
+}
+
+namespace {
+// helper class to automatically release buffer, to be used by
+// save_audio_fileobj
+struct AutoReleaseBuffer {
+  char* ptr;
+  size_t size;
+
+  AutoReleaseBuffer() : ptr(nullptr), size(0) {}
+  AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
+  AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
+  auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
+  auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
+  ~AutoReleaseBuffer() {
+    if (ptr) {
+      free(ptr);
+    }
+  }
+};
+
+} // namespace
+
+void save_audio_fileobj(
+    py::object fileobj,
+    py::array tensor,
+    int64_t sample_rate,
+    bool channels_first,
+    tl::optional<double> compression,
+    tl::optional<std::string> format,
+    tl::optional<std::string> encoding,
+    tl::optional<int64_t> bits_per_sample) {
+
+  if (!format.has_value()) {
+    throw std::runtime_error(
+        "`format` is required when saving to file object.");
+  }
+  const auto filetype = format.value();
+
+  if (filetype == "amr-nb") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "amr-nb format only supports single channel audio.");
+    }
+  } else if (filetype == "htk") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "htk format only supports single channel audio.");
+    }
+  } else if (filetype == "gsm") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "gsm format only supports single channel audio.");
+    }
+    if (sample_rate != 8000) {
+      throw std::runtime_error(
+          "gsm format only supports a sampling rate of 8kHz.");
+    }
+  }
+
+  const auto signal_info =
+      get_signalinfo(&tensor, sample_rate, filetype, channels_first);
+  const auto encoding_info = get_encodinginfo_for_save(
+      filetype,
+      tensor.dtype(),
+      compression,
+      std::move(encoding),
+      bits_per_sample);
+
+  AutoReleaseBuffer buffer;
+
+  SoxFormat sf(sox_open_memstream_write(
+      &buffer.ptr,
+      &buffer.size,
+      &signal_info,
+      &encoding_info,
+      filetype.c_str(),
+      /*oob=*/nullptr));
+
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error(
+        "Error saving audio file: failed to open memory stream.");
+  }
+
+  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+      /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
+      /*output_encoding=*/sf->encoding);
+  chain.addInputTensor(&tensor, sample_rate, channels_first);
+  chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
+  chain.run();
+
+  // Closing the sox_format_t is necessary for flushing the last chunk to the
+  // buffer
+  sf.close();
+  fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
+}
+
+}  // namespace paddleaudio
+}  // namespace sox_io
--- a/audio/paddleaudio/src/pybind/sox/io.h
+++ b/audio/paddleaudio/src/pybind/sox/io.h
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.h with modification.
+#pragma once
+
+#include "paddleaudio/src/pybind/sox/utils.h"
+
+namespace py = pybind11;
+
+namespace paddleaudio {
+namespace sox_io {
+
+auto get_info_file(const std::string &path, 
+                   const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
+
+auto get_info_fileobj(py::object fileobj,
+                   const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
+
+tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
+    py::object fileobj,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format);
+
+void save_audio_fileobj(
+    py::object fileobj,
+    py::array tensor,
+    int64_t sample_rate,
+    bool channels_first,
+    tl::optional<double> compression,
+    tl::optional<std::string> format,
+    tl::optional<std::string> encoding,
+    tl::optional<int64_t> bits_per_sample);
+
+auto get_effects(const tl::optional<int64_t>& frame_offset,
+                 const tl::optional<int64_t>& num_frames)
+    -> std::vector<std::vector<std::string>>;
+
+
+tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
+    const std::string& path,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format);
+
+void save_audio_file(const std::string& path,
+                     py::array tensor,
+                     int64_t sample_rate,
+                     bool channels_first,
+                     tl::optional<double> compression,
+                     tl::optional<std::string> format,
+                     tl::optional<std::string> encoding,
+                     tl::optional<int64_t> bits_per_sample);    
+
+
+}  // namespace paddleaudio
+}  // namespace sox_io
--- a/audio/paddleaudio/src/pybind/sox/types.cpp
+++ b/audio/paddleaudio/src/pybind/sox/types.cpp
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
+
+#include "paddleaudio/src/pybind/sox/types.h"
+#include <ostream>
+#include <sstream>
+
+namespace paddleaudio {
+namespace sox_utils {
+
+Format get_format_from_string(const std::string& format) {
+  if (format == "wav")
+    return Format::WAV;
+  if (format == "mp3")
+    return Format::MP3;
+  if (format == "flac")
+    return Format::FLAC;
+  if (format == "ogg" || format == "vorbis")
+    return Format::VORBIS;
+  if (format == "amr-nb")
+    return Format::AMR_NB;
+  if (format == "amr-wb")
+    return Format::AMR_WB;
+  if (format == "amb")
+    return Format::AMB;
+  if (format == "sph")
+    return Format::SPHERE;
+  if (format == "htk")
+    return Format::HTK;
+  if (format == "gsm")
+    return Format::GSM;
+  std::ostringstream stream;
+  stream << "Internal Error: unexpected format value: " << format;
+  throw std::runtime_error(stream.str());
+}
+
+std::string to_string(Encoding v) {
+  switch (v) {
+    case Encoding::UNKNOWN:
+      return "UNKNOWN";
+    case Encoding::PCM_SIGNED:
+      return "PCM_S";
+    case Encoding::PCM_UNSIGNED:
+      return "PCM_U";
+    case Encoding::PCM_FLOAT:
+      return "PCM_F";
+    case Encoding::FLAC:
+      return "FLAC";
+    case Encoding::ULAW:
+      return "ULAW";
+    case Encoding::ALAW:
+      return "ALAW";
+    case Encoding::MP3:
+      return "MP3";
+    case Encoding::VORBIS:
+      return "VORBIS";
+    case Encoding::AMR_WB:
+      return "AMR_WB";
+    case Encoding::AMR_NB:
+      return "AMR_NB";
+    case Encoding::OPUS:
+      return "OPUS";
+    default:
+      throw std::runtime_error("Internal Error: unexpected encoding.");
+  }
+}
+
+Encoding get_encoding_from_option(const tl::optional<std::string> encoding) {
+  if (!encoding.has_value())
+    return Encoding::NOT_PROVIDED;
+  std::string v = encoding.value();
+  if (v == "PCM_S")
+    return Encoding::PCM_SIGNED;
+  if (v == "PCM_U")
+    return Encoding::PCM_UNSIGNED;
+  if (v == "PCM_F")
+    return Encoding::PCM_FLOAT;
+  if (v == "ULAW")
+    return Encoding::ULAW;
+  if (v == "ALAW")
+    return Encoding::ALAW;
+  std::ostringstream stream;
+  stream << "Internal Error: unexpected encoding value: " << v;
+  throw std::runtime_error(stream.str());
+}
+
+BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth) {
+  if (!bit_depth.has_value())
+    return BitDepth::NOT_PROVIDED;
+  int64_t v = bit_depth.value();
+  switch (v) {
+    case 8:
+      return BitDepth::B8;
+    case 16:
+      return BitDepth::B16;
+    case 24:
+      return BitDepth::B24;
+    case 32:
+      return BitDepth::B32;
+    case 64:
+      return BitDepth::B64;
+    default: {
+      std::ostringstream s;
+      s << "Internal Error: unexpected bit depth value: " << v;
+      throw std::runtime_error(s.str());
+    }
+  }
+}
+
+std::string get_encoding(sox_encoding_t encoding) {
+  switch (encoding) {
+    case SOX_ENCODING_UNKNOWN:
+      return "UNKNOWN";
+    case SOX_ENCODING_SIGN2:
+      return "PCM_S";
+    case SOX_ENCODING_UNSIGNED:
+      return "PCM_U";
+    case SOX_ENCODING_FLOAT:
+      return "PCM_F";
+    case SOX_ENCODING_FLAC:
+      return "FLAC";
+    case SOX_ENCODING_ULAW:
+      return "ULAW";
+    case SOX_ENCODING_ALAW:
+      return "ALAW";
+    case SOX_ENCODING_MP3:
+      return "MP3";
+    case SOX_ENCODING_VORBIS:
+      return "VORBIS";
+    case SOX_ENCODING_AMR_WB:
+      return "AMR_WB";
+    case SOX_ENCODING_AMR_NB:
+      return "AMR_NB";
+    case SOX_ENCODING_OPUS:
+      return "OPUS";
+    case SOX_ENCODING_GSM:
+      return "GSM";
+    default:
+      return "UNKNOWN";
+  }
+}
+
+} // namespace sox_utils
+} // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/sox/types.h
+++ b/audio/paddleaudio/src/pybind/sox/types.h
--- a/audio/paddleaudio/src/pybind/sox/utils.cpp
+++ b/audio/paddleaudio/src/pybind/sox/utils.cpp
--- a/audio/paddleaudio/src/pybind/sox/utils.h
+++ b/audio/paddleaudio/src/pybind/sox/utils.h
--- a/audio/paddleaudio/src/utils.cpp
+++ b/audio/paddleaudio/src/utils.cpp
--- a/audio/paddleaudio/third_party/.gitignore
+++ b/audio/paddleaudio/third_party/.gitignore
+archives/
+install/
--- a/audio/paddleaudio/third_party/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/CMakeLists.txt
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+
+################################################################################
+# sox
+################################################################################
+if (BUILD_SOX)
+  add_subdirectory(sox)
+endif()
+
+################################################################################
+# kaldi
+################################################################################
+if (BUILD_KALDI)
+  add_subdirectory(kaldi)
+endif()
\ No newline at end of file
--- a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
--- a/audio/paddleaudio/third_party/patches/config.guess
+++ b/audio/paddleaudio/third_party/patches/config.guess
--- a/audio/paddleaudio/third_party/patches/config.sub
+++ b/audio/paddleaudio/third_party/patches/config.sub
--- a/audio/paddleaudio/third_party/patches/libmad.patch
+++ b/audio/paddleaudio/third_party/patches/libmad.patch
--- a/audio/paddleaudio/third_party/patches/sox.patch
+++ b/audio/paddleaudio/third_party/patches/sox.patch
--- a/audio/paddleaudio/third_party/sox/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/sox/CMakeLists.txt
--- a/audio/paddleaudio/utils/__init__.py
+++ b/audio/paddleaudio/utils/__init__.py
--- a/audio/paddleaudio/utils/download.py
+++ b/audio/paddleaudio/utils/download.py
--- a/audio/paddleaudio/utils/env.py
+++ b/audio/paddleaudio/utils/env.py
--- a/paddlespeech/audio/io/__init__.py
+++ b/paddlespeech/audio/io/__init__.py
@@ -11,3 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+__all__ = ['ParameterError']
+
+
+class ParameterError(Exception):
+    """Exception class for Parameter checking"""
+    pass
--- a/audio/paddleaudio/utils/log.py
+++ b/audio/paddleaudio/utils/log.py
--- a/audio/paddleaudio/utils/numeric.py
+++ b/audio/paddleaudio/utils/numeric.py
--- a/audio/paddleaudio/utils/sox_utils.py
+++ b/audio/paddleaudio/utils/sox_utils.py
--- a/audio/paddleaudio/utils/tensor_utils.py
+++ b/audio/paddleaudio/utils/tensor_utils.py
--- a/audio/paddleaudio/utils/time.py
+++ b/audio/paddleaudio/utils/time.py
--- a/audio/setup.py
+++ b/audio/setup.py
--- a/tests/unit/audio/backends/base.py
+++ b/tests/unit/audio/backends/base.py
--- a/audio/tests/backends/common.py
+++ b/audio/tests/backends/common.py
--- a/paddlespeech/audio/backends/sox_backend.py
+++ b/paddlespeech/audio/backends/sox_backend.py
--- a/audio/tests/backends/soundfile/common.py
+++ b/audio/tests/backends/soundfile/common.py
--- a/audio/tests/backends/soundfile/common_utils
+++ b/audio/tests/backends/soundfile/common_utils
--- a/audio/tests/backends/soundfile/info_test.py
+++ b/audio/tests/backends/soundfile/info_test.py
--- a/audio/tests/backends/soundfile/load_test.py
+++ b/audio/tests/backends/soundfile/load_test.py
--- a/audio/tests/backends/soundfile/save_test.py
+++ b/audio/tests/backends/soundfile/save_test.py
--- a/tests/unit/audio/backends/soundfile/test_io.py
+++ b/tests/unit/audio/backends/soundfile/test_io.py
--- a/audio/tests/backends/sox_io/common.py
+++ b/audio/tests/backends/sox_io/common.py
--- a/audio/tests/backends/sox_io/common_utils
+++ b/audio/tests/backends/sox_io/common_utils
--- a/audio/tests/backends/sox_io/info_test.py
+++ b/audio/tests/backends/sox_io/info_test.py
--- a/audio/tests/backends/sox_io/load_test.py
+++ b/audio/tests/backends/sox_io/load_test.py
--- a/audio/tests/backends/sox_io/save_test.py
+++ b/audio/tests/backends/sox_io/save_test.py
--- a/audio/tests/backends/sox_io/smoke_test.py
+++ b/audio/tests/backends/sox_io/smoke_test.py
--- a/audio/tests/backends/sox_io/sox_effect_test.py
+++ b/audio/tests/backends/sox_io/sox_effect_test.py
--- a/audio/tests/backends/sox_io/sox_effect_test_args.jsonl
+++ b/audio/tests/backends/sox_io/sox_effect_test_args.jsonl
--- a/tests/benchmark/audio/README.md
+++ b/tests/benchmark/audio/README.md
--- a/tests/benchmark/audio/log_melspectrogram.py
+++ b/tests/benchmark/audio/log_melspectrogram.py
--- a/tests/benchmark/audio/melspectrogram.py
+++ b/tests/benchmark/audio/melspectrogram.py
--- a/tests/benchmark/audio/mfcc.py
+++ b/tests/benchmark/audio/mfcc.py
--- a/audio/tests/common_utils/__init__.py
+++ b/audio/tests/common_utils/__init__.py
--- a/audio/tests/common_utils/case_utils.py
+++ b/audio/tests/common_utils/case_utils.py
--- a/audio/tests/common_utils/data_utils.py
+++ b/audio/tests/common_utils/data_utils.py
--- a/audio/tests/common_utils/parameterized_utils.py
+++ b/audio/tests/common_utils/parameterized_utils.py
--- a/audio/tests/common_utils/sox_utils.py
+++ b/audio/tests/common_utils/sox_utils.py
--- a/audio/tests/common_utils/wav_utils.py
+++ b/audio/tests/common_utils/wav_utils.py
--- a/paddlespeech/audio/sox_effects/__init__.py
+++ b/paddlespeech/audio/sox_effects/__init__.py
--- a/tests/unit/audio/features/base.py
+++ b/tests/unit/audio/features/base.py
--- a/tests/unit/audio/features/test_istft.py
+++ b/tests/unit/audio/features/test_istft.py
--- a/tests/unit/audio/features/test_kaldi.py
+++ b/tests/unit/audio/features/test_kaldi.py
--- a/audio/tests/features/test_kaldi_feat.py
+++ b/audio/tests/features/test_kaldi_feat.py
--- a/tests/unit/audio/features/test_librosa.py
+++ b/tests/unit/audio/features/test_librosa.py
--- a/tests/unit/audio/features/test_log_melspectrogram.py
+++ b/tests/unit/audio/features/test_log_melspectrogram.py
--- a/tests/unit/audio/features/test_spectrogram.py
+++ b/tests/unit/audio/features/test_spectrogram.py
--- a/tests/unit/audio/features/test_stft.py
+++ b/tests/unit/audio/features/test_stft.py
--- a/audio/tests/features/testdata/fbank_feat.ark
+++ b/audio/tests/features/testdata/fbank_feat.ark
--- a/audio/tests/features/testdata/fbank_feat_txt.ark
+++ b/audio/tests/features/testdata/fbank_feat_txt.ark
--- a/audio/tests/features/testdata/pitch_feat.ark
+++ b/audio/tests/features/testdata/pitch_feat.ark
--- a/audio/tests/features/testdata/pitch_feat_txt.ark
+++ b/audio/tests/features/testdata/pitch_feat_txt.ark
--- a/audio/tests/features/testdata/test.wav
+++ b/audio/tests/features/testdata/test.wav
--- a/audio/tools/setup_helpers/__init__.py
+++ b/audio/tools/setup_helpers/__init__.py
--- a/audio/tools/setup_helpers/extension.py
+++ b/audio/tools/setup_helpers/extension.py
--- a/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst
+++ b/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst
--- a/docs/source/api/paddlespeech.audio.backends.sox_backend.rst
+++ b/docs/source/api/paddlespeech.audio.backends.sox_backend.rst
--- a/docs/source/api/paddlespeech.audio.compliance.kaldi.rst
+++ b/docs/source/api/paddlespeech.audio.compliance.kaldi.rst
--- a/docs/source/api/paddlespeech.audio.datasets.dataset.rst
+++ b/docs/source/api/paddlespeech.audio.datasets.dataset.rst
--- a/docs/source/api/paddlespeech.audio.datasets.hey_snips.rst
+++ b/docs/source/api/paddlespeech.audio.datasets.hey_snips.rst
--- a/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst
+++ b/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst
--- a/docs/source/api/paddlespeech.audio.datasets.rst
+++ b/docs/source/api/paddlespeech.audio.datasets.rst
--- a/docs/source/api/paddlespeech.audio.datasets.tess.rst
+++ b/docs/source/api/paddlespeech.audio.datasets.tess.rst
--- a/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst
+++ b/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst
--- a/docs/source/api/paddlespeech.audio.datasets.voxceleb.rst
+++ b/docs/source/api/paddlespeech.audio.datasets.voxceleb.rst
--- a/docs/source/api/paddlespeech.audio.functional.functional.rst
+++ b/docs/source/api/paddlespeech.audio.functional.functional.rst
--- a/docs/source/api/paddlespeech.audio.functional.window.rst
+++ b/docs/source/api/paddlespeech.audio.functional.window.rst
--- a/docs/source/api/paddlespeech.audio.rst
+++ b/docs/source/api/paddlespeech.audio.rst
--- a/docs/source/api/paddlespeech.audio.utils.rst
+++ b/docs/source/api/paddlespeech.audio.utils.rst
--- a/docs/source/api/paddlespeech.cls.exps.panns.rst
+++ b/docs/source/api/paddlespeech.cls.exps.panns.rst
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
--- a/docs/source/api/paddlespeech.rst
+++ b/docs/source/api/paddlespeech.rst
--- a/docs/source/api/paddlespeech.t2s.exps.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.rst
--- a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
--- a/docs/source/api/paddlespeech.version.rst
+++ b/docs/source/api/paddlespeech.version.rst
--- a/docs/source/audio_api/modules.rst
+++ b/docs/source/audio_api/modules.rst
--- a/docs/source/audio_api/paddleaudio.backends.common.rst
+++ b/docs/source/audio_api/paddleaudio.backends.common.rst
--- a/docs/source/api/paddlespeech.audio.sox_effects.rst
+++ b/docs/source/api/paddlespeech.audio.sox_effects.rst
--- a/docs/source/audio_api/paddleaudio.backends.rst
+++ b/docs/source/audio_api/paddleaudio.backends.rst
--- a/docs/source/audio_api/paddleaudio.backends.soundfile_backend.rst
+++ b/docs/source/audio_api/paddleaudio.backends.soundfile_backend.rst
--- a/docs/source/api/paddlespeech.audio.compliance.librosa.rst
+++ b/docs/source/api/paddlespeech.audio.compliance.librosa.rst
--- a/docs/source/audio_api/paddleaudio.backends.utils.rst
+++ b/docs/source/audio_api/paddleaudio.backends.utils.rst
--- a/docs/source/audio_api/paddleaudio.compliance.kaldi.rst
+++ b/docs/source/audio_api/paddleaudio.compliance.kaldi.rst
--- a/docs/source/audio_api/paddleaudio.compliance.librosa.rst
+++ b/docs/source/audio_api/paddleaudio.compliance.librosa.rst
--- a/docs/source/api/paddlespeech.audio.compliance.rst
+++ b/docs/source/api/paddlespeech.audio.compliance.rst
--- a/docs/source/audio_api/paddleaudio.datasets.dataset.rst
+++ b/docs/source/audio_api/paddleaudio.datasets.dataset.rst
--- a/docs/source/audio_api/paddleaudio.datasets.esc50.rst
+++ b/docs/source/audio_api/paddleaudio.datasets.esc50.rst
--- a/docs/source/audio_api/paddleaudio.datasets.gtzan.rst
+++ b/docs/source/audio_api/paddleaudio.datasets.gtzan.rst
--- a/docs/source/audio_api/paddleaudio.datasets.hey_snips.rst
+++ b/docs/source/audio_api/paddleaudio.datasets.hey_snips.rst
--- a/docs/source/api/paddlespeech.audio.datasets.gtzan.rst
+++ b/docs/source/api/paddlespeech.audio.datasets.gtzan.rst
--- a/docs/source/audio_api/paddleaudio.datasets.rst
+++ b/docs/source/audio_api/paddleaudio.datasets.rst
--- a/docs/source/audio_api/paddleaudio.datasets.tess.rst
+++ b/docs/source/audio_api/paddleaudio.datasets.tess.rst
--- a/docs/source/api/paddlespeech.audio.datasets.esc50.rst
+++ b/docs/source/api/paddlespeech.audio.datasets.esc50.rst
--- a/docs/source/api/paddlespeech.audio.metric.eer.rst
+++ b/docs/source/api/paddlespeech.audio.metric.eer.rst
--- a/docs/source/audio_api/paddleaudio.features.layers.rst
+++ b/docs/source/audio_api/paddleaudio.features.layers.rst
--- a/docs/source/api/paddlespeech.audio.metric.rst
+++ b/docs/source/api/paddlespeech.audio.metric.rst
--- a/docs/source/audio_api/paddleaudio.functional.functional.rst
+++ b/docs/source/audio_api/paddleaudio.functional.functional.rst
--- a/docs/source/api/paddlespeech.audio.functional.rst
+++ b/docs/source/api/paddlespeech.audio.functional.rst
--- a/docs/source/audio_api/paddleaudio.functional.window.rst
+++ b/docs/source/audio_api/paddleaudio.functional.window.rst
--- a/docs/source/audio_api/paddleaudio.kaldi.kaldi.rst
+++ b/docs/source/audio_api/paddleaudio.kaldi.kaldi.rst
--- a/docs/source/api/paddlespeech.audio.backends.rst
+++ b/docs/source/api/paddlespeech.audio.backends.rst
--- a/docs/source/audio_api/paddleaudio.metric.eer.rst
+++ b/docs/source/audio_api/paddleaudio.metric.eer.rst
--- a/docs/source/audio_api/paddleaudio.metric.rst
+++ b/docs/source/audio_api/paddleaudio.metric.rst
--- a/docs/source/audio_api/paddleaudio.rst
+++ b/docs/source/audio_api/paddleaudio.rst
--- a/docs/source/audio_api/paddleaudio.sox_effects.rst
+++ b/docs/source/audio_api/paddleaudio.sox_effects.rst
--- a/docs/source/audio_api/paddleaudio.sox_effects.sox_effects.rst
+++ b/docs/source/audio_api/paddleaudio.sox_effects.sox_effects.rst
--- a/docs/source/audio_api/paddleaudio.utils.download.rst
+++ b/docs/source/audio_api/paddleaudio.utils.download.rst
--- a/docs/source/audio_api/paddleaudio.utils.env.rst
+++ b/docs/source/audio_api/paddleaudio.utils.env.rst
--- a/docs/source/audio_api/paddleaudio.utils.error.rst
+++ b/docs/source/audio_api/paddleaudio.utils.error.rst
--- a/docs/source/audio_api/paddleaudio.utils.log.rst
+++ b/docs/source/audio_api/paddleaudio.utils.log.rst
--- a/docs/source/audio_api/paddleaudio.utils.numeric.rst
+++ b/docs/source/audio_api/paddleaudio.utils.numeric.rst
--- a/docs/source/audio_api/paddleaudio.utils.rst
+++ b/docs/source/audio_api/paddleaudio.utils.rst
--- a/docs/source/audio_api/paddleaudio.utils.sox_utils.rst
+++ b/docs/source/audio_api/paddleaudio.utils.sox_utils.rst
--- a/docs/source/audio_api/paddleaudio.utils.tensor_utils.rst
+++ b/docs/source/audio_api/paddleaudio.utils.tensor_utils.rst
--- a/docs/source/audio_api/paddleaudio.utils.time.rst
+++ b/docs/source/audio_api/paddleaudio.utils.time.rst
--- a/docs/source/cls/custom_dataset.md
+++ b/docs/source/cls/custom_dataset.md
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
--- a/docs/source/install.md
+++ b/docs/source/install.md
--- a/examples/esc50/cls0/conf/panns.yaml
+++ b/examples/esc50/cls0/conf/panns.yaml
--- a/examples/hey_snips/kws0/conf/mdtc.yaml
+++ b/examples/hey_snips/kws0/conf/mdtc.yaml
--- a/examples/tess/README.md
+++ b/examples/tess/README.md
--- a/examples/tess/cls0/conf/panns_logmelspectrogram.yaml
+++ b/examples/tess/cls0/conf/panns_logmelspectrogram.yaml
--- a/examples/tess/cls0/conf/panns_melspectrogram.yaml
+++ b/examples/tess/cls0/conf/panns_melspectrogram.yaml
--- a/examples/tess/cls0/conf/panns_mfcc.yaml
+++ b/examples/tess/cls0/conf/panns_mfcc.yaml
--- a/examples/tess/cls0/conf/panns_spectrogram.yaml
+++ b/examples/tess/cls0/conf/panns_spectrogram.yaml
--- a/examples/tess/cls0/local/train.py
+++ b/examples/tess/cls0/local/train.py
--- a/examples/tess/cls0/local/train.sh
+++ b/examples/tess/cls0/local/train.sh
--- a/examples/tess/cls0/path.sh
+++ b/examples/tess/cls0/path.sh
--- a/examples/tess/cls0/run.sh
+++ b/examples/tess/cls0/run.sh
--- a/examples/voxceleb/sv0/local/data_prepare.py
+++ b/examples/voxceleb/sv0/local/data_prepare.py
--- a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
+++ b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
--- a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
+++ b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
--- a/paddlespeech/audio/.gitignore
+++ b/paddlespeech/audio/.gitignore
--- a/paddlespeech/audio/__init__.py
+++ b/paddlespeech/audio/__init__.py
--- a/paddlespeech/audio/streamdata/autodecode.py
+++ b/paddlespeech/audio/streamdata/autodecode.py
--- a/paddlespeech/audio/streamdata/filters.py
+++ b/paddlespeech/audio/streamdata/filters.py
--- a/paddlespeech/audio/streamdata/tariterators.py
+++ b/paddlespeech/audio/streamdata/tariterators.py
--- a/paddlespeech/audio/transform/spectrogram.py
+++ b/paddlespeech/audio/transform/spectrogram.py
--- a/paddlespeech/audio/utils/__init__.py
+++ b/paddlespeech/audio/utils/__init__.py
--- a/paddlespeech/audio/utils/numeric.py
+++ b/paddlespeech/audio/utils/numeric.py
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
--- a/paddlespeech/cli/kws/infer.py
+++ b/paddlespeech/cli/kws/infer.py
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
--- a/paddlespeech/cls/exps/panns/export_model.py
+++ b/paddlespeech/cls/exps/panns/export_model.py
--- a/paddlespeech/cls/exps/panns/predict.py
+++ b/paddlespeech/cls/exps/panns/predict.py
--- a/paddlespeech/cls/exps/panns/train.py
+++ b/paddlespeech/cls/exps/panns/train.py
--- a/paddlespeech/cls/models/panns/panns.py
+++ b/paddlespeech/cls/models/panns/panns.py
--- a/paddlespeech/kws/exps/mdtc/train.py
+++ b/paddlespeech/kws/exps/mdtc/train.py
--- a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
--- a/paddlespeech/s2t/modules/fbank.py
+++ b/paddlespeech/s2t/modules/fbank.py
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
--- a/paddlespeech/server/util.py
+++ b/paddlespeech/server/util.py
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
--- a/paddlespeech/vector/exps/ecapa_tdnn/test.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
--- a/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
+++ b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
--- a/paddlespeech/vector/io/dataset.py
+++ b/paddlespeech/vector/io/dataset.py
--- a/paddlespeech/vector/io/dataset_from_json.py
+++ b/paddlespeech/vector/io/dataset_from_json.py
--- a/setup.py
+++ b/setup.py
--- a/speechx/speechx/kaldi/base/kaldi-types.h
+++ b/speechx/speechx/kaldi/base/kaldi-types.h
--- a/speechx/speechx/kaldi/feat/feature-plp.h
+++ b/speechx/speechx/kaldi/feat/feature-plp.h
--- a/speechx/speechx/kaldi/feat/online-feature-itf.h
+++ b/speechx/speechx/kaldi/feat/online-feature-itf.h
--- a/speechx/speechx/kaldi/feat/online-feature.h
+++ b/speechx/speechx/kaldi/feat/online-feature.h
--- a/speechx/speechx/kaldi/feat/pitch-functions.h
+++ b/speechx/speechx/kaldi/feat/pitch-functions.h
--- a/speechx/speechx/kaldi/matrix/kaldi-blas.h
+++ b/speechx/speechx/kaldi/matrix/kaldi-blas.h
--- a/tests/unit/audio/backends/__init__.py
+++ b/tests/unit/audio/backends/__init__.py
--- a/tests/unit/audio/features/__init__.py
+++ b/tests/unit/audio/features/__init__.py
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh