未验证 提交 0ffcd477 编写于 作者: Y YangZhou 提交者: GitHub

add sox, kaldi feature to paddleaudio && make it work on mac, windows (#2663)

* add paddleaudio sox && kaldifeature

* rrm redundant file which compile audio

* mv audio test into paddleaudio

* fix test bug

* add paddleaudio test

* rm redundant comment

* unsupport sox in windows

* rm io in __init__

* fix windows cblas compile error
上级 62fe3d44
...@@ -20,6 +20,10 @@ paddlespeech/audio/_paddleaudio.so ...@@ -20,6 +20,10 @@ paddlespeech/audio/_paddleaudio.so
paddlespeech/audio/lib/libpaddleaudio.so paddlespeech/audio/lib/libpaddleaudio.so
paddlespeech/version.py paddlespeech/version.py
audio/dist/
audio/fc_patch/
audio/paddleaudio/version.py
docs/build/ docs/build/
docs/topic/ctc/warp-ctc/ docs/topic/ctc/warp-ctc/
......
...@@ -13,16 +13,14 @@ if(NOT CMAKE_VERSION VERSION_LESS 3.15.0) ...@@ -13,16 +13,14 @@ if(NOT CMAKE_VERSION VERSION_LESS 3.15.0)
cmake_policy(SET CMP0092 NEW) cmake_policy(SET CMP0092 NEW)
endif() endif()
project(paddleaudio)
project(paddlespeech)
# check and set CMAKE_CXX_STANDARD # check and set CMAKE_CXX_STANDARD
string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard) string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
if(env_cxx_standard GREATER -1) if(env_cxx_standard GREATER -1)
message( message(
WARNING "C++ standard version definition detected in environment variable." WARNING "C++ standard version definition detected in environment variable."
"paddlespeech requires -std=c++14. Please remove -std=c++ settings in your environment.") "paddleaudio requires -std=c++14. Please remove -std=c++ settings in your environment.")
endif() endif()
...@@ -33,8 +31,6 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) ...@@ -33,8 +31,6 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_VERBOSE_MAKEFILE ON) set(CMAKE_VERBOSE_MAKEFILE ON)
# Options # Options
option(BUILD_SOX "Build libsox statically" ON) option(BUILD_SOX "Build libsox statically" ON)
option(BUILD_MAD "Enable libmad" ON) option(BUILD_MAD "Enable libmad" ON)
...@@ -50,18 +46,21 @@ set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJEC ...@@ -50,18 +46,21 @@ set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJEC
set(FETCHCONTENT_QUIET off) set(FETCHCONTENT_QUIET off)
get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}") get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
set(FETCHCONTENT_BASE_DIR ${fc_patch}) set(FETCHCONTENT_BASE_DIR ${fc_patch})
set(THIRD_PARTY_PATH ${fc_patch})
include(openblas) include(openblas)
if (NOT PY_VERSION)
set(PY_VERSION 3.7)
endif()
set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
include(pybind) include(pybind)
# packages # packages
find_package(Python3 COMPONENTS Interpreter Development) find_package(Python3 COMPONENTS Interpreter Development)
#find_package(pybind11 CONFIG REQUIRED)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g") # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g")
add_subdirectory(paddlespeech/audio) add_subdirectory(paddleaudio)
# Summary # Summary
include(cmake/summary.cmake) include(cmake/summary.cmake)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas)
set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
set(CBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
set(CBLAS_TAG v0.3.10)
if(NOT WIN32)
set(CBLAS_LIBRARIES
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
set(CBLAS_INC_DIR
"${CBLAS_INSTALL_DIR}/include"
CACHE PATH "openblas include directory." FORCE)
set(OPENBLAS_CC
"${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
if(APPLE)
set(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
endif()
set(OPTIONAL_ARGS "")
set(COMMON_ARGS "")
if(APPLE)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
set(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
endif()
set(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1)
endif()
ExternalProject_Add(
OPENBLAS
GIT_REPOSITORY ${CBLAS_REPOSITORY}
GIT_TAG ${CBLAS_TAG}
GIT_SHALLOW YES
PREFIX ${CBLAS_PREFIX_DIR}
INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 1
BUILD_COMMAND make -j${NPROC} ${COMMON_ARGS} ${OPTIONAL_ARGS}
INSTALL_COMMAND make install PREFIX=<INSTALL_DIR>
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
ExternalProject_Get_Property(OPENBLAS INSTALL_DIR)
set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR})
add_library(openblas STATIC IMPORTED)
add_dependencies(openblas OPENBLAS)
set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran)
set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a)
link_directories(${OpenBLAS_INSTALL_PREFIX}/lib)
include_directories(${OpenBLAS_INSTALL_PREFIX}/include)
set(OPENBLAS_LIBRARIES
${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a
)
add_library(libopenblas INTERFACE)
add_dependencies(libopenblas openblas)
target_include_directories(libopenblas INTERFACE ${OpenBLAS_INSTALL_PREFIX}/include/openblas)
target_link_libraries(libopenblas INTERFACE ${OPENBLAS_LIBRARIES})
else()
set(CBLAS_LIBRARIES
"${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
set(CBLAS_INC_DIR
"${CBLAS_INSTALL_DIR}/include/openblas"
CACHE PATH "openblas include directory." FORCE)
ExternalProject_Add(
extern_openblas
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY ${CBLAS_REPOSITORY}
GIT_TAG ${CBLAS_TAG}
PREFIX ${CBLAS_PREFIX_DIR}
INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 0
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_C_COMPILER=clang-cl
-DCMAKE_CXX_COMPILER=clang-cl
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
-DCMAKE_BUILD_TYPE=Release #${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_MT=mt
-DUSE_THREAD=OFF
-DBUILD_WITHOUT_LAPACK=NO
-DCMAKE_Fortran_COMPILER=flang
-DNOFORTRAN=0
-DDYNAMIC_ARCH=ON
#${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release #${THIRD_PARTY_BUILD_TYPE}
# ninja need to know where openblas.lib comes from
BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
set(OPENBLAS_SHARED_LIB
${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
add_library(openblas INTERFACE)
add_dependencies(openblas extern_openblas)
include_directories(${CBLAS_INC_DIR})
link_libraries(${CBLAS_LIBRARIES})
endif()
add_subdirectory(third_party)
add_subdirectory(src)
...@@ -11,12 +11,12 @@ ...@@ -11,12 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from . import _extension
from . import backends from . import backends
from . import compliance from . import compliance
from . import datasets from . import datasets
from . import features from . import features
from . import functional from . import functional
from . import io
from . import metric from . import metric
from . import sox_effects from . import sox_effects
from . import utils from . import utils
import os
import warnings
from pathlib import Path
from ._internal import module_utils as _mod_utils # noqa: F401
import contextlib
import ctypes
import os
import sys
import types
# Query `hasattr` only once.
_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
'setdlopenflags')
@contextlib.contextmanager
def dl_open_guard():
"""
# https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
shared library to load custom operators.
"""
if _SET_GLOBAL_FLAGS:
old_flags = sys.getdlopenflags()
sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
yield
if _SET_GLOBAL_FLAGS:
sys.setdlopenflags(old_flags)
def resolve_library_path(path: str) -> str:
return os.path.realpath(path)
class _Ops(types.ModuleType):
#__file__ = '_ops.py'
def __init__(self):
super(_Ops, self).__init__('paddleaudio.ops')
self.loaded_libraries = set()
def load_library(self, path):
"""
Loads a shared library from the given path into the current process.
This allows dynamically loading custom operators. For this,
you should compile your operator and
the static registration code into a shared library object, and then
call ``paddleaudio.ops.load_library('path/to/libcustom.so')`` to load the
shared object.
After the library is loaded, it is added to the
``paddleaudio.ops.loaded_libraries`` attribute, a set that may be inspected
for the paths of all libraries loaded using this function.
Args:
path (str): A path to a shared library to load.
"""
path = resolve_library_path(path)
with dl_open_guard():
# https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
# Import the shared library into the process, thus running its
# static (global) initialization code in order to register custom
# operators with the JIT.
ctypes.CDLL(path)
self.loaded_libraries.add(path)
_LIB_DIR = Path(__file__).parent / "lib"
def _get_lib_path(lib: str):
suffix = "pyd" if os.name == "nt" else "so"
path = _LIB_DIR / f"{lib}.{suffix}"
return path
def _load_lib(lib: str) -> bool:
"""Load extension module
Note:
In case `paddleaudio` is deployed with `pex` format, the library file
is not in a standard location.
In this case, we expect that `libpaddlleaudio` is available somewhere
in the search path of dynamic loading mechanism, so that importing
`_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
This is the reason why the function should not raising an error when the library
file is not found.
Returns:
bool:
True if the library file is found AND the library loaded without failure.
False if the library file is not found (like in the case where paddlleaudio
is deployed with pex format, thus the shared library file is
in a non-standard location.).
If the library file is found but there is an issue loading the library,
(such as missing dependency) then this function raises the exception as-is.
Raises:
Exception:
If the library file is found, but there is an issue loading the library file,
(when underlying `ctype.DLL` throws an exception), this function will pass
the exception as-is, instead of catching it and returning bool.
The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
is not found.
This behavior was chosen because the expected failure case is not recoverable.
If a dependency is missing, then users have to install it.
"""
path = _get_lib_path(lib)
if not path.exists():
warnings.warn("lib path is not exists:" + str(path))
return False
ops.load_library(path)
return True
_FFMPEG_INITIALIZED = False
def _init_ffmpeg():
global _FFMPEG_INITIALIZED
if _FFMPEG_INITIALIZED:
return
if not paddleaudio._paddlleaudio.is_ffmpeg_available():
raise RuntimeError(
"paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
)
try:
_load_lib("libpaddlleaudio_ffmpeg")
except OSError as err:
raise ImportError(
"FFmpeg libraries are not found. Please install FFmpeg.") from err
import paddllespeech.audio._paddlleaudio_ffmpeg # noqa
paddleaudio._paddlleaudio.ffmpeg_init()
if paddleaudio._paddlleaudio.ffmpeg_get_log_level() > 8:
paddleaudio._paddlleaudio.ffmpeg_set_log_level(8)
_FFMPEG_INITIALIZED = True
def _init_extension():
if not _mod_utils.is_module_available("paddleaudio._paddleaudio"):
warnings.warn("paddleaudio C++ extension is not available.")
return
_load_lib("libpaddleaudio")
# This import is for initializing the methods registered via PyBind11
# This has to happen after the base library is loaded
from paddleaudio import _paddleaudio # noqa
# Because this part is executed as part of `import torchaudio`, we ignore the
# initialization failure.
# If the FFmpeg integration is not properly initialized, then detailed error
# will be raised when client code attempts to import the dedicated feature.
try:
_init_ffmpeg()
except Exception:
pass
ops = _Ops()
_init_extension()
...@@ -2,8 +2,9 @@ import importlib.util ...@@ -2,8 +2,9 @@ import importlib.util
import warnings import warnings
from functools import wraps from functools import wraps
from typing import Optional from typing import Optional
import platform
#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py #code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py with modification.
def is_module_available(*modules: str) -> bool: def is_module_available(*modules: str) -> bool:
...@@ -127,6 +128,8 @@ def requires_soundfile(): ...@@ -127,6 +128,8 @@ def requires_soundfile():
def is_sox_available(): def is_sox_available():
if platform.system() == "Windows": # not support sox in windows
return False
return is_module_available("paddleaudio._paddleaudio") return is_module_available("paddleaudio._paddleaudio")
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -18,127 +18,156 @@ from typing import Union ...@@ -18,127 +18,156 @@ from typing import Union
import paddle import paddle
from paddle import Tensor from paddle import Tensor
__all__ = [
'get_window',
]
class WindowFunctionRegister(object):
def __init__(self):
self._functions_dict = dict()
def register(self):
def add_subfunction(func):
name = func.__name__
self._functions_dict[name] = func
return func
return add_subfunction
def get(self, name):
return self._functions_dict[name]
window_function_register = WindowFunctionRegister()
@window_function_register.register()
def _cat(x: List[Tensor], data_type: str) -> Tensor: def _cat(x: List[Tensor], data_type: str) -> Tensor:
l = [paddle.to_tensor(_, data_type) for _ in x] l = [paddle.to_tensor(_, data_type) for _ in x]
return paddle.concat(l) return paddle.concat(l)
@window_function_register.register()
def _acosh(x: Union[Tensor, float]) -> Tensor: def _acosh(x: Union[Tensor, float]) -> Tensor:
if isinstance(x, float): if isinstance(x, float):
return math.log(x + math.sqrt(x**2 - 1)) return math.log(x + math.sqrt(x**2 - 1))
return paddle.log(x + paddle.sqrt(paddle.square(x) - 1)) return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
@window_function_register.register()
def _extend(M: int, sym: bool) -> bool: def _extend(M: int, sym: bool) -> bool:
"""Extend window by 1 sample if needed for DFT-even symmetry. """ """Extend window by 1 sample if needed for DFT-even symmetry."""
if not sym: if not sym:
return M + 1, True return M + 1, True
else: else:
return M, False return M, False
@window_function_register.register()
def _len_guards(M: int) -> bool: def _len_guards(M: int) -> bool:
"""Handle small or incorrect window lengths. """ """Handle small or incorrect window lengths."""
if int(M) != M or M < 0: if int(M) != M or M < 0:
raise ValueError('Window length M must be a non-negative integer') raise ValueError('Window length M must be a non-negative integer')
return M <= 1 return M <= 1
@window_function_register.register()
def _truncate(w: Tensor, needed: bool) -> Tensor: def _truncate(w: Tensor, needed: bool) -> Tensor:
"""Truncate window by 1 sample if needed for DFT-even symmetry. """ """Truncate window by 1 sample if needed for DFT-even symmetry."""
if needed: if needed:
return w[:-1] return w[:-1]
else: else:
return w return w
def _general_gaussian(M: int, p, sig, sym: bool=True, @window_function_register.register()
dtype: str='float64') -> Tensor: def _general_gaussian(
M: int, p, sig, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a window with a generalized Gaussian shape. """Compute a window with a generalized Gaussian shape.
This function is consistent with scipy.signal.windows.general_gaussian(). This function is consistent with scipy.signal.windows.general_gaussian().
""" """
if _len_guards(M): if _len_guards(M):
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym) M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p)) w = paddle.exp(-0.5 * paddle.abs(n / sig) ** (2 * p))
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def _general_cosine(M: int, a: float, sym: bool=True, @window_function_register.register()
dtype: str='float64') -> Tensor: def _general_cosine(
M: int, a: float, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a generic weighted sum of cosine terms window. """Compute a generic weighted sum of cosine terms window.
This function is consistent with scipy.signal.windows.general_cosine(). This function is consistent with scipy.signal.windows.general_cosine().
""" """
if _len_guards(M): if _len_guards(M):
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym) M, needs_trunc = _extend(M, sym)
fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype) fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
w = paddle.zeros((M, ), dtype=dtype) w = paddle.zeros((M,), dtype=dtype)
for k in range(len(a)): for k in range(len(a)):
w += a[k] * paddle.cos(k * fac) w += a[k] * paddle.cos(k * fac)
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def _general_hamming(M: int, alpha: float, sym: bool=True, @window_function_register.register()
dtype: str='float64') -> Tensor: def _general_hamming(
M: int, alpha: float, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a generalized Hamming window. """Compute a generalized Hamming window.
This function is consistent with scipy.signal.windows.general_hamming() This function is consistent with scipy.signal.windows.general_hamming()
""" """
return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype) return _general_cosine(M, [alpha, 1.0 - alpha], sym, dtype=dtype)
def _taylor(M: int, @window_function_register.register()
nbar=4, def _taylor(
sll=30, M: int, nbar=4, sll=30, norm=True, sym: bool = True, dtype: str = 'float64'
norm=True, ) -> Tensor:
sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a Taylor window. """Compute a Taylor window.
The Taylor window taper function approximates the Dolph-Chebyshev window's The Taylor window taper function approximates the Dolph-Chebyshev window's
constant sidelobe level for a parameterized number of near-in sidelobes. constant sidelobe level for a parameterized number of near-in sidelobes.
""" """
if _len_guards(M): if _len_guards(M):
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym) M, needs_trunc = _extend(M, sym)
# Original text uses a negative sidelobe level parameter and then negates # Original text uses a negative sidelobe level parameter and then negates
# it in the calculation of B. To keep consistent with other methods we # it in the calculation of B. To keep consistent with other methods we
# assume the sidelobe level parameter to be positive. # assume the sidelobe level parameter to be positive.
B = 10**(sll / 20) B = 10 ** (sll / 20)
A = _acosh(B) / math.pi A = _acosh(B) / math.pi
s2 = nbar**2 / (A**2 + (nbar - 0.5)**2) s2 = nbar**2 / (A**2 + (nbar - 0.5) ** 2)
ma = paddle.arange(1, nbar, dtype=dtype) ma = paddle.arange(1, nbar, dtype=dtype)
Fm = paddle.empty((nbar - 1, ), dtype=dtype) Fm = paddle.empty((nbar - 1,), dtype=dtype)
signs = paddle.empty_like(ma) signs = paddle.empty_like(ma)
signs[::2] = 1 signs[::2] = 1
signs[1::2] = -1 signs[1::2] = -1
m2 = ma * ma m2 = ma * ma
for mi in range(len(ma)): for mi in range(len(ma)):
numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2 numer = signs[mi] * paddle.prod(
)) 1 - m2[mi] / s2 / (A**2 + (ma - 0.5) ** 2)
)
if mi == 0: if mi == 0:
denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:]) denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
elif mi == len(ma) - 1: elif mi == len(ma) - 1:
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
else: else:
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[ denom = (
mi] / m2[mi + 1:]) 2
* paddle.prod(1 - m2[mi] / m2[:mi])
* paddle.prod(1 - m2[mi] / m2[mi + 1 :])
)
Fm[mi] = numer / denom Fm[mi] = numer / denom
def W(n): def W(n):
return 1 + 2 * paddle.matmul( return 1 + 2 * paddle.matmul(
Fm.unsqueeze(0), Fm.unsqueeze(0),
paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M)) paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2.0 + 0.5) / M),
)
w = W(paddle.arange(0, M, dtype=dtype)) w = W(paddle.arange(0, M, dtype=dtype))
...@@ -150,7 +179,8 @@ def _taylor(M: int, ...@@ -150,7 +179,8 @@ def _taylor(M: int,
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: @window_function_register.register()
def _hamming(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a Hamming window. """Compute a Hamming window.
The Hamming window is a taper formed by using a raised cosine with The Hamming window is a taper formed by using a raised cosine with
non-zero endpoints, optimized to minimize the nearest side lobe. non-zero endpoints, optimized to minimize the nearest side lobe.
...@@ -158,7 +188,8 @@ def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: ...@@ -158,7 +188,8 @@ def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
return _general_hamming(M, 0.54, sym, dtype=dtype) return _general_hamming(M, 0.54, sym, dtype=dtype)
def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: @window_function_register.register()
def _hann(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a Hann window. """Compute a Hann window.
The Hann window is a taper formed by using a raised cosine or sine-squared The Hann window is a taper formed by using a raised cosine or sine-squared
with ends that touch zero. with ends that touch zero.
...@@ -166,15 +197,18 @@ def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: ...@@ -166,15 +197,18 @@ def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
return _general_hamming(M, 0.5, sym, dtype=dtype) return _general_hamming(M, 0.5, sym, dtype=dtype)
def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: @window_function_register.register()
def _tukey(
M: int, alpha=0.5, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a Tukey window. """Compute a Tukey window.
The Tukey window is also known as a tapered cosine window. The Tukey window is also known as a tapered cosine window.
""" """
if _len_guards(M): if _len_guards(M):
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
if alpha <= 0: if alpha <= 0:
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
elif alpha >= 1.0: elif alpha >= 1.0:
return hann(M, sym=sym) return hann(M, sym=sym)
...@@ -182,53 +216,48 @@ def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: ...@@ -182,53 +216,48 @@ def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
n = paddle.arange(0, M, dtype=dtype) n = paddle.arange(0, M, dtype=dtype)
width = int(alpha * (M - 1) / 2.0) width = int(alpha * (M - 1) / 2.0)
n1 = n[0:width + 1] n1 = n[0 : width + 1]
n2 = n[width + 1:M - width - 1] n2 = n[width + 1 : M - width - 1]
n3 = n[M - width - 1:] n3 = n[M - width - 1 :]
w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1)))) w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
w2 = paddle.ones(n2.shape, dtype=dtype) w2 = paddle.ones(n2.shape, dtype=dtype)
w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / w3 = 0.5 * (
(M - 1)))) 1
+ paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / (M - 1)))
)
w = paddle.concat([w1, w2, w3]) w = paddle.concat([w1, w2, w3])
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def _kaiser(M: int, beta: float, sym: bool=True, @window_function_register.register()
dtype: str='float64') -> Tensor: def _gaussian(
"""Compute a Kaiser window. M: int, std: float, sym: bool = True, dtype: str = 'float64'
The Kaiser window is a taper formed by using a Bessel function. ) -> Tensor:
"""
raise NotImplementedError()
def _gaussian(M: int, std: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a Gaussian window. """Compute a Gaussian window.
The Gaussian widows has a Gaussian shape defined by the standard deviation(std). The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
""" """
if _len_guards(M): if _len_guards(M):
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym) M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
sig2 = 2 * std * std sig2 = 2 * std * std
w = paddle.exp(-n**2 / sig2) w = paddle.exp(-(n**2) / sig2)
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def _exponential(M: int, @window_function_register.register()
center=None, def _exponential(
tau=1., M: int, center=None, tau=1.0, sym: bool = True, dtype: str = 'float64'
sym: bool=True, ) -> Tensor:
dtype: str='float64') -> Tensor: """Compute an exponential (or Poisson) window."""
"""Compute an exponential (or Poisson) window. """
if sym and center is not None: if sym and center is not None:
raise ValueError("If sym==True, center must be None.") raise ValueError("If sym==True, center must be None.")
if _len_guards(M): if _len_guards(M):
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym) M, needs_trunc = _extend(M, sym)
if center is None: if center is None:
...@@ -240,11 +269,11 @@ def _exponential(M: int, ...@@ -240,11 +269,11 @@ def _exponential(M: int,
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: @window_function_register.register()
"""Compute a triangular window. def _triang(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
""" """Compute a triangular window."""
if _len_guards(M): if _len_guards(M):
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym) M, needs_trunc = _extend(M, sym)
n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype) n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
...@@ -258,23 +287,26 @@ def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: ...@@ -258,23 +287,26 @@ def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: @window_function_register.register()
def _bohman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a Bohman window. """Compute a Bohman window.
The Bohman window is the autocorrelation of a cosine window. The Bohman window is the autocorrelation of a cosine window.
""" """
if _len_guards(M): if _len_guards(M):
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym) M, needs_trunc = _extend(M, sym)
fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1]) fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin( w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
math.pi * fac) math.pi * fac
)
w = _cat([0, w, 0], dtype) w = _cat([0, w, 0], dtype)
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: @window_function_register.register()
def _blackman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a Blackman window. """Compute a Blackman window.
The Blackman window is a taper formed by using the first three terms of The Blackman window is a taper formed by using the first three terms of
a summation of cosines. It was designed to have close to the minimal a summation of cosines. It was designed to have close to the minimal
...@@ -284,31 +316,44 @@ def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: ...@@ -284,31 +316,44 @@ def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: @window_function_register.register()
"""Compute a window with a simple cosine shape. def _cosine(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
""" """Compute a window with a simple cosine shape."""
if _len_guards(M): if _len_guards(M):
return paddle.ones((M, ), dtype=dtype) return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym) M, needs_trunc = _extend(M, sym)
w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5)) w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + 0.5))
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def get_window(window: Union[str, Tuple[str, float]], def get_window(
window: Union[str, Tuple[str, float]],
win_length: int, win_length: int,
fftbins: bool=True, fftbins: bool = True,
dtype: str='float64') -> Tensor: dtype: str = 'float64',
) -> Tensor:
"""Return a window of a given length and type. """Return a window of a given length and type.
Args: Args:
window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
win_length (int): Number of samples. win_length (int): Number of samples.
fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True. fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
dtype (str, optional): The data type of the return window. Defaults to 'float64'. dtype (str, optional): The data type of the return window. Defaults to 'float64'.
Returns: Returns:
Tensor: The window represented as a tensor. Tensor: The window represented as a tensor.
Examples:
.. code-block:: python
import paddle
n_fft = 512
cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
std = 7
gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
""" """
sym = not fftbins sym = not fftbins
...@@ -319,19 +364,22 @@ def get_window(window: Union[str, Tuple[str, float]], ...@@ -319,19 +364,22 @@ def get_window(window: Union[str, Tuple[str, float]],
args = window[1:] args = window[1:]
elif isinstance(window, str): elif isinstance(window, str):
if window in ['gaussian', 'exponential']: if window in ['gaussian', 'exponential']:
raise ValueError("The '" + window + "' window needs one or " raise ValueError(
"more parameters -- pass a tuple.") "The '" + window + "' window needs one or "
"more parameters -- pass a tuple."
)
else: else:
winstr = window winstr = window
else: else:
raise ValueError("%s as window type is not supported." % raise ValueError(
str(type(window))) "%s as window type is not supported." % str(type(window))
)
try: try:
winfunc = eval('_' + winstr) winfunc = window_function_register.get('_' + winstr)
except KeyError as e: except KeyError as e:
raise ValueError("Unknown window type.") from e raise ValueError("Unknown window type.") from e
params = (win_length, ) + args params = (win_length,) + args
kwargs = {'sym': sym} kwargs = {'sym': sym}
return winfunc(*params, dtype=dtype, **kwargs) return winfunc(*params, dtype=dtype, **kwargs)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -11,3 +11,5 @@ ...@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .kaldi import fbank
from .kaldi import pitch
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddleaudio
from paddleaudio._internal import module_utils
__all__ = [
'fbank',
'pitch',
]
@module_utils.requires_kaldi()
def fbank(
wav,
samp_freq: int=16000,
frame_shift_ms: float=10.0,
frame_length_ms: float=25.0,
dither: float=0.0,
preemph_coeff: float=0.97,
remove_dc_offset: bool=True,
window_type: str='povey',
round_to_power_of_two: bool=True,
blackman_coeff: float=0.42,
snip_edges: bool=True,
allow_downsample: bool=False,
allow_upsample: bool=False,
max_feature_vectors: int=-1,
num_bins: int=23,
low_freq: float=20,
high_freq: float=0,
vtln_low: float=100,
vtln_high: float=-500,
debug_mel: bool=False,
htk_mode: bool=False,
use_energy: bool=False, # fbank opts
energy_floor: float=0.0,
raw_energy: bool=True,
htk_compat: bool=False,
use_log_fbank: bool=True,
use_power: bool=True):
frame_opts = paddleaudio._paddleaudio.FrameExtractionOptions()
mel_opts = paddleaudio._paddleaudio.MelBanksOptions()
fbank_opts = paddleaudio._paddleaudio.FbankOptions()
frame_opts.samp_freq = samp_freq
frame_opts.frame_shift_ms = frame_shift_ms
frame_opts.frame_length_ms = frame_length_ms
frame_opts.dither = dither
frame_opts.preemph_coeff = preemph_coeff
frame_opts.remove_dc_offset = remove_dc_offset
frame_opts.window_type = window_type
frame_opts.round_to_power_of_two = round_to_power_of_two
frame_opts.blackman_coeff = blackman_coeff
frame_opts.snip_edges = snip_edges
frame_opts.allow_downsample = allow_downsample
frame_opts.allow_upsample = allow_upsample
frame_opts.max_feature_vectors = max_feature_vectors
mel_opts.num_bins = num_bins
mel_opts.low_freq = low_freq
mel_opts.high_freq = high_freq
mel_opts.vtln_low = vtln_low
mel_opts.vtln_high = vtln_high
mel_opts.debug_mel = debug_mel
mel_opts.htk_mode = htk_mode
fbank_opts.use_energy = use_energy
fbank_opts.energy_floor = energy_floor
fbank_opts.raw_energy = raw_energy
fbank_opts.htk_compat = htk_compat
fbank_opts.use_log_fbank = use_log_fbank
fbank_opts.use_power = use_power
feat = paddleaudio._paddleaudio.ComputeFbank(frame_opts, mel_opts, fbank_opts, wav)
return feat
@module_utils.requires_kaldi()
def pitch(wav,
samp_freq: int=16000,
frame_shift_ms: float=10.0,
frame_length_ms: float=25.0,
preemph_coeff: float=0.0,
min_f0: int=50,
max_f0: int=400,
soft_min_f0: float=10.0,
penalty_factor: float=0.1,
lowpass_cutoff: int=1000,
resample_freq: int=4000,
delta_pitch: float=0.005,
nccf_ballast: int=7000,
lowpass_filter_width: int=1,
upsample_filter_width: int=5,
max_frames_latency: int=0,
frames_per_chunk: int=0,
simulate_first_pass_online: bool=False,
recompute_frame: int=500,
nccf_ballast_online: bool=False,
snip_edges: bool=True):
pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
pitch_opts.samp_freq = samp_freq
pitch_opts.frame_shift_ms = frame_shift_ms
pitch_opts.frame_length_ms = frame_length_ms
pitch_opts.preemph_coeff = preemph_coeff
pitch_opts.min_f0 = min_f0
pitch_opts.max_f0 = max_f0
pitch_opts.soft_min_f0 = soft_min_f0
pitch_opts.penalty_factor = penalty_factor
pitch_opts.lowpass_cutoff = lowpass_cutoff
pitch_opts.resample_freq = resample_freq
pitch_opts.delta_pitch = delta_pitch
pitch_opts.nccf_ballast = nccf_ballast
pitch_opts.lowpass_filter_width = lowpass_filter_width
pitch_opts.upsample_filter_width = upsample_filter_width
pitch_opts.max_frames_latency = max_frames_latency
pitch_opts.frames_per_chunk = frames_per_chunk
pitch_opts.simulate_first_pass_online = simulate_first_pass_online
pitch_opts.recompute_frame = recompute_frame
pitch_opts.nccf_ballast_online = nccf_ballast_online
pitch_opts.snip_edges = snip_edges
pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
return pitch
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. from paddleaudio._internal import module_utils as _mod_utils
#
# Licensed under the Apache License, Version 2.0 (the "License"); from .sox_effects import (
# you may not use this file except in compliance with the License. apply_effects_file,
# You may obtain a copy of the License at apply_effects_tensor,
# effect_names,
# http://www.apache.org/licenses/LICENSE-2.0 init_sox_effects,
# shutdown_sox_effects,
# Unless required by applicable law or agreed to in writing, software )
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and if _mod_utils.is_sox_available():
# limitations under the License. import atexit
init_sox_effects()
atexit.register(shutdown_sox_effects)
__all__ = [
"init_sox_effects",
"shutdown_sox_effects",
"effect_names",
"apply_effects_tensor",
"apply_effects_file",
]
import os
from typing import List, Optional, Tuple
import paddle
import numpy
from paddleaudio._internal import module_utils as _mod_utils
from paddleaudio.utils.sox_utils import list_effects
from paddleaudio import _paddleaudio as paddleaudio
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
@_mod_utils.requires_sox()
def init_sox_effects():
"""Initialize resources required to use sox effects.
Note:
You do not need to call this function manually. It is called automatically.
Once initialized, you do not need to call this function again across the multiple uses of
sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
again will result in error.
"""
paddleaudio.sox_effects_initialize_sox_effects()
@_mod_utils.requires_sox()
def shutdown_sox_effects():
"""Clean up resources required to use sox effects.
Note:
You do not need to call this function manually. It is called automatically.
It is safe to call this function multiple times.
Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
initializing again will result in error.
"""
paddleaudio.sox_effects_shutdown_sox_effects()
@_mod_utils.requires_sox()
def effect_names() -> List[str]:
"""Gets list of valid sox effect names
Returns:
List[str]: list of available effect names.
Example
>>> paddleaudio.sox_effects.effect_names()
['allpass', 'band', 'bandpass', ... ]
"""
return list(list_effects().keys())
@_mod_utils.requires_sox()
def apply_effects_tensor(
tensor: paddle.Tensor,
sample_rate: int,
effects: List[List[str]],
channels_first: bool = True,
) -> Tuple[paddle.Tensor, int]:
"""Apply sox effects to given Tensor
.. devices:: CPU
Note:
This function only works on CPU Tensors.
This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` command adds certain effects automatically (such as
``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
need to give ``rate`` effect with desired sampling rate.).
Args:
tensor (paddle.Tensor): Input 2D CPU Tensor.
sample_rate (int): Sample rate
effects (List[List[str]]): List of effects.
channels_first (bool, optional): Indicates if the input Tensor's dimension is
`[channels, time]` or `[time, channels]`
Returns:
(Tensor, int): Resulting Tensor and sample rate.
The resulting Tensor has the same ``dtype`` as the input Tensor, and
the same channels order. The shape of the Tensor can be different based on the
effects applied. Sample rate can also be different based on the effects applied.
Example - Basic usage
>>>
>>> # Defines the effects to apply
>>> effects = [
... ['gain', '-n'], # normalises to 0dB
... ['pitch', '5'], # 5 cent pitch shift
... ['rate', '8000'], # resample to 8000 Hz
... ]
>>>
>>> # Generate pseudo wave:
>>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
>>> sample_rate = 16000
>>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
>>> waveform.shape
paddle.Size([2, 16000])
>>> waveform
tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442],
[-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]])
>>>
>>> # Apply effects
>>> waveform, sample_rate = apply_effects_tensor(
... wave_form, sample_rate, effects, channels_first=True)
>>>
>>> # Check the result
>>> # The new waveform is sampling rate 8000, 1 second.
>>> # normalization and channel order are preserved
>>> waveform.shape
paddle.Size([2, 8000])
>>> waveform
tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110],
[ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]])
>>> sample_rate
8000
"""
tensor_np = tensor.numpy()
ret = paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate, effects, channels_first)
if ret is not None:
return (paddle.to_tensor(ret[0]), ret[1])
raise RuntimeError("Failed to apply sox effect")
@_mod_utils.requires_sox()
def apply_effects_file(
path: str,
effects: List[List[str]],
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[paddle.Tensor, int]:
"""Apply sox effects to the audio file and load the resulting data as Tensor
Note:
This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` commnad adds certain effects automatically (such as
``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
rate and leave samples untouched.
Args:
path (path-like object or file-like object):
effects (List[List[str]]): List of effects.
normalize (bool, optional):
When ``True``, this function always return ``float32``, and sample values are
normalized to ``[-1.0, 1.0]``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type. This argument has no effect for formats other
than integer WAV type.
channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Override the format detection with the given format.
Providing the argument might help when libsox can not infer the format
from header or extension,
Returns:
(Tensor, int): Resulting Tensor and sample rate.
If ``normalize=True``, the resulting Tensor is always ``float32`` type.
If ``normalize=False`` and the input audio file is of integer WAV file, then the
resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
otherwise `[time, channel]`.
Example - Basic usage
>>>
>>> # Defines the effects to apply
>>> effects = [
... ['gain', '-n'], # normalises to 0dB
... ['pitch', '5'], # 5 cent pitch shift
... ['rate', '8000'], # resample to 8000 Hz
... ]
>>>
>>> # Apply effects and load data with channels_first=True
>>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
>>>
>>> # Check the result
>>> waveform.shape
paddle.Size([2, 8000])
>>> waveform
tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07,
-1.4761e-07, 1.8114e-07],
[-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07,
-5.6159e-07, 4.8103e-07]])
>>> sample_rate
8000
Example - Apply random speed perturbation to dataset
>>>
>>> # Load data from file, apply random speed perturbation
>>> class RandomPerturbationFile(paddle.utils.data.Dataset):
... \"\"\"Given flist, apply random speed perturbation
...
... Suppose all the input files are at least one second long.
... \"\"\"
... def __init__(self, flist: List[str], sample_rate: int):
... super().__init__()
... self.flist = flist
... self.sample_rate = sample_rate
...
... def __getitem__(self, index):
... speed = 0.5 + 1.5 * random.randn()
... effects = [
... ['gain', '-n', '-10'], # apply 10 db attenuation
... ['remix', '-'], # merge all the channels
... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds.
... ['rate', f'{self.sample_rate}'],
... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end
... ['trim', '0', '2'], # get the first 2 seconds
... ]
... waveform, _ = paddleaudio.sox_effects.apply_effects_file(
... self.flist[index], effects)
... return waveform
...
... def __len__(self):
... return len(self.flist)
...
>>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
>>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
>>> for batch in loader:
>>> pass
"""
if hasattr(path, "read"):
ret = paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format)
if ret is None:
raise RuntimeError("Failed to load audio from {}".format(path))
return (paddle.to_tensor(ret[0]), ret[1])
path = os.fspath(path)
ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
if ret is not None:
return (paddle.to_tensor(ret[0]), ret[1])
raise RuntimeError("Failed to load audio from {}".format(path))
if (MSVC)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
if(APPLE)
set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
endif(APPLE)
################################################################################
# libpaddleaudio
################################################################################
set(
LIBPADDLEAUDIO_SOURCES
utils.cpp
)
set(
LIBPADDLEAUDIO_INCLUDE_DIRS
${PROJECT_SOURCE_DIR}
)
set(
LIBPADDLEAUDIO_LINK_LIBRARIES
)
set(
LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
#------------------------------------------------------------------------------#
# START OF CUSTOMIZATION LOGICS
#------------------------------------------------------------------------------#
if(BUILD_SOX)
list(
APPEND
LIBPADDLEAUDIO_LINK_LIBRARIES
libsox
)
list(
APPEND
LIBPADDLEAUDIO_SOURCES
#sox/io.cpp
#sox/utils.cpp
#sox/effects.cpp
#sox/effects_chain.cpp
#sox/types.cpp
)
list(
APPEND
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
INCLUDE_SOX
)
endif()
if(BUILD_KALDI)
list(
APPEND
LIBPADDLEAUDIO_LINK_LIBRARIES
libkaldi
)
list(
APPEND
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
INCLUDE_KALDI
COMPILE_WITHOUT_OPENFST
)
endif()
#------------------------------------------------------------------------------#
# END OF CUSTOMIZATION LOGICS
#------------------------------------------------------------------------------#
function (define_library name source include_dirs link_libraries compile_defs)
add_library(${name} SHARED ${source})
target_include_directories(${name} PRIVATE ${include_dirs})
target_link_libraries(${name} ${link_libraries})
target_compile_definitions(${name} PRIVATE ${compile_defs})
set_target_properties(${name} PROPERTIES PREFIX "")
if (MSVC)
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
endif(MSVC)
install(
TARGETS ${name}
LIBRARY DESTINATION lib
RUNTIME DESTINATION lib # For Windows
)
endfunction()
define_library(
libpaddleaudio
"${LIBPADDLEAUDIO_SOURCES}"
"${LIBPADDLEAUDIO_INCLUDE_DIRS}"
"${LIBPADDLEAUDIO_LINK_LIBRARIES}"
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
)
if (APPLE)
set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
else()
set(AUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
endif()
################################################################################
# _paddleaudio.so
################################################################################
if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
if (WIN32)
find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
set(ADDITIONAL_ITEMS Python3::Python)
endif()
function(define_extension name sources include_dirs libraries definitions)
add_library(${name} SHARED ${sources})
target_compile_definitions(${name} PRIVATE "${definitions}")
target_include_directories(
${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
target_link_libraries(
${name}
${libraries}
${PYTHON_LIBRARY}
${ADDITIONAL_ITEMS}
)
set_target_properties(${name} PROPERTIES PREFIX "")
if (MSVC)
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
endif(MSVC)
if (APPLE)
# https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
# https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
endif()
install(
TARGETS ${name}
LIBRARY DESTINATION .
RUNTIME DESTINATION . # For Windows
)
endfunction()
set(
EXTENSION_SOURCES
pybind/pybind.cpp
)
#----------------------------------------------------------------------------#
# START OF CUSTOMIZATION LOGICS
#----------------------------------------------------------------------------#
if(BUILD_SOX)
list(
APPEND
EXTENSION_SOURCES
pybind/sox/effects.cpp
pybind/sox/effects_chain.cpp
pybind/sox/io.cpp
pybind/sox/types.cpp
pybind/sox/utils.cpp
)
endif()
if(BUILD_KALDI)
list(
APPEND
EXTENSION_SOURCES
pybind/kaldi/kaldi_feature_wrapper.cc
pybind/kaldi/kaldi_feature.cc
)
endif()
#----------------------------------------------------------------------------#
# END OF CUSTOMIZATION LOGICS
#----------------------------------------------------------------------------#
define_extension(
_paddleaudio
"${EXTENSION_SOURCES}"
""
libpaddleaudio
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
)
# if(BUILD_CTC_DECODER)
# set(
# DECODER_EXTENSION_SOURCES
# decoder/bindings/pybind.cpp
# )
# define_extension(
# _paddleaudio_decoder
# "${DECODER_EXTENSION_SOURCES}"
# ""
# "libpaddleaudio_decoder"
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
# )
# endif()
# if(USE_FFMPEG)
# set(
# FFMPEG_EXTENSION_SOURCES
# ffmpeg/pybind/typedefs.cpp
# ffmpeg/pybind/pybind.cpp
# ffmpeg/pybind/stream_reader.cpp
# )
# define_extension(
# _paddleaudio_ffmpeg
# "${FFMPEG_EXTENSION_SOURCES}"
# "${FFMPEG_INCLUDE_DIRS}"
# "libpaddleaudio_ffmpeg"
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
# )
# endif()
endif()
Creative Commons Legal Code
CC0 1.0 Universal
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
HEREUNDER.
Statement of Purpose
The laws of most jurisdictions throughout the world automatically confer
exclusive Copyright and Related Rights (defined below) upon the creator
and subsequent owner(s) (each and all, an "owner") of an original work of
authorship and/or a database (each, a "Work").
Certain owners wish to permanently relinquish those rights to a Work for
the purpose of contributing to a commons of creative, cultural and
scientific works ("Commons") that the public can reliably and without fear
of later claims of infringement build upon, modify, incorporate in other
works, reuse and redistribute as freely as possible in any form whatsoever
and for any purposes, including without limitation commercial purposes.
These owners may contribute to the Commons to promote the ideal of a free
culture and the further production of creative, cultural and scientific
works, or to gain reputation or greater distribution for their Work in
part through the use and efforts of others.
For these and/or other purposes and motivations, and without any
expectation of additional consideration or compensation, the person
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
is an owner of Copyright and Related Rights in the Work, voluntarily
elects to apply CC0 to the Work and publicly distribute the Work under its
terms, with knowledge of his or her Copyright and Related Rights in the
Work and the meaning and intended legal effect of CC0 on those rights.
1. Copyright and Related Rights. A Work made available under CC0 may be
protected by copyright and related or neighboring rights ("Copyright and
Related Rights"). Copyright and Related Rights include, but are not
limited to, the following:
i. the right to reproduce, adapt, distribute, perform, display,
communicate, and translate a Work;
ii. moral rights retained by the original author(s) and/or performer(s);
iii. publicity and privacy rights pertaining to a person's image or
likeness depicted in a Work;
iv. rights protecting against unfair competition in regards to a Work,
subject to the limitations in paragraph 4(a), below;
v. rights protecting the extraction, dissemination, use and reuse of data
in a Work;
vi. database rights (such as those arising under Directive 96/9/EC of the
European Parliament and of the Council of 11 March 1996 on the legal
protection of databases, and under any national implementation
thereof, including any amended or successor version of such
directive); and
vii. other similar, equivalent or corresponding rights throughout the
world based on applicable law or treaty, and any national
implementations thereof.
2. Waiver. To the greatest extent permitted by, but not in contravention
of, applicable law, Affirmer hereby overtly, fully, permanently,
irrevocably and unconditionally waives, abandons, and surrenders all of
Affirmer's Copyright and Related Rights and associated claims and causes
of action, whether now known or unknown (including existing as well as
future claims and causes of action), in the Work (i) in all territories
worldwide, (ii) for the maximum duration provided by applicable law or
treaty (including future time extensions), (iii) in any current or future
medium and for any number of copies, and (iv) for any purpose whatsoever,
including without limitation commercial, advertising or promotional
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
member of the public at large and to the detriment of Affirmer's heirs and
successors, fully intending that such Waiver shall not be subject to
revocation, rescission, cancellation, termination, or any other legal or
equitable action to disrupt the quiet enjoyment of the Work by the public
as contemplated by Affirmer's express Statement of Purpose.
3. Public License Fallback. Should any part of the Waiver for any reason
be judged legally invalid or ineffective under applicable law, then the
Waiver shall be preserved to the maximum extent permitted taking into
account Affirmer's express Statement of Purpose. In addition, to the
extent the Waiver is so judged Affirmer hereby grants to each affected
person a royalty-free, non transferable, non sublicensable, non exclusive,
irrevocable and unconditional license to exercise Affirmer's Copyright and
Related Rights in the Work (i) in all territories worldwide, (ii) for the
maximum duration provided by applicable law or treaty (including future
time extensions), (iii) in any current or future medium and for any number
of copies, and (iv) for any purpose whatsoever, including without
limitation commercial, advertising or promotional purposes (the
"License"). The License shall be deemed effective as of the date CC0 was
applied by Affirmer to the Work. Should any part of the License for any
reason be judged legally invalid or ineffective under applicable law, such
partial invalidity or ineffectiveness shall not invalidate the remainder
of the License, and in such case Affirmer hereby affirms that he or she
will not (i) exercise any of his or her remaining Copyright and Related
Rights in the Work or (ii) assert any associated claims and causes of
action with respect to the Work, in either case contrary to Affirmer's
express Statement of Purpose.
4. Limitations and Disclaimers.
a. No trademark or patent rights held by Affirmer are waived, abandoned,
surrendered, licensed or otherwise affected by this document.
b. Affirmer offers the Work as-is and makes no representations or
warranties of any kind concerning the Work, express, implied,
statutory or otherwise, including without limitation warranties of
title, merchantability, fitness for a particular purpose, non
infringement, or the absence of latent or other defects, accuracy, or
the present or absence of errors, whether or not discoverable, all to
the greatest extent permissible under applicable law.
c. Affirmer disclaims responsibility for clearing rights of other persons
that may apply to the Work or any use thereof, including without
limitation any person's Copyright and Related Rights in the Work.
Further, Affirmer disclaims responsibility for obtaining any necessary
consents, permissions or other rights required for any use of the
Work.
d. Affirmer understands and acknowledges that Creative Commons is not a
party to this document and has no duty or obligation with respect to
this CC0 or use of the Work.
此差异已折叠。
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/numpy.h"
#include "feat/feature-window.h"
namespace paddleaudio {
namespace kaldi {
namespace py = pybind11;
template <class F>
class StreamingFeatureTpl {
public:
typedef typename F::Options Options;
StreamingFeatureTpl(const Options& opts);
bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
::kaldi::Vector<::kaldi::BaseFloat>* feats);
void Reset() { remained_wav_.Resize(0); }
int Dim() { return computer_.Dim(); }
private:
bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
::kaldi::Vector<::kaldi::BaseFloat>* feats);
Options opts_;
::kaldi::FeatureWindowFunction window_function_;
::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
F computer_;
};
} // namespace kaldi
} // namespace ppspeech
#include "feature_common_inl.h"
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
namespace paddleaudio {
namespace kaldi {
template <class F>
StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
: opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
// window_function_(computer_.GetFrameOptions()) { the opt set to zero
}
template <class F>
bool StreamingFeatureTpl<F>::ComputeFeature(
const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
::kaldi::Vector<::kaldi::BaseFloat>* feats) {
// append remaned waves
::kaldi::int32 wav_len = wav.Dim();
if (wav_len == 0) return false;
::kaldi::int32 left_len = remained_wav_.Dim();
::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
waves.Range(0, left_len).CopyFromVec(remained_wav_);
waves.Range(left_len, wav_len).CopyFromVec(wav);
// cache remaned waves
::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
::kaldi::int32 frame_shift = frame_opts.WindowShift();
::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
remained_wav_.Resize(left_samples);
remained_wav_.CopyFromVec(
waves.Range(frame_shift * num_frames, left_samples));
// compute speech feature
Compute(waves, feats);
return true;
}
// Compute feat
template <class F>
bool StreamingFeatureTpl<F>::Compute(
const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
::kaldi::Vector<::kaldi::BaseFloat>* feats) {
::kaldi::BaseFloat vtln_warp = 1.0;
const ::kaldi::FrameExtractionOptions& frame_opts =
computer_.GetFrameOptions();
::kaldi::int32 num_samples = waves.Dim();
::kaldi::int32 frame_length = frame_opts.WindowSize();
::kaldi::int32 sample_rate = frame_opts.samp_freq;
if (num_samples < frame_length) {
return false;
}
::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
feats->Resize(num_frames * Dim());
::kaldi::Vector<::kaldi::BaseFloat> window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
::kaldi::BaseFloat raw_log_energy = 0.0;
::kaldi::ExtractWindow(0,
waves,
frame,
frame_opts,
window_function_,
&window,
need_raw_log_energy ? &raw_log_energy : NULL);
::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
::kaldi::kUndefined);
computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
::kaldi::SubVector<::kaldi::BaseFloat> output_row(
feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature);
}
return true;
}
} // namespace kaldi
} // namespace paddleaudio
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
#include "feat/pitch-functions.h"
namespace paddleaudio {
namespace kaldi {
bool InitFbank(
::kaldi::FrameExtractionOptions frame_opts,
::kaldi::MelBanksOptions mel_opts,
FbankOptions fbank_opts) {
::kaldi::FbankOptions opts;
opts.frame_opts = frame_opts;
opts.mel_opts = mel_opts;
opts.use_energy = fbank_opts.use_energy;
opts.energy_floor = fbank_opts.energy_floor;
opts.raw_energy = fbank_opts.raw_energy;
opts.htk_compat = fbank_opts.htk_compat;
opts.use_log_fbank = fbank_opts.use_log_fbank;
opts.use_power = fbank_opts.use_power;
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
return true;
}
py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
return paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ComputeFbank(
wav);
}
py::array_t<float> ComputeFbank(
::kaldi::FrameExtractionOptions frame_opts,
::kaldi::MelBanksOptions mel_opts,
FbankOptions fbank_opts,
const py::array_t<float>& wav) {
InitFbank(frame_opts, mel_opts, fbank_opts);
py::array_t<float> result = ComputeFbankStreaming(wav);
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
return result;
}
void ResetFbank() {
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
}
py::array_t<float> ComputeKaldiPitch(
const ::kaldi::PitchExtractionOptions& opts,
const py::array_t<float>& wav) {
py::buffer_info info = wav.request();
::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
::kaldi::Matrix<::kaldi::BaseFloat> features;
::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
sizeof(float)*features.NumCols());
}
return result;
}
} // namespace kaldi
} // namespace paddleaudio
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <string>
#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
#include "feat/pitch-functions.h"
namespace py = pybind11;
namespace paddleaudio {
namespace kaldi {
struct FbankOptions{
bool use_energy; // append an extra dimension with energy to the filter banks
float energy_floor;
bool raw_energy; // If true, compute energy before preemphasis and windowing
bool htk_compat; // If true, put energy last (if using energy)
bool use_log_fbank; // if true (default), produce log-filterbank, else linear
bool use_power;
FbankOptions(): use_energy(false),
energy_floor(0.0),
raw_energy(true),
htk_compat(false),
use_log_fbank(true),
use_power(true) {}
};
bool InitFbank(
::kaldi::FrameExtractionOptions frame_opts,
::kaldi::MelBanksOptions mel_opts,
FbankOptions fbank_opts);
py::array_t<float> ComputeFbank(
::kaldi::FrameExtractionOptions frame_opts,
::kaldi::MelBanksOptions mel_opts,
FbankOptions fbank_opts,
const py::array_t<float>& wav);
py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
void ResetFbank();
py::array_t<float> ComputeKaldiPitch(
const ::kaldi::PitchExtractionOptions& opts,
const py::array_t<float>& wav);
} // namespace kaldi
} // namespace paddleaudio
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
namespace paddleaudio {
namespace kaldi {
KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
static KaldiFeatureWrapper instance;
return &instance;
}
bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
fbank_.reset(new Fbank(opts));
return true;
}
py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
const py::array_t<float> wav) {
py::buffer_info info = wav.request();
::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
::kaldi::Vector<::kaldi::BaseFloat> feats;
bool flag = fbank_->ComputeFeature(input_wav, &feats);
if (flag == false || feats.Dim() == 0) return py::array_t<float>();
auto result = py::array_t<float>(feats.Dim());
py::buffer_info xs = result.request();
std::cout << std::endl;
float* res_ptr = (float*)xs.ptr;
for (int idx = 0; idx < feats.Dim(); ++idx) {
*res_ptr = feats(idx);
res_ptr++;
}
return result.reshape({feats.Dim() / Dim(), Dim()});
}
} // namesapce kaldi
} // namespace paddleaudio
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "base/kaldi-common.h"
#include "feat/feature-fbank.h"
#include "paddleaudio/src/pybind/kaldi/feature_common.h"
namespace paddleaudio {
namespace kaldi {
typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
class KaldiFeatureWrapper {
public:
static KaldiFeatureWrapper* GetInstance();
bool InitFbank(::kaldi::FbankOptions opts);
py::array_t<float> ComputeFbank(const py::array_t<float> wav);
int Dim() { return fbank_->Dim(); }
void ResetFbank() { fbank_->Reset(); }
private:
std::unique_ptr<paddleaudio::kaldi::Fbank> fbank_;
};
} // namespace kaldi
} // namespace paddleaudio
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
#ifdef INCLUDE_SOX
#include "paddleaudio/src/pybind/sox/io.h"
#include "paddleaudio/src/pybind/sox/effects.h"
#endif
#include <pybind11/stl.h>
#include <pybind11/pybind11.h>
// `tl::optional`
#ifdef INCLUDE_SOX
namespace pybind11 { namespace detail {
template <typename T>
struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
}}
#endif
PYBIND11_MODULE(_paddleaudio, m) {
#ifdef INCLUDE_SOX
m.def("get_info_file",
&paddleaudio::sox_io::get_info_file,
"Get metadata of audio file.");
// support obj later
m.def("get_info_fileobj",
&paddleaudio::sox_io::get_info_fileobj,
"Get metadata of audio in file object.");
m.def("load_audio_fileobj",
&paddleaudio::sox_io::load_audio_fileobj,
"Load audio from file object.");
m.def("save_audio_fileobj",
&paddleaudio::sox_io::save_audio_fileobj,
"Save audio to file obj.");
// sox io
m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
m.def(
"sox_io_load_audio_file",
&paddleaudio::sox_io::load_audio_file);
m.def(
"sox_io_save_audio_file",
&paddleaudio::sox_io::save_audio_file);
// sox utils
m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed);
m.def(
"sox_utils_set_verbosity",
&paddleaudio::sox_utils::set_verbosity);
m.def(
"sox_utils_set_use_threads",
&paddleaudio::sox_utils::set_use_threads);
m.def(
"sox_utils_set_buffer_size",
&paddleaudio::sox_utils::set_buffer_size);
m.def(
"sox_utils_list_effects",
&paddleaudio::sox_utils::list_effects);
m.def(
"sox_utils_list_read_formats",
&paddleaudio::sox_utils::list_read_formats);
m.def(
"sox_utils_list_write_formats",
&paddleaudio::sox_utils::list_write_formats);
m.def(
"sox_utils_get_buffer_size",
&paddleaudio::sox_utils::get_buffer_size);
// effect
m.def("apply_effects_fileobj",
&paddleaudio::sox_effects::apply_effects_fileobj,
"Decode audio data from file-like obj and apply effects.");
m.def("sox_effects_initialize_sox_effects",
&paddleaudio::sox_effects::initialize_sox_effects);
m.def(
"sox_effects_shutdown_sox_effects",
&paddleaudio::sox_effects::shutdown_sox_effects);
m.def(
"sox_effects_apply_effects_tensor",
&paddleaudio::sox_effects::apply_effects_tensor);
m.def(
"sox_effects_apply_effects_file",
&paddleaudio::sox_effects::apply_effects_file);
#endif
#ifdef INCLUDE_KALDI
m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
.def(py::init<>())
.def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
.def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
.def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
.def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
.def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
.def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
.def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
.def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
.def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
.def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
.def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
.def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
.def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
.def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
.def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
.def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
.def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
.def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
.def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
.def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
.def(py::init<>())
.def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
.def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)
.def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
.def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)
.def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)
.def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)
.def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
.def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)
.def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)
.def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
.def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
.def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
.def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
.def(py::init<>())
.def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
.def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
.def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
.def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
.def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
.def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
.def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
.def(py::init<>())
.def_readwrite("use_energy", &paddleaudio::kaldi::FbankOptions::use_energy)
.def_readwrite("energy_floor", &paddleaudio::kaldi::FbankOptions::energy_floor)
.def_readwrite("raw_energy", &paddleaudio::kaldi::FbankOptions::raw_energy)
.def_readwrite("htk_compat", &paddleaudio::kaldi::FbankOptions::htk_compat)
.def_readwrite("use_log_fbank", &paddleaudio::kaldi::FbankOptions::use_log_fbank)
.def_readwrite("use_power", &paddleaudio::kaldi::FbankOptions::use_power);
#endif
}
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp with modification.
#include <mutex>
#include <sox.h>
#include "paddleaudio/src/pybind/sox/effects.h"
#include "paddleaudio/src/pybind/sox/effects_chain.h"
#include "paddleaudio/src/pybind/sox/utils.h"
using namespace paddleaudio::sox_utils;
namespace paddleaudio::sox_effects {
// Streaming decoding over file-like object is tricky because libsox operates on
// FILE pointer. The folloing is what `sox` and `play` commands do
// - file input -> FILE pointer
// - URL input -> call wget in suprocess and pipe the data -> FILE pointer
// - stdin -> FILE pointer
//
// We want to, instead, fetch byte strings chunk by chunk, consume them, and
// discard.
//
// Here is the approach
// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
// chunk of byte string
// This will perform header-based format detection, if necessary, then fill
// the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
// which returns FILE* which points the buffer of the provided byte string.
// 2. Each time sox reads a chunk from the FILE*, we update the underlying
// buffer in a way that it
// starts with unseen data, and append the new data read from the given
// fileobj. This will trick libsox as if it keeps reading from the FILE*
// continuously.
// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
auto apply_effects_fileobj(
py::object fileobj,
const std::vector<std::vector<std::string>>& effects,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
tl::optional<std::string> format)
-> tl::optional<std::tuple<py::array, int64_t>> {
// Prepare the buffer used throughout the lifecycle of SoxEffectChain.
//
// For certain format (such as FLAC), libsox keeps reading the content at
// the initialization unless it reaches EOF even when the header is properly
// parsed. (Making buffer size 8192, which is way bigger than the header,
// resulted in libsox consuming all the buffer content at the time it opens
// the file.) Therefore buffer has to always contain valid data, except after
// EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
// first check if there is enough data to fill the buffer. `read_fileobj`
// repeatedly calls `read` method until it receives the requested length of
// bytes or it reaches EOF. If we get bytes shorter than requested, that means
// the whole audio data are fetched.
//
// * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
const auto capacity = [&]() {
// NOTE:
// Use the abstraction provided by `libpaddleaudio` to access the global
// config defined by libsox. Directly using `sox_get_globals` function will
// end up retrieving the static variable defined in `_paddleaudio`, which is
// not correct.
const auto bufsiz = get_buffer_size();
const int64_t kDefaultCapacityInBytes = 256;
return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
: kDefaultCapacityInBytes;
}();
std::string buffer(capacity, '\0');
auto* in_buf = const_cast<char*>(buffer.data());
auto num_read = read_fileobj(&fileobj, capacity, in_buf);
// If the file is shorter than 256, then libsox cannot read the header.
auto in_buffer_size = (num_read > 256) ? num_read : 256;
// Open file (this starts reading the header)
// When opening a file there are two functions that can touches FILE*.
// * `auto_detect_format`
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
// * `startread` handler of detected format.
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
// To see the handler of a particular format, go to
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
// For example, voribs can be found
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
SoxFormat sf(sox_open_mem_read(
in_buf,
in_buffer_size,
/*signal=*/nullptr,
/*encoding=*/nullptr,
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
// In case of streamed data, length can be 0
if (static_cast<sox_format_t*>(sf) == nullptr ||
sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
return {};
}
// Prepare output buffer
std::vector<sox_sample_t> out_buffer;
out_buffer.reserve(sf->signal.length);
// Create and run SoxEffectsChain
const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
/*input_encoding=*/sf->encoding,
/*output_encoding=*/get_tensor_encodinginfo(dtype));
chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
for (const auto& effect : effects) {
chain.addEffect(effect);
}
chain.addOutputBuffer(&out_buffer);
chain.run();
// Create tensor from buffer
bool channels_first_ = channels_first.value_or(true);
auto tensor = convert_to_tensor(
/*buffer=*/out_buffer.data(),
/*num_samples=*/out_buffer.size(),
/*num_channels=*/chain.getOutputNumChannels(),
dtype,
normalize.value_or(true),
channels_first_);
return std::forward_as_tuple(
tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
}
namespace {
enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
std::mutex SOX_RESOUCE_STATE_MUTEX;
} // namespace
void initialize_sox_effects() {
const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
switch (SOX_RESOURCE_STATE) {
case NotInitialized:
if (sox_init() != SOX_SUCCESS) {
throw std::runtime_error("Failed to initialize sox effects.");
};
SOX_RESOURCE_STATE = Initialized;
break;
case Initialized:
break;
case ShutDown:
throw std::runtime_error(
"SoX Effects has been shut down. Cannot initialize again.");
}
};
void shutdown_sox_effects() {
const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
switch (SOX_RESOURCE_STATE) {
case NotInitialized:
throw std::runtime_error(
"SoX Effects is not initialized. Cannot shutdown.");
case Initialized:
if (sox_quit() != SOX_SUCCESS) {
throw std::runtime_error("Failed to initialize sox effects.");
};
SOX_RESOURCE_STATE = ShutDown;
break;
case ShutDown:
break;
}
}
auto apply_effects_tensor(
py::array waveform,
int64_t sample_rate,
const std::vector<std::vector<std::string>>& effects,
bool channels_first) -> std::tuple<py::array, int64_t> {
validate_input_tensor(waveform);
// Create SoxEffectsChain
const auto dtype = waveform.dtype();
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
/*input_encoding=*/get_tensor_encodinginfo(dtype),
/*output_encoding=*/get_tensor_encodinginfo(dtype));
// Prepare output buffer
std::vector<sox_sample_t> out_buffer;
out_buffer.reserve(waveform.size());
// Build and run effects chain
chain.addInputTensor(&waveform, sample_rate, channels_first);
for (const auto& effect : effects) {
chain.addEffect(effect);
}
chain.addOutputBuffer(&out_buffer);
chain.run();
// Create tensor from buffer
auto out_tensor = convert_to_tensor(
/*buffer=*/out_buffer.data(),
/*num_samples=*/out_buffer.size(),
/*num_channels=*/chain.getOutputNumChannels(),
dtype,
/*normalize=*/false,
channels_first);
return std::tuple<py::array, int64_t>(
out_tensor, chain.getOutputSampleRate());
}
auto apply_effects_file(
const std::string& path,
const std::vector<std::vector<std::string>>& effects,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format)
-> tl::optional<std::tuple<py::array, int64_t>> {
// Open input file
SoxFormat sf(sox_open_read(
path.c_str(),
/*signal=*/nullptr,
/*encoding=*/nullptr,
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
if (static_cast<sox_format_t*>(sf) == nullptr ||
sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
return {};
}
const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
// Prepare output
std::vector<sox_sample_t> out_buffer;
out_buffer.reserve(sf->signal.length);
// Create and run SoxEffectsChain
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
/*input_encoding=*/sf->encoding,
/*output_encoding=*/get_tensor_encodinginfo(dtype));
chain.addInputFile(sf);
for (const auto& effect : effects) {
chain.addEffect(effect);
}
chain.addOutputBuffer(&out_buffer);
chain.run();
// Create tensor from buffer
bool channels_first_ = channels_first.value_or(true);
auto tensor = convert_to_tensor(
/*buffer=*/out_buffer.data(),
/*num_samples=*/out_buffer.size(),
/*num_channels=*/chain.getOutputNumChannels(),
dtype,
normalize.value_or(true),
channels_first_);
return std::tuple<py::array, int64_t>(
tensor, chain.getOutputSampleRate());
}
} // namespace paddleaudio::sox_effects
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h with modification.
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "paddleaudio/src/optional/optional.hpp"
namespace py = pybind11;
namespace paddleaudio::sox_effects {
auto apply_effects_fileobj(
py::object fileobj,
const std::vector<std::vector<std::string>>& effects,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
tl::optional<std::string> format)
-> tl::optional<std::tuple<py::array, int64_t>>;
void initialize_sox_effects();
void shutdown_sox_effects();
auto apply_effects_tensor(
py::array waveform,
int64_t sample_rate,
const std::vector<std::vector<std::string>>& effects,
bool channels_first) -> std::tuple<py::array, int64_t>;
auto apply_effects_file(
const std::string& path,
const std::vector<std::vector<std::string>>& effects,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format)
-> tl::optional<std::tuple<py::array, int64_t>>;
} // namespace paddleaudio::sox_effects
此差异已折叠。
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h with modification.
#pragma once
#include <sox.h>
#include "paddleaudio/src/pybind/sox/utils.h"
namespace paddleaudio::sox_effects_chain {
// Helper struct to safely close sox_effect_t* pointer returned by
// sox_create_effect
struct SoxEffect {
explicit SoxEffect(sox_effect_t* se) noexcept;
SoxEffect(const SoxEffect& other) = delete;
SoxEffect(const SoxEffect&& other) = delete;
auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
~SoxEffect();
operator sox_effect_t*() const;
auto operator->() noexcept -> sox_effect_t*;
private:
sox_effect_t* se_;
};
// Helper struct to safely close sox_effects_chain_t with handy methods
class SoxEffectsChain {
const sox_encodinginfo_t in_enc_;
const sox_encodinginfo_t out_enc_;
protected:
sox_signalinfo_t in_sig_;
sox_signalinfo_t interm_sig_;
sox_signalinfo_t out_sig_;
sox_effects_chain_t* sec_;
public:
explicit SoxEffectsChain(
sox_encodinginfo_t input_encoding,
sox_encodinginfo_t output_encoding);
SoxEffectsChain(const SoxEffectsChain& other) = delete;
SoxEffectsChain(const SoxEffectsChain&& other) = delete;
SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
~SoxEffectsChain();
void run();
void addInputTensor(
py::array* waveform,
int64_t sample_rate,
bool channels_first);
void addInputFile(sox_format_t* sf);
void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
void addOutputFile(sox_format_t* sf);
void addEffect(const std::vector<std::string> effect);
int64_t getOutputNumChannels();
int64_t getOutputSampleRate();
};
class SoxEffectsChainPyBind : public SoxEffectsChain {
using SoxEffectsChain::SoxEffectsChain;
public:
void addInputFileObj(
sox_format_t* sf,
char* buffer,
uint64_t buffer_size,
py::object* fileobj);
void addOutputFileObj(
sox_format_t* sf,
char** buffer,
size_t* buffer_size,
py::object* fileobj);
};
} // namespace paddleaudio::sox_effects_chain
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp with modification.
#include "paddleaudio/src/pybind/sox/io.h"
#include "paddleaudio/src/pybind/sox/effects.h"
#include "paddleaudio/src/pybind/sox/types.h"
#include "paddleaudio/src/pybind/sox/effects_chain.h"
#include "paddleaudio/src/pybind/sox/utils.h"
#include "paddleaudio/src/optional/optional.hpp"
using namespace paddleaudio::sox_utils;
namespace paddleaudio {
namespace sox_io {
auto get_info_file(const std::string &path,
const tl::optional<std::string> &format)
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
SoxFormat sf(
sox_open_read(path.data(),
/*signal=*/nullptr,
/*encoding=*/nullptr,
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
validate_input_file(sf, path);
return std::make_tuple(
static_cast<int64_t>(sf->signal.rate),
static_cast<int64_t>(sf->signal.length / sf->signal.channels),
static_cast<int64_t>(sf->signal.channels),
static_cast<int64_t>(sf->encoding.bits_per_sample),
get_encoding(sf->encoding.encoding));
}
std::vector<std::vector<std::string>> get_effects(
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames) {
const auto offset = frame_offset.value_or(0);
if (offset < 0) {
throw std::runtime_error(
"Invalid argument: frame_offset must be non-negative.");
}
const auto frames = num_frames.value_or(-1);
if (frames == 0 || frames < -1) {
throw std::runtime_error(
"Invalid argument: num_frames must be -1 or greater than 0.");
}
std::vector<std::vector<std::string>> effects;
if (frames != -1) {
std::ostringstream os_offset, os_frames;
os_offset << offset << "s";
os_frames << "+" << frames << "s";
effects.emplace_back(
std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
} else if (offset != 0) {
std::ostringstream os_offset;
os_offset << offset << "s";
effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
}
return effects;
}
auto get_info_fileobj(py::object fileobj,
const tl::optional<std::string> &format)
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
const auto capacity = [&]() {
const auto bufsiz = get_buffer_size();
const int64_t kDefaultCapacityInBytes = 4096;
return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
: kDefaultCapacityInBytes;
}();
std::string buffer(capacity, '\0');
auto *buf = const_cast<char *>(buffer.data());
auto num_read = read_fileobj(&fileobj, capacity, buf);
// If the file is shorter than 256, then libsox cannot read the header.
auto buf_size = (num_read > 256) ? num_read : 256;
SoxFormat sf(sox_open_mem_read(
buf,
buf_size,
/*signal=*/nullptr,
/*encoding=*/nullptr,
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
// In case of streamed data, length can be 0
validate_input_memfile(sf);
return std::make_tuple(
static_cast<int64_t>(sf->signal.rate),
static_cast<int64_t>(sf->signal.length / sf->signal.channels),
static_cast<int64_t>(sf->signal.channels),
static_cast<int64_t>(sf->encoding.bits_per_sample),
get_encoding(sf->encoding.encoding));
}
tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
py::object fileobj,
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format) {
auto effects = get_effects(frame_offset, num_frames);
return paddleaudio::sox_effects::apply_effects_fileobj(
std::move(fileobj), effects, normalize, channels_first, std::move(format));
}
tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
const std::string& path,
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format) {
auto effects = get_effects(frame_offset, num_frames);
return paddleaudio::sox_effects::apply_effects_file(
path, effects, normalize, channels_first, format);
}
void save_audio_file(const std::string& path,
py::array tensor,
int64_t sample_rate,
bool channels_first,
tl::optional<double> compression,
tl::optional<std::string> format,
tl::optional<std::string> encoding,
tl::optional<int64_t> bits_per_sample) {
validate_input_tensor(tensor);
const auto filetype = [&]() {
if (format.has_value()) return format.value();
return get_filetype(path);
}();
if (filetype == "amr-nb") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
//TORCH_CHECK(num_channels == 1,
// "amr-nb format only supports single channel audio.");
assert(num_channels == 1);
} else if (filetype == "htk") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
// TORCH_CHECK(num_channels == 1,
// "htk format only supports single channel audio.");
assert(num_channels == 1);
} else if (filetype == "gsm") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
assert(num_channels == 1);
assert(sample_rate == 8000);
//TORCH_CHECK(num_channels == 1,
// "gsm format only supports single channel audio.");
//TORCH_CHECK(sample_rate == 8000,
// "gsm format only supports a sampling rate of 8kHz.");
}
const auto signal_info =
get_signalinfo(&tensor, sample_rate, filetype, channels_first);
const auto encoding_info = get_encodinginfo_for_save(
filetype, tensor.dtype(), compression, encoding, bits_per_sample);
SoxFormat sf(sox_open_write(path.c_str(),
&signal_info,
&encoding_info,
/*filetype=*/filetype.c_str(),
/*oob=*/nullptr,
/*overwrite_permitted=*/nullptr));
if (static_cast<sox_format_t*>(sf) == nullptr) {
throw std::runtime_error(
"Error saving audio file: failed to open file " + path);
}
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
/*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
/*output_encoding=*/sf->encoding);
chain.addInputTensor(&tensor, sample_rate, channels_first);
chain.addOutputFile(sf);
chain.run();
}
namespace {
// helper class to automatically release buffer, to be used by
// save_audio_fileobj
struct AutoReleaseBuffer {
char* ptr;
size_t size;
AutoReleaseBuffer() : ptr(nullptr), size(0) {}
AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
~AutoReleaseBuffer() {
if (ptr) {
free(ptr);
}
}
};
} // namespace
void save_audio_fileobj(
py::object fileobj,
py::array tensor,
int64_t sample_rate,
bool channels_first,
tl::optional<double> compression,
tl::optional<std::string> format,
tl::optional<std::string> encoding,
tl::optional<int64_t> bits_per_sample) {
if (!format.has_value()) {
throw std::runtime_error(
"`format` is required when saving to file object.");
}
const auto filetype = format.value();
if (filetype == "amr-nb") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
if (num_channels != 1) {
throw std::runtime_error(
"amr-nb format only supports single channel audio.");
}
} else if (filetype == "htk") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
if (num_channels != 1) {
throw std::runtime_error(
"htk format only supports single channel audio.");
}
} else if (filetype == "gsm") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
if (num_channels != 1) {
throw std::runtime_error(
"gsm format only supports single channel audio.");
}
if (sample_rate != 8000) {
throw std::runtime_error(
"gsm format only supports a sampling rate of 8kHz.");
}
}
const auto signal_info =
get_signalinfo(&tensor, sample_rate, filetype, channels_first);
const auto encoding_info = get_encodinginfo_for_save(
filetype,
tensor.dtype(),
compression,
std::move(encoding),
bits_per_sample);
AutoReleaseBuffer buffer;
SoxFormat sf(sox_open_memstream_write(
&buffer.ptr,
&buffer.size,
&signal_info,
&encoding_info,
filetype.c_str(),
/*oob=*/nullptr));
if (static_cast<sox_format_t*>(sf) == nullptr) {
throw std::runtime_error(
"Error saving audio file: failed to open memory stream.");
}
paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
/*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
/*output_encoding=*/sf->encoding);
chain.addInputTensor(&tensor, sample_rate, channels_first);
chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
chain.run();
// Closing the sox_format_t is necessary for flushing the last chunk to the
// buffer
sf.close();
fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
}
} // namespace paddleaudio
} // namespace sox_io
// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.h with modification.
#pragma once
#include "paddleaudio/src/pybind/sox/utils.h"
namespace py = pybind11;
namespace paddleaudio {
namespace sox_io {
auto get_info_file(const std::string &path,
const tl::optional<std::string> &format)
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
auto get_info_fileobj(py::object fileobj,
const tl::optional<std::string> &format)
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
py::object fileobj,
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format);
void save_audio_fileobj(
py::object fileobj,
py::array tensor,
int64_t sample_rate,
bool channels_first,
tl::optional<double> compression,
tl::optional<std::string> format,
tl::optional<std::string> encoding,
tl::optional<int64_t> bits_per_sample);
auto get_effects(const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames)
-> std::vector<std::vector<std::string>>;
tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
const std::string& path,
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format);
void save_audio_file(const std::string& path,
py::array tensor,
int64_t sample_rate,
bool channels_first,
tl::optional<double> compression,
tl::optional<std::string> format,
tl::optional<std::string> encoding,
tl::optional<int64_t> bits_per_sample);
} // namespace paddleaudio
} // namespace sox_io
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
################################################################################
# sox
################################################################################
if (BUILD_SOX)
add_subdirectory(sox)
endif()
################################################################################
# kaldi
################################################################################
if (BUILD_KALDI)
add_subdirectory(kaldi)
endif()
\ No newline at end of file
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
../../common_utils
\ No newline at end of file
此差异已折叠。
../../common_utils
\ No newline at end of file
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册