Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
98300b86
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
98300b86
编写于
7月 27, 2022
作者:
Y
YangZhou
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add sox load_audio&&effets
上级
76b7616f
变更
24
隐藏空白更改
内联
并排
Showing
24 changed file
with
2039 addition
and
73 deletion
+2039
-73
cmake/external/pybind.cmake
cmake/external/pybind.cmake
+2
-2
paddlespeech/audio/_internal/module_utils.py
paddlespeech/audio/_internal/module_utils.py
+1
-1
paddlespeech/audio/backends/sox_io_backend.py
paddlespeech/audio/backends/sox_io_backend.py
+3
-1
paddlespeech/audio/src/CMakeLists.txt
paddlespeech/audio/src/CMakeLists.txt
+8
-3
paddlespeech/audio/src/pybind/pybind.cpp
paddlespeech/audio/src/pybind/pybind.cpp
+10
-0
paddlespeech/audio/src/pybind/sox/effects.cpp
paddlespeech/audio/src/pybind/sox/effects.cpp
+121
-0
paddlespeech/audio/src/pybind/sox/effects.h
paddlespeech/audio/src/pybind/sox/effects.h
+18
-0
paddlespeech/audio/src/pybind/sox/effects_chain.cpp
paddlespeech/audio/src/pybind/sox/effects_chain.cpp
+236
-0
paddlespeech/audio/src/pybind/sox/effects_chain.h
paddlespeech/audio/src/pybind/sox/effects_chain.h
+25
-0
paddlespeech/audio/src/pybind/sox/io.cpp
paddlespeech/audio/src/pybind/sox/io.cpp
+146
-0
paddlespeech/audio/src/pybind/sox/io.h
paddlespeech/audio/src/pybind/sox/io.h
+22
-4
paddlespeech/audio/src/pybind/sox/utils.cpp
paddlespeech/audio/src/pybind/sox/utils.cpp
+29
-1
paddlespeech/audio/src/pybind/sox/utils.h
paddlespeech/audio/src/pybind/sox/utils.h
+4
-25
paddlespeech/audio/src/sox/effects.cpp
paddlespeech/audio/src/sox/effects.cpp
+147
-0
paddlespeech/audio/src/sox/effects.h
paddlespeech/audio/src/sox/effects.h
+29
-0
paddlespeech/audio/src/sox/effects_chain.cpp
paddlespeech/audio/src/sox/effects_chain.cpp
+342
-0
paddlespeech/audio/src/sox/effects_chain.h
paddlespeech/audio/src/sox/effects_chain.h
+62
-0
paddlespeech/audio/src/sox/io.cpp
paddlespeech/audio/src/sox/io.cpp
+19
-27
paddlespeech/audio/src/sox/io.h
paddlespeech/audio/src/sox/io.h
+5
-8
paddlespeech/audio/src/sox/types.cpp
paddlespeech/audio/src/sox/types.cpp
+143
-0
paddlespeech/audio/src/sox/types.h
paddlespeech/audio/src/sox/types.h
+58
-0
paddlespeech/audio/src/sox/utils.cpp
paddlespeech/audio/src/sox/utils.cpp
+488
-0
paddlespeech/audio/src/sox/utils.h
paddlespeech/audio/src/sox/utils.h
+120
-0
setup.py
setup.py
+1
-1
未找到文件。
cmake/external/pybind.cmake
浏览文件 @
98300b86
...
@@ -3,8 +3,8 @@ include(ExternalProject)
...
@@ -3,8 +3,8 @@ include(ExternalProject)
FetchContent_Declare
(
FetchContent_Declare
(
pybind
pybind
URL https://github.com/pybind/pybind11/archive/refs/tags/v2.
9.0.zip
URL https://github.com/pybind/pybind11/archive/refs/tags/v2.
10.0.zip
URL_HASH SHA256=
1c6e0141f7092867c5bf388bc3acdb2689ed49f59c3977651394c6c87ae88232
URL_HASH SHA256=
225df6e6dea7cea7c5754d4ed954e9ca7c43947b849b3795f87cb56437f1bd19
)
)
FetchContent_MakeAvailable
(
pybind
)
FetchContent_MakeAvailable
(
pybind
)
include_directories
(
${
pybind_SOURCE_DIR
}
/include
)
include_directories
(
${
pybind_SOURCE_DIR
}
/include
)
...
...
paddlespeech/audio/_internal/module_utils.py
浏览文件 @
98300b86
...
@@ -145,4 +145,4 @@ def requires_sox():
...
@@ -145,4 +145,4 @@ def requires_sox():
return
wrapped
return
wrapped
return
return
decorator
paddlespeech/audio/backends/sox_io_backend.py
浏览文件 @
98300b86
...
@@ -29,7 +29,7 @@ def _fail_load(
...
@@ -29,7 +29,7 @@ def _fail_load(
normalize
:
bool
=
True
,
normalize
:
bool
=
True
,
channels_first
:
bool
=
True
,
channels_first
:
bool
=
True
,
format
:
Optional
[
str
]
=
None
,
format
:
Optional
[
str
]
=
None
,
)
->
Tuple
[
paddle
.
Tensor
,
int
]:
)
->
Tuple
[
Tensor
,
int
]:
raise
RuntimeError
(
"Failed to load audio from {}"
.
format
(
filepath
))
raise
RuntimeError
(
"Failed to load audio from {}"
.
format
(
filepath
))
...
@@ -41,6 +41,7 @@ _fallback_info_fileobj = _fail_info_fileobj
...
@@ -41,6 +41,7 @@ _fallback_info_fileobj = _fail_info_fileobj
_fallback_load
=
_fail_load
_fallback_load
=
_fail_load
_fallback_load_filebj
=
_fail_load_fileobj
_fallback_load_filebj
=
_fail_load_fileobj
@
_mod_utils
.
requires_sox
()
def
load
(
def
load
(
filepath
:
Union
[
str
,
Path
],
filepath
:
Union
[
str
,
Path
],
out
:
Optional
[
Tensor
]
=
None
,
out
:
Optional
[
Tensor
]
=
None
,
...
@@ -51,6 +52,7 @@ def load(
...
@@ -51,6 +52,7 @@ def load(
filetype
:
Optional
[
str
]
=
None
,
)
->
Tuple
[
Tensor
,
int
]:
filetype
:
Optional
[
str
]
=
None
,
)
->
Tuple
[
Tensor
,
int
]:
raise
RuntimeError
(
"No audio I/O backend is available."
)
raise
RuntimeError
(
"No audio I/O backend is available."
)
@
_mod_utils
.
requires_sox
()
def
save
(
filepath
:
str
,
def
save
(
filepath
:
str
,
src
:
Tensor
,
src
:
Tensor
,
sample_rate
:
int
,
sample_rate
:
int
,
...
...
paddlespeech/audio/src/CMakeLists.txt
浏览文件 @
98300b86
...
@@ -35,6 +35,11 @@ if(BUILD_SOX)
...
@@ -35,6 +35,11 @@ if(BUILD_SOX)
list
(
list
(
APPEND
APPEND
LIBPADDLEAUDIO_SOURCES
LIBPADDLEAUDIO_SOURCES
sox/io.cpp
sox/utils.cpp
sox/effects.cpp
sox/effects_chain.cpp
sox/types.cpp
)
)
list
(
list
(
APPEND
APPEND
...
@@ -139,8 +144,8 @@ if(BUILD_SOX)
...
@@ -139,8 +144,8 @@ if(BUILD_SOX)
list
(
list
(
APPEND
APPEND
EXTENSION_SOURCES
EXTENSION_SOURCES
#
pybind/sox/effects.cpp
pybind/sox/effects.cpp
#
pybind/sox/effects_chain.cpp
pybind/sox/effects_chain.cpp
pybind/sox/io.cpp
pybind/sox/io.cpp
pybind/sox/utils.cpp
pybind/sox/utils.cpp
)
)
...
@@ -192,4 +197,4 @@ define_extension(
...
@@ -192,4 +197,4 @@ define_extension(
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
# )
# )
# endif()
# endif()
endif
()
endif
()
\ No newline at end of file
paddlespeech/audio/src/pybind/pybind.cpp
浏览文件 @
98300b86
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
#include "paddlespeech/audio/src/pybind/sox/io.h"
#include "paddlespeech/audio/src/pybind/sox/io.h"
#include "paddlespeech/audio/src/pybind/sox/effects.h"
#include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h"
#include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h"
PYBIND11_MODULE
(
_paddleaudio
,
m
)
{
PYBIND11_MODULE
(
_paddleaudio
,
m
)
{
...
@@ -13,6 +14,15 @@ PYBIND11_MODULE(_paddleaudio, m) {
...
@@ -13,6 +14,15 @@ PYBIND11_MODULE(_paddleaudio, m) {
m
.
def
(
"get_info_fileobj"
,
m
.
def
(
"get_info_fileobj"
,
&
paddleaudio
::
sox_io
::
get_info_fileobj
,
&
paddleaudio
::
sox_io
::
get_info_fileobj
,
"Get metadata of audio in file object."
);
"Get metadata of audio in file object."
);
m
.
def
(
"load_audio_fileobj"
,
&
paddleaudio
::
sox_io
::
load_audio_fileobj
,
"Load audio from file object."
);
m
.
def
(
"save_audio_fileobj"
,
&
paddleaudio
::
sox_io
::
save_audio_fileobj
,
"Save audio to file obj."
);
m
.
def
(
"apply_effects_fileobj"
,
&
paddleaudio
::
sox_effects
::
apply_effects_fileobj
,
"Decode audio data from file-like obj and apply effects."
);
#endif
#endif
#ifdef INCLUDE_KALDI
#ifdef INCLUDE_KALDI
...
...
paddlespeech/audio/src/pybind/sox/effects.cpp
0 → 100644
浏览文件 @
98300b86
#include "paddlespeech/audio/src/pybind/sox/effects.h"
#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
#include "paddlespeech/audio/src/pybind/sox/utils.h"
using
namespace
paddleaudio
::
sox_utils
;
namespace
paddleaudio
::
sox_effects
{
// Streaming decoding over file-like object is tricky because libsox operates on
// FILE pointer. The folloing is what `sox` and `play` commands do
// - file input -> FILE pointer
// - URL input -> call wget in suprocess and pipe the data -> FILE pointer
// - stdin -> FILE pointer
//
// We want to, instead, fetch byte strings chunk by chunk, consume them, and
// discard.
//
// Here is the approach
// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
// chunk of byte string
// This will perform header-based format detection, if necessary, then fill
// the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
// which returns FILE* which points the buffer of the provided byte string.
// 2. Each time sox reads a chunk from the FILE*, we update the underlying
// buffer in a way that it
// starts with unseen data, and append the new data read from the given
// fileobj. This will trick libsox as if it keeps reading from the FILE*
// continuously.
// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
auto
apply_effects_fileobj
(
py
::
object
fileobj
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
tl
::
optional
<
std
::
string
>
format
)
->
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
{
// Prepare the buffer used throughout the lifecycle of SoxEffectChain.
//
// For certain format (such as FLAC), libsox keeps reading the content at
// the initialization unless it reaches EOF even when the header is properly
// parsed. (Making buffer size 8192, which is way bigger than the header,
// resulted in libsox consuming all the buffer content at the time it opens
// the file.) Therefore buffer has to always contain valid data, except after
// EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
// first check if there is enough data to fill the buffer. `read_fileobj`
// repeatedly calls `read` method until it receives the requested length of
// bytes or it reaches EOF. If we get bytes shorter than requested, that means
// the whole audio data are fetched.
//
// * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
const
auto
capacity
=
[
&
]()
{
// NOTE:
// Use the abstraction provided by `libpaddleaudio` to access the global
// config defined by libsox. Directly using `sox_get_globals` function will
// end up retrieving the static variable defined in `_paddleaudio`, which is
// not correct.
const
auto
bufsiz
=
get_buffer_size
();
const
int64_t
kDefaultCapacityInBytes
=
256
;
return
(
bufsiz
>
kDefaultCapacityInBytes
)
?
bufsiz
:
kDefaultCapacityInBytes
;
}();
std
::
string
buffer
(
capacity
,
'\0'
);
auto
*
in_buf
=
const_cast
<
char
*>
(
buffer
.
data
());
auto
num_read
=
read_fileobj
(
&
fileobj
,
capacity
,
in_buf
);
// If the file is shorter than 256, then libsox cannot read the header.
auto
in_buffer_size
=
(
num_read
>
256
)
?
num_read
:
256
;
// Open file (this starts reading the header)
// When opening a file there are two functions that can touches FILE*.
// * `auto_detect_format`
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
// * `startread` handler of detected format.
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
// To see the handler of a particular format, go to
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
// For example, voribs can be found
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
SoxFormat
sf
(
sox_open_mem_read
(
in_buf
,
in_buffer_size
,
/*signal=*/
nullptr
,
/*encoding=*/
nullptr
,
/*filetype=*/
format
.
has_value
()
?
format
.
value
().
c_str
()
:
nullptr
));
// In case of streamed data, length can be 0
if
(
static_cast
<
sox_format_t
*>
(
sf
)
==
nullptr
||
sf
->
encoding
.
encoding
==
SOX_ENCODING_UNKNOWN
)
{
return
{};
}
// Prepare output buffer
std
::
vector
<
sox_sample_t
>
out_buffer
;
out_buffer
.
reserve
(
sf
->
signal
.
length
);
// Create and run SoxEffectsChain
const
auto
dtype
=
get_dtype
(
sf
->
encoding
.
encoding
,
sf
->
signal
.
precision
);
paddleaudio
::
sox_effects_chain
::
SoxEffectsChainPyBind
chain
(
/*input_encoding=*/
sf
->
encoding
,
/*output_encoding=*/
get_tensor_encodinginfo
(
dtype
));
chain
.
addInputFileObj
(
sf
,
in_buf
,
in_buffer_size
,
&
fileobj
);
for
(
const
auto
&
effect
:
effects
)
{
chain
.
addEffect
(
effect
);
}
chain
.
addOutputBuffer
(
&
out_buffer
);
chain
.
run
();
// Create tensor from buffer
bool
channels_first_
=
channels_first
.
value_or
(
true
);
auto
tensor
=
convert_to_tensor
(
/*buffer=*/
out_buffer
.
data
(),
/*num_samples=*/
out_buffer
.
size
(),
/*num_channels=*/
chain
.
getOutputNumChannels
(),
dtype
,
normalize
.
value_or
(
true
),
channels_first_
);
return
std
::
forward_as_tuple
(
tensor
,
static_cast
<
int64_t
>
(
chain
.
getOutputSampleRate
()));
}
}
// namespace paddleaudio::sox_effects
paddlespeech/audio/src/pybind/sox/effects.h
0 → 100644
浏览文件 @
98300b86
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "paddlespeech/audio/src/optional/optional.hpp"
namespace
py
=
pybind11
;
namespace
paddleaudio
::
sox_effects
{
auto
apply_effects_fileobj
(
py
::
object
fileobj
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
tl
::
optional
<
std
::
string
>
format
)
->
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
;
}
// namespace paddleaudio::sox_effects
paddlespeech/audio/src/pybind/sox/effects_chain.cpp
0 → 100644
浏览文件 @
98300b86
#include <sox.h>
#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
#include "paddlespeech/audio/src/pybind/sox/utils.h"
using
namespace
paddleaudio
::
sox_utils
;
namespace
paddleaudio
::
sox_effects_chain
{
namespace
{
/// helper classes for passing file-like object to SoxEffectChain
struct
FileObjInputPriv
{
sox_format_t
*
sf
;
py
::
object
*
fileobj
;
bool
eof_reached
;
char
*
buffer
;
uint64_t
buffer_size
;
};
struct
FileObjOutputPriv
{
sox_format_t
*
sf
;
py
::
object
*
fileobj
;
char
**
buffer
;
size_t
*
buffer_size
;
};
/// Callback function to feed byte string
/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
auto
fileobj_input_drain
(
sox_effect_t
*
effp
,
sox_sample_t
*
obuf
,
size_t
*
osamp
)
->
int
{
auto
priv
=
static_cast
<
FileObjInputPriv
*>
(
effp
->
priv
);
auto
sf
=
priv
->
sf
;
auto
buffer
=
priv
->
buffer
;
// 1. Refresh the buffer
//
// NOTE:
// Since the underlying FILE* was opened with `fmemopen`, the only way
// libsox detect EOF is reaching the end of the buffer. (null byte won't
// help) Therefore we need to align the content at the end of buffer,
// otherwise, libsox will keep reading the content beyond intended length.
//
// Before:
//
// |<-------consumed------>|<---remaining--->|
// |***********************|-----------------|
// ^ ftell
//
// After:
//
// |<-offset->|<---remaining--->|<-new data->|
// |**********|-----------------|++++++++++++|
// ^ ftell
// NOTE:
// Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
// supposed to be in sync, but there are cases (Vorbis) they are not
// in sync and `tell_off` has seemingly uninitialized value, which
// leads num_remain to be negative and cause segmentation fault
// in `memmove`.
const
auto
tell
=
ftell
((
FILE
*
)
sf
->
fp
);
if
(
tell
<
0
)
{
throw
std
::
runtime_error
(
"Internal Error: ftell failed."
);
}
const
auto
num_consumed
=
static_cast
<
size_t
>
(
tell
);
if
(
num_consumed
>
priv
->
buffer_size
)
{
throw
std
::
runtime_error
(
"Internal Error: buffer overrun."
);
}
const
auto
num_remain
=
priv
->
buffer_size
-
num_consumed
;
// 1.1. Fetch the data to see if there is data to fill the buffer
size_t
num_refill
=
0
;
std
::
string
chunk
(
num_consumed
,
'\0'
);
if
(
num_consumed
&&
!
priv
->
eof_reached
)
{
num_refill
=
read_fileobj
(
priv
->
fileobj
,
num_consumed
,
const_cast
<
char
*>
(
chunk
.
data
()));
if
(
num_refill
<
num_consumed
)
{
priv
->
eof_reached
=
true
;
}
}
const
auto
offset
=
num_consumed
-
num_refill
;
// 1.2. Move the unconsumed data towards the beginning of buffer.
if
(
num_remain
)
{
auto
src
=
static_cast
<
void
*>
(
buffer
+
num_consumed
);
auto
dst
=
static_cast
<
void
*>
(
buffer
+
offset
);
memmove
(
dst
,
src
,
num_remain
);
}
// 1.3. Refill the remaining buffer.
if
(
num_refill
)
{
auto
src
=
static_cast
<
void
*>
(
const_cast
<
char
*>
(
chunk
.
c_str
()));
auto
dst
=
buffer
+
offset
+
num_remain
;
memcpy
(
dst
,
src
,
num_refill
);
}
// 1.4. Set the file pointer to the new offset
sf
->
tell_off
=
offset
;
fseek
((
FILE
*
)
sf
->
fp
,
offset
,
SEEK_SET
);
// 2. Perform decoding operation
// The following part is practically same as "input" effect
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
// At this point, osamp represents the buffer size in bytes,
// but sox_read expects the maximum number of samples ready to read.
// Normally, this is fine, but in case when the samples are not 4-byte
// aligned, (e.g. sample is 24bits), the resulting signal is not correct.
// https://github.com/pytorch/audio/issues/2083
if
(
sf
->
encoding
.
bits_per_sample
>
0
)
*
osamp
/=
(
sf
->
encoding
.
bits_per_sample
/
8
);
// Ensure that it's a multiple of the number of channels
*
osamp
-=
*
osamp
%
effp
->
out_signal
.
channels
;
// Read up to *osamp samples into obuf;
// store the actual number read back to *osamp
*
osamp
=
sox_read
(
sf
,
obuf
,
*
osamp
);
// Decoding is finished when fileobject is exhausted and sox can no longer
// decode a sample.
return
(
priv
->
eof_reached
&&
!*
osamp
)
?
SOX_EOF
:
SOX_SUCCESS
;
}
auto
fileobj_output_flow
(
sox_effect_t
*
effp
,
sox_sample_t
const
*
ibuf
,
sox_sample_t
*
obuf
LSX_UNUSED
,
size_t
*
isamp
,
size_t
*
osamp
)
->
int
{
*
osamp
=
0
;
if
(
*
isamp
)
{
auto
priv
=
static_cast
<
FileObjOutputPriv
*>
(
effp
->
priv
);
auto
sf
=
priv
->
sf
;
auto
fp
=
static_cast
<
FILE
*>
(
sf
->
fp
);
auto
fileobj
=
priv
->
fileobj
;
auto
buffer
=
priv
->
buffer
;
// Encode chunk
auto
num_samples_written
=
sox_write
(
sf
,
ibuf
,
*
isamp
);
fflush
(
fp
);
// Copy the encoded chunk to python object.
fileobj
->
attr
(
"write"
)(
py
::
bytes
(
*
buffer
,
ftell
(
fp
)));
// Reset FILE*
sf
->
tell_off
=
0
;
fseek
(
fp
,
0
,
SEEK_SET
);
if
(
num_samples_written
!=
*
isamp
)
{
if
(
sf
->
sox_errno
)
{
std
::
ostringstream
stream
;
stream
<<
sf
->
sox_errstr
<<
" "
<<
sox_strerror
(
sf
->
sox_errno
)
<<
" "
<<
sf
->
filename
;
throw
std
::
runtime_error
(
stream
.
str
());
}
return
SOX_EOF
;
}
}
return
SOX_SUCCESS
;
}
auto
get_fileobj_input_handler
()
->
sox_effect_handler_t
*
{
static
sox_effect_handler_t
handler
{
/*name=*/
"input_fileobj_object"
,
/*usage=*/
nullptr
,
/*flags=*/
SOX_EFF_MCHAN
,
/*getopts=*/
nullptr
,
/*start=*/
nullptr
,
/*flow=*/
nullptr
,
/*drain=*/
fileobj_input_drain
,
/*stop=*/
nullptr
,
/*kill=*/
nullptr
,
/*priv_size=*/
sizeof
(
FileObjInputPriv
)};
return
&
handler
;
}
auto
get_fileobj_output_handler
()
->
sox_effect_handler_t
*
{
static
sox_effect_handler_t
handler
{
/*name=*/
"output_fileobj_object"
,
/*usage=*/
nullptr
,
/*flags=*/
SOX_EFF_MCHAN
,
/*getopts=*/
nullptr
,
/*start=*/
nullptr
,
/*flow=*/
fileobj_output_flow
,
/*drain=*/
nullptr
,
/*stop=*/
nullptr
,
/*kill=*/
nullptr
,
/*priv_size=*/
sizeof
(
FileObjOutputPriv
)};
return
&
handler
;
}
}
// namespace
void
SoxEffectsChainPyBind
::
addInputFileObj
(
sox_format_t
*
sf
,
char
*
buffer
,
uint64_t
buffer_size
,
py
::
object
*
fileobj
)
{
in_sig_
=
sf
->
signal
;
interm_sig_
=
in_sig_
;
SoxEffect
e
(
sox_create_effect
(
get_fileobj_input_handler
()));
auto
priv
=
static_cast
<
FileObjInputPriv
*>
(
e
->
priv
);
priv
->
sf
=
sf
;
priv
->
fileobj
=
fileobj
;
priv
->
eof_reached
=
false
;
priv
->
buffer
=
buffer
;
priv
->
buffer_size
=
buffer_size
;
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
in_sig_
)
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Internal Error: Failed to add effect: input fileobj"
);
}
}
void
SoxEffectsChainPyBind
::
addOutputFileObj
(
sox_format_t
*
sf
,
char
**
buffer
,
size_t
*
buffer_size
,
py
::
object
*
fileobj
)
{
out_sig_
=
sf
->
signal
;
SoxEffect
e
(
sox_create_effect
(
get_fileobj_output_handler
()));
auto
priv
=
static_cast
<
FileObjOutputPriv
*>
(
e
->
priv
);
priv
->
sf
=
sf
;
priv
->
fileobj
=
fileobj
;
priv
->
buffer
=
buffer
;
priv
->
buffer_size
=
buffer_size
;
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
out_sig_
)
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Internal Error: Failed to add effect: output fileobj"
);
}
}
}
// namespace paddleaudio::sox_effects_chain
paddlespeech/audio/src/pybind/sox/effects_chain.h
0 → 100644
浏览文件 @
98300b86
#pragma once
#include "paddlespeech/audio/src/sox/effects_chain.h"
namespace
paddleaudio
::
sox_effects_chain
{
class
SoxEffectsChainPyBind
:
public
SoxEffectsChain
{
using
SoxEffectsChain
::
SoxEffectsChain
;
public:
void
addInputFileObj
(
sox_format_t
*
sf
,
char
*
buffer
,
uint64_t
buffer_size
,
py
::
object
*
fileobj
);
void
addOutputFileObj
(
sox_format_t
*
sf
,
char
**
buffer
,
size_t
*
buffer_size
,
py
::
object
*
fileobj
);
};
}
// namespace paddleaudio::sox_effects_chain
paddlespeech/audio/src/pybind/sox/io.cpp
浏览文件 @
98300b86
...
@@ -2,7 +2,14 @@
...
@@ -2,7 +2,14 @@
// All rights reserved.
// All rights reserved.
#include "paddlespeech/audio/src/pybind/sox/io.h"
#include "paddlespeech/audio/src/pybind/sox/io.h"
#include "paddlespeech/audio/src/pybind/sox/effects.h"
#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
#include "paddlespeech/audio/src/pybind/sox/utils.h"
#include "paddlespeech/audio/src/pybind/sox/utils.h"
#include "paddlespeech/audio/src/optional/optional.hpp"
#include "paddlespeech/audio/src/sox/io.h"
#include "paddlespeech/audio/src/sox/types.h"
#include "paddlespeech/audio/src/sox/utils.h"
using
namespace
paddleaudio
::
sox_utils
;
using
namespace
paddleaudio
::
sox_utils
;
...
@@ -28,6 +35,35 @@ auto get_info_file(const std::string &path, const std::string &format)
...
@@ -28,6 +35,35 @@ auto get_info_file(const std::string &path, const std::string &format)
get_encoding
(
sf
->
encoding
.
encoding
));
get_encoding
(
sf
->
encoding
.
encoding
));
}
}
std
::
vector
<
std
::
vector
<
std
::
string
>>
get_effects
(
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
num_frames
)
{
const
auto
offset
=
frame_offset
.
value_or
(
0
);
if
(
offset
<
0
)
{
throw
std
::
runtime_error
(
"Invalid argument: frame_offset must be non-negative."
);
}
const
auto
frames
=
num_frames
.
value_or
(
-
1
);
if
(
frames
==
0
||
frames
<
-
1
)
{
throw
std
::
runtime_error
(
"Invalid argument: num_frames must be -1 or greater than 0."
);
}
std
::
vector
<
std
::
vector
<
std
::
string
>>
effects
;
if
(
frames
!=
-
1
)
{
std
::
ostringstream
os_offset
,
os_frames
;
os_offset
<<
offset
<<
"s"
;
os_frames
<<
"+"
<<
frames
<<
"s"
;
effects
.
emplace_back
(
std
::
vector
<
std
::
string
>
{
"trim"
,
os_offset
.
str
(),
os_frames
.
str
()});
}
else
if
(
offset
!=
0
)
{
std
::
ostringstream
os_offset
;
os_offset
<<
offset
<<
"s"
;
effects
.
emplace_back
(
std
::
vector
<
std
::
string
>
{
"trim"
,
os_offset
.
str
()});
}
return
effects
;
}
auto
get_info_fileobj
(
py
::
object
fileobj
,
const
std
::
string
&
format
)
auto
get_info_fileobj
(
py
::
object
fileobj
,
const
std
::
string
&
format
)
->
std
::
tuple
<
int64_t
,
int64_t
,
int64_t
,
int64_t
,
std
::
string
>
{
->
std
::
tuple
<
int64_t
,
int64_t
,
int64_t
,
int64_t
,
std
::
string
>
{
const
auto
capacity
=
[
&
]()
{
const
auto
capacity
=
[
&
]()
{
...
@@ -60,5 +96,115 @@ auto get_info_fileobj(py::object fileobj, const std::string &format)
...
@@ -60,5 +96,115 @@ auto get_info_fileobj(py::object fileobj, const std::string &format)
get_encoding
(
sf
->
encoding
.
encoding
));
get_encoding
(
sf
->
encoding
.
encoding
));
}
}
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
load_audio_fileobj
(
py
::
object
fileobj
,
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
num_frames
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
const
tl
::
optional
<
std
::
string
>&
format
)
{
auto
effects
=
get_effects
(
frame_offset
,
num_frames
);
return
paddleaudio
::
sox_effects
::
apply_effects_fileobj
(
std
::
move
(
fileobj
),
effects
,
normalize
,
channels_first
,
std
::
move
(
format
));
}
namespace
{
// helper class to automatically release buffer, to be used by
// save_audio_fileobj
struct
AutoReleaseBuffer
{
char
*
ptr
;
size_t
size
;
AutoReleaseBuffer
()
:
ptr
(
nullptr
),
size
(
0
)
{}
AutoReleaseBuffer
(
const
AutoReleaseBuffer
&
other
)
=
delete
;
AutoReleaseBuffer
(
AutoReleaseBuffer
&&
other
)
=
delete
;
auto
operator
=
(
const
AutoReleaseBuffer
&
other
)
->
AutoReleaseBuffer
&
=
delete
;
auto
operator
=
(
AutoReleaseBuffer
&&
other
)
->
AutoReleaseBuffer
&
=
delete
;
~
AutoReleaseBuffer
()
{
if
(
ptr
)
{
free
(
ptr
);
}
}
};
}
// namespace
void
save_audio_fileobj
(
py
::
object
fileobj
,
py
::
array
tensor
,
int64_t
sample_rate
,
bool
channels_first
,
tl
::
optional
<
double
>
compression
,
tl
::
optional
<
std
::
string
>
format
,
tl
::
optional
<
std
::
string
>
encoding
,
tl
::
optional
<
int64_t
>
bits_per_sample
)
{
if
(
!
format
.
has_value
())
{
throw
std
::
runtime_error
(
"`format` is required when saving to file object."
);
}
const
auto
filetype
=
format
.
value
();
if
(
filetype
==
"amr-nb"
)
{
const
auto
num_channels
=
tensor
.
shape
(
channels_first
?
0
:
1
);
if
(
num_channels
!=
1
)
{
throw
std
::
runtime_error
(
"amr-nb format only supports single channel audio."
);
}
}
else
if
(
filetype
==
"htk"
)
{
const
auto
num_channels
=
tensor
.
shape
(
channels_first
?
0
:
1
);
if
(
num_channels
!=
1
)
{
throw
std
::
runtime_error
(
"htk format only supports single channel audio."
);
}
}
else
if
(
filetype
==
"gsm"
)
{
const
auto
num_channels
=
tensor
.
shape
(
channels_first
?
0
:
1
);
if
(
num_channels
!=
1
)
{
throw
std
::
runtime_error
(
"gsm format only supports single channel audio."
);
}
if
(
sample_rate
!=
8000
)
{
throw
std
::
runtime_error
(
"gsm format only supports a sampling rate of 8kHz."
);
}
}
const
auto
signal_info
=
get_signalinfo
(
&
tensor
,
sample_rate
,
filetype
,
channels_first
);
const
auto
encoding_info
=
get_encodinginfo_for_save
(
filetype
,
tensor
.
dtype
(),
compression
,
std
::
move
(
encoding
),
bits_per_sample
);
AutoReleaseBuffer
buffer
;
SoxFormat
sf
(
sox_open_memstream_write
(
&
buffer
.
ptr
,
&
buffer
.
size
,
&
signal_info
,
&
encoding_info
,
filetype
.
c_str
(),
/*oob=*/
nullptr
));
if
(
static_cast
<
sox_format_t
*>
(
sf
)
==
nullptr
)
{
throw
std
::
runtime_error
(
"Error saving audio file: failed to open memory stream."
);
}
paddleaudio
::
sox_effects_chain
::
SoxEffectsChainPyBind
chain
(
/*input_encoding=*/
get_tensor_encodinginfo
(
tensor
.
dtype
()),
/*output_encoding=*/
sf
->
encoding
);
chain
.
addInputTensor
(
&
tensor
,
sample_rate
,
channels_first
);
chain
.
addOutputFileObj
(
sf
,
&
buffer
.
ptr
,
&
buffer
.
size
,
&
fileobj
);
chain
.
run
();
// Closing the sox_format_t is necessary for flushing the last chunk to the
// buffer
sf
.
close
();
fileobj
.
attr
(
"write"
)(
py
::
bytes
(
buffer
.
ptr
,
buffer
.
size
));
}
}
// namespace paddleaudio
}
// namespace paddleaudio
}
// namespace sox_io
}
// namespace sox_io
paddlespeech/audio/src/pybind/sox/io.h
浏览文件 @
98300b86
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
// All rights reserved.
// All rights reserved.
#ifndef PADDLEAUDIO_PYBIND_SOX_IO_H
#pragma once
#define PADDLEAUDIO_PYBIND_SOX_IO_H
#include "paddlespeech/audio/src/pybind/sox/utils.h"
#include "paddlespeech/audio/src/pybind/sox/utils.h"
namespace
py
=
pybind11
;
namespace
paddleaudio
{
namespace
paddleaudio
{
namespace
sox_io
{
namespace
sox_io
{
...
@@ -15,7 +16,24 @@ auto get_info_file(const std::string &path, const std::string &format)
...
@@ -15,7 +16,24 @@ auto get_info_file(const std::string &path, const std::string &format)
auto
get_info_fileobj
(
py
::
object
fileobj
,
const
std
::
string
&
format
)
auto
get_info_fileobj
(
py
::
object
fileobj
,
const
std
::
string
&
format
)
->
std
::
tuple
<
int64_t
,
int64_t
,
int64_t
,
int64_t
,
std
::
string
>
;
->
std
::
tuple
<
int64_t
,
int64_t
,
int64_t
,
int64_t
,
std
::
string
>
;
auto
load_audio_fileobj
(
py
::
object
fileobj
,
tl
::
optional
<
int64_t
>
frame_offset
,
tl
::
optional
<
int64_t
>
num_frames
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
tl
::
optional
<
std
::
string
>
format
)
->
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
;
void
save_audio_fileobj
(
py
::
object
fileobj
,
py
::
array
tensor
,
int64_t
sample_rate
,
bool
channels_first
,
tl
::
optional
<
double
>
compression
,
tl
::
optional
<
std
::
string
>
format
,
tl
::
optional
<
std
::
string
>
encoding
,
tl
::
optional
<
int64_t
>
bits_per_sample
);
}
// namespace paddleaudio
}
// namespace paddleaudio
}
// namespace sox_io
}
// namespace sox_io
#endif
paddlespeech/audio/src/pybind/sox/utils.cpp
浏览文件 @
98300b86
...
@@ -8,6 +8,34 @@
...
@@ -8,6 +8,34 @@
namespace
paddleaudio
{
namespace
paddleaudio
{
namespace
sox_utils
{
namespace
sox_utils
{
auto
read_fileobj
(
py
::
object
*
fileobj
,
const
uint64_t
size
,
char
*
buffer
)
->
uint64_t
{
uint64_t
num_read
=
0
;
while
(
num_read
<
size
)
{
auto
request
=
size
-
num_read
;
auto
chunk
=
static_cast
<
std
::
string
>
(
static_cast
<
py
::
bytes
>
(
fileobj
->
attr
(
"read"
)(
request
)));
auto
chunk_len
=
chunk
.
length
();
if
(
chunk_len
==
0
)
{
break
;
}
if
(
chunk_len
>
request
)
{
std
::
ostringstream
message
;
message
<<
"Requested up to "
<<
request
<<
" bytes but, "
<<
"received "
<<
chunk_len
<<
" bytes. "
<<
"The given object does not confirm to read protocol of file "
"object."
;
throw
std
::
runtime_error
(
message
.
str
());
}
memcpy
(
buffer
,
chunk
.
data
(),
chunk_len
);
buffer
+=
chunk_len
;
num_read
+=
chunk_len
;
}
return
num_read
;
}
/*
SoxFormat::SoxFormat(sox_format_t *fd) noexcept : fd_(fd) {}
SoxFormat::SoxFormat(sox_format_t *fd) noexcept : fd_(fd) {}
SoxFormat::~SoxFormat() { close(); }
SoxFormat::~SoxFormat() { close(); }
...
@@ -96,6 +124,6 @@ std::string get_encoding(sox_encoding_t encoding) {
...
@@ -96,6 +124,6 @@ std::string get_encoding(sox_encoding_t encoding) {
return "UNKNOWN";
return "UNKNOWN";
}
}
}
}
*/
}
// namespace paddleaudio
}
// namespace paddleaudio
}
// namespace sox_utils
}
// namespace sox_utils
paddlespeech/audio/src/pybind/sox/utils.h
浏览文件 @
98300b86
...
@@ -4,39 +4,18 @@
...
@@ -4,39 +4,18 @@
#pragma once
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <sox.h>
#include <sox.h>
#include "paddlespeech/audio/src/optional/optional.hpp"
#include "paddlespeech/audio/src/sox/utils.h"
#include "paddlespeech/audio/src/sox/types.h"
namespace
py
=
pybind11
;
namespace
py
=
pybind11
;
namespace
paddleaudio
{
namespace
paddleaudio
{
namespace
sox_utils
{
namespace
sox_utils
{
/// helper class to automatically close sox_format_t*
struct
SoxFormat
{
explicit
SoxFormat
(
sox_format_t
*
fd
)
noexcept
;
SoxFormat
(
const
SoxFormat
&
other
)
=
delete
;
SoxFormat
(
SoxFormat
&&
other
)
=
delete
;
SoxFormat
&
operator
=
(
const
SoxFormat
&
other
)
=
delete
;
SoxFormat
&
operator
=
(
SoxFormat
&&
other
)
=
delete
;
~
SoxFormat
();
sox_format_t
*
operator
->
()
const
noexcept
;
operator
sox_format_t
*
()
const
noexcept
;
void
close
();
private:
sox_format_t
*
fd_
;
};
auto
read_fileobj
(
py
::
object
*
fileobj
,
uint64_t
size
,
char
*
buffer
)
->
uint64_t
;
auto
read_fileobj
(
py
::
object
*
fileobj
,
uint64_t
size
,
char
*
buffer
)
->
uint64_t
;
int64_t
get_buffer_size
();
void
validate_input_file
(
const
SoxFormat
&
sf
,
const
std
::
string
&
path
);
void
validate_input_memfile
(
const
SoxFormat
&
sf
);
std
::
string
get_encoding
(
sox_encoding_t
encoding
);
}
// namespace paddleaudio
}
// namespace paddleaudio
}
// namespace sox_utils
}
// namespace sox_utils
paddlespeech/audio/src/sox/effects.cpp
0 → 100644
浏览文件 @
98300b86
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp
#include <sox.h>
#include <mutex>
#include "paddlespeech/audio/src/sox/effects.h"
#include "paddlespeech/audio/src/sox/effects_chain.h"
#include "paddlespeech/audio/src/sox/utils.h"
using
namespace
paddleaudio
::
sox_utils
;
namespace
paddleaudio
::
sox_effects
{
namespace
{
enum
SoxEffectsResourceState
{
NotInitialized
,
Initialized
,
ShutDown
};
SoxEffectsResourceState
SOX_RESOURCE_STATE
=
NotInitialized
;
std
::
mutex
SOX_RESOUCE_STATE_MUTEX
;
}
// namespace
void
initialize_sox_effects
()
{
const
std
::
lock_guard
<
std
::
mutex
>
lock
(
SOX_RESOUCE_STATE_MUTEX
);
switch
(
SOX_RESOURCE_STATE
)
{
case
NotInitialized
:
if
(
sox_init
()
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Failed to initialize sox effects."
);
};
SOX_RESOURCE_STATE
=
Initialized
;
break
;
case
Initialized
:
break
;
case
ShutDown
:
throw
std
::
runtime_error
(
"SoX Effects has been shut down. Cannot initialize again."
);
}
};
void
shutdown_sox_effects
()
{
const
std
::
lock_guard
<
std
::
mutex
>
lock
(
SOX_RESOUCE_STATE_MUTEX
);
switch
(
SOX_RESOURCE_STATE
)
{
case
NotInitialized
:
throw
std
::
runtime_error
(
"SoX Effects is not initialized. Cannot shutdown."
);
case
Initialized
:
if
(
sox_quit
()
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Failed to initialize sox effects."
);
};
SOX_RESOURCE_STATE
=
ShutDown
;
break
;
case
ShutDown
:
break
;
}
}
auto
apply_effects_tensor
(
py
::
array
waveform
,
int64_t
sample_rate
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
bool
channels_first
)
->
std
::
tuple
<
py
::
array
,
int64_t
>
{
validate_input_tensor
(
waveform
);
// Create SoxEffectsChain
const
auto
dtype
=
waveform
.
dtype
();
paddleaudio
::
sox_effects_chain
::
SoxEffectsChain
chain
(
/*input_encoding=*/
get_tensor_encodinginfo
(
dtype
),
/*output_encoding=*/
get_tensor_encodinginfo
(
dtype
));
// Prepare output buffer
std
::
vector
<
sox_sample_t
>
out_buffer
;
out_buffer
.
reserve
(
waveform
.
size
());
// Build and run effects chain
chain
.
addInputTensor
(
&
waveform
,
sample_rate
,
channels_first
);
for
(
const
auto
&
effect
:
effects
)
{
chain
.
addEffect
(
effect
);
}
chain
.
addOutputBuffer
(
&
out_buffer
);
chain
.
run
();
// Create tensor from buffer
auto
out_tensor
=
convert_to_tensor
(
/*buffer=*/
out_buffer
.
data
(),
/*num_samples=*/
out_buffer
.
size
(),
/*num_channels=*/
chain
.
getOutputNumChannels
(),
dtype
,
/*normalize=*/
false
,
channels_first
);
return
std
::
tuple
<
py
::
array
,
int64_t
>
(
out_tensor
,
chain
.
getOutputSampleRate
());
}
auto
apply_effects_file
(
const
std
::
string
&
path
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
const
tl
::
optional
<
std
::
string
>&
format
)
->
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
{
// Open input file
SoxFormat
sf
(
sox_open_read
(
path
.
c_str
(),
/*signal=*/
nullptr
,
/*encoding=*/
nullptr
,
/*filetype=*/
format
.
has_value
()
?
format
.
value
().
c_str
()
:
nullptr
));
if
(
static_cast
<
sox_format_t
*>
(
sf
)
==
nullptr
||
sf
->
encoding
.
encoding
==
SOX_ENCODING_UNKNOWN
)
{
return
{};
}
const
auto
dtype
=
get_dtype
(
sf
->
encoding
.
encoding
,
sf
->
signal
.
precision
);
// Prepare output
std
::
vector
<
sox_sample_t
>
out_buffer
;
out_buffer
.
reserve
(
sf
->
signal
.
length
);
// Create and run SoxEffectsChain
paddleaudio
::
sox_effects_chain
::
SoxEffectsChain
chain
(
/*input_encoding=*/
sf
->
encoding
,
/*output_encoding=*/
get_tensor_encodinginfo
(
dtype
));
chain
.
addInputFile
(
sf
);
for
(
const
auto
&
effect
:
effects
)
{
chain
.
addEffect
(
effect
);
}
chain
.
addOutputBuffer
(
&
out_buffer
);
chain
.
run
();
// Create tensor from buffer
bool
channels_first_
=
channels_first
.
value_or
(
true
);
auto
tensor
=
convert_to_tensor
(
/*buffer=*/
out_buffer
.
data
(),
/*num_samples=*/
out_buffer
.
size
(),
/*num_channels=*/
chain
.
getOutputNumChannels
(),
dtype
,
normalize
.
value_or
(
true
),
channels_first_
);
return
std
::
tuple
<
py
::
array
,
int64_t
>
(
tensor
,
chain
.
getOutputSampleRate
());
}
}
// namespace paddleaudio::sox_effects
paddlespeech/audio/src/sox/effects.h
0 → 100644
浏览文件 @
98300b86
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h
#pragma once
#include <pybind11/pybind11.h>
#include "paddlespeech/audio/src/sox/utils.h"
namespace
py
=
pybind11
;
namespace
paddleaudio
::
sox_effects
{
void
initialize_sox_effects
();
void
shutdown_sox_effects
();
auto
apply_effects_tensor
(
py
::
array
waveform
,
int64_t
sample_rate
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
bool
channels_first
)
->
std
::
tuple
<
py
::
array
,
int64_t
>
;
auto
apply_effects_file
(
const
std
::
string
&
path
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
const
tl
::
optional
<
std
::
string
>&
format
)
->
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
;
}
// namespace torchaudio::sox_effects
paddlespeech/audio/src/sox/effects_chain.cpp
0 → 100644
浏览文件 @
98300b86
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp
#include "paddlespeech/audio/src/sox/effects_chain.h"
#include "paddlespeech/audio/src/sox/utils.h"
using
namespace
paddleaudio
::
sox_utils
;
namespace
paddleaudio
{
namespace
sox_effects_chain
{
namespace
{
/// helper classes for passing the location of input tensor and output buffer
///
/// drain/flow callback functions require plaing C style function signature and
/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
/// The following structs will be assigned to sox_effect_t::priv pointer which
/// gives sox_effect_t an access to input Tensor and output buffer object.
struct
TensorInputPriv
{
size_t
index
;
py
::
array
*
waveform
;
int64_t
sample_rate
;
bool
channels_first
;
};
struct
TensorOutputPriv
{
std
::
vector
<
sox_sample_t
>*
buffer
;
};
struct
FileOutputPriv
{
sox_format_t
*
sf
;
};
/// Callback function to feed Tensor data to SoxEffectChain.
int
tensor_input_drain
(
sox_effect_t
*
effp
,
sox_sample_t
*
obuf
,
size_t
*
osamp
)
{
// Retrieve the input Tensor and current index
auto
priv
=
static_cast
<
TensorInputPriv
*>
(
effp
->
priv
);
auto
index
=
priv
->
index
;
auto
tensor
=
*
(
priv
->
waveform
);
auto
num_channels
=
effp
->
out_signal
.
channels
;
// Adjust the number of samples to read
const
size_t
num_samples
=
tensor
.
size
();
if
(
index
+
*
osamp
>
num_samples
)
{
*
osamp
=
num_samples
-
index
;
}
// Ensure that it's a multiple of the number of channels
*
osamp
-=
*
osamp
%
num_channels
;
// Slice the input Tensor
// refacor this module, chunk
auto
i_frame
=
index
/
num_channels
;
auto
num_frames
=
*
osamp
/
num_channels
;
py
::
array
chunk
(
tensor
.
dtype
(),
{
num_frames
*
num_channels
});
py
::
buffer_info
ori_info
=
tensor
.
request
();
py
::
buffer_info
info
=
chunk
.
request
();
char
*
ori_start_ptr
=
(
char
*
)
ori_info
.
ptr
+
index
*
chunk
.
itemsize
()
/
sizeof
(
char
);
std
::
memcpy
(
info
.
ptr
,
ori_start_ptr
,
chunk
.
nbytes
());
py
::
dtype
chunk_type
=
py
::
dtype
(
"i"
);
// dtype int32
py
::
array
new_chunk
=
py
::
array
(
chunk_type
,
chunk
.
shape
());
py
::
buffer_info
new_info
=
new_chunk
.
request
();
void
*
ptr
=
(
void
*
)
info
.
ptr
;
int
*
new_ptr
=
(
int
*
)
new_info
.
ptr
;
// Convert to sox_sample_t (int32_t)
switch
(
chunk
.
dtype
().
num
())
{
//case c10::ScalarType::Float: {
case
11
:
{
// Need to convert to 64-bit precision so that
// values around INT32_MIN/MAX are handled correctly.
float
*
ptr_f
=
(
float
*
)
ptr
;
for
(
int
idx
=
0
;
idx
<
chunk
.
size
();
++
idx
)
{
double
elem
=
*
ptr_f
*
2147483648.
;
// *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
if
(
elem
>
INT32_MAX
)
{
*
new_ptr
=
INT32_MAX
;
}
else
if
(
elem
<
INT32_MIN
)
{
*
new_ptr
=
INT32_MIN
;
}
else
{
*
new_ptr
=
elem
;
}
}
break
;
}
//case c10::ScalarType::Int: {
case
5
:
{
break
;
}
// case short
case
3
:
{
int16_t
*
ptr_s
=
(
int16_t
*
)
ptr
;
for
(
int
idx
=
0
;
idx
<
chunk
.
size
();
++
idx
)
{
*
new_ptr
=
*
ptr_s
*
65536
;
}
break
;
}
// case byte
case
1
:
{
int8_t
*
ptr_b
=
(
int8_t
*
)
ptr
;
for
(
int
idx
=
0
;
idx
<
chunk
.
size
();
++
idx
)
{
*
new_ptr
=
(
*
ptr_b
-
128
)
*
16777216
;
}
break
;
}
default:
throw
std
::
runtime_error
(
"Unexpected dtype."
);
}
// Write to buffer
memcpy
(
obuf
,
(
int
*
)
new_info
.
ptr
,
*
osamp
*
4
);
priv
->
index
+=
*
osamp
;
return
(
priv
->
index
==
num_samples
)
?
SOX_EOF
:
SOX_SUCCESS
;
}
/// Callback function to fetch data from SoxEffectChain.
int
tensor_output_flow
(
sox_effect_t
*
effp
,
sox_sample_t
const
*
ibuf
,
sox_sample_t
*
obuf
LSX_UNUSED
,
size_t
*
isamp
,
size_t
*
osamp
)
{
*
osamp
=
0
;
// Get output buffer
auto
out_buffer
=
static_cast
<
TensorOutputPriv
*>
(
effp
->
priv
)
->
buffer
;
// Append at the end
out_buffer
->
insert
(
out_buffer
->
end
(),
ibuf
,
ibuf
+
*
isamp
);
return
SOX_SUCCESS
;
}
int
file_output_flow
(
sox_effect_t
*
effp
,
sox_sample_t
const
*
ibuf
,
sox_sample_t
*
obuf
LSX_UNUSED
,
size_t
*
isamp
,
size_t
*
osamp
)
{
*
osamp
=
0
;
if
(
*
isamp
)
{
auto
sf
=
static_cast
<
FileOutputPriv
*>
(
effp
->
priv
)
->
sf
;
if
(
sox_write
(
sf
,
ibuf
,
*
isamp
)
!=
*
isamp
)
{
if
(
sf
->
sox_errno
)
{
std
::
ostringstream
stream
;
stream
<<
sf
->
sox_errstr
<<
" "
<<
sox_strerror
(
sf
->
sox_errno
)
<<
" "
<<
sf
->
filename
;
throw
std
::
runtime_error
(
stream
.
str
());
}
return
SOX_EOF
;
}
}
return
SOX_SUCCESS
;
}
sox_effect_handler_t
*
get_tensor_input_handler
()
{
static
sox_effect_handler_t
handler
{
/*name=*/
"input_tensor"
,
/*usage=*/
NULL
,
/*flags=*/
SOX_EFF_MCHAN
,
/*getopts=*/
NULL
,
/*start=*/
NULL
,
/*flow=*/
NULL
,
/*drain=*/
tensor_input_drain
,
/*stop=*/
NULL
,
/*kill=*/
NULL
,
/*priv_size=*/
sizeof
(
TensorInputPriv
)};
return
&
handler
;
}
sox_effect_handler_t
*
get_tensor_output_handler
()
{
static
sox_effect_handler_t
handler
{
/*name=*/
"output_tensor"
,
/*usage=*/
NULL
,
/*flags=*/
SOX_EFF_MCHAN
,
/*getopts=*/
NULL
,
/*start=*/
NULL
,
/*flow=*/
tensor_output_flow
,
/*drain=*/
NULL
,
/*stop=*/
NULL
,
/*kill=*/
NULL
,
/*priv_size=*/
sizeof
(
TensorOutputPriv
)};
return
&
handler
;
}
sox_effect_handler_t
*
get_file_output_handler
()
{
static
sox_effect_handler_t
handler
{
/*name=*/
"output_file"
,
/*usage=*/
NULL
,
/*flags=*/
SOX_EFF_MCHAN
,
/*getopts=*/
NULL
,
/*start=*/
NULL
,
/*flow=*/
file_output_flow
,
/*drain=*/
NULL
,
/*stop=*/
NULL
,
/*kill=*/
NULL
,
/*priv_size=*/
sizeof
(
FileOutputPriv
)};
return
&
handler
;
}
}
// namespace
SoxEffect
::
SoxEffect
(
sox_effect_t
*
se
)
noexcept
:
se_
(
se
)
{}
SoxEffect
::~
SoxEffect
()
{
if
(
se_
!=
nullptr
)
{
free
(
se_
);
}
}
SoxEffect
::
operator
sox_effect_t
*
()
const
{
return
se_
;
}
auto
SoxEffect
::
operator
->
()
noexcept
->
sox_effect_t
*
{
return
se_
;
}
SoxEffectsChain
::
SoxEffectsChain
(
sox_encodinginfo_t
input_encoding
,
sox_encodinginfo_t
output_encoding
)
:
in_enc_
(
input_encoding
),
out_enc_
(
output_encoding
),
in_sig_
(),
interm_sig_
(),
out_sig_
(),
sec_
(
sox_create_effects_chain
(
&
in_enc_
,
&
out_enc_
))
{
if
(
!
sec_
)
{
throw
std
::
runtime_error
(
"Failed to create effect chain."
);
}
}
SoxEffectsChain
::~
SoxEffectsChain
()
{
if
(
sec_
!=
nullptr
)
{
sox_delete_effects_chain
(
sec_
);
}
}
void
SoxEffectsChain
::
run
()
{
sox_flow_effects
(
sec_
,
NULL
,
NULL
);
}
void
SoxEffectsChain
::
addInputTensor
(
py
::
array
*
waveform
,
int64_t
sample_rate
,
bool
channels_first
)
{
in_sig_
=
get_signalinfo
(
waveform
,
sample_rate
,
"wav"
,
channels_first
);
interm_sig_
=
in_sig_
;
SoxEffect
e
(
sox_create_effect
(
get_tensor_input_handler
()));
auto
priv
=
static_cast
<
TensorInputPriv
*>
(
e
->
priv
);
priv
->
index
=
0
;
priv
->
waveform
=
waveform
;
priv
->
sample_rate
=
sample_rate
;
priv
->
channels_first
=
channels_first
;
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
in_sig_
)
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Internal Error: Failed to add effect: input_tensor"
);
}
}
void
SoxEffectsChain
::
addOutputBuffer
(
std
::
vector
<
sox_sample_t
>*
output_buffer
)
{
SoxEffect
e
(
sox_create_effect
(
get_tensor_output_handler
()));
static_cast
<
TensorOutputPriv
*>
(
e
->
priv
)
->
buffer
=
output_buffer
;
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
in_sig_
)
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Internal Error: Failed to add effect: output_tensor"
);
}
}
void
SoxEffectsChain
::
addInputFile
(
sox_format_t
*
sf
)
{
in_sig_
=
sf
->
signal
;
interm_sig_
=
in_sig_
;
SoxEffect
e
(
sox_create_effect
(
sox_find_effect
(
"input"
)));
char
*
opts
[]
=
{(
char
*
)
sf
};
sox_effect_options
(
e
,
1
,
opts
);
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
in_sig_
)
!=
SOX_SUCCESS
)
{
std
::
ostringstream
stream
;
stream
<<
"Internal Error: Failed to add effect: input "
<<
sf
->
filename
;
throw
std
::
runtime_error
(
stream
.
str
());
}
}
void
SoxEffectsChain
::
addOutputFile
(
sox_format_t
*
sf
)
{
out_sig_
=
sf
->
signal
;
SoxEffect
e
(
sox_create_effect
(
get_file_output_handler
()));
static_cast
<
FileOutputPriv
*>
(
e
->
priv
)
->
sf
=
sf
;
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
out_sig_
)
!=
SOX_SUCCESS
)
{
std
::
ostringstream
stream
;
stream
<<
"Internal Error: Failed to add effect: output "
<<
sf
->
filename
;
throw
std
::
runtime_error
(
stream
.
str
());
}
}
void
SoxEffectsChain
::
addEffect
(
const
std
::
vector
<
std
::
string
>
effect
)
{
const
auto
num_args
=
effect
.
size
();
if
(
num_args
==
0
)
{
throw
std
::
runtime_error
(
"Invalid argument: empty effect."
);
}
const
auto
name
=
effect
[
0
];
if
(
UNSUPPORTED_EFFECTS
.
find
(
name
)
!=
UNSUPPORTED_EFFECTS
.
end
())
{
std
::
ostringstream
stream
;
stream
<<
"Unsupported effect: "
<<
name
;
throw
std
::
runtime_error
(
stream
.
str
());
}
auto
returned_effect
=
sox_find_effect
(
name
.
c_str
());
if
(
!
returned_effect
)
{
std
::
ostringstream
stream
;
stream
<<
"Unsupported effect: "
<<
name
;
throw
std
::
runtime_error
(
stream
.
str
());
}
SoxEffect
e
(
sox_create_effect
(
returned_effect
));
const
auto
num_options
=
num_args
-
1
;
std
::
vector
<
char
*>
opts
;
for
(
size_t
i
=
1
;
i
<
num_args
;
++
i
)
{
opts
.
push_back
((
char
*
)
effect
[
i
].
c_str
());
}
if
(
sox_effect_options
(
e
,
num_options
,
num_options
?
opts
.
data
()
:
nullptr
)
!=
SOX_SUCCESS
)
{
std
::
ostringstream
stream
;
stream
<<
"Invalid effect option:"
;
for
(
const
auto
&
v
:
effect
)
{
stream
<<
" "
<<
v
;
}
throw
std
::
runtime_error
(
stream
.
str
());
}
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
in_sig_
)
!=
SOX_SUCCESS
)
{
std
::
ostringstream
stream
;
stream
<<
"Internal Error: Failed to add effect:
\"
"
<<
name
;
for
(
size_t
i
=
1
;
i
<
num_args
;
++
i
)
{
stream
<<
" "
<<
effect
[
i
];
}
stream
<<
"
\"
"
;
throw
std
::
runtime_error
(
stream
.
str
());
}
}
int64_t
SoxEffectsChain
::
getOutputNumChannels
()
{
return
interm_sig_
.
channels
;
}
int64_t
SoxEffectsChain
::
getOutputSampleRate
()
{
return
interm_sig_
.
rate
;
}
}
// namespace sox_effects_chain
}
// namespace paddleaudio
paddlespeech/audio/src/sox/effects_chain.h
0 → 100644
浏览文件 @
98300b86
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h
#pragma once
#include <sox.h>
#include "paddlespeech/audio/src/sox/utils.h"
namespace
paddleaudio
{
namespace
sox_effects_chain
{
// Helper struct to safely close sox_effect_t* pointer returned by
// sox_create_effect
struct
SoxEffect
{
explicit
SoxEffect
(
sox_effect_t
*
se
)
noexcept
;
SoxEffect
(
const
SoxEffect
&
other
)
=
delete
;
SoxEffect
(
const
SoxEffect
&&
other
)
=
delete
;
auto
operator
=
(
const
SoxEffect
&
other
)
->
SoxEffect
&
=
delete
;
auto
operator
=
(
SoxEffect
&&
other
)
->
SoxEffect
&
=
delete
;
~
SoxEffect
();
operator
sox_effect_t
*
()
const
;
auto
operator
->
()
noexcept
->
sox_effect_t
*
;
private:
sox_effect_t
*
se_
;
};
// Helper struct to safely close sox_effects_chain_t with handy methods
class
SoxEffectsChain
{
const
sox_encodinginfo_t
in_enc_
;
const
sox_encodinginfo_t
out_enc_
;
protected:
sox_signalinfo_t
in_sig_
;
sox_signalinfo_t
interm_sig_
;
sox_signalinfo_t
out_sig_
;
sox_effects_chain_t
*
sec_
;
public:
explicit
SoxEffectsChain
(
sox_encodinginfo_t
input_encoding
,
sox_encodinginfo_t
output_encoding
);
SoxEffectsChain
(
const
SoxEffectsChain
&
other
)
=
delete
;
SoxEffectsChain
(
const
SoxEffectsChain
&&
other
)
=
delete
;
SoxEffectsChain
&
operator
=
(
const
SoxEffectsChain
&
other
)
=
delete
;
SoxEffectsChain
&
operator
=
(
SoxEffectsChain
&&
other
)
=
delete
;
~
SoxEffectsChain
();
void
run
();
void
addInputTensor
(
py
::
array
*
waveform
,
int64_t
sample_rate
,
bool
channels_first
);
void
addInputFile
(
sox_format_t
*
sf
);
void
addOutputBuffer
(
std
::
vector
<
sox_sample_t
>*
output_buffer
);
void
addOutputFile
(
sox_format_t
*
sf
);
void
addEffect
(
const
std
::
vector
<
std
::
string
>
effect
);
int64_t
getOutputNumChannels
();
int64_t
getOutputSampleRate
();
};
}
// namespace sox_effects_chain
}
// namespace torchaudio
paddlespeech/audio/src/sox/io.cpp
浏览文件 @
98300b86
// #include "sox/effects.h"
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp
// #include "sox/effects_chain.h"
#include "paddlespeech/audio/src/sox/effects.h"
#include "sox/io.h"
#include "paddlespeech/audio/src/sox/effects_chain.h"
#include "sox/types.h"
#include "paddlespeech/audio/src/sox/io.h"
#include "sox/utils.h"
#include "paddlespeech/audio/src/sox/types.h"
#include "paddlespeech/audio/src/sox/utils.h"
using
namespace
torch
::
indexing
;
using
namespace
paddleaudio
::
sox_utils
;
using
namespace
paddleaudio
::
sox_utils
;
namespace
paddleaudio
{
namespace
paddleaudio
{
...
@@ -60,7 +60,7 @@ std::vector<std::vector<std::string>> get_effects(
...
@@ -60,7 +60,7 @@ std::vector<std::vector<std::string>> get_effects(
return
effects
;
return
effects
;
}
}
tl
::
optional
<
std
::
tuple
<
torch
::
Tensor
,
int64_t
>>
load_audio_file
(
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
load_audio_file
(
const
std
::
string
&
path
,
const
std
::
string
&
path
,
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
num_frames
,
const
tl
::
optional
<
int64_t
>&
num_frames
,
...
@@ -73,7 +73,7 @@ tl::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
...
@@ -73,7 +73,7 @@ tl::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
}
}
void
save_audio_file
(
const
std
::
string
&
path
,
void
save_audio_file
(
const
std
::
string
&
path
,
torch
::
Tensor
tensor
,
py
::
array
tensor
,
int64_t
sample_rate
,
int64_t
sample_rate
,
bool
channels_first
,
bool
channels_first
,
tl
::
optional
<
double
>
compression
,
tl
::
optional
<
double
>
compression
,
...
@@ -88,19 +88,19 @@ void save_audio_file(const std::string& path,
...
@@ -88,19 +88,19 @@ void save_audio_file(const std::string& path,
}();
}();
if
(
filetype
==
"amr-nb"
)
{
if
(
filetype
==
"amr-nb"
)
{
const
auto
num_channels
=
tensor
.
s
iz
e
(
channels_first
?
0
:
1
);
const
auto
num_channels
=
tensor
.
s
hap
e
(
channels_first
?
0
:
1
);
TORCH_CHECK
(
num_channels
==
1
,
//
TORCH_CHECK(num_channels == 1,
"amr-nb format only supports single channel audio."
);
//
"amr-nb format only supports single channel audio.");
}
else
if
(
filetype
==
"htk"
)
{
}
else
if
(
filetype
==
"htk"
)
{
const
auto
num_channels
=
tensor
.
s
iz
e
(
channels_first
?
0
:
1
);
const
auto
num_channels
=
tensor
.
s
hap
e
(
channels_first
?
0
:
1
);
TORCH_CHECK
(
num_channels
==
1
,
//
TORCH_CHECK(num_channels == 1,
"htk format only supports single channel audio."
);
//
"htk format only supports single channel audio.");
}
else
if
(
filetype
==
"gsm"
)
{
}
else
if
(
filetype
==
"gsm"
)
{
const
auto
num_channels
=
tensor
.
s
iz
e
(
channels_first
?
0
:
1
);
const
auto
num_channels
=
tensor
.
s
hap
e
(
channels_first
?
0
:
1
);
TORCH_CHECK
(
num_channels
==
1
,
//
TORCH_CHECK(num_channels == 1,
"gsm format only supports single channel audio."
);
//
"gsm format only supports single channel audio.");
TORCH_CHECK
(
sample_rate
==
8000
,
//
TORCH_CHECK(sample_rate == 8000,
"gsm format only supports a sampling rate of 8kHz."
);
//
"gsm format only supports a sampling rate of 8kHz.");
}
}
const
auto
signal_info
=
const
auto
signal_info
=
get_signalinfo
(
&
tensor
,
sample_rate
,
filetype
,
channels_first
);
get_signalinfo
(
&
tensor
,
sample_rate
,
filetype
,
channels_first
);
...
@@ -127,13 +127,5 @@ void save_audio_file(const std::string& path,
...
@@ -127,13 +127,5 @@ void save_audio_file(const std::string& path,
chain
.
run
();
chain
.
run
();
}
}
TORCH_LIBRARY_FRAGMENT
(
paddleaudio
,
m
)
{
m
.
def
(
"paddleaudio::sox_io_get_info"
,
&
paddleaudio
::
sox_io
::
get_info_file
);
m
.
def
(
"paddleaudio::sox_io_load_audio_file"
,
&
paddleaudio
::
sox_io
::
load_audio_file
);
m
.
def
(
"paddleaudio::sox_io_save_audio_file"
,
&
paddleaudio
::
sox_io
::
save_audio_file
);
}
}
// namespace sox_io
}
// namespace sox_io
}
// namespace paddleaudio
}
// namespace paddleaudio
\ No newline at end of file
paddlespeech/audio/src/sox/io.h
浏览文件 @
98300b86
...
@@ -2,11 +2,10 @@
...
@@ -2,11 +2,10 @@
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
// All rights reserved.
// All rights reserved.
#ifndef PADDLEAUDIO_SOX_IO_H
#pragma once
#define PADDLEAUDIO_SOX_IO_H
// #include "sox/utils.h
"
#include "paddlespeech/audio/src/optional/optional.hpp
"
#include "
optional/optional.hpp
"
#include "
paddlespeech/audio/src/sox/utils.h
"
namespace
paddleaudio
{
namespace
paddleaudio
{
namespace
sox_io
{
namespace
sox_io
{
...
@@ -21,7 +20,7 @@ using MetaDataTuple =
...
@@ -21,7 +20,7 @@ using MetaDataTuple =
tl
::
optional
<
MetaDataTuple
>
get_info_file
(
tl
::
optional
<
MetaDataTuple
>
get_info_file
(
const
std
::
string
&
path
,
const
tl
::
optional
<
std
::
string
>&
format
);
const
std
::
string
&
path
,
const
tl
::
optional
<
std
::
string
>&
format
);
tl
::
optional
<
std
::
tuple
<
torch
::
Tensor
,
int64_t
>>
load_audio_file
(
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
load_audio_file
(
const
std
::
string
&
path
,
const
std
::
string
&
path
,
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
num_frames
,
const
tl
::
optional
<
int64_t
>&
num_frames
,
...
@@ -30,7 +29,7 @@ tl::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
...
@@ -30,7 +29,7 @@ tl::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
const
tl
::
optional
<
std
::
string
>&
format
);
const
tl
::
optional
<
std
::
string
>&
format
);
void
save_audio_file
(
const
std
::
string
&
path
,
void
save_audio_file
(
const
std
::
string
&
path
,
torch
::
Tensor
tensor
,
py
::
array
tensor
,
int64_t
sample_rate
,
int64_t
sample_rate
,
bool
channels_first
,
bool
channels_first
,
tl
::
optional
<
double
>
compression
,
tl
::
optional
<
double
>
compression
,
...
@@ -40,5 +39,3 @@ void save_audio_file(const std::string& path,
...
@@ -40,5 +39,3 @@ void save_audio_file(const std::string& path,
}
// namespace sox_io
}
// namespace sox_io
}
// namespace paddleaudio
}
// namespace paddleaudio
#endif
\ No newline at end of file
paddlespeech/audio/src/sox/types.cpp
0 → 100644
浏览文件 @
98300b86
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
#include "paddlespeech/audio/src/sox/types.h"
#include <ostream>
#include <sstream>
namespace
paddleaudio
{
namespace
sox_utils
{
Format
get_format_from_string
(
const
std
::
string
&
format
)
{
if
(
format
==
"wav"
)
return
Format
::
WAV
;
if
(
format
==
"mp3"
)
return
Format
::
MP3
;
if
(
format
==
"flac"
)
return
Format
::
FLAC
;
if
(
format
==
"ogg"
||
format
==
"vorbis"
)
return
Format
::
VORBIS
;
if
(
format
==
"amr-nb"
)
return
Format
::
AMR_NB
;
if
(
format
==
"amr-wb"
)
return
Format
::
AMR_WB
;
if
(
format
==
"amb"
)
return
Format
::
AMB
;
if
(
format
==
"sph"
)
return
Format
::
SPHERE
;
if
(
format
==
"htk"
)
return
Format
::
HTK
;
if
(
format
==
"gsm"
)
return
Format
::
GSM
;
std
::
ostringstream
stream
;
stream
<<
"Internal Error: unexpected format value: "
<<
format
;
throw
std
::
runtime_error
(
stream
.
str
());
}
std
::
string
to_string
(
Encoding
v
)
{
switch
(
v
)
{
case
Encoding
::
UNKNOWN
:
return
"UNKNOWN"
;
case
Encoding
::
PCM_SIGNED
:
return
"PCM_S"
;
case
Encoding
::
PCM_UNSIGNED
:
return
"PCM_U"
;
case
Encoding
::
PCM_FLOAT
:
return
"PCM_F"
;
case
Encoding
::
FLAC
:
return
"FLAC"
;
case
Encoding
::
ULAW
:
return
"ULAW"
;
case
Encoding
::
ALAW
:
return
"ALAW"
;
case
Encoding
::
MP3
:
return
"MP3"
;
case
Encoding
::
VORBIS
:
return
"VORBIS"
;
case
Encoding
::
AMR_WB
:
return
"AMR_WB"
;
case
Encoding
::
AMR_NB
:
return
"AMR_NB"
;
case
Encoding
::
OPUS
:
return
"OPUS"
;
default:
throw
std
::
runtime_error
(
"Internal Error: unexpected encoding."
);
}
}
Encoding
get_encoding_from_option
(
const
tl
::
optional
<
std
::
string
>
encoding
)
{
if
(
!
encoding
.
has_value
())
return
Encoding
::
NOT_PROVIDED
;
std
::
string
v
=
encoding
.
value
();
if
(
v
==
"PCM_S"
)
return
Encoding
::
PCM_SIGNED
;
if
(
v
==
"PCM_U"
)
return
Encoding
::
PCM_UNSIGNED
;
if
(
v
==
"PCM_F"
)
return
Encoding
::
PCM_FLOAT
;
if
(
v
==
"ULAW"
)
return
Encoding
::
ULAW
;
if
(
v
==
"ALAW"
)
return
Encoding
::
ALAW
;
std
::
ostringstream
stream
;
stream
<<
"Internal Error: unexpected encoding value: "
<<
v
;
throw
std
::
runtime_error
(
stream
.
str
());
}
BitDepth
get_bit_depth_from_option
(
const
tl
::
optional
<
int64_t
>
bit_depth
)
{
if
(
!
bit_depth
.
has_value
())
return
BitDepth
::
NOT_PROVIDED
;
int64_t
v
=
bit_depth
.
value
();
switch
(
v
)
{
case
8
:
return
BitDepth
::
B8
;
case
16
:
return
BitDepth
::
B16
;
case
24
:
return
BitDepth
::
B24
;
case
32
:
return
BitDepth
::
B32
;
case
64
:
return
BitDepth
::
B64
;
default:
{
std
::
ostringstream
s
;
s
<<
"Internal Error: unexpected bit depth value: "
<<
v
;
throw
std
::
runtime_error
(
s
.
str
());
}
}
}
std
::
string
get_encoding
(
sox_encoding_t
encoding
)
{
switch
(
encoding
)
{
case
SOX_ENCODING_UNKNOWN
:
return
"UNKNOWN"
;
case
SOX_ENCODING_SIGN2
:
return
"PCM_S"
;
case
SOX_ENCODING_UNSIGNED
:
return
"PCM_U"
;
case
SOX_ENCODING_FLOAT
:
return
"PCM_F"
;
case
SOX_ENCODING_FLAC
:
return
"FLAC"
;
case
SOX_ENCODING_ULAW
:
return
"ULAW"
;
case
SOX_ENCODING_ALAW
:
return
"ALAW"
;
case
SOX_ENCODING_MP3
:
return
"MP3"
;
case
SOX_ENCODING_VORBIS
:
return
"VORBIS"
;
case
SOX_ENCODING_AMR_WB
:
return
"AMR_WB"
;
case
SOX_ENCODING_AMR_NB
:
return
"AMR_NB"
;
case
SOX_ENCODING_OPUS
:
return
"OPUS"
;
case
SOX_ENCODING_GSM
:
return
"GSM"
;
default:
return
"UNKNOWN"
;
}
}
}
// namespace sox_utils
}
// namespace paddleaudio
paddlespeech/audio/src/sox/types.h
0 → 100644
浏览文件 @
98300b86
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
#pragma once
#include <sox.h>
#include "paddlespeech/audio/src/optional/optional.hpp"
namespace
paddleaudio
{
namespace
sox_utils
{
enum
class
Format
{
WAV
,
MP3
,
FLAC
,
VORBIS
,
AMR_NB
,
AMR_WB
,
AMB
,
SPHERE
,
GSM
,
HTK
,
};
Format
get_format_from_string
(
const
std
::
string
&
format
);
enum
class
Encoding
{
NOT_PROVIDED
,
UNKNOWN
,
PCM_SIGNED
,
PCM_UNSIGNED
,
PCM_FLOAT
,
FLAC
,
ULAW
,
ALAW
,
MP3
,
VORBIS
,
AMR_WB
,
AMR_NB
,
OPUS
,
};
std
::
string
to_string
(
Encoding
v
);
Encoding
get_encoding_from_option
(
const
tl
::
optional
<
std
::
string
>
encoding
);
enum
class
BitDepth
:
unsigned
{
NOT_PROVIDED
=
0
,
B8
=
8
,
B16
=
16
,
B24
=
24
,
B32
=
32
,
B64
=
64
,
};
BitDepth
get_bit_depth_from_option
(
const
tl
::
optional
<
int64_t
>
bit_depth
);
std
::
string
get_encoding
(
sox_encoding_t
encoding
);
}
// namespace sox_utils
}
// namespace torchaudio
\ No newline at end of file
paddlespeech/audio/src/sox/utils.cpp
0 → 100644
浏览文件 @
98300b86
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.cpp
#include <sox.h>
#include "paddlespeech/audio/src/sox/types.h"
#include "paddlespeech/audio/src/sox/utils.h"
namespace
paddleaudio
{
namespace
sox_utils
{
void
set_seed
(
const
int64_t
seed
)
{
sox_get_globals
()
->
ranqd1
=
static_cast
<
sox_int32_t
>
(
seed
);
}
void
set_verbosity
(
const
int64_t
verbosity
)
{
sox_get_globals
()
->
verbosity
=
static_cast
<
unsigned
>
(
verbosity
);
}
void
set_use_threads
(
const
bool
use_threads
)
{
sox_get_globals
()
->
use_threads
=
static_cast
<
sox_bool
>
(
use_threads
);
}
void
set_buffer_size
(
const
int64_t
buffer_size
)
{
sox_get_globals
()
->
bufsiz
=
static_cast
<
size_t
>
(
buffer_size
);
}
int64_t
get_buffer_size
()
{
return
sox_get_globals
()
->
bufsiz
;
}
std
::
vector
<
std
::
vector
<
std
::
string
>>
list_effects
()
{
std
::
vector
<
std
::
vector
<
std
::
string
>>
effects
;
for
(
const
sox_effect_fn_t
*
fns
=
sox_get_effect_fns
();
*
fns
;
++
fns
)
{
const
sox_effect_handler_t
*
handler
=
(
*
fns
)();
if
(
handler
&&
handler
->
name
)
{
if
(
UNSUPPORTED_EFFECTS
.
find
(
handler
->
name
)
==
UNSUPPORTED_EFFECTS
.
end
())
{
effects
.
emplace_back
(
std
::
vector
<
std
::
string
>
{
handler
->
name
,
handler
->
usage
?
std
::
string
(
handler
->
usage
)
:
std
::
string
(
""
)});
}
}
}
return
effects
;
}
std
::
vector
<
std
::
string
>
list_write_formats
()
{
std
::
vector
<
std
::
string
>
formats
;
for
(
const
sox_format_tab_t
*
fns
=
sox_get_format_fns
();
fns
->
fn
;
++
fns
)
{
const
sox_format_handler_t
*
handler
=
fns
->
fn
();
for
(
const
char
*
const
*
names
=
handler
->
names
;
*
names
;
++
names
)
{
if
(
!
strchr
(
*
names
,
'/'
)
&&
handler
->
write
)
formats
.
emplace_back
(
*
names
);
}
}
return
formats
;
}
std
::
vector
<
std
::
string
>
list_read_formats
()
{
std
::
vector
<
std
::
string
>
formats
;
for
(
const
sox_format_tab_t
*
fns
=
sox_get_format_fns
();
fns
->
fn
;
++
fns
)
{
const
sox_format_handler_t
*
handler
=
fns
->
fn
();
for
(
const
char
*
const
*
names
=
handler
->
names
;
*
names
;
++
names
)
{
if
(
!
strchr
(
*
names
,
'/'
)
&&
handler
->
read
)
formats
.
emplace_back
(
*
names
);
}
}
return
formats
;
}
SoxFormat
::
SoxFormat
(
sox_format_t
*
fd
)
noexcept
:
fd_
(
fd
)
{}
SoxFormat
::~
SoxFormat
()
{
close
();
}
sox_format_t
*
SoxFormat
::
operator
->
()
const
noexcept
{
return
fd_
;
}
SoxFormat
::
operator
sox_format_t
*
()
const
noexcept
{
return
fd_
;
}
void
SoxFormat
::
close
()
{
if
(
fd_
!=
nullptr
)
{
sox_close
(
fd_
);
fd_
=
nullptr
;
}
}
void
validate_input_file
(
const
SoxFormat
&
sf
,
const
std
::
string
&
path
)
{
if
(
static_cast
<
sox_format_t
*>
(
sf
)
==
nullptr
)
{
throw
std
::
runtime_error
(
"Error loading audio file: failed to open file "
+
path
);
}
if
(
sf
->
encoding
.
encoding
==
SOX_ENCODING_UNKNOWN
)
{
throw
std
::
runtime_error
(
"Error loading audio file: unknown encoding."
);
}
}
void
validate_input_memfile
(
const
SoxFormat
&
sf
)
{
return
validate_input_file
(
sf
,
"<in memory buffer>"
);
}
void
validate_input_tensor
(
const
py
::
array
tensor
)
{
if
(
tensor
.
ndim
()
!=
2
)
{
throw
std
::
runtime_error
(
"Input tensor has to be 2D."
);
}
char
dtype
=
tensor
.
dtype
().
char_
();
bool
flag
=
(
dtype
==
'f'
)
||
(
dtype
==
'd'
)
||
(
dtype
==
'l'
)
||
(
dtype
==
'i'
);
if
(
flag
==
false
)
{
throw
std
::
runtime_error
(
"Input tensor has to be one of float32, int32, int16 or uint8 type."
);
}
}
py
::
dtype
get_dtype
(
const
sox_encoding_t
encoding
,
const
unsigned
precision
)
{
switch
(
encoding
)
{
case
SOX_ENCODING_UNSIGNED
:
// 8-bit PCM WAV
return
py
::
dtype
(
'
u1
'
);
case
SOX_ENCODING_SIGN2
:
// 16-bit, 24-bit, or 32-bit PCM WAV
switch
(
precision
)
{
case
16
:
return
py
::
dtype
(
"i2"
);
case
24
:
// Cast 24-bit to 32-bit.
case
32
:
return
py
::
dtype
(
'i'
);
default:
throw
std
::
runtime_error
(
"Only 16, 24, and 32 bits are supported for signed PCM."
);
}
default:
// default to float32 for the other formats, including
// 32-bit flaoting-point WAV,
// MP3,
// FLAC,
// VORBIS etc...
return
py
::
dtype
(
"f"
);
}
}
py
::
array
convert_to_tensor
(
sox_sample_t
*
buffer
,
const
int32_t
num_samples
,
const
int32_t
num_channels
,
const
py
::
dtype
dtype
,
const
bool
normalize
,
const
bool
channels_first
)
{
py
::
array
t
;
uint64_t
dummy
=
0
;
SOX_SAMPLE_LOCALS
;
if
(
normalize
||
dtype
.
char_
()
==
'f'
)
{
t
=
py
::
array
(
dtype
,
{
num_samples
/
num_channels
,
num_channels
});
auto
ptr
=
(
float
*
)
t
.
mutable_data
(
0
,
0
);
for
(
int32_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
ptr
[
i
]
=
SOX_SAMPLE_TO_FLOAT_32BIT
(
buffer
[
i
],
dummy
);
}
}
else
if
(
dtype
.
char_
()
==
'i'
)
{
//t = torch::from_blob(
// buffer, {num_samples / num_channels, num_channels}, torch::kInt32)
// .clone();
t
=
py
::
array
(
dtype
,
{
num_samples
/
num_channels
,
num_channels
});
auto
ptr
=
(
int
*
)
t
.
mutable_data
(
0
,
0
);
for
(
int32_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
ptr
[
i
]
=
buffer
[
i
];
}
}
else
if
(
dtype
.
char_
()
==
'h'
)
{
// int16
t
=
py
::
array
(
dtype
,
{
num_samples
/
num_channels
,
num_channels
});
auto
ptr
=
(
int16_t
*
)
t
.
mutable_data
(
0
,
0
);
for
(
int32_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
ptr
[
i
]
=
SOX_SAMPLE_TO_SIGNED_16BIT
(
buffer
[
i
],
dummy
);
}
}
else
if
(
dtype
.
char_
()
==
'b'
)
{
//t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
auto
ptr
=
(
uint8_t
*
)
t
.
mutable_data
(
0
,
0
);
for
(
int32_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
ptr
[
i
]
=
SOX_SAMPLE_TO_UNSIGNED_8BIT
(
buffer
[
i
],
dummy
);
}
}
else
{
throw
std
::
runtime_error
(
"Unsupported dtype."
);
}
return
t
;
}
const
std
::
string
get_filetype
(
const
std
::
string
path
)
{
std
::
string
ext
=
path
.
substr
(
path
.
find_last_of
(
"."
)
+
1
);
std
::
transform
(
ext
.
begin
(),
ext
.
end
(),
ext
.
begin
(),
::
tolower
);
return
ext
;
}
namespace
{
std
::
tuple
<
sox_encoding_t
,
unsigned
>
get_save_encoding_for_wav
(
const
std
::
string
format
,
py
::
dtype
dtype
,
const
Encoding
&
encoding
,
const
BitDepth
&
bits_per_sample
)
{
switch
(
encoding
)
{
case
Encoding
::
NOT_PROVIDED
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
switch
(
dtype
.
num
())
{
case
11
:
// float32 numpy dtype num
return
std
::
make_tuple
<>
(
SOX_ENCODING_FLOAT
,
32
);
case
5
:
// int numpy dtype num
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
32
);
case
3
:
// int16 numpy
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
16
);
case
1
:
// byte numpy
return
std
::
make_tuple
<>
(
SOX_ENCODING_UNSIGNED
,
8
);
default:
throw
std
::
runtime_error
(
"Internal Error: Unexpected dtype."
);
}
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_UNSIGNED
,
8
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
static_cast
<
unsigned
>
(
bits_per_sample
));
}
case
Encoding
::
PCM_SIGNED
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
32
);
case
BitDepth
::
B8
:
throw
std
::
runtime_error
(
format
+
" does not support 8-bit signed PCM encoding."
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
static_cast
<
unsigned
>
(
bits_per_sample
));
}
case
Encoding
::
PCM_UNSIGNED
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_UNSIGNED
,
8
);
default:
throw
std
::
runtime_error
(
format
+
" only supports 8-bit for unsigned PCM encoding."
);
}
case
Encoding
::
PCM_FLOAT
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B32
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_FLOAT
,
32
);
case
BitDepth
::
B64
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_FLOAT
,
64
);
default:
throw
std
::
runtime_error
(
format
+
" only supports 32-bit or 64-bit for floating-point PCM encoding."
);
}
case
Encoding
::
ULAW
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ULAW
,
8
);
default:
throw
std
::
runtime_error
(
format
+
" only supports 8-bit for mu-law encoding."
);
}
case
Encoding
::
ALAW
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ALAW
,
8
);
default:
throw
std
::
runtime_error
(
format
+
" only supports 8-bit for a-law encoding."
);
}
default:
throw
std
::
runtime_error
(
format
+
" does not support encoding: "
+
to_string
(
encoding
));
}
}
std
::
tuple
<
sox_encoding_t
,
unsigned
>
get_save_encoding
(
const
std
::
string
&
format
,
const
py
::
dtype
dtype
,
const
tl
::
optional
<
std
::
string
>
encoding
,
const
tl
::
optional
<
int64_t
>
bits_per_sample
)
{
const
Format
fmt
=
get_format_from_string
(
format
);
const
Encoding
enc
=
get_encoding_from_option
(
encoding
);
const
BitDepth
bps
=
get_bit_depth_from_option
(
bits_per_sample
);
switch
(
fmt
)
{
case
Format
::
WAV
:
case
Format
::
AMB
:
return
get_save_encoding_for_wav
(
format
,
dtype
,
enc
,
bps
);
case
Format
::
MP3
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"mp3 does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"mp3 does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_MP3
,
16
);
case
Format
::
HTK
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"htk does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"htk does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
16
);
case
Format
::
VORBIS
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"vorbis does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"vorbis does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_VORBIS
,
16
);
case
Format
::
AMR_NB
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"amr-nb does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"amr-nb does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_AMR_NB
,
16
);
case
Format
::
FLAC
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"flac does not support `encoding` option."
);
switch
(
bps
)
{
case
BitDepth
::
B32
:
case
BitDepth
::
B64
:
throw
std
::
runtime_error
(
"flac does not support `bits_per_sample` larger than 24."
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_FLAC
,
static_cast
<
unsigned
>
(
bps
));
}
case
Format
::
SPHERE
:
switch
(
enc
)
{
case
Encoding
::
NOT_PROVIDED
:
case
Encoding
::
PCM_SIGNED
:
switch
(
bps
)
{
case
BitDepth
::
NOT_PROVIDED
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
32
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
static_cast
<
unsigned
>
(
bps
));
}
case
Encoding
::
PCM_UNSIGNED
:
throw
std
::
runtime_error
(
"sph does not support unsigned integer PCM."
);
case
Encoding
::
PCM_FLOAT
:
throw
std
::
runtime_error
(
"sph does not support floating point PCM."
);
case
Encoding
::
ULAW
:
switch
(
bps
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ULAW
,
8
);
default:
throw
std
::
runtime_error
(
"sph only supports 8-bit for mu-law encoding."
);
}
case
Encoding
::
ALAW
:
switch
(
bps
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ALAW
,
8
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ALAW
,
static_cast
<
unsigned
>
(
bps
));
}
default:
throw
std
::
runtime_error
(
"sph does not support encoding: "
+
encoding
.
value
());
}
case
Format
::
GSM
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"gsm does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"gsm does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_GSM
,
16
);
default:
throw
std
::
runtime_error
(
"Unsupported format: "
+
format
);
}
}
unsigned
get_precision
(
const
std
::
string
filetype
,
py
::
dtype
dtype
)
{
if
(
filetype
==
"mp3"
)
return
SOX_UNSPEC
;
if
(
filetype
==
"flac"
)
return
24
;
if
(
filetype
==
"ogg"
||
filetype
==
"vorbis"
)
return
SOX_UNSPEC
;
if
(
filetype
==
"wav"
||
filetype
==
"amb"
)
{
switch
(
dtype
.
num
())
{
case
1
:
// byte in numpy dype num
return
8
;
case
3
:
// short, in numpy dtype num
return
16
;
case
5
:
// int, numpy dtype
return
32
;
case
11
:
// float, numpy dtype
return
32
;
default:
throw
std
::
runtime_error
(
"Unsupported dtype."
);
}
}
if
(
filetype
==
"sph"
)
return
32
;
if
(
filetype
==
"amr-nb"
)
{
return
16
;
}
if
(
filetype
==
"gsm"
)
{
return
16
;
}
if
(
filetype
==
"htk"
)
{
return
16
;
}
throw
std
::
runtime_error
(
"Unsupported file type: "
+
filetype
);
}
}
// namespace
sox_signalinfo_t
get_signalinfo
(
const
py
::
array
*
waveform
,
const
int64_t
sample_rate
,
const
std
::
string
filetype
,
const
bool
channels_first
)
{
return
sox_signalinfo_t
{
/*rate=*/
static_cast
<
sox_rate_t
>
(
sample_rate
),
/*channels=*/
static_cast
<
unsigned
>
(
waveform
->
shape
(
channels_first
?
0
:
1
)),
/*precision=*/
get_precision
(
filetype
,
waveform
->
dtype
()),
/*length=*/
static_cast
<
uint64_t
>
(
waveform
->
size
())};
}
sox_encodinginfo_t
get_tensor_encodinginfo
(
py
::
dtype
dtype
)
{
sox_encoding_t
encoding
=
[
&
]()
{
switch
(
dtype
.
num
())
{
case
1
:
// byte
return
SOX_ENCODING_UNSIGNED
;
case
3
:
// short
return
SOX_ENCODING_SIGN2
;
case
5
:
// int32
return
SOX_ENCODING_SIGN2
;
case
11
:
// float
return
SOX_ENCODING_FLOAT
;
default:
throw
std
::
runtime_error
(
"Unsupported dtype."
);
}
}();
unsigned
bits_per_sample
=
[
&
]()
{
switch
(
dtype
.
num
())
{
case
1
:
// byte
return
8
;
case
3
:
//short
return
16
;
case
5
:
// int32
return
32
;
case
11
:
// float
return
32
;
default:
throw
std
::
runtime_error
(
"Unsupported dtype."
);
}
}();
return
sox_encodinginfo_t
{
/*encoding=*/
encoding
,
/*bits_per_sample=*/
bits_per_sample
,
/*compression=*/
HUGE_VAL
,
/*reverse_bytes=*/
sox_option_default
,
/*reverse_nibbles=*/
sox_option_default
,
/*reverse_bits=*/
sox_option_default
,
/*opposite_endian=*/
sox_false
};
}
sox_encodinginfo_t
get_encodinginfo_for_save
(
const
std
::
string
&
format
,
const
py
::
dtype
dtype
,
const
tl
::
optional
<
double
>
compression
,
const
tl
::
optional
<
std
::
string
>
encoding
,
const
tl
::
optional
<
int64_t
>
bits_per_sample
)
{
auto
enc
=
get_save_encoding
(
format
,
dtype
,
encoding
,
bits_per_sample
);
return
sox_encodinginfo_t
{
/*encoding=*/
std
::
get
<
0
>
(
enc
),
/*bits_per_sample=*/
std
::
get
<
1
>
(
enc
),
/*compression=*/
compression
.
value_or
(
HUGE_VAL
),
/*reverse_bytes=*/
sox_option_default
,
/*reverse_nibbles=*/
sox_option_default
,
/*reverse_bits=*/
sox_option_default
,
/*opposite_endian=*/
sox_false
};
}
}
// namespace sox_utils
}
// namespace torchaudio
paddlespeech/audio/src/sox/utils.h
0 → 100644
浏览文件 @
98300b86
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.h
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <sox.h>
#include "paddlespeech/audio/src/optional/optional.hpp"
namespace
py
=
pybind11
;
namespace
paddleaudio
{
namespace
sox_utils
{
////////////////////////////////////////////////////////////////////////////////
// APIs for Python interaction
////////////////////////////////////////////////////////////////////////////////
/// Set sox global options
void
set_seed
(
const
int64_t
seed
);
void
set_verbosity
(
const
int64_t
verbosity
);
void
set_use_threads
(
const
bool
use_threads
);
void
set_buffer_size
(
const
int64_t
buffer_size
);
int64_t
get_buffer_size
();
std
::
vector
<
std
::
vector
<
std
::
string
>>
list_effects
();
std
::
vector
<
std
::
string
>
list_read_formats
();
std
::
vector
<
std
::
string
>
list_write_formats
();
////////////////////////////////////////////////////////////////////////////////
// Utilities for sox_io / sox_effects implementations
////////////////////////////////////////////////////////////////////////////////
const
std
::
unordered_set
<
std
::
string
>
UNSUPPORTED_EFFECTS
=
{
"input"
,
"output"
,
"spectrogram"
,
"noiseprof"
,
"noisered"
,
"splice"
};
/// helper class to automatically close sox_format_t*
struct
SoxFormat
{
explicit
SoxFormat
(
sox_format_t
*
fd
)
noexcept
;
SoxFormat
(
const
SoxFormat
&
other
)
=
delete
;
SoxFormat
(
SoxFormat
&&
other
)
=
delete
;
SoxFormat
&
operator
=
(
const
SoxFormat
&
other
)
=
delete
;
SoxFormat
&
operator
=
(
SoxFormat
&&
other
)
=
delete
;
~
SoxFormat
();
sox_format_t
*
operator
->
()
const
noexcept
;
operator
sox_format_t
*
()
const
noexcept
;
void
close
();
private:
sox_format_t
*
fd_
;
};
///
/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
void
validate_input_tensor
(
const
py
::
array
);
void
validate_input_file
(
const
SoxFormat
&
sf
,
const
std
::
string
&
path
);
void
validate_input_memfile
(
const
SoxFormat
&
sf
);
///
/// Get target dtype for the given encoding and precision.
py
::
dtype
get_dtype
(
const
sox_encoding_t
encoding
,
const
unsigned
precision
);
///
/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
/// NOTE: This function might modify the values in the input buffer to
/// reduce the number of memory copy.
/// @param buffer Pointer to buffer that contains audio data.
/// @param num_samples The number of samples to read.
/// @param num_channels The number of channels. Used to reshape the resulting
/// Tensor.
/// @param dtype Target dtype. Determines the output dtype and value range in
/// conjunction with normalization.
/// @param noramlize Perform normalization. Only effective when dtype is not
/// kFloat32. When effective, the output tensor is kFloat32 type and value range
/// is [-1.0, 1.0]
/// @param channels_first When True, output Tensor has shape of [num_channels,
/// num_frames].
py
::
array
convert_to_tensor
(
sox_sample_t
*
buffer
,
const
int32_t
num_samples
,
const
int32_t
num_channels
,
const
py
::
dtype
dtype
,
const
bool
normalize
,
const
bool
channels_first
);
/// Extract extension from file path
const
std
::
string
get_filetype
(
const
std
::
string
path
);
/// Get sox_signalinfo_t for passing a py::array object.
sox_signalinfo_t
get_signalinfo
(
const
py
::
array
*
waveform
,
const
int64_t
sample_rate
,
const
std
::
string
filetype
,
const
bool
channels_first
);
/// Get sox_encodinginfo_t for Tensor I/O
sox_encodinginfo_t
get_tensor_encodinginfo
(
const
py
::
dtype
dtype
);
/// Get sox_encodinginfo_t for saving to file/file object
sox_encodinginfo_t
get_encodinginfo_for_save
(
const
std
::
string
&
format
,
const
py
::
dtype
dtype
,
const
tl
::
optional
<
double
>
compression
,
const
tl
::
optional
<
std
::
string
>
encoding
,
const
tl
::
optional
<
int64_t
>
bits_per_sample
);
}
// namespace sox_utils
}
// namespace paddleaudio
setup.py
浏览文件 @
98300b86
...
@@ -43,7 +43,7 @@ base = [
...
@@ -43,7 +43,7 @@ base = [
"pypinyin"
,
"pypinyin-dict"
,
"python-dateutil"
,
"pyworld"
,
"resampy==0.2.2"
,
"pypinyin"
,
"pypinyin-dict"
,
"python-dateutil"
,
"pyworld"
,
"resampy==0.2.2"
,
"sacrebleu"
,
"scipy"
,
"sentencepiece~=0.1.96"
,
"soundfile~=0.10"
,
"sacrebleu"
,
"scipy"
,
"sentencepiece~=0.1.96"
,
"soundfile~=0.10"
,
"textgrid"
,
"timer"
,
"tqdm"
,
"typeguard"
,
"visualdl"
,
"webrtcvad"
,
"textgrid"
,
"timer"
,
"tqdm"
,
"typeguard"
,
"visualdl"
,
"webrtcvad"
,
"yacs~=0.1.8"
,
"prettytable"
,
"zhon"
,
"colorlog"
,
"pathos == 0.2.8"
"yacs~=0.1.8"
,
"prettytable"
,
"zhon"
,
"colorlog"
,
"pathos == 0.2.8"
,
"Ninja"
]
]
server
=
[
server
=
[
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录