Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
5e30f925
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5e30f925
编写于
7月 29, 2022
作者:
Y
YangZhou
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
clean code
上级
c938a468
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
0 addition
and
1561 deletion
+0
-1561
paddlespeech/audio/src/sox/effects.cpp
paddlespeech/audio/src/sox/effects.cpp
+0
-147
paddlespeech/audio/src/sox/effects.h
paddlespeech/audio/src/sox/effects.h
+0
-29
paddlespeech/audio/src/sox/effects_chain.cpp
paddlespeech/audio/src/sox/effects_chain.cpp
+0
-342
paddlespeech/audio/src/sox/effects_chain.h
paddlespeech/audio/src/sox/effects_chain.h
+0
-62
paddlespeech/audio/src/sox/io.cpp
paddlespeech/audio/src/sox/io.cpp
+0
-131
paddlespeech/audio/src/sox/io.h
paddlespeech/audio/src/sox/io.h
+0
-41
paddlespeech/audio/src/sox/types.cpp
paddlespeech/audio/src/sox/types.cpp
+0
-143
paddlespeech/audio/src/sox/types.h
paddlespeech/audio/src/sox/types.h
+0
-58
paddlespeech/audio/src/sox/utils.cpp
paddlespeech/audio/src/sox/utils.cpp
+0
-488
paddlespeech/audio/src/sox/utils.h
paddlespeech/audio/src/sox/utils.h
+0
-120
未找到文件。
paddlespeech/audio/src/sox/effects.cpp
已删除
100644 → 0
浏览文件 @
c938a468
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp
#include <sox.h>
#include <mutex>
#include "paddlespeech/audio/src/sox/effects.h"
#include "paddlespeech/audio/src/sox/effects_chain.h"
#include "paddlespeech/audio/src/sox/utils.h"
using
namespace
paddleaudio
::
sox_utils
;
namespace
paddleaudio
::
sox_effects
{
namespace
{
enum
SoxEffectsResourceState
{
NotInitialized
,
Initialized
,
ShutDown
};
SoxEffectsResourceState
SOX_RESOURCE_STATE
=
NotInitialized
;
std
::
mutex
SOX_RESOUCE_STATE_MUTEX
;
}
// namespace
void
initialize_sox_effects
()
{
const
std
::
lock_guard
<
std
::
mutex
>
lock
(
SOX_RESOUCE_STATE_MUTEX
);
switch
(
SOX_RESOURCE_STATE
)
{
case
NotInitialized
:
if
(
sox_init
()
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Failed to initialize sox effects."
);
};
SOX_RESOURCE_STATE
=
Initialized
;
break
;
case
Initialized
:
break
;
case
ShutDown
:
throw
std
::
runtime_error
(
"SoX Effects has been shut down. Cannot initialize again."
);
}
};
void
shutdown_sox_effects
()
{
const
std
::
lock_guard
<
std
::
mutex
>
lock
(
SOX_RESOUCE_STATE_MUTEX
);
switch
(
SOX_RESOURCE_STATE
)
{
case
NotInitialized
:
throw
std
::
runtime_error
(
"SoX Effects is not initialized. Cannot shutdown."
);
case
Initialized
:
if
(
sox_quit
()
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Failed to initialize sox effects."
);
};
SOX_RESOURCE_STATE
=
ShutDown
;
break
;
case
ShutDown
:
break
;
}
}
auto
apply_effects_tensor
(
py
::
array
waveform
,
int64_t
sample_rate
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
bool
channels_first
)
->
std
::
tuple
<
py
::
array
,
int64_t
>
{
validate_input_tensor
(
waveform
);
// Create SoxEffectsChain
const
auto
dtype
=
waveform
.
dtype
();
paddleaudio
::
sox_effects_chain
::
SoxEffectsChain
chain
(
/*input_encoding=*/
get_tensor_encodinginfo
(
dtype
),
/*output_encoding=*/
get_tensor_encodinginfo
(
dtype
));
// Prepare output buffer
std
::
vector
<
sox_sample_t
>
out_buffer
;
out_buffer
.
reserve
(
waveform
.
size
());
// Build and run effects chain
chain
.
addInputTensor
(
&
waveform
,
sample_rate
,
channels_first
);
for
(
const
auto
&
effect
:
effects
)
{
chain
.
addEffect
(
effect
);
}
chain
.
addOutputBuffer
(
&
out_buffer
);
chain
.
run
();
// Create tensor from buffer
auto
out_tensor
=
convert_to_tensor
(
/*buffer=*/
out_buffer
.
data
(),
/*num_samples=*/
out_buffer
.
size
(),
/*num_channels=*/
chain
.
getOutputNumChannels
(),
dtype
,
/*normalize=*/
false
,
channels_first
);
return
std
::
tuple
<
py
::
array
,
int64_t
>
(
out_tensor
,
chain
.
getOutputSampleRate
());
}
auto
apply_effects_file
(
const
std
::
string
&
path
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
const
tl
::
optional
<
std
::
string
>&
format
)
->
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
{
// Open input file
SoxFormat
sf
(
sox_open_read
(
path
.
c_str
(),
/*signal=*/
nullptr
,
/*encoding=*/
nullptr
,
/*filetype=*/
format
.
has_value
()
?
format
.
value
().
c_str
()
:
nullptr
));
if
(
static_cast
<
sox_format_t
*>
(
sf
)
==
nullptr
||
sf
->
encoding
.
encoding
==
SOX_ENCODING_UNKNOWN
)
{
return
{};
}
const
auto
dtype
=
get_dtype
(
sf
->
encoding
.
encoding
,
sf
->
signal
.
precision
);
// Prepare output
std
::
vector
<
sox_sample_t
>
out_buffer
;
out_buffer
.
reserve
(
sf
->
signal
.
length
);
// Create and run SoxEffectsChain
paddleaudio
::
sox_effects_chain
::
SoxEffectsChain
chain
(
/*input_encoding=*/
sf
->
encoding
,
/*output_encoding=*/
get_tensor_encodinginfo
(
dtype
));
chain
.
addInputFile
(
sf
);
for
(
const
auto
&
effect
:
effects
)
{
chain
.
addEffect
(
effect
);
}
chain
.
addOutputBuffer
(
&
out_buffer
);
chain
.
run
();
// Create tensor from buffer
bool
channels_first_
=
channels_first
.
value_or
(
true
);
auto
tensor
=
convert_to_tensor
(
/*buffer=*/
out_buffer
.
data
(),
/*num_samples=*/
out_buffer
.
size
(),
/*num_channels=*/
chain
.
getOutputNumChannels
(),
dtype
,
normalize
.
value_or
(
true
),
channels_first_
);
return
std
::
tuple
<
py
::
array
,
int64_t
>
(
tensor
,
chain
.
getOutputSampleRate
());
}
}
// namespace paddleaudio::sox_effects
paddlespeech/audio/src/sox/effects.h
已删除
100644 → 0
浏览文件 @
c938a468
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h
#pragma once
#include <pybind11/pybind11.h>
#include "paddlespeech/audio/src/sox/utils.h"
namespace
py
=
pybind11
;
namespace
paddleaudio
::
sox_effects
{
void
initialize_sox_effects
();
void
shutdown_sox_effects
();
auto
apply_effects_tensor
(
py
::
array
waveform
,
int64_t
sample_rate
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
bool
channels_first
)
->
std
::
tuple
<
py
::
array
,
int64_t
>
;
auto
apply_effects_file
(
const
std
::
string
&
path
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
effects
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
const
tl
::
optional
<
std
::
string
>&
format
)
->
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
;
}
// namespace torchaudio::sox_effects
paddlespeech/audio/src/sox/effects_chain.cpp
已删除
100644 → 0
浏览文件 @
c938a468
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp
#include "paddlespeech/audio/src/sox/effects_chain.h"
#include "paddlespeech/audio/src/sox/utils.h"
using
namespace
paddleaudio
::
sox_utils
;
namespace
paddleaudio
{
namespace
sox_effects_chain
{
namespace
{
/// helper classes for passing the location of input tensor and output buffer
///
/// drain/flow callback functions require plaing C style function signature and
/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
/// The following structs will be assigned to sox_effect_t::priv pointer which
/// gives sox_effect_t an access to input Tensor and output buffer object.
struct
TensorInputPriv
{
size_t
index
;
py
::
array
*
waveform
;
int64_t
sample_rate
;
bool
channels_first
;
};
struct
TensorOutputPriv
{
std
::
vector
<
sox_sample_t
>*
buffer
;
};
struct
FileOutputPriv
{
sox_format_t
*
sf
;
};
/// Callback function to feed Tensor data to SoxEffectChain.
int
tensor_input_drain
(
sox_effect_t
*
effp
,
sox_sample_t
*
obuf
,
size_t
*
osamp
)
{
// Retrieve the input Tensor and current index
auto
priv
=
static_cast
<
TensorInputPriv
*>
(
effp
->
priv
);
auto
index
=
priv
->
index
;
auto
tensor
=
*
(
priv
->
waveform
);
auto
num_channels
=
effp
->
out_signal
.
channels
;
// Adjust the number of samples to read
const
size_t
num_samples
=
tensor
.
size
();
if
(
index
+
*
osamp
>
num_samples
)
{
*
osamp
=
num_samples
-
index
;
}
// Ensure that it's a multiple of the number of channels
*
osamp
-=
*
osamp
%
num_channels
;
// Slice the input Tensor
// refacor this module, chunk
auto
i_frame
=
index
/
num_channels
;
auto
num_frames
=
*
osamp
/
num_channels
;
py
::
array
chunk
(
tensor
.
dtype
(),
{
num_frames
*
num_channels
});
py
::
buffer_info
ori_info
=
tensor
.
request
();
py
::
buffer_info
info
=
chunk
.
request
();
char
*
ori_start_ptr
=
(
char
*
)
ori_info
.
ptr
+
index
*
chunk
.
itemsize
()
/
sizeof
(
char
);
std
::
memcpy
(
info
.
ptr
,
ori_start_ptr
,
chunk
.
nbytes
());
py
::
dtype
chunk_type
=
py
::
dtype
(
"i"
);
// dtype int32
py
::
array
new_chunk
=
py
::
array
(
chunk_type
,
chunk
.
shape
());
py
::
buffer_info
new_info
=
new_chunk
.
request
();
void
*
ptr
=
(
void
*
)
info
.
ptr
;
int
*
new_ptr
=
(
int
*
)
new_info
.
ptr
;
// Convert to sox_sample_t (int32_t)
switch
(
chunk
.
dtype
().
num
())
{
//case c10::ScalarType::Float: {
case
11
:
{
// Need to convert to 64-bit precision so that
// values around INT32_MIN/MAX are handled correctly.
float
*
ptr_f
=
(
float
*
)
ptr
;
for
(
int
idx
=
0
;
idx
<
chunk
.
size
();
++
idx
)
{
double
elem
=
*
ptr_f
*
2147483648.
;
// *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
if
(
elem
>
INT32_MAX
)
{
*
new_ptr
=
INT32_MAX
;
}
else
if
(
elem
<
INT32_MIN
)
{
*
new_ptr
=
INT32_MIN
;
}
else
{
*
new_ptr
=
elem
;
}
}
break
;
}
//case c10::ScalarType::Int: {
case
5
:
{
break
;
}
// case short
case
3
:
{
int16_t
*
ptr_s
=
(
int16_t
*
)
ptr
;
for
(
int
idx
=
0
;
idx
<
chunk
.
size
();
++
idx
)
{
*
new_ptr
=
*
ptr_s
*
65536
;
}
break
;
}
// case byte
case
1
:
{
int8_t
*
ptr_b
=
(
int8_t
*
)
ptr
;
for
(
int
idx
=
0
;
idx
<
chunk
.
size
();
++
idx
)
{
*
new_ptr
=
(
*
ptr_b
-
128
)
*
16777216
;
}
break
;
}
default:
throw
std
::
runtime_error
(
"Unexpected dtype."
);
}
// Write to buffer
memcpy
(
obuf
,
(
int
*
)
new_info
.
ptr
,
*
osamp
*
4
);
priv
->
index
+=
*
osamp
;
return
(
priv
->
index
==
num_samples
)
?
SOX_EOF
:
SOX_SUCCESS
;
}
/// Callback function to fetch data from SoxEffectChain.
int
tensor_output_flow
(
sox_effect_t
*
effp
,
sox_sample_t
const
*
ibuf
,
sox_sample_t
*
obuf
LSX_UNUSED
,
size_t
*
isamp
,
size_t
*
osamp
)
{
*
osamp
=
0
;
// Get output buffer
auto
out_buffer
=
static_cast
<
TensorOutputPriv
*>
(
effp
->
priv
)
->
buffer
;
// Append at the end
out_buffer
->
insert
(
out_buffer
->
end
(),
ibuf
,
ibuf
+
*
isamp
);
return
SOX_SUCCESS
;
}
int
file_output_flow
(
sox_effect_t
*
effp
,
sox_sample_t
const
*
ibuf
,
sox_sample_t
*
obuf
LSX_UNUSED
,
size_t
*
isamp
,
size_t
*
osamp
)
{
*
osamp
=
0
;
if
(
*
isamp
)
{
auto
sf
=
static_cast
<
FileOutputPriv
*>
(
effp
->
priv
)
->
sf
;
if
(
sox_write
(
sf
,
ibuf
,
*
isamp
)
!=
*
isamp
)
{
if
(
sf
->
sox_errno
)
{
std
::
ostringstream
stream
;
stream
<<
sf
->
sox_errstr
<<
" "
<<
sox_strerror
(
sf
->
sox_errno
)
<<
" "
<<
sf
->
filename
;
throw
std
::
runtime_error
(
stream
.
str
());
}
return
SOX_EOF
;
}
}
return
SOX_SUCCESS
;
}
sox_effect_handler_t
*
get_tensor_input_handler
()
{
static
sox_effect_handler_t
handler
{
/*name=*/
"input_tensor"
,
/*usage=*/
NULL
,
/*flags=*/
SOX_EFF_MCHAN
,
/*getopts=*/
NULL
,
/*start=*/
NULL
,
/*flow=*/
NULL
,
/*drain=*/
tensor_input_drain
,
/*stop=*/
NULL
,
/*kill=*/
NULL
,
/*priv_size=*/
sizeof
(
TensorInputPriv
)};
return
&
handler
;
}
sox_effect_handler_t
*
get_tensor_output_handler
()
{
static
sox_effect_handler_t
handler
{
/*name=*/
"output_tensor"
,
/*usage=*/
NULL
,
/*flags=*/
SOX_EFF_MCHAN
,
/*getopts=*/
NULL
,
/*start=*/
NULL
,
/*flow=*/
tensor_output_flow
,
/*drain=*/
NULL
,
/*stop=*/
NULL
,
/*kill=*/
NULL
,
/*priv_size=*/
sizeof
(
TensorOutputPriv
)};
return
&
handler
;
}
sox_effect_handler_t
*
get_file_output_handler
()
{
static
sox_effect_handler_t
handler
{
/*name=*/
"output_file"
,
/*usage=*/
NULL
,
/*flags=*/
SOX_EFF_MCHAN
,
/*getopts=*/
NULL
,
/*start=*/
NULL
,
/*flow=*/
file_output_flow
,
/*drain=*/
NULL
,
/*stop=*/
NULL
,
/*kill=*/
NULL
,
/*priv_size=*/
sizeof
(
FileOutputPriv
)};
return
&
handler
;
}
}
// namespace
SoxEffect
::
SoxEffect
(
sox_effect_t
*
se
)
noexcept
:
se_
(
se
)
{}
SoxEffect
::~
SoxEffect
()
{
if
(
se_
!=
nullptr
)
{
free
(
se_
);
}
}
SoxEffect
::
operator
sox_effect_t
*
()
const
{
return
se_
;
}
auto
SoxEffect
::
operator
->
()
noexcept
->
sox_effect_t
*
{
return
se_
;
}
SoxEffectsChain
::
SoxEffectsChain
(
sox_encodinginfo_t
input_encoding
,
sox_encodinginfo_t
output_encoding
)
:
in_enc_
(
input_encoding
),
out_enc_
(
output_encoding
),
in_sig_
(),
interm_sig_
(),
out_sig_
(),
sec_
(
sox_create_effects_chain
(
&
in_enc_
,
&
out_enc_
))
{
if
(
!
sec_
)
{
throw
std
::
runtime_error
(
"Failed to create effect chain."
);
}
}
SoxEffectsChain
::~
SoxEffectsChain
()
{
if
(
sec_
!=
nullptr
)
{
sox_delete_effects_chain
(
sec_
);
}
}
void
SoxEffectsChain
::
run
()
{
sox_flow_effects
(
sec_
,
NULL
,
NULL
);
}
void
SoxEffectsChain
::
addInputTensor
(
py
::
array
*
waveform
,
int64_t
sample_rate
,
bool
channels_first
)
{
in_sig_
=
get_signalinfo
(
waveform
,
sample_rate
,
"wav"
,
channels_first
);
interm_sig_
=
in_sig_
;
SoxEffect
e
(
sox_create_effect
(
get_tensor_input_handler
()));
auto
priv
=
static_cast
<
TensorInputPriv
*>
(
e
->
priv
);
priv
->
index
=
0
;
priv
->
waveform
=
waveform
;
priv
->
sample_rate
=
sample_rate
;
priv
->
channels_first
=
channels_first
;
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
in_sig_
)
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Internal Error: Failed to add effect: input_tensor"
);
}
}
void
SoxEffectsChain
::
addOutputBuffer
(
std
::
vector
<
sox_sample_t
>*
output_buffer
)
{
SoxEffect
e
(
sox_create_effect
(
get_tensor_output_handler
()));
static_cast
<
TensorOutputPriv
*>
(
e
->
priv
)
->
buffer
=
output_buffer
;
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
in_sig_
)
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"Internal Error: Failed to add effect: output_tensor"
);
}
}
void
SoxEffectsChain
::
addInputFile
(
sox_format_t
*
sf
)
{
in_sig_
=
sf
->
signal
;
interm_sig_
=
in_sig_
;
SoxEffect
e
(
sox_create_effect
(
sox_find_effect
(
"input"
)));
char
*
opts
[]
=
{(
char
*
)
sf
};
sox_effect_options
(
e
,
1
,
opts
);
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
in_sig_
)
!=
SOX_SUCCESS
)
{
std
::
ostringstream
stream
;
stream
<<
"Internal Error: Failed to add effect: input "
<<
sf
->
filename
;
throw
std
::
runtime_error
(
stream
.
str
());
}
}
void
SoxEffectsChain
::
addOutputFile
(
sox_format_t
*
sf
)
{
out_sig_
=
sf
->
signal
;
SoxEffect
e
(
sox_create_effect
(
get_file_output_handler
()));
static_cast
<
FileOutputPriv
*>
(
e
->
priv
)
->
sf
=
sf
;
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
out_sig_
)
!=
SOX_SUCCESS
)
{
std
::
ostringstream
stream
;
stream
<<
"Internal Error: Failed to add effect: output "
<<
sf
->
filename
;
throw
std
::
runtime_error
(
stream
.
str
());
}
}
void
SoxEffectsChain
::
addEffect
(
const
std
::
vector
<
std
::
string
>
effect
)
{
const
auto
num_args
=
effect
.
size
();
if
(
num_args
==
0
)
{
throw
std
::
runtime_error
(
"Invalid argument: empty effect."
);
}
const
auto
name
=
effect
[
0
];
if
(
UNSUPPORTED_EFFECTS
.
find
(
name
)
!=
UNSUPPORTED_EFFECTS
.
end
())
{
std
::
ostringstream
stream
;
stream
<<
"Unsupported effect: "
<<
name
;
throw
std
::
runtime_error
(
stream
.
str
());
}
auto
returned_effect
=
sox_find_effect
(
name
.
c_str
());
if
(
!
returned_effect
)
{
std
::
ostringstream
stream
;
stream
<<
"Unsupported effect: "
<<
name
;
throw
std
::
runtime_error
(
stream
.
str
());
}
SoxEffect
e
(
sox_create_effect
(
returned_effect
));
const
auto
num_options
=
num_args
-
1
;
std
::
vector
<
char
*>
opts
;
for
(
size_t
i
=
1
;
i
<
num_args
;
++
i
)
{
opts
.
push_back
((
char
*
)
effect
[
i
].
c_str
());
}
if
(
sox_effect_options
(
e
,
num_options
,
num_options
?
opts
.
data
()
:
nullptr
)
!=
SOX_SUCCESS
)
{
std
::
ostringstream
stream
;
stream
<<
"Invalid effect option:"
;
for
(
const
auto
&
v
:
effect
)
{
stream
<<
" "
<<
v
;
}
throw
std
::
runtime_error
(
stream
.
str
());
}
if
(
sox_add_effect
(
sec_
,
e
,
&
interm_sig_
,
&
in_sig_
)
!=
SOX_SUCCESS
)
{
std
::
ostringstream
stream
;
stream
<<
"Internal Error: Failed to add effect:
\"
"
<<
name
;
for
(
size_t
i
=
1
;
i
<
num_args
;
++
i
)
{
stream
<<
" "
<<
effect
[
i
];
}
stream
<<
"
\"
"
;
throw
std
::
runtime_error
(
stream
.
str
());
}
}
int64_t
SoxEffectsChain
::
getOutputNumChannels
()
{
return
interm_sig_
.
channels
;
}
int64_t
SoxEffectsChain
::
getOutputSampleRate
()
{
return
interm_sig_
.
rate
;
}
}
// namespace sox_effects_chain
}
// namespace paddleaudio
paddlespeech/audio/src/sox/effects_chain.h
已删除
100644 → 0
浏览文件 @
c938a468
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h
#pragma once
#include <sox.h>
#include "paddlespeech/audio/src/sox/utils.h"
namespace
paddleaudio
{
namespace
sox_effects_chain
{
// Helper struct to safely close sox_effect_t* pointer returned by
// sox_create_effect
struct
SoxEffect
{
explicit
SoxEffect
(
sox_effect_t
*
se
)
noexcept
;
SoxEffect
(
const
SoxEffect
&
other
)
=
delete
;
SoxEffect
(
const
SoxEffect
&&
other
)
=
delete
;
auto
operator
=
(
const
SoxEffect
&
other
)
->
SoxEffect
&
=
delete
;
auto
operator
=
(
SoxEffect
&&
other
)
->
SoxEffect
&
=
delete
;
~
SoxEffect
();
operator
sox_effect_t
*
()
const
;
auto
operator
->
()
noexcept
->
sox_effect_t
*
;
private:
sox_effect_t
*
se_
;
};
// Helper struct to safely close sox_effects_chain_t with handy methods
class
SoxEffectsChain
{
const
sox_encodinginfo_t
in_enc_
;
const
sox_encodinginfo_t
out_enc_
;
protected:
sox_signalinfo_t
in_sig_
;
sox_signalinfo_t
interm_sig_
;
sox_signalinfo_t
out_sig_
;
sox_effects_chain_t
*
sec_
;
public:
explicit
SoxEffectsChain
(
sox_encodinginfo_t
input_encoding
,
sox_encodinginfo_t
output_encoding
);
SoxEffectsChain
(
const
SoxEffectsChain
&
other
)
=
delete
;
SoxEffectsChain
(
const
SoxEffectsChain
&&
other
)
=
delete
;
SoxEffectsChain
&
operator
=
(
const
SoxEffectsChain
&
other
)
=
delete
;
SoxEffectsChain
&
operator
=
(
SoxEffectsChain
&&
other
)
=
delete
;
~
SoxEffectsChain
();
void
run
();
void
addInputTensor
(
py
::
array
*
waveform
,
int64_t
sample_rate
,
bool
channels_first
);
void
addInputFile
(
sox_format_t
*
sf
);
void
addOutputBuffer
(
std
::
vector
<
sox_sample_t
>*
output_buffer
);
void
addOutputFile
(
sox_format_t
*
sf
);
void
addEffect
(
const
std
::
vector
<
std
::
string
>
effect
);
int64_t
getOutputNumChannels
();
int64_t
getOutputSampleRate
();
};
}
// namespace sox_effects_chain
}
// namespace torchaudio
paddlespeech/audio/src/sox/io.cpp
已删除
100644 → 0
浏览文件 @
c938a468
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp
#include "paddlespeech/audio/src/sox/effects.h"
#include "paddlespeech/audio/src/sox/effects_chain.h"
#include "paddlespeech/audio/src/sox/io.h"
#include "paddlespeech/audio/src/sox/types.h"
#include "paddlespeech/audio/src/sox/utils.h"
using
namespace
paddleaudio
::
sox_utils
;
namespace
paddleaudio
{
namespace
sox_io
{
tl
::
optional
<
MetaDataTuple
>
get_info_file
(
const
std
::
string
&
path
,
const
tl
::
optional
<
std
::
string
>&
format
)
{
SoxFormat
sf
(
sox_open_read
(
path
.
c_str
(),
/*signal=*/
nullptr
,
/*encoding=*/
nullptr
,
/*filetype=*/
format
.
has_value
()
?
format
.
value
().
c_str
()
:
nullptr
));
if
(
static_cast
<
sox_format_t
*>
(
sf
)
==
nullptr
||
sf
->
encoding
.
encoding
==
SOX_ENCODING_UNKNOWN
)
{
return
{};
}
return
std
::
forward_as_tuple
(
static_cast
<
int64_t
>
(
sf
->
signal
.
rate
),
static_cast
<
int64_t
>
(
sf
->
signal
.
length
/
sf
->
signal
.
channels
),
static_cast
<
int64_t
>
(
sf
->
signal
.
channels
),
static_cast
<
int64_t
>
(
sf
->
encoding
.
bits_per_sample
),
get_encoding
(
sf
->
encoding
.
encoding
));
}
std
::
vector
<
std
::
vector
<
std
::
string
>>
get_effects
(
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
num_frames
)
{
const
auto
offset
=
frame_offset
.
value_or
(
0
);
if
(
offset
<
0
)
{
throw
std
::
runtime_error
(
"Invalid argument: frame_offset must be non-negative."
);
}
const
auto
frames
=
num_frames
.
value_or
(
-
1
);
if
(
frames
==
0
||
frames
<
-
1
)
{
throw
std
::
runtime_error
(
"Invalid argument: num_frames must be -1 or greater than 0."
);
}
std
::
vector
<
std
::
vector
<
std
::
string
>>
effects
;
if
(
frames
!=
-
1
)
{
std
::
ostringstream
os_offset
,
os_frames
;
os_offset
<<
offset
<<
"s"
;
os_frames
<<
"+"
<<
frames
<<
"s"
;
effects
.
emplace_back
(
std
::
vector
<
std
::
string
>
{
"trim"
,
os_offset
.
str
(),
os_frames
.
str
()});
}
else
if
(
offset
!=
0
)
{
std
::
ostringstream
os_offset
;
os_offset
<<
offset
<<
"s"
;
effects
.
emplace_back
(
std
::
vector
<
std
::
string
>
{
"trim"
,
os_offset
.
str
()});
}
return
effects
;
}
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
load_audio_file
(
const
std
::
string
&
path
,
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
num_frames
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
const
tl
::
optional
<
std
::
string
>&
format
)
{
auto
effects
=
get_effects
(
frame_offset
,
num_frames
);
return
paddleaudio
::
sox_effects
::
apply_effects_file
(
path
,
effects
,
normalize
,
channels_first
,
format
);
}
void
save_audio_file
(
const
std
::
string
&
path
,
py
::
array
tensor
,
int64_t
sample_rate
,
bool
channels_first
,
tl
::
optional
<
double
>
compression
,
tl
::
optional
<
std
::
string
>
format
,
tl
::
optional
<
std
::
string
>
encoding
,
tl
::
optional
<
int64_t
>
bits_per_sample
)
{
validate_input_tensor
(
tensor
);
const
auto
filetype
=
[
&
]()
{
if
(
format
.
has_value
())
return
format
.
value
();
return
get_filetype
(
path
);
}();
if
(
filetype
==
"amr-nb"
)
{
const
auto
num_channels
=
tensor
.
shape
(
channels_first
?
0
:
1
);
//TORCH_CHECK(num_channels == 1,
// "amr-nb format only supports single channel audio.");
}
else
if
(
filetype
==
"htk"
)
{
const
auto
num_channels
=
tensor
.
shape
(
channels_first
?
0
:
1
);
// TORCH_CHECK(num_channels == 1,
// "htk format only supports single channel audio.");
}
else
if
(
filetype
==
"gsm"
)
{
const
auto
num_channels
=
tensor
.
shape
(
channels_first
?
0
:
1
);
//TORCH_CHECK(num_channels == 1,
// "gsm format only supports single channel audio.");
//TORCH_CHECK(sample_rate == 8000,
// "gsm format only supports a sampling rate of 8kHz.");
}
const
auto
signal_info
=
get_signalinfo
(
&
tensor
,
sample_rate
,
filetype
,
channels_first
);
const
auto
encoding_info
=
get_encodinginfo_for_save
(
filetype
,
tensor
.
dtype
(),
compression
,
encoding
,
bits_per_sample
);
SoxFormat
sf
(
sox_open_write
(
path
.
c_str
(),
&
signal_info
,
&
encoding_info
,
/*filetype=*/
filetype
.
c_str
(),
/*oob=*/
nullptr
,
/*overwrite_permitted=*/
nullptr
));
if
(
static_cast
<
sox_format_t
*>
(
sf
)
==
nullptr
)
{
throw
std
::
runtime_error
(
"Error saving audio file: failed to open file "
+
path
);
}
paddleaudio
::
sox_effects_chain
::
SoxEffectsChain
chain
(
/*input_encoding=*/
get_tensor_encodinginfo
(
tensor
.
dtype
()),
/*output_encoding=*/
sf
->
encoding
);
chain
.
addInputTensor
(
&
tensor
,
sample_rate
,
channels_first
);
chain
.
addOutputFile
(
sf
);
chain
.
run
();
}
}
// namespace sox_io
}
// namespace paddleaudio
\ No newline at end of file
paddlespeech/audio/src/sox/io.h
已删除
100644 → 0
浏览文件 @
c938a468
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
// All rights reserved.
#pragma once
#include "paddlespeech/audio/src/optional/optional.hpp"
#include "paddlespeech/audio/src/sox/utils.h"
namespace
paddleaudio
{
namespace
sox_io
{
auto
get_effects
(
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
num_frames
)
->
std
::
vector
<
std
::
vector
<
std
::
string
>>
;
using
MetaDataTuple
=
std
::
tuple
<
int64_t
,
int64_t
,
int64_t
,
int64_t
,
std
::
string
>
;
tl
::
optional
<
MetaDataTuple
>
get_info_file
(
const
std
::
string
&
path
,
const
tl
::
optional
<
std
::
string
>&
format
);
tl
::
optional
<
std
::
tuple
<
py
::
array
,
int64_t
>>
load_audio_file
(
const
std
::
string
&
path
,
const
tl
::
optional
<
int64_t
>&
frame_offset
,
const
tl
::
optional
<
int64_t
>&
num_frames
,
tl
::
optional
<
bool
>
normalize
,
tl
::
optional
<
bool
>
channels_first
,
const
tl
::
optional
<
std
::
string
>&
format
);
void
save_audio_file
(
const
std
::
string
&
path
,
py
::
array
tensor
,
int64_t
sample_rate
,
bool
channels_first
,
tl
::
optional
<
double
>
compression
,
tl
::
optional
<
std
::
string
>
format
,
tl
::
optional
<
std
::
string
>
encoding
,
tl
::
optional
<
int64_t
>
bits_per_sample
);
}
// namespace sox_io
}
// namespace paddleaudio
paddlespeech/audio/src/sox/types.cpp
已删除
100644 → 0
浏览文件 @
c938a468
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
#include "paddlespeech/audio/src/sox/types.h"
#include <ostream>
#include <sstream>
namespace
paddleaudio
{
namespace
sox_utils
{
Format
get_format_from_string
(
const
std
::
string
&
format
)
{
if
(
format
==
"wav"
)
return
Format
::
WAV
;
if
(
format
==
"mp3"
)
return
Format
::
MP3
;
if
(
format
==
"flac"
)
return
Format
::
FLAC
;
if
(
format
==
"ogg"
||
format
==
"vorbis"
)
return
Format
::
VORBIS
;
if
(
format
==
"amr-nb"
)
return
Format
::
AMR_NB
;
if
(
format
==
"amr-wb"
)
return
Format
::
AMR_WB
;
if
(
format
==
"amb"
)
return
Format
::
AMB
;
if
(
format
==
"sph"
)
return
Format
::
SPHERE
;
if
(
format
==
"htk"
)
return
Format
::
HTK
;
if
(
format
==
"gsm"
)
return
Format
::
GSM
;
std
::
ostringstream
stream
;
stream
<<
"Internal Error: unexpected format value: "
<<
format
;
throw
std
::
runtime_error
(
stream
.
str
());
}
std
::
string
to_string
(
Encoding
v
)
{
switch
(
v
)
{
case
Encoding
::
UNKNOWN
:
return
"UNKNOWN"
;
case
Encoding
::
PCM_SIGNED
:
return
"PCM_S"
;
case
Encoding
::
PCM_UNSIGNED
:
return
"PCM_U"
;
case
Encoding
::
PCM_FLOAT
:
return
"PCM_F"
;
case
Encoding
::
FLAC
:
return
"FLAC"
;
case
Encoding
::
ULAW
:
return
"ULAW"
;
case
Encoding
::
ALAW
:
return
"ALAW"
;
case
Encoding
::
MP3
:
return
"MP3"
;
case
Encoding
::
VORBIS
:
return
"VORBIS"
;
case
Encoding
::
AMR_WB
:
return
"AMR_WB"
;
case
Encoding
::
AMR_NB
:
return
"AMR_NB"
;
case
Encoding
::
OPUS
:
return
"OPUS"
;
default:
throw
std
::
runtime_error
(
"Internal Error: unexpected encoding."
);
}
}
Encoding
get_encoding_from_option
(
const
tl
::
optional
<
std
::
string
>
encoding
)
{
if
(
!
encoding
.
has_value
())
return
Encoding
::
NOT_PROVIDED
;
std
::
string
v
=
encoding
.
value
();
if
(
v
==
"PCM_S"
)
return
Encoding
::
PCM_SIGNED
;
if
(
v
==
"PCM_U"
)
return
Encoding
::
PCM_UNSIGNED
;
if
(
v
==
"PCM_F"
)
return
Encoding
::
PCM_FLOAT
;
if
(
v
==
"ULAW"
)
return
Encoding
::
ULAW
;
if
(
v
==
"ALAW"
)
return
Encoding
::
ALAW
;
std
::
ostringstream
stream
;
stream
<<
"Internal Error: unexpected encoding value: "
<<
v
;
throw
std
::
runtime_error
(
stream
.
str
());
}
BitDepth
get_bit_depth_from_option
(
const
tl
::
optional
<
int64_t
>
bit_depth
)
{
if
(
!
bit_depth
.
has_value
())
return
BitDepth
::
NOT_PROVIDED
;
int64_t
v
=
bit_depth
.
value
();
switch
(
v
)
{
case
8
:
return
BitDepth
::
B8
;
case
16
:
return
BitDepth
::
B16
;
case
24
:
return
BitDepth
::
B24
;
case
32
:
return
BitDepth
::
B32
;
case
64
:
return
BitDepth
::
B64
;
default:
{
std
::
ostringstream
s
;
s
<<
"Internal Error: unexpected bit depth value: "
<<
v
;
throw
std
::
runtime_error
(
s
.
str
());
}
}
}
std
::
string
get_encoding
(
sox_encoding_t
encoding
)
{
switch
(
encoding
)
{
case
SOX_ENCODING_UNKNOWN
:
return
"UNKNOWN"
;
case
SOX_ENCODING_SIGN2
:
return
"PCM_S"
;
case
SOX_ENCODING_UNSIGNED
:
return
"PCM_U"
;
case
SOX_ENCODING_FLOAT
:
return
"PCM_F"
;
case
SOX_ENCODING_FLAC
:
return
"FLAC"
;
case
SOX_ENCODING_ULAW
:
return
"ULAW"
;
case
SOX_ENCODING_ALAW
:
return
"ALAW"
;
case
SOX_ENCODING_MP3
:
return
"MP3"
;
case
SOX_ENCODING_VORBIS
:
return
"VORBIS"
;
case
SOX_ENCODING_AMR_WB
:
return
"AMR_WB"
;
case
SOX_ENCODING_AMR_NB
:
return
"AMR_NB"
;
case
SOX_ENCODING_OPUS
:
return
"OPUS"
;
case
SOX_ENCODING_GSM
:
return
"GSM"
;
default:
return
"UNKNOWN"
;
}
}
}
// namespace sox_utils
}
// namespace paddleaudio
paddlespeech/audio/src/sox/types.h
已删除
100644 → 0
浏览文件 @
c938a468
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
#pragma once
#include <sox.h>
#include "paddlespeech/audio/src/optional/optional.hpp"
namespace
paddleaudio
{
namespace
sox_utils
{
enum
class
Format
{
WAV
,
MP3
,
FLAC
,
VORBIS
,
AMR_NB
,
AMR_WB
,
AMB
,
SPHERE
,
GSM
,
HTK
,
};
Format
get_format_from_string
(
const
std
::
string
&
format
);
enum
class
Encoding
{
NOT_PROVIDED
,
UNKNOWN
,
PCM_SIGNED
,
PCM_UNSIGNED
,
PCM_FLOAT
,
FLAC
,
ULAW
,
ALAW
,
MP3
,
VORBIS
,
AMR_WB
,
AMR_NB
,
OPUS
,
};
std
::
string
to_string
(
Encoding
v
);
Encoding
get_encoding_from_option
(
const
tl
::
optional
<
std
::
string
>
encoding
);
enum
class
BitDepth
:
unsigned
{
NOT_PROVIDED
=
0
,
B8
=
8
,
B16
=
16
,
B24
=
24
,
B32
=
32
,
B64
=
64
,
};
BitDepth
get_bit_depth_from_option
(
const
tl
::
optional
<
int64_t
>
bit_depth
);
std
::
string
get_encoding
(
sox_encoding_t
encoding
);
}
// namespace sox_utils
}
// namespace torchaudio
\ No newline at end of file
paddlespeech/audio/src/sox/utils.cpp
已删除
100644 → 0
浏览文件 @
c938a468
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.cpp
#include <sox.h>
#include "paddlespeech/audio/src/sox/types.h"
#include "paddlespeech/audio/src/sox/utils.h"
namespace
paddleaudio
{
namespace
sox_utils
{
void
set_seed
(
const
int64_t
seed
)
{
sox_get_globals
()
->
ranqd1
=
static_cast
<
sox_int32_t
>
(
seed
);
}
void
set_verbosity
(
const
int64_t
verbosity
)
{
sox_get_globals
()
->
verbosity
=
static_cast
<
unsigned
>
(
verbosity
);
}
void
set_use_threads
(
const
bool
use_threads
)
{
sox_get_globals
()
->
use_threads
=
static_cast
<
sox_bool
>
(
use_threads
);
}
void
set_buffer_size
(
const
int64_t
buffer_size
)
{
sox_get_globals
()
->
bufsiz
=
static_cast
<
size_t
>
(
buffer_size
);
}
int64_t
get_buffer_size
()
{
return
sox_get_globals
()
->
bufsiz
;
}
std
::
vector
<
std
::
vector
<
std
::
string
>>
list_effects
()
{
std
::
vector
<
std
::
vector
<
std
::
string
>>
effects
;
for
(
const
sox_effect_fn_t
*
fns
=
sox_get_effect_fns
();
*
fns
;
++
fns
)
{
const
sox_effect_handler_t
*
handler
=
(
*
fns
)();
if
(
handler
&&
handler
->
name
)
{
if
(
UNSUPPORTED_EFFECTS
.
find
(
handler
->
name
)
==
UNSUPPORTED_EFFECTS
.
end
())
{
effects
.
emplace_back
(
std
::
vector
<
std
::
string
>
{
handler
->
name
,
handler
->
usage
?
std
::
string
(
handler
->
usage
)
:
std
::
string
(
""
)});
}
}
}
return
effects
;
}
std
::
vector
<
std
::
string
>
list_write_formats
()
{
std
::
vector
<
std
::
string
>
formats
;
for
(
const
sox_format_tab_t
*
fns
=
sox_get_format_fns
();
fns
->
fn
;
++
fns
)
{
const
sox_format_handler_t
*
handler
=
fns
->
fn
();
for
(
const
char
*
const
*
names
=
handler
->
names
;
*
names
;
++
names
)
{
if
(
!
strchr
(
*
names
,
'/'
)
&&
handler
->
write
)
formats
.
emplace_back
(
*
names
);
}
}
return
formats
;
}
std
::
vector
<
std
::
string
>
list_read_formats
()
{
std
::
vector
<
std
::
string
>
formats
;
for
(
const
sox_format_tab_t
*
fns
=
sox_get_format_fns
();
fns
->
fn
;
++
fns
)
{
const
sox_format_handler_t
*
handler
=
fns
->
fn
();
for
(
const
char
*
const
*
names
=
handler
->
names
;
*
names
;
++
names
)
{
if
(
!
strchr
(
*
names
,
'/'
)
&&
handler
->
read
)
formats
.
emplace_back
(
*
names
);
}
}
return
formats
;
}
SoxFormat
::
SoxFormat
(
sox_format_t
*
fd
)
noexcept
:
fd_
(
fd
)
{}
SoxFormat
::~
SoxFormat
()
{
close
();
}
sox_format_t
*
SoxFormat
::
operator
->
()
const
noexcept
{
return
fd_
;
}
SoxFormat
::
operator
sox_format_t
*
()
const
noexcept
{
return
fd_
;
}
void
SoxFormat
::
close
()
{
if
(
fd_
!=
nullptr
)
{
sox_close
(
fd_
);
fd_
=
nullptr
;
}
}
void
validate_input_file
(
const
SoxFormat
&
sf
,
const
std
::
string
&
path
)
{
if
(
static_cast
<
sox_format_t
*>
(
sf
)
==
nullptr
)
{
throw
std
::
runtime_error
(
"Error loading audio file: failed to open file "
+
path
);
}
if
(
sf
->
encoding
.
encoding
==
SOX_ENCODING_UNKNOWN
)
{
throw
std
::
runtime_error
(
"Error loading audio file: unknown encoding."
);
}
}
void
validate_input_memfile
(
const
SoxFormat
&
sf
)
{
return
validate_input_file
(
sf
,
"<in memory buffer>"
);
}
void
validate_input_tensor
(
const
py
::
array
tensor
)
{
if
(
tensor
.
ndim
()
!=
2
)
{
throw
std
::
runtime_error
(
"Input tensor has to be 2D."
);
}
char
dtype
=
tensor
.
dtype
().
char_
();
bool
flag
=
(
dtype
==
'f'
)
||
(
dtype
==
'd'
)
||
(
dtype
==
'l'
)
||
(
dtype
==
'i'
);
if
(
flag
==
false
)
{
throw
std
::
runtime_error
(
"Input tensor has to be one of float32, int32, int16 or uint8 type."
);
}
}
py
::
dtype
get_dtype
(
const
sox_encoding_t
encoding
,
const
unsigned
precision
)
{
switch
(
encoding
)
{
case
SOX_ENCODING_UNSIGNED
:
// 8-bit PCM WAV
return
py
::
dtype
(
'
u1
'
);
case
SOX_ENCODING_SIGN2
:
// 16-bit, 24-bit, or 32-bit PCM WAV
switch
(
precision
)
{
case
16
:
return
py
::
dtype
(
"i2"
);
case
24
:
// Cast 24-bit to 32-bit.
case
32
:
return
py
::
dtype
(
'i'
);
default:
throw
std
::
runtime_error
(
"Only 16, 24, and 32 bits are supported for signed PCM."
);
}
default:
// default to float32 for the other formats, including
// 32-bit flaoting-point WAV,
// MP3,
// FLAC,
// VORBIS etc...
return
py
::
dtype
(
"f"
);
}
}
py
::
array
convert_to_tensor
(
sox_sample_t
*
buffer
,
const
int32_t
num_samples
,
const
int32_t
num_channels
,
const
py
::
dtype
dtype
,
const
bool
normalize
,
const
bool
channels_first
)
{
py
::
array
t
;
uint64_t
dummy
=
0
;
SOX_SAMPLE_LOCALS
;
if
(
normalize
||
dtype
.
char_
()
==
'f'
)
{
t
=
py
::
array
(
dtype
,
{
num_samples
/
num_channels
,
num_channels
});
auto
ptr
=
(
float
*
)
t
.
mutable_data
(
0
,
0
);
for
(
int32_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
ptr
[
i
]
=
SOX_SAMPLE_TO_FLOAT_32BIT
(
buffer
[
i
],
dummy
);
}
}
else
if
(
dtype
.
char_
()
==
'i'
)
{
//t = torch::from_blob(
// buffer, {num_samples / num_channels, num_channels}, torch::kInt32)
// .clone();
t
=
py
::
array
(
dtype
,
{
num_samples
/
num_channels
,
num_channels
});
auto
ptr
=
(
int
*
)
t
.
mutable_data
(
0
,
0
);
for
(
int32_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
ptr
[
i
]
=
buffer
[
i
];
}
}
else
if
(
dtype
.
char_
()
==
'h'
)
{
// int16
t
=
py
::
array
(
dtype
,
{
num_samples
/
num_channels
,
num_channels
});
auto
ptr
=
(
int16_t
*
)
t
.
mutable_data
(
0
,
0
);
for
(
int32_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
ptr
[
i
]
=
SOX_SAMPLE_TO_SIGNED_16BIT
(
buffer
[
i
],
dummy
);
}
}
else
if
(
dtype
.
char_
()
==
'b'
)
{
//t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
auto
ptr
=
(
uint8_t
*
)
t
.
mutable_data
(
0
,
0
);
for
(
int32_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
ptr
[
i
]
=
SOX_SAMPLE_TO_UNSIGNED_8BIT
(
buffer
[
i
],
dummy
);
}
}
else
{
throw
std
::
runtime_error
(
"Unsupported dtype."
);
}
return
t
;
}
const
std
::
string
get_filetype
(
const
std
::
string
path
)
{
std
::
string
ext
=
path
.
substr
(
path
.
find_last_of
(
"."
)
+
1
);
std
::
transform
(
ext
.
begin
(),
ext
.
end
(),
ext
.
begin
(),
::
tolower
);
return
ext
;
}
namespace
{
std
::
tuple
<
sox_encoding_t
,
unsigned
>
get_save_encoding_for_wav
(
const
std
::
string
format
,
py
::
dtype
dtype
,
const
Encoding
&
encoding
,
const
BitDepth
&
bits_per_sample
)
{
switch
(
encoding
)
{
case
Encoding
::
NOT_PROVIDED
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
switch
(
dtype
.
num
())
{
case
11
:
// float32 numpy dtype num
return
std
::
make_tuple
<>
(
SOX_ENCODING_FLOAT
,
32
);
case
5
:
// int numpy dtype num
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
32
);
case
3
:
// int16 numpy
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
16
);
case
1
:
// byte numpy
return
std
::
make_tuple
<>
(
SOX_ENCODING_UNSIGNED
,
8
);
default:
throw
std
::
runtime_error
(
"Internal Error: Unexpected dtype."
);
}
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_UNSIGNED
,
8
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
static_cast
<
unsigned
>
(
bits_per_sample
));
}
case
Encoding
::
PCM_SIGNED
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
32
);
case
BitDepth
::
B8
:
throw
std
::
runtime_error
(
format
+
" does not support 8-bit signed PCM encoding."
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
static_cast
<
unsigned
>
(
bits_per_sample
));
}
case
Encoding
::
PCM_UNSIGNED
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_UNSIGNED
,
8
);
default:
throw
std
::
runtime_error
(
format
+
" only supports 8-bit for unsigned PCM encoding."
);
}
case
Encoding
::
PCM_FLOAT
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B32
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_FLOAT
,
32
);
case
BitDepth
::
B64
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_FLOAT
,
64
);
default:
throw
std
::
runtime_error
(
format
+
" only supports 32-bit or 64-bit for floating-point PCM encoding."
);
}
case
Encoding
::
ULAW
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ULAW
,
8
);
default:
throw
std
::
runtime_error
(
format
+
" only supports 8-bit for mu-law encoding."
);
}
case
Encoding
::
ALAW
:
switch
(
bits_per_sample
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ALAW
,
8
);
default:
throw
std
::
runtime_error
(
format
+
" only supports 8-bit for a-law encoding."
);
}
default:
throw
std
::
runtime_error
(
format
+
" does not support encoding: "
+
to_string
(
encoding
));
}
}
std
::
tuple
<
sox_encoding_t
,
unsigned
>
get_save_encoding
(
const
std
::
string
&
format
,
const
py
::
dtype
dtype
,
const
tl
::
optional
<
std
::
string
>
encoding
,
const
tl
::
optional
<
int64_t
>
bits_per_sample
)
{
const
Format
fmt
=
get_format_from_string
(
format
);
const
Encoding
enc
=
get_encoding_from_option
(
encoding
);
const
BitDepth
bps
=
get_bit_depth_from_option
(
bits_per_sample
);
switch
(
fmt
)
{
case
Format
::
WAV
:
case
Format
::
AMB
:
return
get_save_encoding_for_wav
(
format
,
dtype
,
enc
,
bps
);
case
Format
::
MP3
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"mp3 does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"mp3 does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_MP3
,
16
);
case
Format
::
HTK
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"htk does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"htk does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
16
);
case
Format
::
VORBIS
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"vorbis does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"vorbis does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_VORBIS
,
16
);
case
Format
::
AMR_NB
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"amr-nb does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"amr-nb does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_AMR_NB
,
16
);
case
Format
::
FLAC
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"flac does not support `encoding` option."
);
switch
(
bps
)
{
case
BitDepth
::
B32
:
case
BitDepth
::
B64
:
throw
std
::
runtime_error
(
"flac does not support `bits_per_sample` larger than 24."
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_FLAC
,
static_cast
<
unsigned
>
(
bps
));
}
case
Format
::
SPHERE
:
switch
(
enc
)
{
case
Encoding
::
NOT_PROVIDED
:
case
Encoding
::
PCM_SIGNED
:
switch
(
bps
)
{
case
BitDepth
::
NOT_PROVIDED
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
32
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_SIGN2
,
static_cast
<
unsigned
>
(
bps
));
}
case
Encoding
::
PCM_UNSIGNED
:
throw
std
::
runtime_error
(
"sph does not support unsigned integer PCM."
);
case
Encoding
::
PCM_FLOAT
:
throw
std
::
runtime_error
(
"sph does not support floating point PCM."
);
case
Encoding
::
ULAW
:
switch
(
bps
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ULAW
,
8
);
default:
throw
std
::
runtime_error
(
"sph only supports 8-bit for mu-law encoding."
);
}
case
Encoding
::
ALAW
:
switch
(
bps
)
{
case
BitDepth
::
NOT_PROVIDED
:
case
BitDepth
::
B8
:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ALAW
,
8
);
default:
return
std
::
make_tuple
<>
(
SOX_ENCODING_ALAW
,
static_cast
<
unsigned
>
(
bps
));
}
default:
throw
std
::
runtime_error
(
"sph does not support encoding: "
+
encoding
.
value
());
}
case
Format
::
GSM
:
if
(
enc
!=
Encoding
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"gsm does not support `encoding` option."
);
if
(
bps
!=
BitDepth
::
NOT_PROVIDED
)
throw
std
::
runtime_error
(
"gsm does not support `bits_per_sample` option."
);
return
std
::
make_tuple
<>
(
SOX_ENCODING_GSM
,
16
);
default:
throw
std
::
runtime_error
(
"Unsupported format: "
+
format
);
}
}
unsigned
get_precision
(
const
std
::
string
filetype
,
py
::
dtype
dtype
)
{
if
(
filetype
==
"mp3"
)
return
SOX_UNSPEC
;
if
(
filetype
==
"flac"
)
return
24
;
if
(
filetype
==
"ogg"
||
filetype
==
"vorbis"
)
return
SOX_UNSPEC
;
if
(
filetype
==
"wav"
||
filetype
==
"amb"
)
{
switch
(
dtype
.
num
())
{
case
1
:
// byte in numpy dype num
return
8
;
case
3
:
// short, in numpy dtype num
return
16
;
case
5
:
// int, numpy dtype
return
32
;
case
11
:
// float, numpy dtype
return
32
;
default:
throw
std
::
runtime_error
(
"Unsupported dtype."
);
}
}
if
(
filetype
==
"sph"
)
return
32
;
if
(
filetype
==
"amr-nb"
)
{
return
16
;
}
if
(
filetype
==
"gsm"
)
{
return
16
;
}
if
(
filetype
==
"htk"
)
{
return
16
;
}
throw
std
::
runtime_error
(
"Unsupported file type: "
+
filetype
);
}
}
// namespace
sox_signalinfo_t
get_signalinfo
(
const
py
::
array
*
waveform
,
const
int64_t
sample_rate
,
const
std
::
string
filetype
,
const
bool
channels_first
)
{
return
sox_signalinfo_t
{
/*rate=*/
static_cast
<
sox_rate_t
>
(
sample_rate
),
/*channels=*/
static_cast
<
unsigned
>
(
waveform
->
shape
(
channels_first
?
0
:
1
)),
/*precision=*/
get_precision
(
filetype
,
waveform
->
dtype
()),
/*length=*/
static_cast
<
uint64_t
>
(
waveform
->
size
())};
}
sox_encodinginfo_t
get_tensor_encodinginfo
(
py
::
dtype
dtype
)
{
sox_encoding_t
encoding
=
[
&
]()
{
switch
(
dtype
.
num
())
{
case
1
:
// byte
return
SOX_ENCODING_UNSIGNED
;
case
3
:
// short
return
SOX_ENCODING_SIGN2
;
case
5
:
// int32
return
SOX_ENCODING_SIGN2
;
case
11
:
// float
return
SOX_ENCODING_FLOAT
;
default:
throw
std
::
runtime_error
(
"Unsupported dtype."
);
}
}();
unsigned
bits_per_sample
=
[
&
]()
{
switch
(
dtype
.
num
())
{
case
1
:
// byte
return
8
;
case
3
:
//short
return
16
;
case
5
:
// int32
return
32
;
case
11
:
// float
return
32
;
default:
throw
std
::
runtime_error
(
"Unsupported dtype."
);
}
}();
return
sox_encodinginfo_t
{
/*encoding=*/
encoding
,
/*bits_per_sample=*/
bits_per_sample
,
/*compression=*/
HUGE_VAL
,
/*reverse_bytes=*/
sox_option_default
,
/*reverse_nibbles=*/
sox_option_default
,
/*reverse_bits=*/
sox_option_default
,
/*opposite_endian=*/
sox_false
};
}
sox_encodinginfo_t
get_encodinginfo_for_save
(
const
std
::
string
&
format
,
const
py
::
dtype
dtype
,
const
tl
::
optional
<
double
>
compression
,
const
tl
::
optional
<
std
::
string
>
encoding
,
const
tl
::
optional
<
int64_t
>
bits_per_sample
)
{
auto
enc
=
get_save_encoding
(
format
,
dtype
,
encoding
,
bits_per_sample
);
return
sox_encodinginfo_t
{
/*encoding=*/
std
::
get
<
0
>
(
enc
),
/*bits_per_sample=*/
std
::
get
<
1
>
(
enc
),
/*compression=*/
compression
.
value_or
(
HUGE_VAL
),
/*reverse_bytes=*/
sox_option_default
,
/*reverse_nibbles=*/
sox_option_default
,
/*reverse_bits=*/
sox_option_default
,
/*opposite_endian=*/
sox_false
};
}
}
// namespace sox_utils
}
// namespace torchaudio
paddlespeech/audio/src/sox/utils.h
已删除
100644 → 0
浏览文件 @
c938a468
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.h
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <sox.h>
#include "paddlespeech/audio/src/optional/optional.hpp"
namespace
py
=
pybind11
;
namespace
paddleaudio
{
namespace
sox_utils
{
////////////////////////////////////////////////////////////////////////////////
// APIs for Python interaction
////////////////////////////////////////////////////////////////////////////////
/// Set sox global options
void
set_seed
(
const
int64_t
seed
);
void
set_verbosity
(
const
int64_t
verbosity
);
void
set_use_threads
(
const
bool
use_threads
);
void
set_buffer_size
(
const
int64_t
buffer_size
);
int64_t
get_buffer_size
();
std
::
vector
<
std
::
vector
<
std
::
string
>>
list_effects
();
std
::
vector
<
std
::
string
>
list_read_formats
();
std
::
vector
<
std
::
string
>
list_write_formats
();
////////////////////////////////////////////////////////////////////////////////
// Utilities for sox_io / sox_effects implementations
////////////////////////////////////////////////////////////////////////////////
const
std
::
unordered_set
<
std
::
string
>
UNSUPPORTED_EFFECTS
=
{
"input"
,
"output"
,
"spectrogram"
,
"noiseprof"
,
"noisered"
,
"splice"
};
/// helper class to automatically close sox_format_t*
struct
SoxFormat
{
explicit
SoxFormat
(
sox_format_t
*
fd
)
noexcept
;
SoxFormat
(
const
SoxFormat
&
other
)
=
delete
;
SoxFormat
(
SoxFormat
&&
other
)
=
delete
;
SoxFormat
&
operator
=
(
const
SoxFormat
&
other
)
=
delete
;
SoxFormat
&
operator
=
(
SoxFormat
&&
other
)
=
delete
;
~
SoxFormat
();
sox_format_t
*
operator
->
()
const
noexcept
;
operator
sox_format_t
*
()
const
noexcept
;
void
close
();
private:
sox_format_t
*
fd_
;
};
///
/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
void
validate_input_tensor
(
const
py
::
array
);
void
validate_input_file
(
const
SoxFormat
&
sf
,
const
std
::
string
&
path
);
void
validate_input_memfile
(
const
SoxFormat
&
sf
);
///
/// Get target dtype for the given encoding and precision.
py
::
dtype
get_dtype
(
const
sox_encoding_t
encoding
,
const
unsigned
precision
);
///
/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
/// NOTE: This function might modify the values in the input buffer to
/// reduce the number of memory copy.
/// @param buffer Pointer to buffer that contains audio data.
/// @param num_samples The number of samples to read.
/// @param num_channels The number of channels. Used to reshape the resulting
/// Tensor.
/// @param dtype Target dtype. Determines the output dtype and value range in
/// conjunction with normalization.
/// @param noramlize Perform normalization. Only effective when dtype is not
/// kFloat32. When effective, the output tensor is kFloat32 type and value range
/// is [-1.0, 1.0]
/// @param channels_first When True, output Tensor has shape of [num_channels,
/// num_frames].
py
::
array
convert_to_tensor
(
sox_sample_t
*
buffer
,
const
int32_t
num_samples
,
const
int32_t
num_channels
,
const
py
::
dtype
dtype
,
const
bool
normalize
,
const
bool
channels_first
);
/// Extract extension from file path
const
std
::
string
get_filetype
(
const
std
::
string
path
);
/// Get sox_signalinfo_t for passing a py::array object.
sox_signalinfo_t
get_signalinfo
(
const
py
::
array
*
waveform
,
const
int64_t
sample_rate
,
const
std
::
string
filetype
,
const
bool
channels_first
);
/// Get sox_encodinginfo_t for Tensor I/O
sox_encodinginfo_t
get_tensor_encodinginfo
(
const
py
::
dtype
dtype
);
/// Get sox_encodinginfo_t for saving to file/file object
sox_encodinginfo_t
get_encodinginfo_for_save
(
const
std
::
string
&
format
,
const
py
::
dtype
dtype
,
const
tl
::
optional
<
double
>
compression
,
const
tl
::
optional
<
std
::
string
>
encoding
,
const
tl
::
optional
<
int64_t
>
bits_per_sample
);
}
// namespace sox_utils
}
// namespace paddleaudio
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录