Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
d03ebe87
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d03ebe87
编写于
4月 11, 2023
作者:
Mars懵
提交者:
GitHub
4月 11, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add vad interface GetVadResult (#3140)
* add vad interface GetVadResult * fix comment
上级
f35a87ab
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
117 addition
and
95 deletion
+117
-95
runtime/engine/vad/interface/vad_interface.cc
runtime/engine/vad/interface/vad_interface.cc
+10
-1
runtime/engine/vad/interface/vad_interface.h
runtime/engine/vad/interface/vad_interface.h
+1
-0
runtime/engine/vad/interface/vad_interface_main.cc
runtime/engine/vad/interface/vad_interface_main.cc
+38
-31
runtime/engine/vad/nnet/vad.cc
runtime/engine/vad/nnet/vad.cc
+64
-54
runtime/engine/vad/nnet/vad.h
runtime/engine/vad/nnet/vad.h
+4
-3
runtime/engine/vad/nnet/vad_nnet_main.cc
runtime/engine/vad/nnet/vad_nnet_main.cc
+0
-6
未找到文件。
runtime/engine/vad/interface/vad_interface.cc
浏览文件 @
d03ebe87
...
...
@@ -91,4 +91,13 @@ int PPSVadReset(PPSHandle_t instance) {
}
model
->
Reset
();
return
0
;
}
\ No newline at end of file
}
int
PPSVadGetResult
(
PPSHandle_t
instance
,
char
*
result
,
int
max_len
){
ppspeech
::
Vad
*
model
=
static_cast
<
ppspeech
::
Vad
*>
(
instance
);
if
(
model
==
nullptr
)
{
printf
(
"instance is null
\n
"
);
return
-
1
;
}
return
model
->
GetResult
(
result
,
max_len
);
};
\ No newline at end of file
runtime/engine/vad/interface/vad_interface.h
浏览文件 @
d03ebe87
...
...
@@ -41,6 +41,7 @@ PPSVadState_t PPSVadFeedForward(PPSHandle_t instance,
float
*
chunk
,
int
num_element
);
int
PPSVadGetResult
(
PPSHandle_t
instance
,
char
*
result
,
int
max_len
);
#ifdef __cplusplus
}
#endif // __cplusplus
\ No newline at end of file
runtime/engine/vad/interface/vad_interface_main.cc
浏览文件 @
d03ebe87
...
...
@@ -16,56 +16,63 @@
#include <iostream>
#include <vector>
#include <fstream>
#include "common/base/common.h"
#include "vad/frontend/wav.h"
#include "vad/interface/vad_interface.h"
int
main
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<
3
)
{
std
::
cout
<<
"Usage: vad_interface_main path/to/config
path/to/audio
"
std
::
cout
<<
"Usage: vad_interface_main path/to/config
wav.scp
"
"run_option, "
"e.g ./vad_interface_main config
sample.wav
"
"e.g ./vad_interface_main config
wav.scp
"
<<
std
::
endl
;
return
-
1
;
}
std
::
string
config_path
=
argv
[
1
];
std
::
string
audio_file
=
argv
[
2
];
std
::
string
wav_scp
=
argv
[
2
];
PPSHandle_t
handle
=
PPSVadCreateInstance
(
config_path
.
c_str
());
std
::
vector
<
float
>
inputWav
;
// [0, 1]
wav
::
WavReader
wav_reader
=
wav
::
WavReader
(
audio_file
);
auto
sr
=
wav_reader
.
sample_rate
();
CHECK
(
sr
==
16000
)
<<
" sr is "
<<
sr
<<
" expect 16000"
;
std
::
ifstream
fp_wav
(
wav_scp
);
std
::
string
line
=
""
;
while
(
getline
(
fp_wav
,
line
)){
std
::
vector
<
float
>
inputWav
;
// [0, 1]
wav
::
WavReader
wav_reader
=
wav
::
WavReader
(
line
);
auto
sr
=
wav_reader
.
sample_rate
();
CHECK
(
sr
==
16000
)
<<
" sr is "
<<
sr
<<
" expect 16000"
;
auto
num_samples
=
wav_reader
.
num_samples
();
inputWav
.
resize
(
num_samples
);
for
(
int
i
=
0
;
i
<
num_samples
;
i
++
)
{
inputWav
[
i
]
=
wav_reader
.
data
()[
i
]
/
32768
;
}
ppspeech
::
Timer
timer
;
int
window_size_samples
=
PPSVadChunkSizeSamples
(
handle
);
for
(
int64_t
j
=
0
;
j
<
num_samples
;
j
+=
window_size_samples
)
{
auto
start
=
j
;
auto
end
=
start
+
window_size_samples
>=
num_samples
?
num_samples
:
start
+
window_size_samples
;
auto
current_chunk_size
=
end
-
start
;
auto
num_samples
=
wav_reader
.
num_samples
();
inputWav
.
resize
(
num_samples
);
for
(
int
i
=
0
;
i
<
num_samples
;
i
++
)
{
inputWav
[
i
]
=
wav_reader
.
data
()[
i
]
/
32768
;
}
std
::
vector
<
float
>
r
{
&
inputWav
[
0
]
+
start
,
&
inputWav
[
0
]
+
end
};
assert
(
r
.
size
()
==
static_cast
<
size_t
>
(
current_chunk_size
));
ppspeech
::
Timer
timer
;
int
window_size_samples
=
PPSVadChunkSizeSamples
(
handle
);
for
(
int64_t
j
=
0
;
j
<
num_samples
;
j
+=
window_size_samples
)
{
auto
start
=
j
;
auto
end
=
start
+
window_size_samples
>=
num_samples
?
num_samples
:
start
+
window_size_samples
;
std
::
vector
<
float
>
r
(
window_size_samples
,
0
);
auto
current_chunk_size
=
end
-
start
;
memcpy
(
r
.
data
(),
inputWav
.
data
()
+
start
,
current_chunk_size
*
sizeof
(
float
));
PPSVadState_t
s
=
PPSVadFeedForward
(
handle
,
r
.
data
(),
r
.
size
());
std
::
cout
<<
s
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
PPSVadState_t
s
=
PPSVadFeedForward
(
handle
,
r
.
data
(),
r
.
size
());
}
std
::
cout
<<
"RTF="
<<
timer
.
Elapsed
()
/
double
(
num_samples
/
sr
)
<<
std
::
endl
;
PPSVadReset
(
handle
);
std
::
cout
<<
"RTF="
<<
timer
.
Elapsed
()
/
double
(
num_samples
/
sr
)
<<
std
::
endl
;
char
result
[
10240
]
=
{
0
};
PPSVadGetResult
(
handle
,
result
,
10240
);
std
::
cout
<<
line
<<
" "
<<
result
<<
std
::
endl
;
PPSVadReset
(
handle
);
// getchar();
}
PPSVadDestroyInstance
(
handle
);
return
0
;
}
runtime/engine/vad/nnet/vad.cc
浏览文件 @
d03ebe87
...
...
@@ -100,8 +100,8 @@ void Vad::Reset() {
temp_end_
=
0
;
current_sample_
=
0
;
spe
ak
Start_
.
clear
();
spe
ak
End_
.
clear
();
spe
ech
Start_
.
clear
();
spe
ech
End_
.
clear
();
states_
.
clear
();
}
...
...
@@ -176,34 +176,43 @@ const Vad::State& Vad::Postprocess() {
if
(
outputProb_
<
threshold_
&&
!
triggerd_
)
{
// 1. Silence
#ifdef PPS_DEBUG
DLOG
(
INFO
)
<<
"{ silence: "
<<
1.0
*
current_sample_
/
sample_rate_
<<
" s; prob: "
<<
outputProb_
<<
" }"
;
#endif
states_
.
emplace_back
(
Vad
::
State
::
SIL
);
}
else
if
(
outputProb_
>=
threshold_
&&
!
triggerd_
)
{
// 2. Start
triggerd_
=
true
;
speech_start_
=
current_sample_
-
current_chunk_size_
-
speech_pad_left_samples_
;
speech_start_
=
std
::
max
(
int
(
speech_start_
),
0
);
float
start_sec
=
1.0
*
speech_start_
/
sample_rate_
;
speakStart_
.
emplace_back
(
start_sec
);
speechStart_
.
emplace_back
(
start_sec
);
#ifdef PPS_DEBUG
DLOG
(
INFO
)
<<
"{ speech start: "
<<
start_sec
<<
" s; prob: "
<<
outputProb_
<<
" }"
;
#endif
states_
.
emplace_back
(
Vad
::
State
::
START
);
}
else
if
(
outputProb_
>=
threshold_
-
beam_
&&
triggerd_
)
{
// 3. Continue
if
(
temp_end_
!=
0
)
{
// speech prob relaxation, speech continues again
#ifdef PPS_DEBUG
DLOG
(
INFO
)
<<
"{ speech fake end(sil < min_silence_ms) to continue: "
<<
1.0
*
current_sample_
/
sample_rate_
<<
" s; prob: "
<<
outputProb_
<<
" }"
;
#endif
temp_end_
=
0
;
}
else
{
// speech prob relaxation, keep tracking speech
#ifdef PPS_DEBUG
DLOG
(
INFO
)
<<
"{ speech continue: "
<<
1.0
*
current_sample_
/
sample_rate_
<<
" s; prob: "
<<
outputProb_
<<
" }"
;
#endif
}
states_
.
emplace_back
(
Vad
::
State
::
SPEECH
);
...
...
@@ -216,9 +225,11 @@ const Vad::State& Vad::Postprocess() {
// check possible speech end
if
(
current_sample_
-
temp_end_
<
min_silence_samples_
)
{
// a. silence < min_slience_samples, continue speaking
#ifdef PPS_DEBUG
DLOG
(
INFO
)
<<
"{ speech fake end(sil < min_silence_ms): "
<<
1.0
*
current_sample_
/
sample_rate_
<<
" s; prob: "
<<
outputProb_
<<
" }"
;
#endif
states_
.
emplace_back
(
Vad
::
State
::
SIL
);
}
else
{
// b. silence >= min_slience_samples, end speaking
...
...
@@ -226,9 +237,11 @@ const Vad::State& Vad::Postprocess() {
temp_end_
=
0
;
triggerd_
=
false
;
auto
end_sec
=
1.0
*
speech_end_
/
sample_rate_
;
speakEnd_
.
emplace_back
(
end_sec
);
speechEnd_
.
emplace_back
(
end_sec
);
#ifdef PPS_DEBUG
DLOG
(
INFO
)
<<
"{ speech end: "
<<
end_sec
<<
" s; prob: "
<<
outputProb_
<<
" }"
;
#endif
states_
.
emplace_back
(
Vad
::
State
::
END
);
}
}
...
...
@@ -236,66 +249,63 @@ const Vad::State& Vad::Postprocess() {
return
states_
.
back
();
}
const
std
::
vector
<
std
::
map
<
std
::
string
,
float
>>
Vad
::
GetResult
(
std
::
string
Vad
::
ConvertTime
(
float
time_s
)
const
{
float
seconds_tmp
,
minutes_tmp
,
hours_tmp
;
float
seconds
;
int
minutes
,
hours
;
// 计算小时
hours_tmp
=
time_s
/
60
/
60
;
// 1
hours
=
(
int
)
hours_tmp
;
// 计算分钟
minutes_tmp
=
time_s
/
60
;
if
(
minutes_tmp
>=
60
)
{
minutes
=
minutes_tmp
-
60
*
(
double
)
hours
;
}
else
{
minutes
=
minutes_tmp
;
}
// 计算秒数
seconds_tmp
=
(
60
*
60
*
hours
)
+
(
60
*
minutes
);
seconds
=
time_s
-
seconds_tmp
;
// 输出格式
std
::
stringstream
ss
;
ss
<<
hours
<<
":"
<<
minutes
<<
":"
<<
seconds
;
return
ss
.
str
();
}
int
Vad
::
GetResult
(
char
*
result
,
int
max_len
,
float
removeThreshold
,
float
expandHeadThreshold
,
float
expandTailThreshold
,
float
mergeThreshold
)
const
{
float
audioLength
=
1.0
*
current_sample_
/
sample_rate_
;
if
(
spe
akStart_
.
empty
()
&&
speak
End_
.
empty
())
{
if
(
spe
echStart_
.
empty
()
&&
speech
End_
.
empty
())
{
return
{};
}
if
(
spe
akEnd_
.
size
()
!=
speak
Start_
.
size
())
{
if
(
spe
echEnd_
.
size
()
!=
speech
Start_
.
size
())
{
// set the audio length as the last end
speakEnd_
.
emplace_back
(
audioLength
);
speechEnd_
.
emplace_back
(
audioLength
);
}
std
::
string
json
=
"["
;
for
(
int
i
=
0
;
i
<
speechStart_
.
size
();
++
i
)
{
json
+=
"{
\"
s
\"
:
\"
"
+
ConvertTime
(
speechStart_
[
i
])
+
"
\"
,
\"
e
\"
:
\"
"
+
ConvertTime
(
speechEnd_
[
i
])
+
"
\"
},"
;
}
// Remove too short segments
// auto startIter = speakStart_.begin();
// auto endIter = speakEnd_.begin();
// while (startIter != speakStart_.end()) {
// if (removeThreshold < audioLength &&
// *endIter - *startIter < removeThreshold) {
// startIter = speakStart_.erase(startIter);
// endIter = speakEnd_.erase(endIter);
// } else {
// startIter++;
// endIter++;
// }
// }
// // Expand to avoid to tight cut.
// startIter = speakStart_.begin();
// endIter = speakEnd_.begin();
// *startIter = std::fmax(0.f, *startIter - expandHeadThreshold);
// *endIter = std::fmin(*endIter + expandTailThreshold, *(startIter + 1));
// endIter = speakEnd_.end() - 1;
// startIter = speakStart_.end() - 1;
// *startIter = fmax(*startIter - expandHeadThreshold, *(endIter - 1));
// *endIter = std::fmin(*endIter + expandTailThreshold, audioLength);
// for (int i = 1; i < speakStart_.size() - 1; ++i) {
// speakStart_[i] = std::fmax(speakStart_[i] - expandHeadThreshold,
// speakEnd_[i - 1]);
// speakEnd_[i] = std::fmin(speakEnd_[i] + expandTailThreshold,
// speakStart_[i + 1]);
// }
// // Merge very closed segments
// startIter = speakStart_.begin() + 1;
// endIter = speakEnd_.begin();
// while (startIter != speakStart_.end()) {
// if (*startIter - *endIter < mergeThreshold) {
// startIter = speakStart_.erase(startIter);
// endIter = speakEnd_.erase(endIter);
// } else {
// startIter++;
// endIter++;
// }
// }
std
::
vector
<
std
::
map
<
std
::
string
,
float
>>
result
;
for
(
int
i
=
0
;
i
<
speakStart_
.
size
();
++
i
)
{
result
.
emplace_back
(
std
::
map
<
std
::
string
,
float
>
(
{{
"start"
,
speakStart_
[
i
]},
{
"end"
,
speakEnd_
[
i
]}}));
json
.
pop_back
();
json
+=
"]"
;
if
(
result
!=
NULL
){
snprintf
(
result
,
max_len
,
"%s"
,
json
.
c_str
());
}
else
{
DLOG
(
INFO
)
<<
"result is NULL"
;
}
return
result
;
return
0
;
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
Vad
::
State
&
s
)
{
...
...
runtime/engine/vad/nnet/vad.h
浏览文件 @
d03ebe87
...
...
@@ -70,7 +70,7 @@ class Vad : public fastdeploy::FastDeployModel {
const
State
&
Postprocess
();
const
std
::
vector
<
std
::
map
<
std
::
string
,
float
>>
GetResult
(
int
GetResult
(
char
*
result
,
int
max_len
,
float
removeThreshold
=
0.0
,
float
expandHeadThreshold
=
0.0
,
float
expandTailThreshold
=
0
,
...
...
@@ -103,6 +103,7 @@ class Vad : public fastdeploy::FastDeployModel {
private:
bool
Initialize
();
std
::
string
ConvertTime
(
float
time_s
)
const
;
private:
std
::
mutex
init_lock_
;
...
...
@@ -122,8 +123,8 @@ class Vad : public fastdeploy::FastDeployModel {
// MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes
float
outputProb_
;
std
::
vector
<
float
>
spe
ak
Start_
;
mutable
std
::
vector
<
float
>
spe
ak
End_
;
std
::
vector
<
float
>
spe
ech
Start_
;
mutable
std
::
vector
<
float
>
spe
ech
End_
;
std
::
vector
<
State
>
states_
;
...
...
runtime/engine/vad/nnet/vad_nnet_main.cc
浏览文件 @
d03ebe87
...
...
@@ -70,12 +70,6 @@ int main(int argc, char* argv[]) {
std
::
cout
<<
"RTF="
<<
timer
.
Elapsed
()
/
double
(
num_samples
/
sr
)
<<
std
::
endl
;
std
::
vector
<
std
::
map
<
std
::
string
,
float
>>
result
=
vad
.
GetResult
();
for
(
auto
&
res
:
result
)
{
std
::
cout
<<
"speak start: "
<<
res
[
"start"
]
<<
" s, end: "
<<
res
[
"end"
]
<<
" s | "
;
}
std
::
cout
<<
"
\b\b
"
<<
std
::
endl
;
vad
.
Reset
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录