Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
cb0b6785
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
cb0b6785
编写于
5月 04, 2022
作者:
Y
Yang Zhou
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add fbank into feature pipeline
上级
a36ecccf
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
53 addition
and
23 deletion
+53
-23
speechx/examples/ds2_ol/feat/compute_fbank_main.cc
speechx/examples/ds2_ol/feat/compute_fbank_main.cc
+1
-2
speechx/speechx/decoder/param.h
speechx/speechx/decoder/param.h
+16
-7
speechx/speechx/frontend/audio/CMakeLists.txt
speechx/speechx/frontend/audio/CMakeLists.txt
+1
-1
speechx/speechx/frontend/audio/fbank.cc
speechx/speechx/frontend/audio/fbank.cc
+15
-3
speechx/speechx/frontend/audio/fbank.h
speechx/speechx/frontend/audio/fbank.h
+1
-1
speechx/speechx/frontend/audio/feature_pipeline.cc
speechx/speechx/frontend/audio/feature_pipeline.cc
+10
-4
speechx/speechx/frontend/audio/feature_pipeline.h
speechx/speechx/frontend/audio/feature_pipeline.h
+5
-0
speechx/speechx/kaldi/feat/feature-fbank.cc
speechx/speechx/kaldi/feat/feature-fbank.cc
+3
-4
speechx/speechx/kaldi/feat/feature-fbank.h
speechx/speechx/kaldi/feat/feature-fbank.h
+1
-1
未找到文件。
speechx/examples/ds2_ol/feat/compute_fbank_main.cc
浏览文件 @
cb0b6785
...
...
@@ -43,7 +43,7 @@ int main(int argc, char* argv[]) {
int32
num_done
=
0
,
num_err
=
0
;
// feature pipeline: wave cache -->
hanning
window
// feature pipeline: wave cache -->
povey
window
// -->fbank --> global cmvn -> feat cache
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
data_source
(
...
...
@@ -78,7 +78,6 @@ int main(int argc, char* argv[]) {
LOG
(
INFO
)
<<
"chunk size (s): "
<<
streaming_chunk
;
LOG
(
INFO
)
<<
"chunk size (sample): "
<<
chunk_sample_size
;
for
(;
!
wav_reader
.
Done
();
wav_reader
.
Next
())
{
std
::
string
utt
=
wav_reader
.
Key
();
const
kaldi
::
WaveData
&
wave_data
=
wav_reader
.
Value
();
...
...
speechx/speechx/decoder/param.h
浏览文件 @
cb0b6785
...
...
@@ -47,7 +47,8 @@ DEFINE_string(model_cache_names,
"chunk_state_h_box,chunk_state_c_box"
,
"model cache names"
);
DEFINE_string
(
model_cache_shapes
,
"5-1-1024,5-1-1024"
,
"model cache shapes"
);
DEFINE_bool
(
use_fbank
,
false
,
"use fbank or linear feature"
);
DEFINE_int32
(
num_bins
,
161
,
"num bins of mel"
);
namespace
ppspeech
{
// todo refactor later
...
...
@@ -57,13 +58,21 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts
.
linear_spectrogram_opts
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opts
.
to_float32
=
FLAGS_to_float32
;
kaldi
::
FrameExtractionOptions
frame_opts
;
frame_opts
.
frame_length_ms
=
20
;
frame_opts
.
frame_shift_ms
=
10
;
frame_opts
.
remove_dc_offset
=
false
;
frame_opts
.
window_type
=
"hanning"
;
frame_opts
.
preemph_coeff
=
0.0
;
frame_opts
.
dither
=
0.0
;
opts
.
linear_spectrogram_opts
.
frame_opts
=
frame_opts
;
frame_opts
.
frame_shift_ms
=
10
;
opts
.
use_fbank
=
FLAGS_use_fbank
;
if
(
opts
.
use_fbank
)
{
frame_opts
.
window_type
=
"povey"
;
frame_opts
.
frame_length_ms
=
25
;
opts
.
fbank_opts
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opts
.
fbank_opts
.
fbank_opts
.
frame_opts
=
frame_opts
;
}
else
{
frame_opts
.
remove_dc_offset
=
false
;
frame_opts
.
frame_length_ms
=
20
;
frame_opts
.
window_type
=
"hanning"
;
frame_opts
.
preemph_coeff
=
0.0
;
opts
.
linear_spectrogram_opts
.
frame_opts
=
frame_opts
;
}
opts
.
feature_cache_opts
.
frame_chunk_size
=
FLAGS_receptive_field_length
;
opts
.
feature_cache_opts
.
frame_chunk_stride
=
FLAGS_downsampling_rate
;
return
opts
;
...
...
speechx/speechx/frontend/audio/CMakeLists.txt
浏览文件 @
cb0b6785
...
...
@@ -10,4 +10,4 @@ add_library(frontend STATIC
fbank.cc
)
target_link_libraries
(
frontend PUBLIC kaldi-matrix kaldi-feat-common
)
target_link_libraries
(
frontend PUBLIC kaldi-matrix kaldi-feat-common
kaldi-fbank
)
speechx/speechx/frontend/audio/fbank.cc
浏览文件 @
cb0b6785
...
...
@@ -29,6 +29,8 @@ using kaldi::VectorBase;
using
kaldi
::
Matrix
;
using
std
::
vector
;
// todo refactor later:(SmileGoat)
Fbank
::
Fbank
(
const
FbankOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
...
...
@@ -98,12 +100,22 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
Vector
<
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kaldi
::
kUndefined
);
// note: this online feature-extraction code does not support VTLN.
BaseFloat
vtln_warp
=
1.0
;
computer_
.
Compute
(
raw_log_energy
,
vtln_warp
,
&
window
,
&
this_feature
);
RealFft
(
&
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
&
window
);
const
kaldi
::
MelBanks
&
mel_bank
=
*
(
computer_
.
GetMelBanks
(
1.0
));
SubVector
<
BaseFloat
>
power_spectrum
(
window
,
0
,
window
.
Dim
()
/
2
+
1
);
if
(
!
opts_
.
fbank_opts
.
use_power
)
{
power_spectrum
.
ApplyPow
(
0.5
);
}
int32
mel_offset
=
((
opts_
.
fbank_opts
.
use_energy
&&
!
opts_
.
fbank_opts
.
htk_compat
)
?
1
:
0
);
SubVector
<
BaseFloat
>
mel_energies
(
this_feature
,
mel_offset
,
opts_
.
fbank_opts
.
mel_opts
.
num_bins
);
mel_bank
.
Compute
(
power_spectrum
,
&
mel_energies
);
mel_energies
.
ApplyFloor
(
1e-07
);
mel_energies
.
ApplyLog
();
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
}
return
true
;
}
}
// namespace ppspeech
\ No newline at end of file
}
// namespace ppspeech
speechx/speechx/frontend/audio/fbank.h
浏览文件 @
cb0b6785
...
...
@@ -74,4 +74,4 @@ class Fbank : public FrontendInterface {
DISALLOW_COPY_AND_ASSIGN
(
Fbank
);
};
}
// namespace ppspeech
\ No newline at end of file
}
// namespace ppspeech
speechx/speechx/frontend/audio/feature_pipeline.cc
浏览文件 @
cb0b6785
...
...
@@ -22,12 +22,18 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
unique_ptr
<
FrontendInterface
>
data_source
(
new
ppspeech
::
AudioCache
(
1000
*
kint16max
,
opts
.
to_float32
));
unique_ptr
<
FrontendInterface
>
linear_spectrogram
(
new
ppspeech
::
LinearSpectrogram
(
opts
.
linear_spectrogram_opts
,
std
::
move
(
data_source
)));
unique_ptr
<
FrontendInterface
>
base_feature
;
if
(
opts
.
use_fbank
)
{
base_feature
.
reset
(
new
ppspeech
::
Fbank
(
opts
.
fbank_opts
,
std
::
move
(
data_source
)));
}
else
{
base_feature
.
reset
(
new
ppspeech
::
LinearSpectrogram
(
opts
.
linear_spectrogram_opts
,
std
::
move
(
data_source
)));
}
unique_ptr
<
FrontendInterface
>
cmvn
(
new
ppspeech
::
CMVN
(
opts
.
cmvn_file
,
std
::
move
(
linear_spectrogram
)));
new
ppspeech
::
CMVN
(
opts
.
cmvn_file
,
std
::
move
(
base_feature
)));
base_extractor_
.
reset
(
new
ppspeech
::
FeatureCache
(
opts
.
feature_cache_opts
,
std
::
move
(
cmvn
)));
...
...
speechx/speechx/frontend/audio/feature_pipeline.h
浏览文件 @
cb0b6785
...
...
@@ -21,6 +21,7 @@
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/normalizer.h"
namespace
ppspeech
{
...
...
@@ -28,12 +29,16 @@ namespace ppspeech {
struct
FeaturePipelineOptions
{
std
::
string
cmvn_file
;
bool
to_float32
;
bool
use_fbank
;
LinearSpectrogramOptions
linear_spectrogram_opts
;
FbankOptions
fbank_opts
;
FeatureCacheOptions
feature_cache_opts
;
FeaturePipelineOptions
()
:
cmvn_file
(
""
),
to_float32
(
false
),
use_fbank
(
false
),
linear_spectrogram_opts
(),
fbank_opts
(),
feature_cache_opts
()
{}
};
...
...
speechx/speechx/kaldi/feat/feature-fbank.cc
浏览文件 @
cb0b6785
...
...
@@ -85,10 +85,9 @@ void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
signal_raw_log_energy
=
Log
(
std
::
max
<
BaseFloat
>
(
VecVec
(
*
signal_frame
,
*
signal_frame
),
std
::
numeric_limits
<
float
>::
epsilon
()));
// todo : remove later; as align fbank feature in paddleaudio
//if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
// srfft_->Compute(signal_frame->Data(), true);
//else // An alternative algorithm that works for non-powers-of-two.
if
(
srfft_
!=
NULL
)
// Compute FFT using split-radix algorithm.
srfft_
->
Compute
(
signal_frame
->
Data
(),
true
);
else
// An alternative algorithm that works for non-powers-of-two.
RealFft
(
signal_frame
,
true
);
// Convert the FFT into a power spectrum.
...
...
speechx/speechx/kaldi/feat/feature-fbank.h
浏览文件 @
cb0b6785
...
...
@@ -128,8 +128,8 @@ class FbankComputer {
~
FbankComputer
();
private:
const
MelBanks
*
GetMelBanks
(
BaseFloat
vtln_warp
);
private:
FbankOptions
opts_
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录