Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
96825d9c
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
96825d9c
编写于
5月 05, 2022
作者:
H
Hui Zhang
提交者:
GitHub
5月 05, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1827 from SmileGoat/add_fbank
[speechx] align fbank with paddleaudio
上级
a0308992
bd2211f6
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
229 addition
and
32 deletion
+229
-32
speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
...mples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
+3
-0
speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
+9
-0
speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
+4
-1
speechx/examples/ds2_ol/feat/CMakeLists.txt
speechx/examples/ds2_ol/feat/CMakeLists.txt
+5
-1
speechx/examples/ds2_ol/feat/compute_fbank_main.cc
speechx/examples/ds2_ol/feat/compute_fbank_main.cc
+142
-0
speechx/speechx/decoder/param.h
speechx/speechx/decoder/param.h
+16
-7
speechx/speechx/frontend/audio/CMakeLists.txt
speechx/speechx/frontend/audio/CMakeLists.txt
+2
-1
speechx/speechx/frontend/audio/fbank.cc
speechx/speechx/frontend/audio/fbank.cc
+22
-9
speechx/speechx/frontend/audio/fbank.h
speechx/speechx/frontend/audio/fbank.h
+6
-4
speechx/speechx/frontend/audio/feature_pipeline.cc
speechx/speechx/frontend/audio/feature_pipeline.cc
+10
-4
speechx/speechx/frontend/audio/feature_pipeline.h
speechx/speechx/frontend/audio/feature_pipeline.h
+5
-0
speechx/speechx/kaldi/feat/CMakeLists.txt
speechx/speechx/kaldi/feat/CMakeLists.txt
+2
-2
speechx/speechx/kaldi/feat/feature-fbank.h
speechx/speechx/kaldi/feat/feature-fbank.h
+1
-1
speechx/speechx/kaldi/feat/mel-computations.cc
speechx/speechx/kaldi/feat/mel-computations.cc
+2
-2
未找到文件。
speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
浏览文件 @
96825d9c
...
@@ -98,6 +98,7 @@ int main(int argc, char* argv[]) {
...
@@ -98,6 +98,7 @@ int main(int argc, char* argv[]) {
LOG
(
INFO
)
<<
"receptive field (frame): "
<<
receptive_field_length
;
LOG
(
INFO
)
<<
"receptive field (frame): "
<<
receptive_field_length
;
decoder
.
InitDecoder
();
decoder
.
InitDecoder
();
kaldi
::
Timer
timer
;
for
(;
!
feature_reader
.
Done
();
feature_reader
.
Next
())
{
for
(;
!
feature_reader
.
Done
();
feature_reader
.
Next
())
{
string
utt
=
feature_reader
.
Key
();
string
utt
=
feature_reader
.
Key
();
kaldi
::
Matrix
<
BaseFloat
>
feature
=
feature_reader
.
Value
();
kaldi
::
Matrix
<
BaseFloat
>
feature
=
feature_reader
.
Value
();
...
@@ -160,5 +161,7 @@ int main(int argc, char* argv[]) {
...
@@ -160,5 +161,7 @@ int main(int argc, char* argv[]) {
KALDI_LOG
<<
"Done "
<<
num_done
<<
" utterances, "
<<
num_err
KALDI_LOG
<<
"Done "
<<
num_done
<<
" utterances, "
<<
num_err
<<
" with errors."
;
<<
" with errors."
;
double
elapsed
=
timer
.
Elapsed
();
KALDI_LOG
<<
" cost:"
<<
elapsed
<<
" s"
;
return
(
num_done
!=
0
?
0
:
1
);
return
(
num_done
!=
0
?
0
:
1
);
}
}
speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
浏览文件 @
96825d9c
...
@@ -38,6 +38,9 @@ int main(int argc, char* argv[]) {
...
@@ -38,6 +38,9 @@ int main(int argc, char* argv[]) {
LOG
(
INFO
)
<<
"chunk size (sample): "
<<
chunk_sample_size
;
LOG
(
INFO
)
<<
"chunk size (sample): "
<<
chunk_sample_size
;
int32
num_done
=
0
,
num_err
=
0
;
int32
num_done
=
0
,
num_err
=
0
;
double
tot_wav_duration
=
0.0
;
kaldi
::
Timer
timer
;
for
(;
!
wav_reader
.
Done
();
wav_reader
.
Next
())
{
for
(;
!
wav_reader
.
Done
();
wav_reader
.
Next
())
{
std
::
string
utt
=
wav_reader
.
Key
();
std
::
string
utt
=
wav_reader
.
Key
();
...
@@ -47,6 +50,7 @@ int main(int argc, char* argv[]) {
...
@@ -47,6 +50,7 @@ int main(int argc, char* argv[]) {
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
waveform
(
wave_data
.
Data
(),
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
waveform
(
wave_data
.
Data
(),
this_channel
);
this_channel
);
int
tot_samples
=
waveform
.
Dim
();
int
tot_samples
=
waveform
.
Dim
();
tot_wav_duration
+=
tot_samples
*
1.0
/
sample_rate
;
LOG
(
INFO
)
<<
"wav len (sample): "
<<
tot_samples
;
LOG
(
INFO
)
<<
"wav len (sample): "
<<
tot_samples
;
int
sample_offset
=
0
;
int
sample_offset
=
0
;
...
@@ -85,4 +89,9 @@ int main(int argc, char* argv[]) {
...
@@ -85,4 +89,9 @@ int main(int argc, char* argv[]) {
result_writer
.
Write
(
utt
,
result
);
result_writer
.
Write
(
utt
,
result
);
++
num_done
;
++
num_done
;
}
}
double
elapsed
=
timer
.
Elapsed
();
KALDI_LOG
<<
"Done "
<<
num_done
<<
" out of "
<<
(
num_err
+
num_done
);
KALDI_LOG
<<
" cost:"
<<
elapsed
<<
" s"
;
KALDI_LOG
<<
"total wav duration is: "
<<
tot_wav_duration
<<
" s"
;
KALDI_LOG
<<
"the RTF is: "
<<
elapsed
/
tot_wav_duration
;
}
}
\ No newline at end of file
speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
浏览文件 @
96825d9c
...
@@ -100,7 +100,7 @@ int main(int argc, char* argv[]) {
...
@@ -100,7 +100,7 @@ int main(int argc, char* argv[]) {
LOG
(
INFO
)
<<
"chunk stride (frame): "
<<
chunk_stride
;
LOG
(
INFO
)
<<
"chunk stride (frame): "
<<
chunk_stride
;
LOG
(
INFO
)
<<
"receptive field (frame): "
<<
receptive_field_length
;
LOG
(
INFO
)
<<
"receptive field (frame): "
<<
receptive_field_length
;
decoder
.
InitDecoder
();
decoder
.
InitDecoder
();
kaldi
::
Timer
timer
;
for
(;
!
feature_reader
.
Done
();
feature_reader
.
Next
())
{
for
(;
!
feature_reader
.
Done
();
feature_reader
.
Next
())
{
string
utt
=
feature_reader
.
Key
();
string
utt
=
feature_reader
.
Key
();
kaldi
::
Matrix
<
BaseFloat
>
feature
=
feature_reader
.
Value
();
kaldi
::
Matrix
<
BaseFloat
>
feature
=
feature_reader
.
Value
();
...
@@ -160,6 +160,9 @@ int main(int argc, char* argv[]) {
...
@@ -160,6 +160,9 @@ int main(int argc, char* argv[]) {
++
num_done
;
++
num_done
;
}
}
double
elapsed
=
timer
.
Elapsed
();
KALDI_LOG
<<
" cost:"
<<
elapsed
<<
" s"
;
KALDI_LOG
<<
"Done "
<<
num_done
<<
" utterances, "
<<
num_err
KALDI_LOG
<<
"Done "
<<
num_done
<<
" utterances, "
<<
num_err
<<
" with errors."
;
<<
" with errors."
;
return
(
num_done
!=
0
?
0
:
1
);
return
(
num_done
!=
0
?
0
:
1
);
...
...
speechx/examples/ds2_ol/feat/CMakeLists.txt
浏览文件 @
96825d9c
...
@@ -5,8 +5,12 @@ add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
...
@@ -5,8 +5,12 @@ add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories
(
${
bin_name
}
PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_include_directories
(
${
bin_name
}
PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_link_libraries
(
${
bin_name
}
frontend kaldi-util kaldi-feat-common gflags glog
)
target_link_libraries
(
${
bin_name
}
frontend kaldi-util kaldi-feat-common gflags glog
)
set
(
bin_name compute_fbank_main
)
add_executable
(
${
bin_name
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
bin_name
}
.cc
)
target_include_directories
(
${
bin_name
}
PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_link_libraries
(
${
bin_name
}
frontend kaldi-util kaldi-feat-common gflags glog
)
set
(
bin_name cmvn-json2kaldi
)
set
(
bin_name cmvn-json2kaldi
)
add_executable
(
${
bin_name
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
bin_name
}
.cc
)
add_executable
(
${
bin_name
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
bin_name
}
.cc
)
target_include_directories
(
${
bin_name
}
PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_include_directories
(
${
bin_name
}
PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_link_libraries
(
${
bin_name
}
utils kaldi-util kaldi-matrix gflags glog
${
DEPS
}
)
target_link_libraries
(
${
bin_name
}
utils kaldi-util kaldi-matrix gflags glog
${
DEPS
}
)
\ No newline at end of file
speechx/examples/ds2_ol/feat/compute_fbank_main.cc
0 → 100644
浏览文件 @
96825d9c
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// todo refactor, repalce with gtest
#include "base/flags.h"
#include "base/log.h"
#include "kaldi/feat/wave-reader.h"
#include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h"
#include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/normalizer.h"
DEFINE_string
(
wav_rspecifier
,
""
,
"test wav scp path"
);
DEFINE_string
(
feature_wspecifier
,
""
,
"output feats wspecifier"
);
DEFINE_string
(
cmvn_file
,
""
,
"read cmvn"
);
DEFINE_double
(
streaming_chunk
,
0.36
,
"streaming feature chunk size"
);
DEFINE_int32
(
num_bins
,
161
,
"fbank num bins"
);
int
main
(
int
argc
,
char
*
argv
[])
{
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
false
);
google
::
InitGoogleLogging
(
argv
[
0
]);
kaldi
::
SequentialTableReader
<
kaldi
::
WaveHolder
>
wav_reader
(
FLAGS_wav_rspecifier
);
kaldi
::
BaseFloatMatrixWriter
feat_writer
(
FLAGS_feature_wspecifier
);
int32
num_done
=
0
,
num_err
=
0
;
// feature pipeline: wave cache --> povey window
// -->fbank --> global cmvn -> feat cache
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
data_source
(
new
ppspeech
::
AudioCache
(
3600
*
1600
,
false
));
ppspeech
::
FbankOptions
opt
;
opt
.
fbank_opts
.
frame_opts
.
frame_length_ms
=
25
;
opt
.
fbank_opts
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opt
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opt
.
fbank_opts
.
frame_opts
.
dither
=
0.0
;
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
fbank
(
new
ppspeech
::
Fbank
(
opt
,
std
::
move
(
data_source
)));
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
cmvn
(
new
ppspeech
::
CMVN
(
FLAGS_cmvn_file
,
std
::
move
(
fbank
)));
ppspeech
::
FeatureCacheOptions
feat_cache_opts
;
// the feature cache output feature chunk by chunk.
// frame_chunk_size : num frame of a chunk.
// frame_chunk_stride: chunk sliding window stride.
feat_cache_opts
.
frame_chunk_stride
=
1
;
feat_cache_opts
.
frame_chunk_size
=
1
;
ppspeech
::
FeatureCache
feature_cache
(
feat_cache_opts
,
std
::
move
(
cmvn
));
LOG
(
INFO
)
<<
"feat dim: "
<<
feature_cache
.
Dim
();
int
sample_rate
=
16000
;
float
streaming_chunk
=
FLAGS_streaming_chunk
;
int
chunk_sample_size
=
streaming_chunk
*
sample_rate
;
LOG
(
INFO
)
<<
"sr: "
<<
sample_rate
;
LOG
(
INFO
)
<<
"chunk size (s): "
<<
streaming_chunk
;
LOG
(
INFO
)
<<
"chunk size (sample): "
<<
chunk_sample_size
;
for
(;
!
wav_reader
.
Done
();
wav_reader
.
Next
())
{
std
::
string
utt
=
wav_reader
.
Key
();
const
kaldi
::
WaveData
&
wave_data
=
wav_reader
.
Value
();
LOG
(
INFO
)
<<
"process utt: "
<<
utt
;
int32
this_channel
=
0
;
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
waveform
(
wave_data
.
Data
(),
this_channel
);
int
tot_samples
=
waveform
.
Dim
();
LOG
(
INFO
)
<<
"wav len (sample): "
<<
tot_samples
;
int
sample_offset
=
0
;
std
::
vector
<
kaldi
::
Vector
<
BaseFloat
>>
feats
;
int
feature_rows
=
0
;
while
(
sample_offset
<
tot_samples
)
{
int
cur_chunk_size
=
std
::
min
(
chunk_sample_size
,
tot_samples
-
sample_offset
);
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
wav_chunk
(
cur_chunk_size
);
for
(
int
i
=
0
;
i
<
cur_chunk_size
;
++
i
)
{
wav_chunk
(
i
)
=
waveform
(
sample_offset
+
i
);
}
kaldi
::
Vector
<
BaseFloat
>
features
;
feature_cache
.
Accept
(
wav_chunk
);
if
(
cur_chunk_size
<
chunk_sample_size
)
{
feature_cache
.
SetFinished
();
}
bool
flag
=
true
;
do
{
flag
=
feature_cache
.
Read
(
&
features
);
feats
.
push_back
(
features
);
feature_rows
+=
features
.
Dim
()
/
feature_cache
.
Dim
();
}
while
(
flag
==
true
&&
features
.
Dim
()
!=
0
);
sample_offset
+=
cur_chunk_size
;
}
int
cur_idx
=
0
;
kaldi
::
Matrix
<
kaldi
::
BaseFloat
>
features
(
feature_rows
,
feature_cache
.
Dim
());
for
(
auto
feat
:
feats
)
{
int
num_rows
=
feat
.
Dim
()
/
feature_cache
.
Dim
();
for
(
int
row_idx
=
0
;
row_idx
<
num_rows
;
++
row_idx
)
{
for
(
size_t
col_idx
=
0
;
col_idx
<
feature_cache
.
Dim
();
++
col_idx
)
{
features
(
cur_idx
,
col_idx
)
=
feat
(
row_idx
*
feature_cache
.
Dim
()
+
col_idx
);
}
++
cur_idx
;
}
}
feat_writer
.
Write
(
utt
,
features
);
feature_cache
.
Reset
();
if
(
num_done
%
50
==
0
&&
num_done
!=
0
)
KALDI_VLOG
(
2
)
<<
"Processed "
<<
num_done
<<
" utterances"
;
num_done
++
;
}
KALDI_LOG
<<
"Done "
<<
num_done
<<
" utterances, "
<<
num_err
<<
" with errors."
;
return
(
num_done
!=
0
?
0
:
1
);
}
speechx/speechx/decoder/param.h
浏览文件 @
96825d9c
...
@@ -47,7 +47,8 @@ DEFINE_string(model_cache_names,
...
@@ -47,7 +47,8 @@ DEFINE_string(model_cache_names,
"chunk_state_h_box,chunk_state_c_box"
,
"chunk_state_h_box,chunk_state_c_box"
,
"model cache names"
);
"model cache names"
);
DEFINE_string
(
model_cache_shapes
,
"5-1-1024,5-1-1024"
,
"model cache shapes"
);
DEFINE_string
(
model_cache_shapes
,
"5-1-1024,5-1-1024"
,
"model cache shapes"
);
DEFINE_bool
(
use_fbank
,
false
,
"use fbank or linear feature"
);
DEFINE_int32
(
num_bins
,
161
,
"num bins of mel"
);
namespace
ppspeech
{
namespace
ppspeech
{
// todo refactor later
// todo refactor later
...
@@ -57,13 +58,21 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
...
@@ -57,13 +58,21 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts
.
linear_spectrogram_opts
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opts
.
linear_spectrogram_opts
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opts
.
to_float32
=
FLAGS_to_float32
;
opts
.
to_float32
=
FLAGS_to_float32
;
kaldi
::
FrameExtractionOptions
frame_opts
;
kaldi
::
FrameExtractionOptions
frame_opts
;
frame_opts
.
frame_length_ms
=
20
;
frame_opts
.
frame_shift_ms
=
10
;
frame_opts
.
remove_dc_offset
=
false
;
frame_opts
.
window_type
=
"hanning"
;
frame_opts
.
preemph_coeff
=
0.0
;
frame_opts
.
dither
=
0.0
;
frame_opts
.
dither
=
0.0
;
opts
.
linear_spectrogram_opts
.
frame_opts
=
frame_opts
;
frame_opts
.
frame_shift_ms
=
10
;
opts
.
use_fbank
=
FLAGS_use_fbank
;
if
(
opts
.
use_fbank
)
{
frame_opts
.
window_type
=
"povey"
;
frame_opts
.
frame_length_ms
=
25
;
opts
.
fbank_opts
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opts
.
fbank_opts
.
fbank_opts
.
frame_opts
=
frame_opts
;
}
else
{
frame_opts
.
remove_dc_offset
=
false
;
frame_opts
.
frame_length_ms
=
20
;
frame_opts
.
window_type
=
"hanning"
;
frame_opts
.
preemph_coeff
=
0.0
;
opts
.
linear_spectrogram_opts
.
frame_opts
=
frame_opts
;
}
opts
.
feature_cache_opts
.
frame_chunk_size
=
FLAGS_receptive_field_length
;
opts
.
feature_cache_opts
.
frame_chunk_size
=
FLAGS_receptive_field_length
;
opts
.
feature_cache_opts
.
frame_chunk_stride
=
FLAGS_downsampling_rate
;
opts
.
feature_cache_opts
.
frame_chunk_stride
=
FLAGS_downsampling_rate
;
return
opts
;
return
opts
;
...
...
speechx/speechx/frontend/audio/CMakeLists.txt
浏览文件 @
96825d9c
...
@@ -7,6 +7,7 @@ add_library(frontend STATIC
...
@@ -7,6 +7,7 @@ add_library(frontend STATIC
audio_cache.cc
audio_cache.cc
feature_cache.cc
feature_cache.cc
feature_pipeline.cc
feature_pipeline.cc
fbank.cc
)
)
target_link_libraries
(
frontend PUBLIC kaldi-matrix kaldi-feat-common
)
target_link_libraries
(
frontend PUBLIC kaldi-matrix kaldi-feat-common
kaldi-fbank
)
speechx/speechx/frontend/audio/fbank.cc
浏览文件 @
96825d9c
...
@@ -29,14 +29,16 @@ using kaldi::VectorBase;
...
@@ -29,14 +29,16 @@ using kaldi::VectorBase;
using
kaldi
::
Matrix
;
using
kaldi
::
Matrix
;
using
std
::
vector
;
using
std
::
vector
;
// todo refactor later:(SmileGoat)
Fbank
::
Fbank
(
const
FbankOptions
&
opts
,
Fbank
::
Fbank
(
const
FbankOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
:
opts_
(
opts
),
computer_
(
opts
.
fbank_opts
),
computer_
(
opts
.
fbank_opts
),
window_function_
(
computer_
.
GetFrameOptions
()
)
{
window_function_
(
opts
.
fbank_opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
base_extractor_
=
std
::
move
(
base_extractor
);
chunk_sample_size_
=
chunk_sample_size_
=
static_cast
<
int32
>
(
static_cast
<
int32
>
(
opts
.
streaming_chunk
*
opts
.
frame_opts
.
samp_freq
);
opts
.
streaming_chunk
*
opts
.
fbank_
opts
.
frame_opts
.
samp_freq
);
}
}
void
Fbank
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
void
Fbank
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
...
@@ -71,7 +73,8 @@ bool Fbank::Read(Vector<BaseFloat>* feats) {
...
@@ -71,7 +73,8 @@ bool Fbank::Read(Vector<BaseFloat>* feats) {
// Compute spectrogram feat
// Compute spectrogram feat
bool
Fbank
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
Vector
<
BaseFloat
>*
feats
)
{
bool
Fbank
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
Vector
<
BaseFloat
>*
feats
)
{
const
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
const
kaldi
::
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_samples
=
waves
.
Dim
();
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
frame_opts
.
WindowSize
();
int32
frame_length
=
frame_opts
.
WindowSize
();
int32
sample_rate
=
frame_opts
.
samp_freq
;
int32
sample_rate
=
frame_opts
.
samp_freq
;
...
@@ -80,7 +83,7 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
...
@@ -80,7 +83,7 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
}
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
feats
->
Rsize
(
num_frames
*
Dim
());
feats
->
R
e
size
(
num_frames
*
Dim
());
Vector
<
BaseFloat
>
window
;
Vector
<
BaseFloat
>
window
;
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
...
@@ -95,14 +98,24 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
...
@@ -95,14 +98,24 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
Vector
<
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kUndefined
);
Vector
<
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
k
aldi
::
k
Undefined
);
// note: this online feature-extraction code does not support VTLN.
// note: this online feature-extraction code does not support VTLN.
BaseFloat
vtln_warp
=
1.0
;
RealFft
(
&
window
,
true
);
computer_
.
Compute
(
raw_log_energy
,
vtln_warp
,
&
window
,
&
this_feature
);
kaldi
::
ComputePowerSpectrum
(
&
window
);
const
kaldi
::
MelBanks
&
mel_bank
=
*
(
computer_
.
GetMelBanks
(
1.0
));
SubVector
<
BaseFloat
>
power_spectrum
(
window
,
0
,
window
.
Dim
()
/
2
+
1
);
if
(
!
opts_
.
fbank_opts
.
use_power
)
{
power_spectrum
.
ApplyPow
(
0.5
);
}
int32
mel_offset
=
((
opts_
.
fbank_opts
.
use_energy
&&
!
opts_
.
fbank_opts
.
htk_compat
)
?
1
:
0
);
SubVector
<
BaseFloat
>
mel_energies
(
this_feature
,
mel_offset
,
opts_
.
fbank_opts
.
mel_opts
.
num_bins
);
mel_bank
.
Compute
(
power_spectrum
,
&
mel_energies
);
mel_energies
.
ApplyFloor
(
1e-07
);
mel_energies
.
ApplyLog
();
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
output_row
.
CopyFromVec
(
this_feature
);
}
}
return
true
;
return
true
;
}
}
}
// namespace ppspeech
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/frontend/audio/fbank.h
浏览文件 @
96825d9c
...
@@ -14,6 +14,8 @@
...
@@ -14,6 +14,8 @@
#pragma once
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-fbank.h"
#include "kaldi/feat/feature-fbank.h"
#include "kaldi/feat/feature-mfcc.h"
#include "kaldi/feat/feature-mfcc.h"
#include "kaldi/matrix/kaldi-vector.h"
#include "kaldi/matrix/kaldi-vector.h"
...
@@ -38,7 +40,7 @@ struct FbankOptions {
...
@@ -38,7 +40,7 @@ struct FbankOptions {
class
Fbank
:
public
FrontendInterface
{
class
Fbank
:
public
FrontendInterface
{
public:
public:
explicit
Fbank
(
const
FbankOptions
&
opts
,
explicit
Fbank
(
const
FbankOptions
&
opts
,
unique_ptr
<
FrontendInterface
>
base_extractor
);
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
...
@@ -61,15 +63,15 @@ class Fbank : public FrontendInterface {
...
@@ -61,15 +63,15 @@ class Fbank : public FrontendInterface {
FbankOptions
opts_
;
FbankOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
FeatureWindowFunction
window_function_
;
FeatureWindowFunction
window_function_
;
kaldi
::
FbankComputer
computer_
;
kaldi
::
FbankComputer
computer_
;
// features_ is the Mfcc or Plp or Fbank features that we have already
// features_ is the Mfcc or Plp or Fbank features that we have already
// computed.
// computed.
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
features_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
features_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
kaldi
::
int32
chunk_sample_size_
;
DISALLOW_COPY_AND_ASSIGN
(
Fbank
);
DISALLOW_COPY_AND_ASSIGN
(
Fbank
);
};
};
}
// namespace ppspeech
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/frontend/audio/feature_pipeline.cc
浏览文件 @
96825d9c
...
@@ -22,12 +22,18 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
...
@@ -22,12 +22,18 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
unique_ptr
<
FrontendInterface
>
data_source
(
unique_ptr
<
FrontendInterface
>
data_source
(
new
ppspeech
::
AudioCache
(
1000
*
kint16max
,
opts
.
to_float32
));
new
ppspeech
::
AudioCache
(
1000
*
kint16max
,
opts
.
to_float32
));
unique_ptr
<
FrontendInterface
>
linear_spectrogram
(
unique_ptr
<
FrontendInterface
>
base_feature
;
new
ppspeech
::
LinearSpectrogram
(
opts
.
linear_spectrogram_opts
,
std
::
move
(
data_source
)));
if
(
opts
.
use_fbank
)
{
base_feature
.
reset
(
new
ppspeech
::
Fbank
(
opts
.
fbank_opts
,
std
::
move
(
data_source
)));
}
else
{
base_feature
.
reset
(
new
ppspeech
::
LinearSpectrogram
(
opts
.
linear_spectrogram_opts
,
std
::
move
(
data_source
)));
}
unique_ptr
<
FrontendInterface
>
cmvn
(
unique_ptr
<
FrontendInterface
>
cmvn
(
new
ppspeech
::
CMVN
(
opts
.
cmvn_file
,
std
::
move
(
linear_spectrogram
)));
new
ppspeech
::
CMVN
(
opts
.
cmvn_file
,
std
::
move
(
base_feature
)));
base_extractor_
.
reset
(
base_extractor_
.
reset
(
new
ppspeech
::
FeatureCache
(
opts
.
feature_cache_opts
,
std
::
move
(
cmvn
)));
new
ppspeech
::
FeatureCache
(
opts
.
feature_cache_opts
,
std
::
move
(
cmvn
)));
...
...
speechx/speechx/frontend/audio/feature_pipeline.h
浏览文件 @
96825d9c
...
@@ -21,6 +21,7 @@
...
@@ -21,6 +21,7 @@
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/normalizer.h"
#include "frontend/audio/normalizer.h"
namespace
ppspeech
{
namespace
ppspeech
{
...
@@ -28,12 +29,16 @@ namespace ppspeech {
...
@@ -28,12 +29,16 @@ namespace ppspeech {
struct
FeaturePipelineOptions
{
struct
FeaturePipelineOptions
{
std
::
string
cmvn_file
;
std
::
string
cmvn_file
;
bool
to_float32
;
bool
to_float32
;
bool
use_fbank
;
LinearSpectrogramOptions
linear_spectrogram_opts
;
LinearSpectrogramOptions
linear_spectrogram_opts
;
FbankOptions
fbank_opts
;
FeatureCacheOptions
feature_cache_opts
;
FeatureCacheOptions
feature_cache_opts
;
FeaturePipelineOptions
()
FeaturePipelineOptions
()
:
cmvn_file
(
""
),
:
cmvn_file
(
""
),
to_float32
(
false
),
to_float32
(
false
),
use_fbank
(
false
),
linear_spectrogram_opts
(),
linear_spectrogram_opts
(),
fbank_opts
(),
feature_cache_opts
()
{}
feature_cache_opts
()
{}
};
};
...
...
speechx/speechx/kaldi/feat/CMakeLists.txt
浏览文件 @
96825d9c
...
@@ -3,10 +3,10 @@ add_library(kaldi-mfcc
...
@@ -3,10 +3,10 @@ add_library(kaldi-mfcc
)
)
target_link_libraries
(
kaldi-mfcc PUBLIC kaldi-feat-common
)
target_link_libraries
(
kaldi-mfcc PUBLIC kaldi-feat-common
)
add_library
(
fbank
add_library
(
kaldi-
fbank
feature-fbank.cc
feature-fbank.cc
)
)
target_link_libraries
(
fbank PUBLIC kaldi-feat-common
)
target_link_libraries
(
kaldi-
fbank PUBLIC kaldi-feat-common
)
add_library
(
kaldi-feat-common
add_library
(
kaldi-feat-common
wave-reader.cc
wave-reader.cc
...
...
speechx/speechx/kaldi/feat/feature-fbank.h
浏览文件 @
96825d9c
...
@@ -128,8 +128,8 @@ class FbankComputer {
...
@@ -128,8 +128,8 @@ class FbankComputer {
~
FbankComputer
();
~
FbankComputer
();
private:
const
MelBanks
*
GetMelBanks
(
BaseFloat
vtln_warp
);
const
MelBanks
*
GetMelBanks
(
BaseFloat
vtln_warp
);
private:
FbankOptions
opts_
;
FbankOptions
opts_
;
...
...
speechx/speechx/kaldi/feat/mel-computations.cc
浏览文件 @
96825d9c
...
@@ -120,8 +120,8 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
...
@@ -120,8 +120,8 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
last_index
=
i
;
last_index
=
i
;
}
}
}
}
KALDI_ASSERT
(
first_index
!=
-
1
&&
last_index
>=
first_index
//
KALDI_ASSERT(first_index != -1 && last_index >= first_index
&&
"You may have set --num-mel-bins too large."
);
//
&& "You may have set --num-mel-bins too large.");
bins_
[
bin
].
first
=
first_index
;
bins_
[
bin
].
first
=
first_index
;
int32
size
=
last_index
+
1
-
first_index
;
int32
size
=
last_index
+
1
-
first_index
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录