Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
182858bf
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
206
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
182858bf
编写于
6月 02, 2022
作者:
H
Hui Zhang
提交者:
GitHub
6月 02, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #2003 from SmileGoat/refactor_file_struct
[speechx] refactor frontend
上级
82c1f4c5
6dbf3081
变更
17
隐藏空白更改
内联
并排
Showing
17 changed file
with
241 addition
and
250 deletion
+241
-250
speechx/examples/custom_asr/run.sh
speechx/examples/custom_asr/run.sh
+0
-1
speechx/examples/ds2_ol/aishell/run.sh
speechx/examples/ds2_ol/aishell/run.sh
+0
-2
speechx/examples/ds2_ol/aishell/run_fbank.sh
speechx/examples/ds2_ol/aishell/run_fbank.sh
+0
-1
speechx/examples/ds2_ol/websocket/websocket_client.sh
speechx/examples/ds2_ol/websocket/websocket_client.sh
+1
-1
speechx/examples/ds2_ol/websocket/websocket_server.sh
speechx/examples/ds2_ol/websocket/websocket_server.sh
+0
-1
speechx/speechx/decoder/param.h
speechx/speechx/decoder/param.h
+2
-4
speechx/speechx/decoder/recognizer_main.cc
speechx/speechx/decoder/recognizer_main.cc
+2
-1
speechx/speechx/frontend/audio/audio_cache.h
speechx/speechx/frontend/audio/audio_cache.h
+3
-2
speechx/speechx/frontend/audio/compute_fbank_main.cc
speechx/speechx/frontend/audio/compute_fbank_main.cc
+5
-6
speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
...speechx/frontend/audio/compute_linear_spectrogram_main.cc
+0
-1
speechx/speechx/frontend/audio/fbank.cc
speechx/speechx/frontend/audio/fbank.cc
+20
-83
speechx/speechx/frontend/audio/fbank.h
speechx/speechx/frontend/audio/fbank.h
+15
-42
speechx/speechx/frontend/audio/feature_common.h
speechx/speechx/frontend/audio/feature_common.h
+54
-0
speechx/speechx/frontend/audio/feature_common_inl.h
speechx/speechx/frontend/audio/feature_common_inl.h
+95
-0
speechx/speechx/frontend/audio/feature_pipeline.h
speechx/speechx/frontend/audio/feature_pipeline.h
+1
-1
speechx/speechx/frontend/audio/linear_spectrogram.cc
speechx/speechx/frontend/audio/linear_spectrogram.cc
+21
-70
speechx/speechx/frontend/audio/linear_spectrogram.h
speechx/speechx/frontend/audio/linear_spectrogram.h
+22
-34
未找到文件。
speechx/examples/custom_asr/run.sh
浏览文件 @
182858bf
...
...
@@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
recognizer_test_main
\
--wav_rspecifier
=
scp:
$wav_scp
\
--cmvn_file
=
$cmvn
\
--streaming_chunk
=
30
\
--use_fbank
=
true
\
--model_path
=
$model_dir
/avg_10.jit.pdmodel
\
--param_path
=
$model_dir
/avg_10.jit.pdiparams
\
...
...
speechx/examples/ds2_ol/aishell/run.sh
浏览文件 @
182858bf
...
...
@@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--feature_wspecifier
=
ark,scp:
$data
/split
${
nj
}
/JOB/feat.ark,
$data
/split
${
nj
}
/JOB/feat.scp
\
--cmvn_file
=
$cmvn
\
--streaming_chunk
=
0.36
echo
"feature make have finished!!!"
fi
...
...
@@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--cmvn_file
=
$cmvn
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--streaming_chunk
=
30
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
...
...
speechx/examples/ds2_ol/aishell/run_fbank.sh
浏览文件 @
182858bf
...
...
@@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--cmvn_file
=
$cmvn
\
--model_path
=
$model_dir
/avg_5.jit.pdmodel
\
--streaming_chunk
=
30
\
--use_fbank
=
true
\
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
...
...
speechx/examples/ds2_ol/websocket/websocket_client.sh
浏览文件 @
182858bf
...
...
@@ -32,4 +32,4 @@ export GLOG_logtostderr=1
# websocket client
websocket_client_main
\
--wav_rspecifier
=
scp:
$data
/
$aishell_wav_scp
--streaming_chunk
=
0.36
\ No newline at end of file
--wav_rspecifier
=
scp:
$data
/
$aishell_wav_scp
--streaming_chunk
=
0.5
speechx/examples/ds2_ol/websocket/websocket_server.sh
浏览文件 @
182858bf
...
...
@@ -62,7 +62,6 @@ fi
websocket_server_main
\
--cmvn_file
=
$cmvn
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--streaming_chunk
=
0.1
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
...
...
speechx/speechx/decoder/param.h
浏览文件 @
182858bf
...
...
@@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
// feature, or fbank");
DEFINE_int32
(
num_bins
,
161
,
"num bins of mel"
);
DEFINE_string
(
cmvn_file
,
""
,
"read cmvn"
);
DEFINE_double
(
streaming_chunk
,
0.1
,
"streaming feature chunk size"
);
// feature sliding window
DEFINE_int32
(
receptive_field_length
,
7
,
...
...
@@ -62,7 +61,6 @@ namespace ppspeech {
FeaturePipelineOptions
InitFeaturePipelineOptions
()
{
FeaturePipelineOptions
opts
;
opts
.
cmvn_file
=
FLAGS_cmvn_file
;
opts
.
linear_spectrogram_opts
.
streaming_chunk
=
FLAGS_streaming_chunk
;
kaldi
::
FrameExtractionOptions
frame_opts
;
frame_opts
.
dither
=
0.0
;
frame_opts
.
frame_shift_ms
=
10
;
...
...
@@ -71,8 +69,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts
.
to_float32
=
false
;
frame_opts
.
window_type
=
"povey"
;
frame_opts
.
frame_length_ms
=
25
;
opts
.
fbank_opts
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opts
.
fbank_opts
.
f
bank_opts
.
f
rame_opts
=
frame_opts
;
opts
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opts
.
fbank_opts
.
frame_opts
=
frame_opts
;
}
else
{
opts
.
to_float32
=
true
;
frame_opts
.
remove_dc_offset
=
false
;
...
...
speechx/speechx/decoder/recognizer_main.cc
浏览文件 @
182858bf
...
...
@@ -19,6 +19,7 @@
DEFINE_string
(
wav_rspecifier
,
""
,
"test feature rspecifier"
);
DEFINE_string
(
result_wspecifier
,
""
,
"test result wspecifier"
);
DEFINE_double
(
streaming_chunk
,
0.36
,
"streaming feature chunk size"
);
DEFINE_int32
(
sample_rate
,
16000
,
"sample rate"
);
int
main
(
int
argc
,
char
*
argv
[])
{
...
...
@@ -96,4 +97,4 @@ int main(int argc, char* argv[]) {
KALDI_LOG
<<
" cost:"
<<
elapsed
<<
" s"
;
KALDI_LOG
<<
"total wav duration is: "
<<
tot_wav_duration
<<
" s"
;
KALDI_LOG
<<
"the RTF is: "
<<
elapsed
/
tot_wav_duration
;
}
\ No newline at end of file
}
speechx/speechx/frontend/audio/audio_cache.h
浏览文件 @
182858bf
...
...
@@ -30,8 +30,9 @@ class AudioCache : public FrontendInterface {
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
waves
);
// the audio dim is 1, one sample
virtual
size_t
Dim
()
const
{
return
1
;
}
// the audio dim is 1, one sample, which is useless,
// so we return size_(cache samples) instead.
virtual
size_t
Dim
()
const
{
return
size_
;
}
virtual
void
SetFinished
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
...
...
speechx/speechx/frontend/audio/compute_fbank_main.cc
浏览文件 @
182858bf
...
...
@@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
data_source
(
new
ppspeech
::
AudioCache
(
3600
*
1600
,
false
));
ppspeech
::
FbankOptions
opt
;
opt
.
fbank_opts
.
frame_opts
.
frame_length_ms
=
25
;
opt
.
fbank_opts
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opt
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opt
.
fbank_opts
.
frame_opts
.
dither
=
0.0
;
kaldi
::
FbankOptions
opt
;
opt
.
frame_opts
.
frame_length_ms
=
25
;
opt
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opt
.
frame_opts
.
dither
=
0.0
;
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
fbank
(
new
ppspeech
::
Fbank
(
opt
,
std
::
move
(
data_source
)));
...
...
speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
浏览文件 @
182858bf
...
...
@@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
ppspeech
::
LinearSpectrogramOptions
opt
;
opt
.
frame_opts
.
frame_length_ms
=
20
;
opt
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opt
.
frame_opts
.
dither
=
0.0
;
opt
.
frame_opts
.
remove_dc_offset
=
false
;
opt
.
frame_opts
.
window_type
=
"hanning"
;
...
...
speechx/speechx/frontend/audio/fbank.cc
浏览文件 @
182858bf
...
...
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/fbank.h"
#include "kaldi/base/kaldi-math.h"
#include "kaldi/feat/feature-common.h"
...
...
@@ -29,95 +28,33 @@ using kaldi::VectorBase;
using
kaldi
::
Matrix
;
using
std
::
vector
;
// todo refactor later:(SmileGoat)
Fbank
::
Fbank
(
const
FbankOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
FbankComputer
::
FbankComputer
(
const
Options
&
opts
)
:
opts_
(
opts
),
computer_
(
opts
.
fbank_opts
),
window_function_
(
opts
.
fbank_opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
chunk_sample_size_
=
static_cast
<
int32
>
(
opts
.
streaming_chunk
*
opts
.
fbank_opts
.
frame_opts
.
samp_freq
);
}
computer_
(
opts
)
{}
void
Fbank
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
base_extractor_
->
Accept
(
inputs
);
int32
FbankComputer
::
Dim
()
const
{
return
opts_
.
mel_opts
.
num_bins
+
(
opts_
.
use_energy
?
1
:
0
);
}
bool
Fbank
::
Read
(
Vector
<
BaseFloat
>*
feats
)
{
Vector
<
BaseFloat
>
wav
(
chunk_sample_size_
);
bool
flag
=
base_extractor_
->
Read
(
&
wav
);
if
(
flag
==
false
||
wav
.
Dim
()
==
0
)
return
false
;
// append remaned waves
int32
wav_len
=
wav
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
Vector
<
BaseFloat
>
waves
(
left_len
+
wav_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
wav_len
).
CopyFromVec
(
wav
);
// compute speech feature
Compute
(
waves
,
feats
);
// cache remaned waves
kaldi
::
FrameExtractionOptions
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
frame_opts
);
int32
frame_shift
=
frame_opts
.
WindowShift
();
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
bool
FbankComputer
::
NeedRawLogEnergy
()
{
return
opts_
.
use_energy
&&
opts_
.
raw_energy
;
}
// Compute spectrogram feat
bool
Fbank
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
Vector
<
BaseFloat
>*
feats
)
{
const
kaldi
::
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
frame_opts
.
WindowSize
();
int32
sample_rate
=
frame_opts
.
samp_freq
;
if
(
num_samples
<
frame_length
)
{
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
feats
->
Resize
(
num_frames
*
Dim
());
Vector
<
BaseFloat
>
window
;
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
for
(
int32
frame
=
0
;
frame
<
num_frames
;
frame
++
)
{
BaseFloat
raw_log_energy
=
0.0
;
kaldi
::
ExtractWindow
(
0
,
waves
,
frame
,
frame_opts
,
window_function_
,
&
window
,
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
Vector
<
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kaldi
::
kUndefined
);
// note: this online feature-extraction code does not support VTLN.
RealFft
(
&
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
&
window
);
const
kaldi
::
MelBanks
&
mel_bank
=
*
(
computer_
.
GetMelBanks
(
1.0
));
SubVector
<
BaseFloat
>
power_spectrum
(
window
,
0
,
window
.
Dim
()
/
2
+
1
);
if
(
!
opts_
.
fbank_opts
.
use_power
)
{
power_spectrum
.
ApplyPow
(
0.5
);
}
int32
mel_offset
=
((
opts_
.
fbank_opts
.
use_energy
&&
!
opts_
.
fbank_opts
.
htk_compat
)
?
1
:
0
);
SubVector
<
BaseFloat
>
mel_energies
(
this_feature
,
mel_offset
,
opts_
.
fbank_opts
.
mel_opts
.
num_bins
);
mel_bank
.
Compute
(
power_spectrum
,
&
mel_energies
);
mel_energies
.
ApplyFloor
(
1e-07
);
mel_energies
.
ApplyLog
();
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
// Compute feat
bool
FbankComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
Vector
<
BaseFloat
>*
feat
)
{
RealFft
(
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
window
);
const
kaldi
::
MelBanks
&
mel_bank
=
*
(
computer_
.
GetMelBanks
(
1.0
));
SubVector
<
BaseFloat
>
power_spectrum
(
*
window
,
0
,
window
->
Dim
()
/
2
+
1
);
if
(
!
opts_
.
use_power
)
{
power_spectrum
.
ApplyPow
(
0.5
);
}
int32
mel_offset
=
((
opts_
.
use_energy
&&
!
opts_
.
htk_compat
)
?
1
:
0
);
SubVector
<
BaseFloat
>
mel_energies
(
*
feat
,
mel_offset
,
opts_
.
mel_opts
.
num_bins
);
mel_bank
.
Compute
(
power_spectrum
,
&
mel_energies
);
mel_energies
.
ApplyFloor
(
1e-07
);
mel_energies
.
ApplyLog
();
return
true
;
}
...
...
speechx/speechx/frontend/audio/fbank.h
浏览文件 @
182858bf
...
...
@@ -15,6 +15,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-fbank.h"
#include "kaldi/feat/feature-mfcc.h"
...
...
@@ -22,56 +23,28 @@
namespace
ppspeech
{
struct
FbankOptions
{
kaldi
::
FbankOptions
fbank_opts
;
kaldi
::
BaseFloat
streaming_chunk
;
// second
FbankOptions
()
:
streaming_chunk
(
0.1
),
fbank_opts
()
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"streaming-chunk"
,
&
streaming_chunk
,
"streaming chunk size, default: 0.1 sec"
);
fbank_opts
.
Register
(
opts
);
}
};
class
Fbank
:
public
FrontendInterface
{
class
FbankComputer
{
public:
explicit
Fbank
(
const
FbankOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
typedef
kaldi
::
FbankOptions
Options
;
explicit
FbankComputer
(
const
Options
&
opts
);
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
();
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
kaldi
::
FrameExtractionOptions
&
GetFrameOptions
()
{
return
opts_
.
frame_opts
;
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
bool
Compute
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
window
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
);
int32
Dim
()
const
;
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
}
bool
NeedRawLogEnergy
();
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
Options
opts_
;
FbankOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
FeatureWindowFunction
window_function_
;
kaldi
::
FbankComputer
computer_
;
// features_ is the Mfcc or Plp or Fbank features that we have already
// computed.
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
features_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
kaldi
::
int32
chunk_sample_size_
;
DISALLOW_COPY_AND_ASSIGN
(
Fbank
);
DISALLOW_COPY_AND_ASSIGN
(
FbankComputer
);
};
typedef
StreamingFeatureTpl
<
FbankComputer
>
Fbank
;
}
// namespace ppspeech
speechx/speechx/frontend/audio/feature_common.h
0 → 100644
浏览文件 @
182858bf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "frontend_itf.h"
#include "kaldi/feat/feature-window.h"
namespace
ppspeech
{
template
<
class
F
>
class
StreamingFeatureTpl
:
public
FrontendInterface
{
public:
typedef
typename
F
::
Options
Options
;
StreamingFeatureTpl
(
const
Options
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
();
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
}
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
Options
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
FeatureWindowFunction
window_function_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
F
computer_
;
};
}
// namespace ppspeech
#include "frontend/audio/feature_common_inl.h"
speechx/speechx/frontend/audio/feature_common_inl.h
0 → 100644
浏览文件 @
182858bf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace
ppspeech
{
template
<
class
F
>
StreamingFeatureTpl
<
F
>::
StreamingFeatureTpl
(
const
Options
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
computer_
(
opts
),
window_function_
(
opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
}
template
<
class
F
>
void
StreamingFeatureTpl
<
F
>::
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
)
{
base_extractor_
->
Accept
(
waves
);
}
template
<
class
F
>
bool
StreamingFeatureTpl
<
F
>::
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
)
{
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
wav
(
base_extractor_
->
Dim
());
bool
flag
=
base_extractor_
->
Read
(
&
wav
);
if
(
flag
==
false
||
wav
.
Dim
()
==
0
)
return
false
;
// append remaned waves
int32
wav_len
=
wav
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
waves
(
left_len
+
wav_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
wav_len
).
CopyFromVec
(
wav
);
// compute speech feature
Compute
(
waves
,
feats
);
// cache remaned waves
kaldi
::
FrameExtractionOptions
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
frame_opts
);
int32
frame_shift
=
frame_opts
.
WindowShift
();
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
}
// Compute feat
template
<
class
F
>
bool
StreamingFeatureTpl
<
F
>::
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
)
{
const
kaldi
::
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
frame_opts
.
WindowSize
();
int32
sample_rate
=
frame_opts
.
samp_freq
;
if
(
num_samples
<
frame_length
)
{
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
feats
->
Resize
(
num_frames
*
Dim
());
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
window
;
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
for
(
int32
frame
=
0
;
frame
<
num_frames
;
frame
++
)
{
kaldi
::
BaseFloat
raw_log_energy
=
0.0
;
kaldi
::
ExtractWindow
(
0
,
waves
,
frame
,
frame_opts
,
window_function_
,
&
window
,
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kaldi
::
kUndefined
);
computer_
.
Compute
(
&
window
,
&
this_feature
);
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
}
return
true
;
}
}
// namespace ppspeech
speechx/speechx/frontend/audio/feature_pipeline.h
浏览文件 @
182858bf
...
...
@@ -32,7 +32,7 @@ struct FeaturePipelineOptions {
bool
to_float32
;
// true, only for linear feature
bool
use_fbank
;
LinearSpectrogramOptions
linear_spectrogram_opts
;
FbankOptions
fbank_opts
;
kaldi
::
FbankOptions
fbank_opts
;
FeatureCacheOptions
feature_cache_opts
;
AssemblerOptions
assembler_opts
;
...
...
speechx/speechx/frontend/audio/linear_spectrogram.cc
浏览文件 @
182858bf
...
...
@@ -28,81 +28,32 @@ using kaldi::VectorBase;
using
kaldi
::
Matrix
;
using
std
::
vector
;
LinearSpectrogram
::
LinearSpectrogram
(
const
LinearSpectrogramOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
feature_window_funtion_
(
opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
LinearSpectrogramComputer
::
LinearSpectrogramComputer
(
const
Options
&
opts
)
:
opts_
(
opts
)
{
kaldi
::
FeatureWindowFunction
feature_window_function
(
opts
.
frame_opts
);
int32
window_size
=
opts
.
frame_opts
.
WindowSize
();
int32
window_shift
=
opts
.
frame_opts
.
WindowShift
()
;
frame_length_
=
window_size
;
dim_
=
window_size
/
2
+
1
;
chunk_sample_size_
=
static_cast
<
int32
>
(
opts
.
streaming_chunk
*
opts
.
frame_opts
.
samp_freq
);
hanning_window_energy_
=
kaldi
::
VecVec
(
feature_window_funtion_
.
window
,
feature_window_funtion_
.
window
);
}
void
LinearSpectrogram
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
base_extractor_
->
Accept
(
inputs
);
}
bool
LinearSpectrogram
::
Read
(
Vector
<
BaseFloat
>*
feats
)
{
Vector
<
BaseFloat
>
input_feats
(
chunk_sample_size_
);
bool
flag
=
base_extractor_
->
Read
(
&
input_feats
);
if
(
flag
==
false
||
input_feats
.
Dim
()
==
0
)
return
false
;
int32
feat_len
=
input_feats
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
Vector
<
BaseFloat
>
waves
(
feat_len
+
left_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
feat_len
).
CopyFromVec
(
input_feats
);
Compute
(
waves
,
feats
);
int32
frame_shift
=
opts_
.
frame_opts
.
WindowShift
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
opts_
.
frame_opts
);
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
BaseFloat
hanning_window_energy
=
kaldi
::
VecVec
(
feature_window_function
.
window
,
feature_window_function
.
window
);
int32
sample_rate
=
opts
.
frame_opts
.
samp_freq
;
scale_
=
2.0
/
(
hanning_window_energy
*
sample_rate
);
}
// Compute spectrogram feat
bool
LinearSpectrogram
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
Vector
<
BaseFloat
>*
feats
)
{
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
opts_
.
frame_opts
.
WindowSize
();
int32
sample_rate
=
opts_
.
frame_opts
.
samp_freq
;
BaseFloat
scale
=
2.0
/
(
hanning_window_energy_
*
sample_rate
);
if
(
num_samples
<
frame_length
)
{
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
opts_
.
frame_opts
);
feats
->
Resize
(
num_frames
*
dim_
);
Vector
<
BaseFloat
>
window
;
for
(
int
frame_idx
=
0
;
frame_idx
<
num_frames
;
++
frame_idx
)
{
kaldi
::
ExtractWindow
(
0
,
waves
,
frame_idx
,
opts_
.
frame_opts
,
feature_window_funtion_
,
&
window
,
NULL
);
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame_idx
*
dim_
,
dim_
);
window
.
Resize
(
frame_length
,
kaldi
::
kCopyData
);
RealFft
(
&
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
&
window
);
SubVector
<
BaseFloat
>
power_spectrum
(
window
,
0
,
dim_
);
power_spectrum
.
Scale
(
scale
);
power_spectrum
(
0
)
=
power_spectrum
(
0
)
/
2
;
power_spectrum
(
dim_
-
1
)
=
power_spectrum
(
dim_
-
1
)
/
2
;
power_spectrum
.
Add
(
1e-14
);
power_spectrum
.
ApplyLog
();
output_row
.
CopyFromVec
(
power_spectrum
);
}
bool
LinearSpectrogramComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
Vector
<
BaseFloat
>*
feat
)
{
window
->
Resize
(
frame_length_
,
kaldi
::
kCopyData
);
RealFft
(
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
window
);
SubVector
<
BaseFloat
>
power_spectrum
(
*
window
,
0
,
dim_
);
power_spectrum
.
Scale
(
scale_
);
power_spectrum
(
0
)
=
power_spectrum
(
0
)
/
2
;
power_spectrum
(
dim_
-
1
)
=
power_spectrum
(
dim_
-
1
)
/
2
;
power_spectrum
.
Add
(
1e-14
);
power_spectrum
.
ApplyLog
();
feat
->
CopyFromVec
(
power_spectrum
);
return
true
;
}
...
...
speechx/speechx/frontend/audio/linear_spectrogram.h
浏览文件 @
182858bf
...
...
@@ -16,6 +16,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-window.h"
...
...
@@ -23,47 +24,34 @@ namespace ppspeech {
struct
LinearSpectrogramOptions
{
kaldi
::
FrameExtractionOptions
frame_opts
;
kaldi
::
BaseFloat
streaming_chunk
;
// second
LinearSpectrogramOptions
()
:
streaming_chunk
(
0.1
),
frame_opts
()
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"streaming-chunk"
,
&
streaming_chunk
,
"streaming chunk size, default: 0.1 sec"
);
frame_opts
.
Register
(
opts
);
}
LinearSpectrogramOptions
()
:
frame_opts
()
{}
};
class
LinearSpectrogram
:
public
FrontendInterface
{
class
LinearSpectrogram
Computer
{
public:
explicit
LinearSpectrogram
(
const
LinearSpectrogramOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
typedef
LinearSpectrogramOptions
Options
;
explicit
LinearSpectrogramComputer
(
const
Options
&
opts
);
kaldi
::
FrameExtractionOptions
&
GetFrameOptions
()
{
return
opts_
.
frame_opts
;
}
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
bool
Compute
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
window
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
);
size_t
dim_
;
kaldi
::
FeatureWindowFunction
feature_window_funtion_
;
kaldi
::
BaseFloat
hanning_window_energy_
;
LinearSpectrogramOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
int
chunk_sample_size_
;
DISALLOW_COPY_AND_ASSIGN
(
LinearSpectrogram
);
int32
Dim
()
const
{
return
dim_
;
}
bool
NeedRawLogEnergy
()
{
return
false
;
}
private:
kaldi
::
BaseFloat
scale_
;
Options
opts_
;
int32
frame_length_
;
int32
dim_
;
DISALLOW_COPY_AND_ASSIGN
(
LinearSpectrogramComputer
);
};
typedef
StreamingFeatureTpl
<
LinearSpectrogramComputer
>
LinearSpectrogram
;
}
// namespace ppspeech
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录