Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
ab3097b7
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
206
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
ab3097b7
编写于
4月 24, 2022
作者:
H
Hui Zhang
提交者:
GitHub
4月 24, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1765 from zh794390558/fbank
[speechx] fbank and mfcc
上级
f8cb0c8e
44e14515
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
298 addition
and
15 deletion
+298
-15
speechx/speechx/frontend/audio/data_cache.h
speechx/speechx/frontend/audio/data_cache.h
+4
-2
speechx/speechx/frontend/audio/fbank.cc
speechx/speechx/frontend/audio/fbank.cc
+108
-0
speechx/speechx/frontend/audio/fbank.h
speechx/speechx/frontend/audio/fbank.h
+16
-11
speechx/speechx/frontend/audio/mfcc.cc
speechx/speechx/frontend/audio/mfcc.cc
+108
-0
speechx/speechx/frontend/audio/mfcc.h
speechx/speechx/frontend/audio/mfcc.h
+62
-2
未找到文件。
speechx/speechx/frontend/audio/data_cache.h
浏览文件 @
ab3097b7
...
@@ -21,8 +21,10 @@
...
@@ -21,8 +21,10 @@
namespace
ppspeech
{
namespace
ppspeech
{
// A data source for testing different frontend module.
// It accepts waves or feats.
// Simulates audio/feature input, by returning data from a Vector.
// This class is mostly meant to be used for online decoder testing using
// pre-recorded audio/feature
class
DataCache
:
public
FrontendInterface
{
class
DataCache
:
public
FrontendInterface
{
public:
public:
explicit
DataCache
()
{
finished_
=
false
;
}
explicit
DataCache
()
{
finished_
=
false
;
}
...
...
speechx/speechx/frontend/audio/fbank.cc
0 → 100644
浏览文件 @
ab3097b7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/fbank.h"
#include "kaldi/base/kaldi-math.h"
#include "kaldi/feat/feature-common.h"
#include "kaldi/feat/feature-functions.h"
#include "kaldi/matrix/matrix-functions.h"
namespace
ppspeech
{
using
kaldi
::
int32
;
using
kaldi
::
BaseFloat
;
using
kaldi
::
Vector
;
using
kaldi
::
SubVector
;
using
kaldi
::
VectorBase
;
using
kaldi
::
Matrix
;
using
std
::
vector
;
Fbank
::
Fbank
(
const
FbankOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
computer_
(
opts
.
fbank_opts
),
window_function_
(
computer_
.
GetFrameOptions
())
{
base_extractor_
=
std
::
move
(
base_extractor
);
chunk_sample_size_
=
static_cast
<
int32
>
(
opts
.
streaming_chunk
*
opts
.
frame_opts
.
samp_freq
);
}
void
Fbank
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
base_extractor_
->
Accept
(
inputs
);
}
bool
Fbank
::
Read
(
Vector
<
BaseFloat
>*
feats
)
{
Vector
<
BaseFloat
>
wav
(
chunk_sample_size_
);
bool
flag
=
base_extractor_
->
Read
(
&
wav
);
if
(
flag
==
false
||
wav
.
Dim
()
==
0
)
return
false
;
// append remaned waves
int32
wav_len
=
wav
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
Vector
<
BaseFloat
>
waves
(
left_len
+
wav_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
wav_len
).
CopyFromVec
(
wav
);
// compute speech feature
Compute
(
waves
,
feats
);
// cache remaned waves
kaldi
::
FrameExtractionOptions
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
frame_opts
);
int32
frame_shift
=
frame_opts
.
WindowShift
();
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
}
// Compute spectrogram feat
bool
Fbank
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
Vector
<
BaseFloat
>*
feats
)
{
const
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
frame_opts
.
WindowSize
();
int32
sample_rate
=
frame_opts
.
samp_freq
;
if
(
num_samples
<
frame_length
)
{
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
feats
->
Rsize
(
num_frames
*
Dim
());
Vector
<
BaseFloat
>
window
;
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
for
(
int32
frame
=
0
;
frame
<
num_frames
;
frame
++
)
{
BaseFloat
raw_log_energy
=
0.0
;
kaldi
::
ExtractWindow
(
0
,
waves
,
frame
,
frame_opts
,
window_function_
,
&
window
,
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
Vector
<
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kUndefined
);
// note: this online feature-extraction code does not support VTLN.
BaseFloat
vtln_warp
=
1.0
;
computer_
.
Compute
(
raw_log_energy
,
vtln_warp
,
&
window
,
&
this_feature
);
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
}
return
true
;
}
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/frontend/audio/fbank.h
浏览文件 @
ab3097b7
...
@@ -12,29 +12,30 @@
...
@@ -12,29 +12,30 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
// wrap the fbank feat of kaldi, todo (SmileGoat)
#pragma once
#include "kaldi/feat/feature-fbank.h"
#include "kaldi/feat/feature-mfcc.h"
#include "kaldi/feat/feature-mfcc.h"
#incl
du
e "kaldi/matrix/kaldi-vector.h"
#incl
ud
e "kaldi/matrix/kaldi-vector.h"
namespace
ppspeech
{
namespace
ppspeech
{
struct
FbankOptions
{
struct
FbankOptions
{
kaldi
::
F
rameExtractionOptions
frame
_opts
;
kaldi
::
F
bankOptions
fbank
_opts
;
kaldi
::
BaseFloat
streaming_chunk
;
// second
kaldi
::
BaseFloat
streaming_chunk
;
// second
LinearSpectrogramOptions
()
:
streaming_chunk
(
0.1
),
frame
_opts
()
{}
FbankOptions
()
:
streaming_chunk
(
0.1
),
fbank
_opts
()
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"streaming-chunk"
,
opts
->
Register
(
"streaming-chunk"
,
&
streaming_chunk
,
&
streaming_chunk
,
"streaming chunk size, default: 0.1 sec"
);
"streaming chunk size, default: 0.1 sec"
);
f
rame
_opts
.
Register
(
opts
);
f
bank
_opts
.
Register
(
opts
);
}
}
};
};
class
Fbank
:
FrontendInterface
{
class
Fbank
:
public
FrontendInterface
{
public:
public:
explicit
Fbank
(
const
FbankOptions
&
opts
,
explicit
Fbank
(
const
FbankOptions
&
opts
,
unique_ptr
<
FrontendInterface
>
base_extractor
);
unique_ptr
<
FrontendInterface
>
base_extractor
);
...
@@ -42,7 +43,7 @@ class Fbank : FrontendInterface {
...
@@ -42,7 +43,7 @@ class Fbank : FrontendInterface {
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the dim of single frame feature
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
()
;
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
...
@@ -57,13 +58,17 @@ class Fbank : FrontendInterface {
...
@@ -57,13 +58,17 @@ class Fbank : FrontendInterface {
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// kaldi::FeatureWindowFunction feature_window_funtion_;
// kaldi::BaseFloat hanning_window_energy_;
size_t
dim_
;
FbankOptions
opts_
;
FbankOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
FeatureWindowFunction
window_function_
;
kaldi
::
FbankComputer
computer_
;
// features_ is the Mfcc or Plp or Fbank features that we have already
// computed.
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
features_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
int
chunk_sample_size_
;
DISALLOW_COPY_AND_ASSIGN
(
Fbank
);
DISALLOW_COPY_AND_ASSIGN
(
Fbank
);
};
};
...
...
speechx/speechx/frontend/audio/mfcc.cc
0 → 100644
浏览文件 @
ab3097b7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/mfcc.h"
#include "kaldi/base/kaldi-math.h"
#include "kaldi/feat/feature-common.h"
#include "kaldi/feat/feature-functions.h"
#include "kaldi/matrix/matrix-functions.h"
namespace
ppspeech
{
using
kaldi
::
int32
;
using
kaldi
::
BaseFloat
;
using
kaldi
::
Vector
;
using
kaldi
::
SubVector
;
using
kaldi
::
VectorBase
;
using
kaldi
::
Matrix
;
using
std
::
vector
;
Mfcc
::
Mfcc
(
const
MfccOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
computer_
(
opts
.
mfcc_opts
),
window_function_
(
computer_
.
GetFrameOptions
())
{
base_extractor_
=
std
::
move
(
base_extractor
);
chunk_sample_size_
=
static_cast
<
int32
>
(
opts
.
streaming_chunk
*
opts
.
frame_opts
.
samp_freq
);
}
void
Mfcc
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
base_extractor_
->
Accept
(
inputs
);
}
bool
Mfcc
::
Read
(
Vector
<
BaseFloat
>*
feats
)
{
Vector
<
BaseFloat
>
wav
(
chunk_sample_size_
);
bool
flag
=
base_extractor_
->
Read
(
&
wav
);
if
(
flag
==
false
||
wav
.
Dim
()
==
0
)
return
false
;
// append remaned waves
int32
wav_len
=
wav
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
Vector
<
BaseFloat
>
waves
(
left_len
+
wav_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
wav_len
).
CopyFromVec
(
wav
);
// compute speech feature
Compute
(
waves
,
feats
);
// cache remaned waves
kaldi
::
FrameExtractionOptions
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
frame_opts
);
int32
frame_shift
=
frame_opts
.
WindowShift
();
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
}
// Compute spectrogram feat
bool
Mfcc
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
Vector
<
BaseFloat
>*
feats
)
{
const
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
frame_opts
.
WindowSize
();
int32
sample_rate
=
frame_opts
.
samp_freq
;
if
(
num_samples
<
frame_length
)
{
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
feats
->
Rsize
(
num_frames
*
Dim
());
Vector
<
BaseFloat
>
window
;
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
for
(
int32
frame
=
0
;
frame
<
num_frames
;
frame
++
)
{
BaseFloat
raw_log_energy
=
0.0
;
kaldi
::
ExtractWindow
(
0
,
waves
,
frame
,
frame_opts
,
window_function_
,
&
window
,
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
Vector
<
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kUndefined
);
// note: this online feature-extraction code does not support VTLN.
BaseFloat
vtln_warp
=
1.0
;
computer_
.
Compute
(
raw_log_energy
,
vtln_warp
,
&
window
,
&
this_feature
);
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
}
return
true
;
}
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/frontend/audio/mfcc.h
浏览文件 @
ab3097b7
...
@@ -12,5 +12,65 @@
...
@@ -12,5 +12,65 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
// wrap the mfcc feat of kaldi, todo (SmileGoat)
#pragma once
#include "kaldi/feat/feature-mfcc.h"
\ No newline at end of file
#include "kaldi/feat/feature-mfcc.h"
#include "kaldi/feat/feature-mfcc.h"
#include "kaldi/matrix/kaldi-vector.h"
namespace
ppspeech
{
struct
MfccOptions
{
kaldi
::
MfccOptions
mfcc_opts
;
kaldi
::
BaseFloat
streaming_chunk
;
// second
MfccOptions
()
:
streaming_chunk
(
0.1
),
mfcc_opts
()
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"streaming-chunk"
,
&
streaming_chunk
,
"streaming chunk size, default: 0.1 sec"
);
mfcc_opts
.
Register
(
opts
);
}
};
class
Mfcc
:
public
FrontendInterface
{
public:
explicit
Mfcc
(
const
MfccOptions
&
opts
,
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
();
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
}
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
MfccOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
FeatureWindowFunction
window_function_
;
kaldi
::
MfccComputer
computer_
;
// features_ is the Mfcc or Plp or Fbank features that we have already
// computed.
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
features_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
DISALLOW_COPY_AND_ASSIGN
(
Fbank
);
};
}
// namespace ppspeech
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录