Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
a01fa866
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a01fa866
编写于
1月 28, 2022
作者:
S
SmileGoat
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add normalizer
上级
88275aff
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
167 addition
and
5 deletion
+167
-5
speechx/speechx/frontend/linear_spectrogram.cc
speechx/speechx/frontend/linear_spectrogram.cc
+4
-4
speechx/speechx/frontend/linear_spectrogram.h
speechx/speechx/frontend/linear_spectrogram.h
+1
-1
speechx/speechx/frontend/normalizer.cc
speechx/speechx/frontend/normalizer.cc
+97
-0
speechx/speechx/frontend/normalizer.h
speechx/speechx/frontend/normalizer.h
+65
-0
未找到文件。
speechx/speechx/frontend/linear_spectrogram.cc
浏览文件 @
a01fa866
...
...
@@ -89,7 +89,7 @@ bool LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) const {
// Compute spectrogram feat, return num frames
// todo: refactor later (SmileGoat)
int32
LinearSpectrogram
::
Compute
(
const
vector
<
float
>&
wave
,
bool
LinearSpectrogram
::
Compute
(
const
vector
<
float
>&
wave
,
vector
<
vector
<
float
>>&
feat
)
{
int
num_samples
=
wave
.
size
();
const
int
&
frame_length
=
opts
.
frame_opts
.
WindowSize
();
...
...
@@ -99,7 +99,7 @@ int32 LinearSpectrogram::Compute(const vector<float>& wave,
const
float
scale
=
hanning_window_energy_
*
frame_shift
;
if
(
num_samples
<
frame_length
)
{
return
0
;
return
true
;
}
int
num_frames
=
1
+
((
num_samples
-
frame_length
)
/
frame_shift
);
...
...
@@ -118,7 +118,7 @@ int32 LinearSpectrogram::Compute(const vector<float>& wave,
v
.
assign
(
data
.
begin
(),
data
.
end
());
if
(
NumpyFft
(
&
v
,
fft_real
,
fft_img
))
{
LOG
(
ERROR
)
<<
i
<<
" fft compute occurs error, please checkout the input data"
;
return
-
1
;
return
false
;
}
feat
[
i
].
resize
(
fft_points
/
2
+
1
);
// the last dimension is Fs/2 Hz
...
...
@@ -135,5 +135,5 @@ int32 LinearSpectrogram::Compute(const vector<float>& wave,
// log added eps=1e-14
feat
[
i
][
j
]
=
std
::
log
(
feat
[
i
][
j
]
+
1e-14
);
}
return
0
;
return
true
;
}
speechx/speechx/frontend/linear_spectrogram.h
浏览文件 @
a01fa866
...
...
@@ -28,7 +28,7 @@ class LinearSpectrogram : public FeatureExtractorInterface {
private:
void
Hanning
(
std
::
vector
<
kaldi
::
BaseFloat
>&
data
)
const
;
kaldi
::
int32
Compute
(
const
std
::
vector
<
kaldi
::
BaseFloat
>&
wave
,
std
::
vector
<
std
::
vector
<
kaldi
::
BaseFloat
>>&
feat
)
const
;
std
::
vector
<
std
::
vector
<
kaldi
::
BaseFloat
>>&
feat
);
bool
NumpyFft
(
std
::
vector
<
kaldi
::
BaseFloat
>*
v
,
std
::
vector
<
kaldi
::
BaseFloat
>*
real
,
std
::
vector
<
kaldi
::
BaseFloat
>*
img
)
const
;
...
...
speechx/speechx/frontend/normalizer.cc
0 → 100644
浏览文件 @
a01fa866
#include "frontend/normalizer.h"
DecibelNormalizer
::
DecibelNormalizer
(
const
DecibelNormalizerOptions
&
opts
,
const
std
::
unique_ptr
<
FeatureExtractorInterface
>&
pre_extractor
)
{
}
void
DecibelNormalizer
::
AcceptWavefrom
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
input
)
{
}
void
DecibelNormalizer
::
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
)
{
}
bool
DecibelNormalizer
::
Compute
(
const
Vector
<
kaldi
::
BaseFloat
>&
input
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
)
{
// calculate db rms
float
rms_db
=
0.0
;
float
mean_square
=
0.0
;
float
gain
=
0.0
;
vector
<
BaseFloat
>
smaples
;
samples
.
resize
(
input
.
Size
());
for
(
int32
i
=
0
;
i
<
samples
.
size
();
++
i
)
{
samples
[
i
]
=
input
(
i
);
}
// square
for
(
auto
&
d
:
samples
)
{
if
(
_opts
.
convert_int_float
)
{
d
=
d
*
WAVE_FLOAT_NORMALIZATION
;
}
mean_square
+=
d
*
d
;
}
// mean
mean_square
/=
samples
.
size
();
rms_db
=
10
*
std
::
log10
(
mean_square
);
gain
=
opts
.
target_db
-
rms_db
;
if
(
gain
>
opts
.
max_gain_db
)
{
LOG
(
ERROR
)
<<
"Unable to normalize segment to "
<<
opts
.
target_db
<<
"dB,"
<<
"because the the probable gain have exceeds opts.max_gain_db"
<<
opts
.
max_gain_db
<<
"dB."
;
return
false
;
}
// Note that this is an in-place transformation.
for
(
auto
&
item
:
samples
)
{
// python item *= 10.0 ** (gain / 20.0)
item
*=
std
::
pow
(
10.0
,
gain
/
20.0
);
}
return
true
;
}
PPNormalizer
::
PPNormalizer
(
const
PPNormalizerOptions
&
opts
,
const
std
::
unique_ptr
<
FeatureExtractorInterface
>&
pre_extractor
)
{
}
void
PPNormalizer
::
AcceptWavefrom
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
input
)
{
}
void
PPNormalizer
::
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
)
{
}
bool
PPNormalizer
::
Compute
(
const
Vector
<
kaldi
::
BaseFloat
>&
input
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>>*
feat
)
{
if
((
input
.
Dim
()
%
mean_
.
Dim
())
==
0
)
{
LOG
(
ERROR
)
<<
"CMVN dimension is wrong!"
;
return
false
;
}
try
{
int32
size
=
mean_
.
Dim
();
feat
->
Resize
(
input
.
Dim
());
for
(
int32
row_idx
=
0
;
row_idx
<
j
;
++
row_idx
)
{
int32
base_idx
=
row_idx
*
size
;
for
(
int32
idx
=
0
;
idx
<
mean_
.
Dim
();
++
idx
)
{
(
*
feat
)(
base_idx
+
idx
)
=
(
input
(
base_dix
+
idx
)
-
mean_
(
idx
))
*
variance_
(
idx
);
}
}
}
catch
(
const
std
::
exception
&
e
)
{
std
::
cerr
<<
e
.
what
()
<<
'\n'
;
return
false
;
}
return
true
;
}
speechx/speechx/frontend/normalizer.h
0 → 100644
浏览文件 @
a01fa866
#pragma once
#include "frontend/feature_extractor_interface.h"
namespace
ppspeech
{
struct
DecibelNormalizerOptions
{
float
target_db
;
float
max_gain_db
;
DecibelNormalizerOptions
()
:
target_db
(
-
20
),
max_gain_db
(
300.0
),
convert_int_float
(
false
)
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"target-db"
,
&
target_db
,
"target db for db normalization"
);
opts
->
Register
(
"max-gain-db"
,
&
max_gain_db
,
"max gain db for db normalization"
);
opts
->
Register
(
"convert-int-float"
,
&
convert_int_float
,
"if convert int samples to float"
);
}
};
class
DecibelNormalizer
:
public
FeatureExtractorInterface
{
public:
explict
DecibelNormalizer
(
const
DecibelNormalizerOptions
&
opts
,
const
std
::
unique_ptr
<
FeatureExtractorInterface
>&
pre_extractor
);
virtual
void
AcceptWavefrom
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
input
);
virtual
void
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
);
virtual
size_t
Dim
()
const
;
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
input
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>>*
feat
);
private:
};
struct
NormalizerOptions
{
std
::
string
mean_std_path
;
NormalizerOptions
()
:
mean_std_path
(
""
)
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"mean-std"
,
&
mean_std_path
,
"mean std file"
);
}
};
// todo refactor later (SmileGoat)
class
PPNormalizer
:
public
FeatureExtractorInterface
{
public:
explicit
PPNormalizer
(
const
NormalizerOptions
&
opts
,
const
std
::
unique_ptr
<
FeatureExtractorInterface
>&
pre_extractor
);
~
PPNormalizer
()
{}
virtual
void
AcceptWavefrom
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
input
);
virtual
void
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
);
virtual
size_t
Dim
()
const
;
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
input
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>>&
feat
);
private:
bool
_initialized
;
kaldi
::
Vector
<
float
>
mean_
;
kaldi
::
Vector
<
float
>
variance_
;
NormalizerOptions
_opts
;
};
}
// namespace ppspeech
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录