Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
8d66a254
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8d66a254
编写于
4月 01, 2022
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
cmvn and db norm
上级
a9f4ce47
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
199 addition
and
166 deletion
+199
-166
speechx/speechx/frontend/CMakeLists.txt
speechx/speechx/frontend/CMakeLists.txt
+2
-1
speechx/speechx/frontend/cmvn.cc
speechx/speechx/frontend/cmvn.cc
+1
-78
speechx/speechx/frontend/cmvn.h
speechx/speechx/frontend/cmvn.h
+34
-0
speechx/speechx/frontend/db_norm.cc
speechx/speechx/frontend/db_norm.cc
+95
-0
speechx/speechx/frontend/db_norm.h
speechx/speechx/frontend/db_norm.h
+65
-0
speechx/speechx/frontend/normalizer.h
speechx/speechx/frontend/normalizer.h
+2
-87
未找到文件。
speechx/speechx/frontend/CMakeLists.txt
浏览文件 @
8d66a254
project
(
frontend
)
add_library
(
frontend STATIC
normalizer.cc
cmvn.cc
db_norm.cc
linear_spectrogram.cc
audio_cache.cc
feature_cache.cc
...
...
speechx/speechx/frontend/
normalizer
.cc
→
speechx/speechx/frontend/
cmvn
.cc
浏览文件 @
8d66a254
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/normalizer.h"
#include "kaldi/feat/cmvn.h"
...
...
@@ -26,70 +12,7 @@ using std::vector;
using
kaldi
::
SubVector
;
using
std
::
unique_ptr
;
DecibelNormalizer
::
DecibelNormalizer
(
const
DecibelNormalizerOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
opts_
=
opts
;
dim_
=
1
;
}
void
DecibelNormalizer
::
Accept
(
const
kaldi
::
VectorBase
<
BaseFloat
>&
waves
)
{
base_extractor_
->
Accept
(
waves
);
}
bool
DecibelNormalizer
::
Read
(
kaldi
::
Vector
<
BaseFloat
>*
waves
)
{
if
(
base_extractor_
->
Read
(
waves
)
==
false
||
waves
->
Dim
()
==
0
)
{
return
false
;
}
Compute
(
waves
);
return
true
;
}
bool
DecibelNormalizer
::
Compute
(
VectorBase
<
BaseFloat
>*
waves
)
const
{
// calculate db rms
BaseFloat
rms_db
=
0.0
;
BaseFloat
mean_square
=
0.0
;
BaseFloat
gain
=
0.0
;
BaseFloat
wave_float_normlization
=
1.0
f
/
(
std
::
pow
(
2
,
16
-
1
));
vector
<
BaseFloat
>
samples
;
samples
.
resize
(
waves
->
Dim
());
for
(
size_t
i
=
0
;
i
<
samples
.
size
();
++
i
)
{
samples
[
i
]
=
(
*
waves
)(
i
);
}
// square
for
(
auto
&
d
:
samples
)
{
if
(
opts_
.
convert_int_float
)
{
d
=
d
*
wave_float_normlization
;
}
mean_square
+=
d
*
d
;
}
// mean
mean_square
/=
samples
.
size
();
rms_db
=
10
*
std
::
log10
(
mean_square
);
gain
=
opts_
.
target_db
-
rms_db
;
if
(
gain
>
opts_
.
max_gain_db
)
{
LOG
(
ERROR
)
<<
"Unable to normalize segment to "
<<
opts_
.
target_db
<<
"dB,"
<<
"because the the probable gain have exceeds opts_.max_gain_db"
<<
opts_
.
max_gain_db
<<
"dB."
;
return
false
;
}
// Note that this is an in-place transformation.
for
(
auto
&
item
:
samples
)
{
// python item *= 10.0 ** (gain / 20.0)
item
*=
std
::
pow
(
10.0
,
gain
/
20.0
);
}
std
::
memcpy
(
waves
->
Data
(),
samples
.
data
(),
sizeof
(
BaseFloat
)
*
samples
.
size
());
return
true
;
}
CMVN
::
CMVN
(
std
::
string
cmvn_file
,
unique_ptr
<
FrontendInterface
>
base_extractor
)
...
...
@@ -185,4 +108,4 @@ void CMVN::ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats) {
ApplyCmvn
(
stats_
,
var_norm_
,
feats
);
}
}
// namespace ppspeech
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/frontend/cmvn.h
0 → 100644
浏览文件 @
8d66a254
#pragma once
#include "base/common.h"
#include "frontend/frontend_itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
namespace
ppspeech
{
class
CMVN
:
public
FrontendInterface
{
public:
explicit
CMVN
(
std
::
string
cmvn_file
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
// the length of feats = feature_row * feature_dim,
// the Matrix is squashed into Vector
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the feautre dim.
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
}
private:
void
Compute
(
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>*
feats
)
const
;
void
ApplyCMVN
(
kaldi
::
MatrixBase
<
BaseFloat
>*
feats
);
kaldi
::
Matrix
<
double
>
stats_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
size_t
dim_
;
bool
var_norm_
;
};
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/frontend/db_norm.cc
0 → 100644
浏览文件 @
8d66a254
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/normalizer.h"
#include "kaldi/feat/cmvn.h"
#include "kaldi/util/kaldi-io.h"
namespace
ppspeech
{
using
kaldi
::
Vector
;
using
kaldi
::
VectorBase
;
using
kaldi
::
BaseFloat
;
using
std
::
vector
;
using
kaldi
::
SubVector
;
using
std
::
unique_ptr
;
DecibelNormalizer
::
DecibelNormalizer
(
const
DecibelNormalizerOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
opts_
=
opts
;
dim_
=
1
;
}
void
DecibelNormalizer
::
Accept
(
const
kaldi
::
VectorBase
<
BaseFloat
>&
waves
)
{
base_extractor_
->
Accept
(
waves
);
}
bool
DecibelNormalizer
::
Read
(
kaldi
::
Vector
<
BaseFloat
>*
waves
)
{
if
(
base_extractor_
->
Read
(
waves
)
==
false
||
waves
->
Dim
()
==
0
)
{
return
false
;
}
Compute
(
waves
);
return
true
;
}
bool
DecibelNormalizer
::
Compute
(
VectorBase
<
BaseFloat
>*
waves
)
const
{
// calculate db rms
BaseFloat
rms_db
=
0.0
;
BaseFloat
mean_square
=
0.0
;
BaseFloat
gain
=
0.0
;
BaseFloat
wave_float_normlization
=
1.0
f
/
(
std
::
pow
(
2
,
16
-
1
));
vector
<
BaseFloat
>
samples
;
samples
.
resize
(
waves
->
Dim
());
for
(
size_t
i
=
0
;
i
<
samples
.
size
();
++
i
)
{
samples
[
i
]
=
(
*
waves
)(
i
);
}
// square
for
(
auto
&
d
:
samples
)
{
if
(
opts_
.
convert_int_float
)
{
d
=
d
*
wave_float_normlization
;
}
mean_square
+=
d
*
d
;
}
// mean
mean_square
/=
samples
.
size
();
rms_db
=
10
*
std
::
log10
(
mean_square
);
gain
=
opts_
.
target_db
-
rms_db
;
if
(
gain
>
opts_
.
max_gain_db
)
{
LOG
(
ERROR
)
<<
"Unable to normalize segment to "
<<
opts_
.
target_db
<<
"dB,"
<<
"because the the probable gain have exceeds opts_.max_gain_db"
<<
opts_
.
max_gain_db
<<
"dB."
;
return
false
;
}
// Note that this is an in-place transformation.
for
(
auto
&
item
:
samples
)
{
// python item *= 10.0 ** (gain / 20.0)
item
*=
std
::
pow
(
10.0
,
gain
/
20.0
);
}
std
::
memcpy
(
waves
->
Data
(),
samples
.
data
(),
sizeof
(
BaseFloat
)
*
samples
.
size
());
return
true
;
}
}
// namespace ppspeech
speechx/speechx/frontend/db_norm.h
0 → 100644
浏览文件 @
8d66a254
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "base/common.h"
#include "frontend/frontend_itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
namespace
ppspeech
{
struct
DecibelNormalizerOptions
{
float
target_db
;
float
max_gain_db
;
bool
convert_int_float
;
DecibelNormalizerOptions
()
:
target_db
(
-
20
),
max_gain_db
(
300.0
),
convert_int_float
(
false
)
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"target-db"
,
&
target_db
,
"target db for db normalization"
);
opts
->
Register
(
"max-gain-db"
,
&
max_gain_db
,
"max gain db for db normalization"
);
opts
->
Register
(
"convert-int-float"
,
&
convert_int_float
,
"if convert int samples to float"
);
}
};
class
DecibelNormalizer
:
public
FrontendInterface
{
public:
explicit
DecibelNormalizer
(
const
DecibelNormalizerOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
waves
);
// noramlize audio, the dim is 1.
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
}
private:
bool
Compute
(
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>*
waves
)
const
;
DecibelNormalizerOptions
opts_
;
size_t
dim_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
waveform_
;
};
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/frontend/normalizer.h
浏览文件 @
8d66a254
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "base/common.h"
#include "frontend/frontend_itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
namespace
ppspeech
{
struct
DecibelNormalizerOptions
{
float
target_db
;
float
max_gain_db
;
bool
convert_int_float
;
DecibelNormalizerOptions
()
:
target_db
(
-
20
),
max_gain_db
(
300.0
),
convert_int_float
(
false
)
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"target-db"
,
&
target_db
,
"target db for db normalization"
);
opts
->
Register
(
"max-gain-db"
,
&
max_gain_db
,
"max gain db for db normalization"
);
opts
->
Register
(
"convert-int-float"
,
&
convert_int_float
,
"if convert int samples to float"
);
}
};
class
DecibelNormalizer
:
public
FrontendInterface
{
public:
explicit
DecibelNormalizer
(
const
DecibelNormalizerOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
waves
);
// noramlize audio, the dim is 1.
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
}
private:
bool
Compute
(
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>*
waves
)
const
;
DecibelNormalizerOptions
opts_
;
size_t
dim_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
waveform_
;
};
class
CMVN
:
public
FrontendInterface
{
public:
explicit
CMVN
(
std
::
string
cmvn_file
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
// the length of feats = feature_row * feature_dim,
// the Matrix is squashed into Vector
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the feautre dim.
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
}
private:
void
Compute
(
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>*
feats
)
const
;
void
ApplyCMVN
(
kaldi
::
MatrixBase
<
BaseFloat
>*
feats
);
kaldi
::
Matrix
<
double
>
stats_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
size_t
dim_
;
bool
var_norm_
;
};
}
// namespace ppspeech
\ No newline at end of file
#include "frontend/cmvn.h"
#include "frontend/db_norm.h"
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录