Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
fc72ab1e
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
fc72ab1e
编写于
10月 21, 2022
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
more debug info
上级
48271260
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
57 addition
and
20 deletion
+57
-20
speechx/build.sh
speechx/build.sh
+1
-1
speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+22
-3
speechx/speechx/frontend/audio/assembler.cc
speechx/speechx/frontend/audio/assembler.cc
+13
-6
speechx/speechx/frontend/audio/feature_cache.h
speechx/speechx/frontend/audio/feature_cache.h
+3
-1
speechx/speechx/nnet/decodable.cc
speechx/speechx/nnet/decodable.cc
+8
-3
speechx/speechx/nnet/u2_nnet.cc
speechx/speechx/nnet/u2_nnet.cc
+3
-2
speechx/speechx/recognizer/u2_recognizer.cc
speechx/speechx/recognizer/u2_recognizer.cc
+3
-3
speechx/speechx/recognizer/u2_recognizer_main.cc
speechx/speechx/recognizer/u2_recognizer_main.cc
+4
-1
未找到文件。
speechx/build.sh
浏览文件 @
fc72ab1e
...
@@ -20,4 +20,4 @@ fi
...
@@ -20,4 +20,4 @@ fi
mkdir
-p
build
mkdir
-p
build
cmake
-B
build
-DBOOST_ROOT
:STRING
=
${
boost_SOURCE_DIR
}
cmake
-B
build
-DBOOST_ROOT
:STRING
=
${
boost_SOURCE_DIR
}
cmake
--build
build
cmake
--build
build
-j
speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
浏览文件 @
fc72ab1e
...
@@ -76,11 +76,15 @@ void CTCPrefixBeamSearch::AdvanceDecode(
...
@@ -76,11 +76,15 @@ void CTCPrefixBeamSearch::AdvanceDecode(
// forward frame by frame
// forward frame by frame
std
::
vector
<
kaldi
::
BaseFloat
>
frame_prob
;
std
::
vector
<
kaldi
::
BaseFloat
>
frame_prob
;
bool
flag
=
decodable
->
FrameLikelihood
(
num_frame_decoded_
,
&
frame_prob
);
bool
flag
=
decodable
->
FrameLikelihood
(
num_frame_decoded_
,
&
frame_prob
);
if
(
flag
==
false
)
break
;
if
(
flag
==
false
)
{
LOG
(
INFO
)
<<
"decoder advance decode exit."
<<
frame_prob
.
size
();
break
;
}
std
::
vector
<
std
::
vector
<
kaldi
::
BaseFloat
>>
likelihood
;
std
::
vector
<
std
::
vector
<
kaldi
::
BaseFloat
>>
likelihood
;
likelihood
.
push_back
(
frame_prob
);
likelihood
.
push_back
(
frame_prob
);
AdvanceDecoding
(
likelihood
);
AdvanceDecoding
(
likelihood
);
VLOG
(
2
)
<<
"num_frame_decoded_: "
<<
num_frame_decoded_
;
}
}
}
}
...
@@ -114,7 +118,11 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
...
@@ -114,7 +118,11 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
std
::
vector
<
float
>
topk_score
;
std
::
vector
<
float
>
topk_score
;
std
::
vector
<
int32_t
>
topk_index
;
std
::
vector
<
int32_t
>
topk_index
;
TopK
(
logp_t
,
first_beam_size
,
&
topk_score
,
&
topk_index
);
TopK
(
logp_t
,
first_beam_size
,
&
topk_score
,
&
topk_index
);
VLOG
(
2
)
<<
"topk: "
<<
num_frame_decoded_
<<
" "
<<
*
std
::
max_element
(
logp_t
.
begin
(),
logp_t
.
end
())
<<
" "
<<
topk_score
[
0
];
for
(
int
i
=
0
;
i
<
topk_score
.
size
();
i
++
){
VLOG
(
2
)
<<
"topk: "
<<
num_frame_decoded_
<<
" "
<<
topk_score
[
i
];
}
// 2. token passing
// 2. token passing
for
(
int
i
=
0
;
i
<
topk_index
.
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
topk_index
.
size
();
++
i
)
{
int
id
=
topk_index
[
i
];
int
id
=
topk_index
[
i
];
...
@@ -295,7 +303,18 @@ void CTCPrefixBeamSearch::UpdateOutputs(
...
@@ -295,7 +303,18 @@ void CTCPrefixBeamSearch::UpdateOutputs(
outputs_
.
emplace_back
(
output
);
outputs_
.
emplace_back
(
output
);
}
}
void
CTCPrefixBeamSearch
::
FinalizeSearch
()
{
UpdateFinalContext
();
}
void
CTCPrefixBeamSearch
::
FinalizeSearch
()
{
UpdateFinalContext
();
VLOG
(
2
)
<<
"num_frame_decoded_: "
<<
num_frame_decoded_
;
int
cnt
=
0
;
for
(
int
i
=
0
;
i
<
hypotheses_
.
size
();
i
++
){
VLOG
(
2
)
<<
"hyp "
<<
cnt
<<
" len: "
<<
hypotheses_
[
i
].
size
()
<<
" ctc score: "
<<
likelihood_
[
i
];
for
(
int
j
=
0
;
j
<
hypotheses_
[
i
].
size
();
j
++
){
VLOG
(
2
)
<<
hypotheses_
[
i
][
j
];
}
}
}
void
CTCPrefixBeamSearch
::
UpdateFinalContext
()
{
void
CTCPrefixBeamSearch
::
UpdateFinalContext
()
{
if
(
context_graph_
==
nullptr
)
return
;
if
(
context_graph_
==
nullptr
)
return
;
...
...
speechx/speechx/frontend/audio/assembler.cc
浏览文件 @
fc72ab1e
...
@@ -52,15 +52,21 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
...
@@ -52,15 +52,21 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
Vector
<
BaseFloat
>
feature
;
Vector
<
BaseFloat
>
feature
;
result
=
base_extractor_
->
Read
(
&
feature
);
result
=
base_extractor_
->
Read
(
&
feature
);
if
(
result
==
false
||
feature
.
Dim
()
==
0
)
{
if
(
result
==
false
||
feature
.
Dim
()
==
0
)
{
if
(
IsFinished
()
==
false
)
return
false
;
VLOG
(
1
)
<<
"result: "
<<
result
<<
"feature dim: "
<<
feature
.
Dim
();
break
;
if
(
IsFinished
()
==
false
)
{
LOG
(
INFO
)
<<
"finished reading feature. cache size: "
<<
feature_cache_
.
size
();
return
false
;
}
else
{
LOG
(
INFO
)
<<
"break"
;
break
;
}
}
}
CHECK
(
feature
.
Dim
()
==
dim_
);
CHECK
(
feature
.
Dim
()
==
dim_
);
feature_cache_
.
push
(
feature
);
nframes_
+=
1
;
nframes_
+=
1
;
VLOG
(
1
)
<<
"nframes: "
<<
nframes_
;
VLOG
(
1
)
<<
"nframes: "
<<
nframes_
;
feature_cache_
.
push
(
feature
);
}
}
if
(
feature_cache_
.
size
()
<
receptive_filed_length_
)
{
if
(
feature_cache_
.
size
()
<
receptive_filed_length_
)
{
...
@@ -68,8 +74,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
...
@@ -68,8 +74,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
return
false
;
return
false
;
}
}
if
(
fill_zero_
)
{
if
(
fill_zero_
){
while
(
feature_cache_
.
size
()
<
frame_chunk_size_
)
{
while
(
feature_cache_
.
size
()
<
frame_chunk_size_
)
{
Vector
<
BaseFloat
>
feature
(
dim_
,
kaldi
::
kSetZero
);
Vector
<
BaseFloat
>
feature
(
dim_
,
kaldi
::
kSetZero
);
nframes_
+=
1
;
nframes_
+=
1
;
...
@@ -79,6 +84,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
...
@@ -79,6 +84,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
int32
this_chunk_size
=
std
::
min
(
static_cast
<
int32
>
(
feature_cache_
.
size
()),
frame_chunk_size_
);
int32
this_chunk_size
=
std
::
min
(
static_cast
<
int32
>
(
feature_cache_
.
size
()),
frame_chunk_size_
);
feats
->
Resize
(
dim_
*
this_chunk_size
);
feats
->
Resize
(
dim_
*
this_chunk_size
);
VLOG
(
1
)
<<
"read "
<<
this_chunk_size
<<
" feat."
;
int32
counter
=
0
;
int32
counter
=
0
;
while
(
counter
<
this_chunk_size
)
{
while
(
counter
<
this_chunk_size
)
{
...
@@ -97,6 +103,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
...
@@ -97,6 +103,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
counter
++
;
counter
++
;
}
}
CHECK
(
feature_cache_
.
size
()
==
cache_size_
);
return
result
;
return
result
;
}
}
...
...
speechx/speechx/frontend/audio/feature_cache.h
浏览文件 @
fc72ab1e
...
@@ -41,12 +41,14 @@ class FeatureCache : public FrontendInterface {
...
@@ -41,12 +41,14 @@ class FeatureCache : public FrontendInterface {
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
virtual
void
SetFinished
()
{
LOG
(
INFO
)
<<
"set finished"
;
// std::unique_lock<std::mutex> lock(mutex_);
// std::unique_lock<std::mutex> lock(mutex_);
base_extractor_
->
SetFinished
();
base_extractor_
->
SetFinished
();
LOG
(
INFO
)
<<
"set finished"
;
// read the last chunk data
// read the last chunk data
Compute
();
Compute
();
// ready_feed_condition_.notify_one();
// ready_feed_condition_.notify_one();
LOG
(
INFO
)
<<
"compute last feats done."
;
}
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
...
...
speechx/speechx/nnet/decodable.cc
浏览文件 @
fc72ab1e
...
@@ -36,8 +36,6 @@ void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
...
@@ -36,8 +36,6 @@ void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
frames_ready_
+=
likelihood
.
NumRows
();
frames_ready_
+=
likelihood
.
NumRows
();
}
}
// Decodable::Init(DecodableConfig config) {
//}
// return the size of frame have computed.
// return the size of frame have computed.
int32
Decodable
::
NumFramesReady
()
const
{
return
frames_ready_
;
}
int32
Decodable
::
NumFramesReady
()
const
{
return
frames_ready_
;
}
...
@@ -70,9 +68,10 @@ bool Decodable::AdvanceChunk() {
...
@@ -70,9 +68,10 @@ bool Decodable::AdvanceChunk() {
Vector
<
BaseFloat
>
features
;
Vector
<
BaseFloat
>
features
;
if
(
frontend_
==
NULL
||
frontend_
->
Read
(
&
features
)
==
false
)
{
if
(
frontend_
==
NULL
||
frontend_
->
Read
(
&
features
)
==
false
)
{
// no feat or frontend_ not init.
// no feat or frontend_ not init.
VLOG
(
1
)
<<
"decodable exit;"
;
return
false
;
return
false
;
}
}
VLOG
(
2
)
<<
"Forward
with "
<<
features
.
Dim
()
<<
" frame
s."
;
VLOG
(
2
)
<<
"Forward
in "
<<
features
.
Dim
()
/
frontend_
->
Dim
()
<<
" feat
s."
;
// forward feats
// forward feats
NnetOut
out
;
NnetOut
out
;
...
@@ -80,6 +79,7 @@ bool Decodable::AdvanceChunk() {
...
@@ -80,6 +79,7 @@ bool Decodable::AdvanceChunk() {
int32
&
vocab_dim
=
out
.
vocab_dim
;
int32
&
vocab_dim
=
out
.
vocab_dim
;
Vector
<
BaseFloat
>&
logprobs
=
out
.
logprobs
;
Vector
<
BaseFloat
>&
logprobs
=
out
.
logprobs
;
VLOG
(
2
)
<<
"Forward out "
<<
logprobs
.
Dim
()
/
vocab_dim
<<
" decoder frames."
;
// cache nnet outupts
// cache nnet outupts
nnet_out_cache_
.
Resize
(
logprobs
.
Dim
()
/
vocab_dim
,
vocab_dim
);
nnet_out_cache_
.
Resize
(
logprobs
.
Dim
()
/
vocab_dim
,
vocab_dim
);
nnet_out_cache_
.
CopyRowsFromVec
(
logprobs
);
nnet_out_cache_
.
CopyRowsFromVec
(
logprobs
);
...
@@ -114,15 +114,20 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
...
@@ -114,15 +114,20 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
// read one frame likelihood
// read one frame likelihood
bool
Decodable
::
FrameLikelihood
(
int32
frame
,
vector
<
BaseFloat
>*
likelihood
)
{
bool
Decodable
::
FrameLikelihood
(
int32
frame
,
vector
<
BaseFloat
>*
likelihood
)
{
if
(
EnsureFrameHaveComputed
(
frame
)
==
false
)
{
if
(
EnsureFrameHaveComputed
(
frame
)
==
false
)
{
LOG
(
INFO
)
<<
"framelikehood exit."
;
return
false
;
return
false
;
}
}
int
nrows
=
nnet_out_cache_
.
NumRows
();
CHECK
(
nrows
==
(
frames_ready_
-
frame_offset_
));
int
vocab_size
=
nnet_out_cache_
.
NumCols
();
int
vocab_size
=
nnet_out_cache_
.
NumCols
();
likelihood
->
resize
(
vocab_size
);
likelihood
->
resize
(
vocab_size
);
for
(
int32
idx
=
0
;
idx
<
vocab_size
;
++
idx
)
{
for
(
int32
idx
=
0
;
idx
<
vocab_size
;
++
idx
)
{
(
*
likelihood
)[
idx
]
=
(
*
likelihood
)[
idx
]
=
nnet_out_cache_
(
frame
-
frame_offset_
,
idx
)
*
acoustic_scale_
;
nnet_out_cache_
(
frame
-
frame_offset_
,
idx
)
*
acoustic_scale_
;
VLOG
(
4
)
<<
"nnet out: "
<<
frame
<<
" offset:"
<<
frame_offset_
<<
" "
<<
nnet_out_cache_
.
NumRows
()
<<
" logprob: "
<<
nnet_out_cache_
(
frame
-
frame_offset_
,
idx
);
}
}
return
true
;
return
true
;
}
}
...
...
speechx/speechx/nnet/u2_nnet.cc
浏览文件 @
fc72ab1e
...
@@ -440,6 +440,7 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
...
@@ -440,6 +440,7 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
max_hyps_len
=
std
::
max
(
max_hyps_len
,
len
);
max_hyps_len
=
std
::
max
(
max_hyps_len
,
len
);
hyps_len_ptr
[
i
]
=
static_cast
<
int64_t
>
(
len
);
hyps_len_ptr
[
i
]
=
static_cast
<
int64_t
>
(
len
);
}
}
VLOG
(
2
)
<<
"max_hyps_len: "
<<
max_hyps_len
;
paddle
::
Tensor
hyps_tensor
=
paddle
::
Tensor
hyps_tensor
=
paddle
::
full
({
num_hyps
,
max_hyps_len
},
eos_
,
paddle
::
DataType
::
INT64
);
paddle
::
full
({
num_hyps
,
max_hyps_len
},
eos_
,
paddle
::
DataType
::
INT64
);
...
@@ -625,8 +626,8 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
...
@@ -625,8 +626,8 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
// combinded left-to-right and right-to-lfet score
// combinded left-to-right and right-to-lfet score
(
*
rescoring_score
)[
i
]
=
(
*
rescoring_score
)[
i
]
=
score
*
(
1
-
reverse_weight
)
+
r_score
*
reverse_weight
;
score
*
(
1
-
reverse_weight
)
+
r_score
*
reverse_weight
;
VLOG
(
1
)
<<
"hyp "
<<
i
<<
" score: "
<<
score
<<
" r_score: "
<<
r_score
VLOG
(
1
)
<<
"hyp "
<<
i
<<
"
"
<<
hyp
.
size
()
<<
"
score: "
<<
score
<<
" r_score: "
<<
r_score
<<
" reverse_weight: "
<<
reverse_weight
;
<<
" reverse_weight: "
<<
reverse_weight
<<
" final score: "
<<
(
*
rescoring_score
)[
i
]
;
}
}
}
}
...
...
speechx/speechx/recognizer/u2_recognizer.cc
浏览文件 @
fc72ab1e
...
@@ -52,7 +52,6 @@ void U2Recognizer::Reset() {
...
@@ -52,7 +52,6 @@ void U2Recognizer::Reset() {
num_frames_
=
0
;
num_frames_
=
0
;
result_
.
clear
();
result_
.
clear
();
feature_pipeline_
->
Reset
();
decodable_
->
Reset
();
decodable_
->
Reset
();
decoder_
->
Reset
();
decoder_
->
Reset
();
}
}
...
@@ -62,7 +61,6 @@ void U2Recognizer::ResetContinuousDecoding() {
...
@@ -62,7 +61,6 @@ void U2Recognizer::ResetContinuousDecoding() {
num_frames_
=
0
;
num_frames_
=
0
;
result_
.
clear
();
result_
.
clear
();
feature_pipeline_
->
Reset
();
decodable_
->
Reset
();
decodable_
->
Reset
();
decoder_
->
Reset
();
decoder_
->
Reset
();
}
}
...
@@ -192,10 +190,12 @@ void U2Recognizer::AttentionRescoring() {
...
@@ -192,10 +190,12 @@ void U2Recognizer::AttentionRescoring() {
// combine ctc score and rescoring score
// combine ctc score and rescoring score
for
(
size_t
i
=
0
;
i
<
num_hyps
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_hyps
;
i
++
)
{
VLOG
(
1
)
<<
"hyp "
<<
i
<<
" rescoring_score: "
<<
rescoring_score
[
i
]
VLOG
(
1
)
<<
"hyp "
<<
i
<<
" rescoring_score: "
<<
rescoring_score
[
i
]
<<
" ctc_score: "
<<
result_
[
i
].
score
;
<<
" ctc_score: "
<<
result_
[
i
].
score
<<
" rescoring_weight: "
<<
opts_
.
decoder_opts
.
rescoring_weight
<<
" ctc_weight: "
<<
opts_
.
decoder_opts
.
ctc_weight
;
result_
[
i
].
score
=
result_
[
i
].
score
=
opts_
.
decoder_opts
.
rescoring_weight
*
rescoring_score
[
i
]
+
opts_
.
decoder_opts
.
rescoring_weight
*
rescoring_score
[
i
]
+
opts_
.
decoder_opts
.
ctc_weight
*
result_
[
i
].
score
;
opts_
.
decoder_opts
.
ctc_weight
*
result_
[
i
].
score
;
VLOG
(
1
)
<<
"hyp: "
<<
result_
[
0
].
sentence
<<
" score: "
<<
result_
[
0
].
score
;
}
}
std
::
sort
(
result_
.
begin
(),
result_
.
end
(),
DecodeResult
::
CompareFunc
);
std
::
sort
(
result_
.
begin
(),
result_
.
end
(),
DecodeResult
::
CompareFunc
);
...
...
speechx/speechx/recognizer/u2_recognizer_main.cc
浏览文件 @
fc72ab1e
...
@@ -62,6 +62,7 @@ int main(int argc, char* argv[]) {
...
@@ -62,6 +62,7 @@ int main(int argc, char* argv[]) {
LOG
(
INFO
)
<<
"wav len (sample): "
<<
tot_samples
;
LOG
(
INFO
)
<<
"wav len (sample): "
<<
tot_samples
;
int
sample_offset
=
0
;
int
sample_offset
=
0
;
int
cnt
=
0
;
while
(
sample_offset
<
tot_samples
)
{
while
(
sample_offset
<
tot_samples
)
{
int
cur_chunk_size
=
int
cur_chunk_size
=
std
::
min
(
chunk_sample_size
,
tot_samples
-
sample_offset
);
std
::
min
(
chunk_sample_size
,
tot_samples
-
sample_offset
);
...
@@ -77,12 +78,14 @@ int main(int argc, char* argv[]) {
...
@@ -77,12 +78,14 @@ int main(int argc, char* argv[]) {
recognizer
.
SetFinished
();
recognizer
.
SetFinished
();
}
}
recognizer
.
Decode
();
recognizer
.
Decode
();
LOG
(
INFO
)
<<
"Pratial result: "
<<
recognizer
.
GetPartialResult
();
LOG
(
INFO
)
<<
"Pratial result: "
<<
cnt
<<
" "
<<
recognizer
.
GetPartialResult
();
// no overlap
// no overlap
sample_offset
+=
cur_chunk_size
;
sample_offset
+=
cur_chunk_size
;
cnt
++
;
}
}
CHECK
(
sample_offset
==
tot_samples
);
CHECK
(
sample_offset
==
tot_samples
);
VLOG
(
1
)
<<
"num decode: "
<<
cnt
;
// recognizer.SetFinished();
// recognizer.SetFinished();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录