Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
8c2196ea
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
8c2196ea
编写于
4月 24, 2023
作者:
Y
YangZhou
提交者:
GitHub
4月 24, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[engine] add wfst recognizer in example (#3173)
* update wfst script * add skip blank
上级
5e2251af
变更
21
隐藏空白更改
内联
并排
Showing
21 changed file
with
178 addition
and
50 deletion
+178
-50
runtime/engine/asr/decoder/ctc_prefix_beam_search_decoder.cc
runtime/engine/asr/decoder/ctc_prefix_beam_search_decoder.cc
+2
-2
runtime/engine/asr/decoder/ctc_prefix_beam_search_decoder_main.cc
...engine/asr/decoder/ctc_prefix_beam_search_decoder_main.cc
+1
-1
runtime/engine/asr/decoder/ctc_tlg_decoder.h
runtime/engine/asr/decoder/ctc_tlg_decoder.h
+1
-1
runtime/engine/asr/decoder/ctc_tlg_decoder_main.cc
runtime/engine/asr/decoder/ctc_tlg_decoder_main.cc
+1
-1
runtime/engine/asr/decoder/param.h
runtime/engine/asr/decoder/param.h
+1
-4
runtime/engine/asr/nnet/nnet_producer.cc
runtime/engine/asr/nnet/nnet_producer.cc
+19
-4
runtime/engine/asr/nnet/nnet_producer.h
runtime/engine/asr/nnet/nnet_producer.h
+6
-1
runtime/engine/asr/nnet/u2_nnet.cc
runtime/engine/asr/nnet/u2_nnet.cc
+13
-2
runtime/engine/asr/nnet/u2_nnet.h
runtime/engine/asr/nnet/u2_nnet.h
+2
-1
runtime/engine/asr/recognizer/recognizer_controller_impl.cc
runtime/engine/asr/recognizer/recognizer_controller_impl.cc
+2
-1
runtime/engine/asr/recognizer/recognizer_main.cc
runtime/engine/asr/recognizer/recognizer_main.cc
+3
-2
runtime/engine/asr/recognizer/recognizer_resource.h
runtime/engine/asr/recognizer/recognizer_resource.h
+3
-0
runtime/engine/kaldi/fstbin/CMakeLists.txt
runtime/engine/kaldi/fstbin/CMakeLists.txt
+1
-1
runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
+15
-12
runtime/examples/u2pp_ol/wenetspeech/local/recognizer_fastdeploy.sh
...amples/u2pp_ol/wenetspeech/local/recognizer_fastdeploy.sh
+36
-0
runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh
...me/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh
+1
-1
runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh
...ime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh
+12
-1
runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst_fastdeploy.sh
...s/u2pp_ol/wenetspeech/local/recognizer_wfst_fastdeploy.sh
+51
-0
runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh
runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh
+1
-2
runtime/examples/u2pp_ol/wenetspeech/path.sh
runtime/examples/u2pp_ol/wenetspeech/path.sh
+1
-1
runtime/examples/u2pp_ol/wenetspeech/run.sh
runtime/examples/u2pp_ol/wenetspeech/run.sh
+6
-12
未找到文件。
runtime/engine/asr/decoder/ctc_prefix_beam_search_decoder.cc
浏览文件 @
8c2196ea
...
@@ -87,9 +87,9 @@ void CTCPrefixBeamSearch::AdvanceDecode(
...
@@ -87,9 +87,9 @@ void CTCPrefixBeamSearch::AdvanceDecode(
VLOG
(
1
)
<<
"num_frame_decoded_: "
<<
num_frame_decoded_
;
VLOG
(
1
)
<<
"num_frame_decoded_: "
<<
num_frame_decoded_
;
}
}
VLOG
(
1
)
<<
"AdvanceDecode feat + forward cost: "
<<
feat_nnet_cost
VLOG
(
2
)
<<
"AdvanceDecode feat + forward cost: "
<<
feat_nnet_cost
<<
" sec."
;
<<
" sec."
;
VLOG
(
1
)
<<
"AdvanceDecode search cost: "
<<
search_cost
<<
" sec."
;
VLOG
(
2
)
<<
"AdvanceDecode search cost: "
<<
search_cost
<<
" sec."
;
}
}
static
bool
PrefixScoreCompare
(
static
bool
PrefixScoreCompare
(
...
...
runtime/engine/asr/decoder/ctc_prefix_beam_search_decoder_main.cc
浏览文件 @
8c2196ea
...
@@ -71,7 +71,7 @@ int main(int argc, char* argv[]) {
...
@@ -71,7 +71,7 @@ int main(int argc, char* argv[]) {
std
::
shared_ptr
<
ppspeech
::
DataCache
>
raw_data
=
std
::
shared_ptr
<
ppspeech
::
DataCache
>
raw_data
=
std
::
make_shared
<
ppspeech
::
DataCache
>
();
std
::
make_shared
<
ppspeech
::
DataCache
>
();
std
::
shared_ptr
<
ppspeech
::
NnetProducer
>
nnet_producer
=
std
::
shared_ptr
<
ppspeech
::
NnetProducer
>
nnet_producer
=
std
::
make_shared
<
ppspeech
::
NnetProducer
>
(
nnet
,
raw_data
);
std
::
make_shared
<
ppspeech
::
NnetProducer
>
(
nnet
,
raw_data
,
1.0
);
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
=
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
=
std
::
make_shared
<
ppspeech
::
Decodable
>
(
nnet_producer
);
std
::
make_shared
<
ppspeech
::
Decodable
>
(
nnet_producer
);
...
...
runtime/engine/asr/decoder/ctc_tlg_decoder.h
浏览文件 @
8c2196ea
...
@@ -44,7 +44,7 @@ struct TLGDecoderOptions {
...
@@ -44,7 +44,7 @@ struct TLGDecoderOptions {
decoder_opts
.
word_symbol_table
=
FLAGS_word_symbol_table
;
decoder_opts
.
word_symbol_table
=
FLAGS_word_symbol_table
;
decoder_opts
.
fst_path
=
FLAGS_graph_path
;
decoder_opts
.
fst_path
=
FLAGS_graph_path
;
LOG
(
INFO
)
<<
"fst path: "
<<
decoder_opts
.
fst_path
;
LOG
(
INFO
)
<<
"fst path: "
<<
decoder_opts
.
fst_path
;
LOG
(
INFO
)
<<
"
fst
symbole table: "
<<
decoder_opts
.
word_symbol_table
;
LOG
(
INFO
)
<<
"symbole table: "
<<
decoder_opts
.
word_symbol_table
;
if
(
!
decoder_opts
.
fst_path
.
empty
())
{
if
(
!
decoder_opts
.
fst_path
.
empty
())
{
CHECK
(
FileExists
(
decoder_opts
.
fst_path
));
CHECK
(
FileExists
(
decoder_opts
.
fst_path
));
...
...
runtime/engine/asr/decoder/ctc_tlg_decoder_main.cc
浏览文件 @
8c2196ea
...
@@ -54,7 +54,7 @@ int main(int argc, char* argv[]) {
...
@@ -54,7 +54,7 @@ int main(int argc, char* argv[]) {
ppspeech
::
ModelOptions
model_opts
=
ppspeech
::
ModelOptions
::
InitFromFlags
();
ppspeech
::
ModelOptions
model_opts
=
ppspeech
::
ModelOptions
::
InitFromFlags
();
std
::
shared_ptr
<
ppspeech
::
NnetProducer
>
nnet_producer
=
std
::
shared_ptr
<
ppspeech
::
NnetProducer
>
nnet_producer
=
std
::
make_shared
<
ppspeech
::
NnetProducer
>
(
nullptr
);
std
::
make_shared
<
ppspeech
::
NnetProducer
>
(
nullptr
,
nullptr
,
1.0
);
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
new
ppspeech
::
Decodable
(
nnet_producer
,
FLAGS_acoustic_scale
));
new
ppspeech
::
Decodable
(
nnet_producer
,
FLAGS_acoustic_scale
));
...
...
runtime/engine/asr/decoder/param.h
浏览文件 @
8c2196ea
...
@@ -35,13 +35,11 @@ DEFINE_int32(subsampling_rate,
...
@@ -35,13 +35,11 @@ DEFINE_int32(subsampling_rate,
"two CNN(kernel=3) module downsampling rate."
);
"two CNN(kernel=3) module downsampling rate."
);
DEFINE_int32
(
nnet_decoder_chunk
,
1
,
"paddle nnet forward chunk"
);
DEFINE_int32
(
nnet_decoder_chunk
,
1
,
"paddle nnet forward chunk"
);
// nnet
// nnet
DEFINE_string
(
model_path
,
"avg_1.jit.pdmodel"
,
"paddle nnet model"
);
DEFINE_string
(
model_path
,
"avg_1.jit.pdmodel"
,
"paddle nnet model"
);
#ifdef USE_ONNX
#ifdef USE_ONNX
DEFINE_bool
(
with_onnx_model
,
false
,
"True mean the model path is onnx model path"
);
DEFINE_bool
(
with_onnx_model
,
false
,
"True mean the model path is onnx model path"
);
#endif
#endif
//DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
// decoder
// decoder
DEFINE_double
(
acoustic_scale
,
1
.
0
,
"acoustic scale"
);
DEFINE_double
(
acoustic_scale
,
1
.
0
,
"acoustic scale"
);
...
@@ -50,10 +48,9 @@ DEFINE_string(word_symbol_table, "", "word symbol table");
...
@@ -50,10 +48,9 @@ DEFINE_string(word_symbol_table, "", "word symbol table");
DEFINE_int32
(
max_active
,
7500
,
"max active"
);
DEFINE_int32
(
max_active
,
7500
,
"max active"
);
DEFINE_double
(
beam
,
15
.
0
,
"decoder beam"
);
DEFINE_double
(
beam
,
15
.
0
,
"decoder beam"
);
DEFINE_double
(
lattice_beam
,
7
.
5
,
"decoder beam"
);
DEFINE_double
(
lattice_beam
,
7
.
5
,
"decoder beam"
);
DEFINE_double
(
blank_threshold
,
0
.
98
,
"blank skip threshold"
);
// DecodeOptions flags
// DecodeOptions flags
// DEFINE_int32(chunk_size, -1, "decoding chunk size");
DEFINE_int32
(
num_left_chunks
,
-
1
,
"left chunks in decoding"
);
DEFINE_int32
(
num_left_chunks
,
-
1
,
"left chunks in decoding"
);
DEFINE_double
(
ctc_weight
,
DEFINE_double
(
ctc_weight
,
0
.
5
,
0
.
5
,
...
...
runtime/engine/asr/nnet/nnet_producer.cc
浏览文件 @
8c2196ea
...
@@ -22,8 +22,9 @@ using kaldi::BaseFloat;
...
@@ -22,8 +22,9 @@ using kaldi::BaseFloat;
using
std
::
vector
;
using
std
::
vector
;
NnetProducer
::
NnetProducer
(
std
::
shared_ptr
<
NnetBase
>
nnet
,
NnetProducer
::
NnetProducer
(
std
::
shared_ptr
<
NnetBase
>
nnet
,
std
::
shared_ptr
<
FrontendInterface
>
frontend
)
std
::
shared_ptr
<
FrontendInterface
>
frontend
,
:
nnet_
(
nnet
),
frontend_
(
frontend
)
{
float
blank_threshold
)
:
nnet_
(
nnet
),
frontend_
(
frontend
),
blank_threshold_
(
blank_threshold
)
{
Reset
();
Reset
();
}
}
...
@@ -45,7 +46,6 @@ void NnetProducer::Acceptlikelihood(
...
@@ -45,7 +46,6 @@ void NnetProducer::Acceptlikelihood(
bool
NnetProducer
::
Read
(
std
::
vector
<
kaldi
::
BaseFloat
>*
nnet_prob
)
{
bool
NnetProducer
::
Read
(
std
::
vector
<
kaldi
::
BaseFloat
>*
nnet_prob
)
{
bool
flag
=
cache_
.
pop
(
nnet_prob
);
bool
flag
=
cache_
.
pop
(
nnet_prob
);
VLOG
(
1
)
<<
"nnet cache_ size: "
<<
cache_
.
size
();
return
flag
;
return
flag
;
}
}
...
@@ -70,7 +70,22 @@ bool NnetProducer::Compute() {
...
@@ -70,7 +70,22 @@ bool NnetProducer::Compute() {
std
::
vector
<
BaseFloat
>
logprob
(
std
::
vector
<
BaseFloat
>
logprob
(
out
.
logprobs
.
data
()
+
idx
*
vocab_dim
,
out
.
logprobs
.
data
()
+
idx
*
vocab_dim
,
out
.
logprobs
.
data
()
+
(
idx
+
1
)
*
vocab_dim
);
out
.
logprobs
.
data
()
+
(
idx
+
1
)
*
vocab_dim
);
cache_
.
push_back
(
logprob
);
// process blank prob
float
blank_prob
=
std
::
exp
(
logprob
[
0
]);
if
(
blank_prob
>
blank_threshold_
)
{
last_frame_logprob_
=
logprob
;
is_last_frame_skip_
=
true
;
continue
;
}
else
{
int
cur_max
=
std
::
max
(
logprob
.
begin
(),
logprob
.
end
())
-
logprob
.
begin
();
if
(
cur_max
==
last_max_elem_
&&
cur_max
!=
0
&&
is_last_frame_skip_
)
{
cache_
.
push_back
(
last_frame_logprob_
);
last_max_elem_
=
cur_max
;
}
last_max_elem_
=
cur_max
;
is_last_frame_skip_
=
false
;
cache_
.
push_back
(
logprob
);
}
}
}
return
true
;
return
true
;
}
}
...
...
runtime/engine/asr/nnet/nnet_producer.h
浏览文件 @
8c2196ea
...
@@ -24,7 +24,8 @@ namespace ppspeech {
...
@@ -24,7 +24,8 @@ namespace ppspeech {
class
NnetProducer
{
class
NnetProducer
{
public:
public:
explicit
NnetProducer
(
std
::
shared_ptr
<
NnetBase
>
nnet
,
explicit
NnetProducer
(
std
::
shared_ptr
<
NnetBase
>
nnet
,
std
::
shared_ptr
<
FrontendInterface
>
frontend
=
NULL
);
std
::
shared_ptr
<
FrontendInterface
>
frontend
,
float
blank_threshold
);
// Feed feats or waves
// Feed feats or waves
void
Accept
(
const
std
::
vector
<
kaldi
::
BaseFloat
>&
inputs
);
void
Accept
(
const
std
::
vector
<
kaldi
::
BaseFloat
>&
inputs
);
...
@@ -64,6 +65,10 @@ class NnetProducer {
...
@@ -64,6 +65,10 @@ class NnetProducer {
std
::
shared_ptr
<
FrontendInterface
>
frontend_
;
std
::
shared_ptr
<
FrontendInterface
>
frontend_
;
std
::
shared_ptr
<
NnetBase
>
nnet_
;
std
::
shared_ptr
<
NnetBase
>
nnet_
;
SafeQueue
<
std
::
vector
<
kaldi
::
BaseFloat
>>
cache_
;
SafeQueue
<
std
::
vector
<
kaldi
::
BaseFloat
>>
cache_
;
std
::
vector
<
BaseFloat
>
last_frame_logprob_
;
bool
is_last_frame_skip_
=
false
;
int
last_max_elem_
=
-
1
;
float
blank_threshold_
=
0.0
;
bool
finished_
;
bool
finished_
;
DISALLOW_COPY_AND_ASSIGN
(
NnetProducer
);
DISALLOW_COPY_AND_ASSIGN
(
NnetProducer
);
...
...
runtime/engine/asr/nnet/u2_nnet.cc
浏览文件 @
8c2196ea
...
@@ -124,7 +124,15 @@ U2Nnet::U2Nnet(const U2Nnet& other) {
...
@@ -124,7 +124,15 @@ U2Nnet::U2Nnet(const U2Nnet& other) {
offset_
=
other
.
offset_
;
offset_
=
other
.
offset_
;
// copy model ptr
// copy model ptr
model_
=
other
.
model_
->
Clone
();
// model_ = other.model_->Clone();
// hack, fix later
#ifdef WITH_GPU
dev_
=
phi
::
GPUPlace
();
#else
dev_
=
phi
::
CPUPlace
();
#endif
paddle
::
jit
::
Layer
model
=
paddle
::
jit
::
Load
(
other
.
opts_
.
model_path
,
dev_
);
model_
=
std
::
make_shared
<
paddle
::
jit
::
Layer
>
(
std
::
move
(
model
));
ctc_activation_
=
model_
->
Function
(
"ctc_activation"
);
ctc_activation_
=
model_
->
Function
(
"ctc_activation"
);
subsampling_rate_
=
model_
->
Attribute
<
int
>
(
"subsampling_rate"
);
subsampling_rate_
=
model_
->
Attribute
<
int
>
(
"subsampling_rate"
);
right_context_
=
model_
->
Attribute
<
int
>
(
"right_context"
);
right_context_
=
model_
->
Attribute
<
int
>
(
"right_context"
);
...
@@ -166,6 +174,7 @@ void U2Nnet::Reset() {
...
@@ -166,6 +174,7 @@ void U2Nnet::Reset() {
std
::
move
(
paddle
::
zeros
({
0
,
0
,
0
,
0
},
paddle
::
DataType
::
FLOAT32
));
std
::
move
(
paddle
::
zeros
({
0
,
0
,
0
,
0
},
paddle
::
DataType
::
FLOAT32
));
encoder_outs_
.
clear
();
encoder_outs_
.
clear
();
VLOG
(
1
)
<<
"FeedForward cost: "
<<
cost_time_
<<
" sec. "
;
VLOG
(
3
)
<<
"u2nnet reset"
;
VLOG
(
3
)
<<
"u2nnet reset"
;
}
}
...
@@ -185,8 +194,10 @@ void U2Nnet::FeedForward(const std::vector<BaseFloat>& features,
...
@@ -185,8 +194,10 @@ void U2Nnet::FeedForward(const std::vector<BaseFloat>& features,
std
::
vector
<
kaldi
::
BaseFloat
>
ctc_probs
;
std
::
vector
<
kaldi
::
BaseFloat
>
ctc_probs
;
ForwardEncoderChunkImpl
(
ForwardEncoderChunkImpl
(
features
,
feature_dim
,
&
out
->
logprobs
,
&
out
->
vocab_dim
);
features
,
feature_dim
,
&
out
->
logprobs
,
&
out
->
vocab_dim
);
VLOG
(
1
)
<<
"FeedForward cost: "
<<
timer
.
Elapsed
()
<<
" sec. "
float
forward_chunk_time
=
timer
.
Elapsed
();
VLOG
(
1
)
<<
"FeedForward cost: "
<<
forward_chunk_time
<<
" sec. "
<<
features
.
size
()
/
feature_dim
<<
" frames."
;
<<
features
.
size
()
/
feature_dim
<<
" frames."
;
cost_time_
+=
forward_chunk_time
;
}
}
...
...
runtime/engine/asr/nnet/u2_nnet.h
浏览文件 @
8c2196ea
...
@@ -113,8 +113,8 @@ class U2Nnet : public U2NnetBase {
...
@@ -113,8 +113,8 @@ class U2Nnet : public U2NnetBase {
void
EncoderOuts
(
void
EncoderOuts
(
std
::
vector
<
std
::
vector
<
kaldi
::
BaseFloat
>>*
encoder_out
)
const
;
std
::
vector
<
std
::
vector
<
kaldi
::
BaseFloat
>>*
encoder_out
)
const
;
ModelOptions
opts_
;
// hack, fix later
private:
private:
ModelOptions
opts_
;
phi
::
Place
dev_
;
phi
::
Place
dev_
;
std
::
shared_ptr
<
paddle
::
jit
::
Layer
>
model_
{
nullptr
};
std
::
shared_ptr
<
paddle
::
jit
::
Layer
>
model_
{
nullptr
};
...
@@ -127,6 +127,7 @@ class U2Nnet : public U2NnetBase {
...
@@ -127,6 +127,7 @@ class U2Nnet : public U2NnetBase {
paddle
::
jit
::
Function
forward_encoder_chunk_
;
paddle
::
jit
::
Function
forward_encoder_chunk_
;
paddle
::
jit
::
Function
forward_attention_decoder_
;
paddle
::
jit
::
Function
forward_attention_decoder_
;
paddle
::
jit
::
Function
ctc_activation_
;
paddle
::
jit
::
Function
ctc_activation_
;
float
cost_time_
=
0.0
;
};
};
}
// namespace ppspeech
}
// namespace ppspeech
\ No newline at end of file
runtime/engine/asr/recognizer/recognizer_controller_impl.cc
浏览文件 @
8c2196ea
...
@@ -21,6 +21,7 @@ namespace ppspeech {
...
@@ -21,6 +21,7 @@ namespace ppspeech {
RecognizerControllerImpl
::
RecognizerControllerImpl
(
const
RecognizerResource
&
resource
)
RecognizerControllerImpl
::
RecognizerControllerImpl
(
const
RecognizerResource
&
resource
)
:
opts_
(
resource
)
{
:
opts_
(
resource
)
{
BaseFloat
am_scale
=
resource
.
acoustic_scale
;
BaseFloat
am_scale
=
resource
.
acoustic_scale
;
BaseFloat
blank_threshold
=
resource
.
blank_threshold
;
const
FeaturePipelineOptions
&
feature_opts
=
resource
.
feature_pipeline_opts
;
const
FeaturePipelineOptions
&
feature_opts
=
resource
.
feature_pipeline_opts
;
std
::
shared_ptr
<
FeaturePipeline
>
feature_pipeline
(
std
::
shared_ptr
<
FeaturePipeline
>
feature_pipeline
(
new
FeaturePipeline
(
feature_opts
));
new
FeaturePipeline
(
feature_opts
));
...
@@ -34,7 +35,7 @@ RecognizerControllerImpl::RecognizerControllerImpl(const RecognizerResource& res
...
@@ -34,7 +35,7 @@ RecognizerControllerImpl::RecognizerControllerImpl(const RecognizerResource& res
nnet
=
resource
.
nnet
->
Clone
();
nnet
=
resource
.
nnet
->
Clone
();
}
}
#endif
#endif
nnet_producer_
.
reset
(
new
NnetProducer
(
nnet
,
feature_pipeline
));
nnet_producer_
.
reset
(
new
NnetProducer
(
nnet
,
feature_pipeline
,
blank_threshold
));
nnet_thread_
=
std
::
thread
(
RunNnetEvaluation
,
this
);
nnet_thread_
=
std
::
thread
(
RunNnetEvaluation
,
this
);
decodable_
.
reset
(
new
Decodable
(
nnet_producer_
,
am_scale
));
decodable_
.
reset
(
new
Decodable
(
nnet_producer_
,
am_scale
));
...
...
runtime/engine/asr/recognizer/recognizer_main.cc
浏览文件 @
8c2196ea
...
@@ -88,7 +88,8 @@ int main(int argc, char* argv[]) {
...
@@ -88,7 +88,8 @@ int main(int argc, char* argv[]) {
kaldi
::
Timer
timer
;
kaldi
::
Timer
timer
;
recognizer_ptr
->
AttentionRescoring
();
recognizer_ptr
->
AttentionRescoring
();
tot_attention_rescore_time
+=
timer
.
Elapsed
();
float
rescore_time
=
timer
.
Elapsed
();
tot_attention_rescore_time
+=
rescore_time
;
std
::
string
result
=
recognizer_ptr
->
GetFinalResult
();
std
::
string
result
=
recognizer_ptr
->
GetFinalResult
();
if
(
result
.
empty
())
{
if
(
result
.
empty
())
{
...
@@ -101,7 +102,7 @@ int main(int argc, char* argv[]) {
...
@@ -101,7 +102,7 @@ int main(int argc, char* argv[]) {
tot_decode_time
+=
local_timer
.
Elapsed
();
tot_decode_time
+=
local_timer
.
Elapsed
();
LOG
(
INFO
)
<<
utt
<<
" "
<<
result
;
LOG
(
INFO
)
<<
utt
<<
" "
<<
result
;
LOG
(
INFO
)
<<
" RTF: "
<<
local_timer
.
Elapsed
()
/
dur
<<
" dur: "
<<
dur
LOG
(
INFO
)
<<
" RTF: "
<<
local_timer
.
Elapsed
()
/
dur
<<
" dur: "
<<
dur
<<
" cost: "
<<
local_timer
.
Elapsed
();
<<
" cost: "
<<
local_timer
.
Elapsed
()
<<
" rescore:"
<<
rescore_time
;
result_writer
.
Write
(
utt
,
result
);
result_writer
.
Write
(
utt
,
result
);
...
...
runtime/engine/asr/recognizer/recognizer_resource.h
浏览文件 @
8c2196ea
...
@@ -12,6 +12,7 @@ DECLARE_double(reverse_weight);
...
@@ -12,6 +12,7 @@ DECLARE_double(reverse_weight);
DECLARE_int32
(
nbest
);
DECLARE_int32
(
nbest
);
DECLARE_int32
(
blank
);
DECLARE_int32
(
blank
);
DECLARE_double
(
acoustic_scale
);
DECLARE_double
(
acoustic_scale
);
DECLARE_double
(
blank_threshold
);
DECLARE_string
(
word_symbol_table
);
DECLARE_string
(
word_symbol_table
);
namespace
ppspeech
{
namespace
ppspeech
{
...
@@ -71,6 +72,7 @@ struct DecodeOptions {
...
@@ -71,6 +72,7 @@ struct DecodeOptions {
struct
RecognizerResource
{
struct
RecognizerResource
{
// decodable opt
// decodable opt
kaldi
::
BaseFloat
acoustic_scale
{
1.0
};
kaldi
::
BaseFloat
acoustic_scale
{
1.0
};
kaldi
::
BaseFloat
blank_threshold
{
0.98
};
FeaturePipelineOptions
feature_pipeline_opts
{};
FeaturePipelineOptions
feature_pipeline_opts
{};
ModelOptions
model_opts
{};
ModelOptions
model_opts
{};
...
@@ -80,6 +82,7 @@ struct RecognizerResource {
...
@@ -80,6 +82,7 @@ struct RecognizerResource {
static
RecognizerResource
InitFromFlags
()
{
static
RecognizerResource
InitFromFlags
()
{
RecognizerResource
resource
;
RecognizerResource
resource
;
resource
.
acoustic_scale
=
FLAGS_acoustic_scale
;
resource
.
acoustic_scale
=
FLAGS_acoustic_scale
;
resource
.
blank_threshold
=
FLAGS_blank_threshold
;
LOG
(
INFO
)
<<
"acoustic_scale: "
<<
resource
.
acoustic_scale
;
LOG
(
INFO
)
<<
"acoustic_scale: "
<<
resource
.
acoustic_scale
;
resource
.
feature_pipeline_opts
=
resource
.
feature_pipeline_opts
=
...
...
runtime/engine/kaldi/fstbin/CMakeLists.txt
浏览文件 @
8c2196ea
...
@@ -11,5 +11,5 @@ fsttablecompose
...
@@ -11,5 +11,5 @@ fsttablecompose
foreach
(
binary IN LISTS BINS
)
foreach
(
binary IN LISTS BINS
)
add_executable
(
${
binary
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
binary
}
.cc
)
add_executable
(
${
binary
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
binary
}
.cc
)
target_include_directories
(
${
binary
}
PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_include_directories
(
${
binary
}
PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_link_libraries
(
${
binary
}
PUBLIC kaldi-fstext glog
libgflags_nothreads.so
fst dl
)
target_link_libraries
(
${
binary
}
PUBLIC kaldi-fstext glog
gflags
fst dl
)
endforeach
()
endforeach
()
runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
浏览文件 @
8c2196ea
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
## U2++ Attention Rescore
## U2++ Attention Rescore
> Intel(R) Xeon(R) Gold 6
271C CPU @ 2.6
0GHz, support `avx512_vnni`
> Intel(R) Xeon(R) Gold 6
148 CPU @ 2.4
0GHz, support `avx512_vnni`
> RTF with feature and decoder which is more end to end.
> RTF with feature and decoder which is more end to end.
### FP32
### FP32
...
@@ -23,18 +23,15 @@ Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
...
@@ -23,18 +23,15 @@ Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
#### RTF
#### RTF
```
```
I1027 10:52:38.662868 51665
u2_
recognizer_main.cc:122] total wav duration is: 36108.9 sec
I1027 10:52:38.662868 51665 recognizer_main.cc:122] total wav duration is: 36108.9 sec
I1027 10:52:38.662858 51665
u2_recognizer_main.cc:121] total cost:11169.
1 sec
I1027 10:52:38.662858 51665
recognizer_main.cc:121] total cost:9577.3
1 sec
I1027 10:52:38.662876 51665
u2_recognizer_main.cc:123] RTF is: 0.309318
I1027 10:52:38.662876 51665
recognizer_main.cc:123] RTF is: 0.265234
```
```
### INT8
### INT8
`local/recognizer_quant.sh`
`local/recognizer_quant.sh`
> RTF relative improve 12.8%, which count feature and decoder time.
> Test under Paddle commit c331e2ce2031d68a553bc9469a07c30d718438f3
#### CER
#### CER
```
```
...
@@ -52,16 +49,22 @@ I1110 09:59:52.551717 37249 u2_recognizer_main.cc:123] total decode cost:9737.63
...
@@ -52,16 +49,22 @@ I1110 09:59:52.551717 37249 u2_recognizer_main.cc:123] total decode cost:9737.63
I1110 09:59:52.551723 37249 u2_recognizer_main.cc:124] RTF is: 0.269674
I1110 09:59:52.551723 37249 u2_recognizer_main.cc:124] RTF is: 0.269674
```
```
###
CTC Prefix Beam Search
###
TLG decoder without attention rescore
`local/
decode
.sh`
`local/
recognizer_wfst
.sh`
#### CER
#### CER
```
```
Overall -> 6.74 % N=104765 C=98106 S=6516 D=143 I=401
Overall -> 4.73 % N=104765 C=100001 S=4283 D=481 I=187
Mandarin -> 6.74 % N=104762 C=98106 S=6513 D=143 I=401
Mandarin -> 4.72 % N=104762 C=100001 S=4280 D=481 I=187
English -> 0.00 % N=0 C=0 S=0 D=0 I=0
Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
```
#### RTF
```
I0417 08:07:15.300631 75784 recognizer_main.cc:113] total wav duration is: 36108.9 sec
I0417 08:07:15.300642 75784 recognizer_main.cc:114] total decode cost:10247.7 sec
I0417 08:07:15.300648 75784 recognizer_main.cc:115] total rescore cost:908.228 sec
I0417 08:07:15.300653 75784 recognizer_main.cc:116] RTF is: 0.283
```
```
runtime/examples/u2pp_ol/wenetspeech/local/recognizer_fastdeploy.sh
0 → 100755
浏览文件 @
8c2196ea
#!/bin/bash
set
-e
data
=
data
exp
=
exp
nj
=
20
.
utils/parse_options.sh
mkdir
-p
$exp
ckpt_dir
=
./data/model
model_dir
=
$ckpt_dir
/onnx_model/
aishell_wav_scp
=
aishell_test.scp
text
=
$data
/test/text
./local/split_data.sh
$data
$data
/
$aishell_wav_scp
$aishell_wav_scp
$nj
utils/run.pl
JOB
=
1:
$nj
$data
/split
${
nj
}
/JOB/recognizer.fd.log
\
recognizer_main
\
--use_fbank
=
true
\
--num_bins
=
80
\
--model_path
=
$model_dir
\
--word_symbol_table
=
$model_dir
/unit.txt
\
--nnet_decoder_chunk
=
16
\
--receptive_field_length
=
7
\
--subsampling_rate
=
4
\
--with_onnx_model
=
true
\
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/recognizer.fd.rsl.ark
cat
$data
/split
${
nj
}
/
*
/recognizer.fd.rsl.ark
>
$exp
/aishell.recognizer.fd.rsl
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/aishell.recognizer.fd.rsl
>
$exp
/aishell.recognizer.fd.err
echo
"recognizer fd test have finished!!!"
echo
"please checkout in
$exp
/aishell.recognizer.fd.err"
tail
-n
7
$exp
/aishell.recognizer.fd.err
runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh
浏览文件 @
8c2196ea
...
@@ -16,7 +16,7 @@ text=$data/test/text
...
@@ -16,7 +16,7 @@ text=$data/test/text
./local/split_data.sh
$data
$data
/
$aishell_wav_scp
$aishell_wav_scp
$nj
./local/split_data.sh
$data
$data
/
$aishell_wav_scp
$aishell_wav_scp
$nj
utils/run.pl
JOB
=
1:
$nj
$data
/split
${
nj
}
/JOB/recognizer.quant.log
\
utils/run.pl
JOB
=
1:
$nj
$data
/split
${
nj
}
/JOB/recognizer.quant.log
\
u2_
recognizer_main
\
recognizer_main
\
--use_fbank
=
true
\
--use_fbank
=
true
\
--num_bins
=
80
\
--num_bins
=
80
\
--cmvn_file
=
$model_dir
/mean_std.json
\
--cmvn_file
=
$model_dir
/mean_std.json
\
...
...
runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh
浏览文件 @
8c2196ea
...
@@ -3,7 +3,7 @@ set -e
...
@@ -3,7 +3,7 @@ set -e
data
=
data
data
=
data
exp
=
exp
exp
=
exp
nj
=
4
0
nj
=
2
0
.
utils/parse_options.sh
.
utils/parse_options.sh
...
@@ -19,6 +19,15 @@ lang_dir=./data/lang_test/
...
@@ -19,6 +19,15 @@ lang_dir=./data/lang_test/
graph
=
$lang_dir
/TLG.fst
graph
=
$lang_dir
/TLG.fst
word_table
=
$lang_dir
/words.txt
word_table
=
$lang_dir
/words.txt
if
[
!
-f
$graph
]
;
then
# download ngram, if you want to make graph by yourself, please refer local/run_build_tlg.sh
mkdir
-p
$lang_dir
pushd
$lang_dir
wget
-c
https://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/tlg.zip
unzip tlg.zip
popd
fi
utils/run.pl
JOB
=
1:
$nj
$data
/split
${
nj
}
/JOB/recognizer_wfst.log
\
utils/run.pl
JOB
=
1:
$nj
$data
/split
${
nj
}
/JOB/recognizer_wfst.log
\
recognizer_main
\
recognizer_main
\
--use_fbank
=
true
\
--use_fbank
=
true
\
...
@@ -31,6 +40,8 @@ recognizer_main \
...
@@ -31,6 +40,8 @@ recognizer_main \
--receptive_field_length
=
7
\
--receptive_field_length
=
7
\
--subsampling_rate
=
4
\
--subsampling_rate
=
4
\
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--rescoring_weight
=
0.0
\
--acoustic_scale
=
2
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_recognizer_wfst.ark
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_recognizer_wfst.ark
...
...
runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst_fastdeploy.sh
0 → 100755
浏览文件 @
8c2196ea
#!/bin/bash
set
-e
data
=
data
exp
=
exp
nj
=
20
.
utils/parse_options.sh
mkdir
-p
$exp
ckpt_dir
=
./data/model
model_dir
=
$ckpt_dir
/onnx_model/
aishell_wav_scp
=
aishell_test.scp
text
=
$data
/test/text
./local/split_data.sh
$data
$data
/
$aishell_wav_scp
$aishell_wav_scp
$nj
lang_dir
=
./data/lang_test/
graph
=
$lang_dir
/TLG.fst
word_table
=
$lang_dir
/words.txt
if
[
!
-f
$graph
]
;
then
# download ngram, if you want to make graph by yourself, please refer local/run_build_tlg.sh
mkdir
-p
$lang_dir
pushd
$lang_dir
wget
-c
https://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/tlg.zip
unzip tlg.zip
popd
fi
utils/run.pl
JOB
=
1:
$nj
$data
/split
${
nj
}
/JOB/recognizer_wfst_fd.log
\
recognizer_main
\
--use_fbank
=
true
\
--num_bins
=
80
\
--model_path
=
$model_dir
\
--graph_path
=
$lang_dir
/TLG.fst
\
--word_symbol_table
=
$word_table
\
--nnet_decoder_chunk
=
16
\
--receptive_field_length
=
7
\
--subsampling_rate
=
4
\
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--rescoring_weight
=
0.0
\
--acoustic_scale
=
2
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_recognizer_wfst_fd.ark
cat
$data
/split
${
nj
}
/
*
/result_recognizer_wfst_fd.ark
>
$exp
/aishell_recognizer_wfst_fd
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/aishell_recognizer_wfst_fd
>
$exp
/aishell.recognizer_wfst_fd.err
echo
"recognizer test have finished!!!"
echo
"please checkout in
$exp
/aishell.recognizer_wfst_fd.err"
tail
-n
7
$exp
/aishell.recognizer_wfst_fd.err
runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh
浏览文件 @
8c2196ea
...
@@ -7,13 +7,12 @@ set -eo pipefail
...
@@ -7,13 +7,12 @@ set -eo pipefail
# different acustic model has different vocab
# different acustic model has different vocab
ckpt_dir
=
data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model
ckpt_dir
=
data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model
unit
=
$ckpt_dir
/vocab.txt
# vocab file, line: char/spm_pice
unit
=
$ckpt_dir
/vocab.txt
# vocab file, line: char/spm_pice
model_dir
=
$ckpt_dir
/exp/deepspeech2_online/checkpoints/
stage
=
2
stage
=
2
stop_stage
=
100
stop_stage
=
100
corpus
=
aishell
corpus
=
aishell
lexicon
=
data/lexicon.txt
# line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
lexicon
=
data/lexicon.txt
# line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
text
=
data/text
# line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
text
=
data/text
# line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
filter by data/train/text
.
utils/parse_options.sh
.
utils/parse_options.sh
...
...
runtime/examples/u2pp_ol/wenetspeech/path.sh
浏览文件 @
8c2196ea
...
@@ -12,7 +12,7 @@ TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin
...
@@ -12,7 +12,7 @@ TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin
export
LC_AL
=
C
export
LC_AL
=
C
export
PATH
=
$PATH
:
$TOOLS_BIN
:
$ENGINE_BUILD
/nnet:
$ENGINE_BUILD
/decoder:
$ENGINE_BUILD
/../common/frontend/audio:
$ENGINE_BUILD
/recognizer
export
PATH
=
$PATH
:
$TOOLS_BIN
:
$ENGINE_BUILD
/nnet:
$ENGINE_BUILD
/decoder:
$ENGINE_BUILD
/../common/frontend/audio:
$ENGINE_BUILD
/recognizer
:../../../fc_patch/openfst/bin:
$ENGINE_BUILD
/../kaldi/fstbin:
$ENGINE_BUILD
/../kaldi/lmbin
#PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);")
#PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);")
export
LD_LIBRARY_PATH
=
$PADDLE_LIB_PATH
:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
$PADDLE_LIB_PATH
:
$LD_LIBRARY_PATH
runtime/examples/u2pp_ol/wenetspeech/run.sh
浏览文件 @
8c2196ea
...
@@ -69,23 +69,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
...
@@ -69,23 +69,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
fi
fi
fi
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# process compute fbank feat
./local/feat.sh
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# decode with fbank feat input
./local/decode.sh
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# decode with wav input
# decode with wav input
./local/recognizer.sh
./local/recognizer.sh
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# decode with wav input with quanted model
# decode with wav input with quanted model
./local/recognizer_quant.sh
./local/recognizer_quant.sh
fi
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# decode with wfst
./local/recognizer_wfst.sh
fi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录