Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
f3132ce2
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f3132ce2
编写于
6月 07, 2022
作者:
H
Hui Zhang
提交者:
GitHub
6月 07, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #2014 from SmileGoat/refactor_file_struct
[speechx] add nnet_decoder_chunk opt
上级
7fc9f5d4
46107f71
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
232 addition
and
24 deletion
+232
-24
speechx/examples/ds2_ol/aishell/run.sh
speechx/examples/ds2_ol/aishell/run.sh
+8
-0
speechx/examples/ds2_ol/aishell/run_fbank.sh
speechx/examples/ds2_ol/aishell/run_fbank.sh
+10
-2
speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
...hx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+6
-4
speechx/speechx/decoder/param.h
speechx/speechx/decoder/param.h
+7
-5
speechx/speechx/decoder/tlg_decoder_main.cc
speechx/speechx/decoder/tlg_decoder_main.cc
+6
-5
speechx/speechx/frontend/audio/assembler.cc
speechx/speechx/frontend/audio/assembler.cc
+16
-3
speechx/speechx/frontend/audio/assembler.h
speechx/speechx/frontend/audio/assembler.h
+9
-4
speechx/speechx/nnet/CMakeLists.txt
speechx/speechx/nnet/CMakeLists.txt
+8
-1
speechx/speechx/nnet/nnet_forward_main.cc
speechx/speechx/nnet/nnet_forward_main.cc
+162
-0
未找到文件。
speechx/examples/ds2_ol/aishell/run.sh
浏览文件 @
f3132ce2
...
@@ -89,6 +89,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
...
@@ -89,6 +89,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--nnet_decoder_chunk
=
8
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result
...
@@ -96,6 +97,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
...
@@ -96,6 +97,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
>
$exp
/
${
wer
}
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
>
$exp
/
${
wer
}
echo
"ctc-prefix-beam-search-decoder-ol without lm has finished!!!"
echo
"ctc-prefix-beam-search-decoder-ol without lm has finished!!!"
echo
"please checkout in
${
exp
}
/
${
wer
}
"
echo
"please checkout in
${
exp
}
/
${
wer
}
"
tail
-n
7
$exp
/
${
wer
}
fi
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
...
@@ -106,6 +108,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
...
@@ -106,6 +108,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--nnet_decoder_chunk
=
8
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--lm_path
=
$lm
\
--lm_path
=
$lm
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_lm
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_lm
...
@@ -114,6 +117,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
...
@@ -114,6 +117,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_lm
>
$exp
/
${
wer
}
.lm
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_lm
>
$exp
/
${
wer
}
.lm
echo
"ctc-prefix-beam-search-decoder-ol with lm test has finished!!!"
echo
"ctc-prefix-beam-search-decoder-ol with lm test has finished!!!"
echo
"please checkout in
${
exp
}
/
${
wer
}
.lm"
echo
"please checkout in
${
exp
}
/
${
wer
}
.lm"
tail
-n
7
$exp
/
${
wer
}
.lm
fi
fi
wfst
=
$data
/wfst/
wfst
=
$data
/wfst/
...
@@ -138,6 +142,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -138,6 +142,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--word_symbol_table
=
$wfst
/words.txt
\
--word_symbol_table
=
$wfst
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--graph_path
=
$wfst
/TLG.fst
--max_active
=
7500
\
--graph_path
=
$wfst
/TLG.fst
--max_active
=
7500
\
--nnet_decoder_chunk
=
8
\
--acoustic_scale
=
1.2
\
--acoustic_scale
=
1.2
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_tlg
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_tlg
...
@@ -145,6 +150,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -145,6 +150,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_tlg
>
$exp
/
${
wer
}
.tlg
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_tlg
>
$exp
/
${
wer
}
.tlg
echo
"wfst-decoder-ol have finished!!!"
echo
"wfst-decoder-ol have finished!!!"
echo
"please checkout in
${
exp
}
/
${
wer
}
.tlg"
echo
"please checkout in
${
exp
}
/
${
wer
}
.tlg"
tail
-n
7
$exp
/
${
wer
}
.tlg
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
...
@@ -156,6 +162,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
...
@@ -156,6 +162,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
--word_symbol_table
=
$wfst
/words.txt
\
--nnet_decoder_chunk
=
8
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--graph_path
=
$wfst
/TLG.fst
--max_active
=
7500
\
--graph_path
=
$wfst
/TLG.fst
--max_active
=
7500
\
--acoustic_scale
=
1.2
\
--acoustic_scale
=
1.2
\
...
@@ -165,4 +172,5 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
...
@@ -165,4 +172,5 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_recognizer
>
$exp
/
${
wer
}
.recognizer
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_recognizer
>
$exp
/
${
wer
}
.recognizer
echo
"recognizer test have finished!!!"
echo
"recognizer test have finished!!!"
echo
"please checkout in
${
exp
}
/
${
wer
}
.recognizer"
echo
"please checkout in
${
exp
}
/
${
wer
}
.recognizer"
tail
-n
7
$exp
/
${
wer
}
.recognizer
fi
fi
speechx/examples/ds2_ol/aishell/run_fbank.sh
浏览文件 @
f3132ce2
...
@@ -90,11 +90,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
...
@@ -90,11 +90,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_cache_shapes
=
"5-1-2048,5-1-2048"
\
--model_cache_shapes
=
"5-1-2048,5-1-2048"
\
--nnet_decoder_chunk
=
8
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_fbank
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_fbank
cat
$data
/split
${
nj
}
/
*
/result_fbank
>
$exp
/
${
label_file
}
cat
$data
/split
${
nj
}
/
*
/result_fbank
>
$exp
/
${
label_file
}
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
>
$exp
/
${
wer
}
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
>
$exp
/
${
wer
}
tail
-n
7
$exp
/
${
wer
}
fi
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
...
@@ -105,13 +107,15 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
...
@@ -105,13 +107,15 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--model_path
=
$model_dir
/avg_5.jit.pdmodel
\
--model_path
=
$model_dir
/avg_5.jit.pdmodel
\
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_cache_shapes
=
"5-1-2048,5-1-2048"
\
--model_cache_shapes
=
"5-1-2048,5-1-2048"
\
--nnet_decoder_chunk
=
8
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--lm_path
=
$lm
\
--lm_path
=
$lm
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/fbank_result_lm
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/fbank_result_lm
cat
$data
/split
${
nj
}
/
*
/fbank_result_lm
>
$exp
/
${
label_file
}
_lm
cat
$data
/split
${
nj
}
/
*
/fbank_result_lm
>
$exp
/
${
label_file
}
_lm
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_lm
>
$exp
/
${
wer
}
.lm
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_lm
>
$exp
/
${
wer
}
.lm
tail
-n
7
$exp
/
${
wer
}
.lm
fi
fi
wfst
=
$data
/wfst_fbank/
wfst
=
$data
/wfst_fbank/
...
@@ -135,7 +139,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -135,7 +139,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
--word_symbol_table
=
$wfst
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_cache_shapes
=
"5-1-2048,5-1-2048"
\
--model_cache_shapes
=
"5-1-2048,5-1-2048"
\
--nnet_decoder_chunk
=
8
\
--graph_path
=
$wfst
/TLG.fst
--max_active
=
7500
\
--graph_path
=
$wfst
/TLG.fst
--max_active
=
7500
\
--acoustic_scale
=
1.2
\
--acoustic_scale
=
1.2
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_tlg
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_tlg
...
@@ -144,6 +149,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -144,6 +149,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_tlg
>
$exp
/
${
wer
}
.tlg
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_tlg
>
$exp
/
${
wer
}
.tlg
echo
"wfst-decoder-ol have finished!!!"
echo
"wfst-decoder-ol have finished!!!"
echo
"please checkout in
${
exp
}
/
${
wer
}
.tlg"
echo
"please checkout in
${
exp
}
/
${
wer
}
.tlg"
tail
-n
7
$exp
/
${
wer
}
.tlg
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
...
@@ -157,6 +163,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
...
@@ -157,6 +163,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--word_symbol_table
=
$wfst
/words.txt
\
--word_symbol_table
=
$wfst
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_cache_shapes
=
"5-1-2048,5-1-2048"
\
--model_cache_shapes
=
"5-1-2048,5-1-2048"
\
--nnet_decoder_chunk
=
8
\
--graph_path
=
$wfst
/TLG.fst
--max_active
=
7500
\
--graph_path
=
$wfst
/TLG.fst
--max_active
=
7500
\
--acoustic_scale
=
1.2
\
--acoustic_scale
=
1.2
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_fbank_recognizer
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result_fbank_recognizer
...
@@ -165,4 +172,5 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
...
@@ -165,4 +172,5 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_recognizer
>
$exp
/
${
wer
}
.recognizer
utils/compute-wer.py
--char
=
1
--v
=
1
$text
$exp
/
${
label_file
}
_recognizer
>
$exp
/
${
wer
}
.recognizer
echo
"recognizer test have finished!!!"
echo
"recognizer test have finished!!!"
echo
"please checkout in
${
exp
}
/
${
wer
}
.recognizer"
echo
"please checkout in
${
exp
}
/
${
wer
}
.recognizer"
tail
-n
7
$exp
/
${
wer
}
.recognizer
fi
fi
speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
浏览文件 @
f3132ce2
...
@@ -30,10 +30,10 @@ DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
...
@@ -30,10 +30,10 @@ DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
DEFINE_string
(
lm_path
,
""
,
"language model"
);
DEFINE_string
(
lm_path
,
""
,
"language model"
);
DEFINE_int32
(
receptive_field_length
,
DEFINE_int32
(
receptive_field_length
,
7
,
7
,
"receptive field of two CNN(kernel=
5
) downsampling module."
);
"receptive field of two CNN(kernel=
3
) downsampling module."
);
DEFINE_int32
(
downsampling_rate
,
DEFINE_int32
(
downsampling_rate
,
4
,
4
,
"two CNN(kernel=
5
) module downsampling rate."
);
"two CNN(kernel=
3
) module downsampling rate."
);
DEFINE_string
(
DEFINE_string
(
model_input_names
,
model_input_names
,
"audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box"
,
"audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box"
,
...
@@ -45,6 +45,7 @@ DEFINE_string(model_cache_names,
...
@@ -45,6 +45,7 @@ DEFINE_string(model_cache_names,
"chunk_state_h_box,chunk_state_c_box"
,
"chunk_state_h_box,chunk_state_c_box"
,
"model cache names"
);
"model cache names"
);
DEFINE_string
(
model_cache_shapes
,
"5-1-1024,5-1-1024"
,
"model cache shapes"
);
DEFINE_string
(
model_cache_shapes
,
"5-1-1024,5-1-1024"
,
"model cache shapes"
);
DEFINE_int32
(
nnet_decoder_chunk
,
1
,
"paddle nnet forward chunk"
);
using
kaldi
::
BaseFloat
;
using
kaldi
::
BaseFloat
;
using
kaldi
::
Matrix
;
using
kaldi
::
Matrix
;
...
@@ -90,8 +91,9 @@ int main(int argc, char* argv[]) {
...
@@ -90,8 +91,9 @@ int main(int argc, char* argv[]) {
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
new
ppspeech
::
Decodable
(
nnet
,
raw_data
));
new
ppspeech
::
Decodable
(
nnet
,
raw_data
));
int32
chunk_size
=
FLAGS_receptive_field_length
;
int32
chunk_size
=
FLAGS_receptive_field_length
int32
chunk_stride
=
FLAGS_downsampling_rate
;
+
(
FLAGS_nnet_decoder_chunk
-
1
)
*
FLAGS_downsampling_rate
;
int32
chunk_stride
=
FLAGS_downsampling_rate
*
FLAGS_nnet_decoder_chunk
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
LOG
(
INFO
)
<<
"chunk stride (frame): "
<<
chunk_stride
;
LOG
(
INFO
)
<<
"chunk stride (frame): "
<<
chunk_stride
;
...
...
speechx/speechx/decoder/param.h
浏览文件 @
f3132ce2
...
@@ -28,11 +28,11 @@ DEFINE_string(cmvn_file, "", "read cmvn");
...
@@ -28,11 +28,11 @@ DEFINE_string(cmvn_file, "", "read cmvn");
// feature sliding window
// feature sliding window
DEFINE_int32
(
receptive_field_length
,
DEFINE_int32
(
receptive_field_length
,
7
,
7
,
"receptive field of two CNN(kernel=
5
) downsampling module."
);
"receptive field of two CNN(kernel=
3
) downsampling module."
);
DEFINE_int32
(
downsampling_rate
,
DEFINE_int32
(
downsampling_rate
,
4
,
4
,
"two CNN(kernel=
5
) module downsampling rate."
);
"two CNN(kernel=
3
) module downsampling rate."
);
DEFINE_int32
(
nnet_decoder_chunk
,
1
,
"paddle nnet forward chunk"
);
// nnet
// nnet
DEFINE_string
(
model_path
,
"avg_1.jit.pdmodel"
,
"paddle nnet model"
);
DEFINE_string
(
model_path
,
"avg_1.jit.pdmodel"
,
"paddle nnet model"
);
DEFINE_string
(
param_path
,
"avg_1.jit.pdiparams"
,
"paddle nnet model param"
);
DEFINE_string
(
param_path
,
"avg_1.jit.pdiparams"
,
"paddle nnet model param"
);
...
@@ -79,8 +79,10 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
...
@@ -79,8 +79,10 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
frame_opts
.
preemph_coeff
=
0.0
;
frame_opts
.
preemph_coeff
=
0.0
;
opts
.
linear_spectrogram_opts
.
frame_opts
=
frame_opts
;
opts
.
linear_spectrogram_opts
.
frame_opts
=
frame_opts
;
}
}
opts
.
assembler_opts
.
frame_chunk_size
=
FLAGS_receptive_field_length
;
opts
.
assembler_opts
.
subsampling_rate
=
FLAGS_downsampling_rate
;
opts
.
assembler_opts
.
frame_chunk_stride
=
FLAGS_downsampling_rate
;
opts
.
assembler_opts
.
receptive_filed_length
=
FLAGS_receptive_field_length
;
opts
.
assembler_opts
.
nnet_decoder_chunk
=
FLAGS_nnet_decoder_chunk
;
return
opts
;
return
opts
;
}
}
...
...
speechx/speechx/decoder/tlg_decoder_main.cc
浏览文件 @
f3132ce2
...
@@ -28,15 +28,15 @@ DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
...
@@ -28,15 +28,15 @@ DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
DEFINE_string
(
param_path
,
"avg_1.jit.pdiparams"
,
"paddle nnet model param"
);
DEFINE_string
(
param_path
,
"avg_1.jit.pdiparams"
,
"paddle nnet model param"
);
DEFINE_string
(
word_symbol_table
,
"words.txt"
,
"word symbol table"
);
DEFINE_string
(
word_symbol_table
,
"words.txt"
,
"word symbol table"
);
DEFINE_string
(
graph_path
,
"TLG"
,
"decoder graph"
);
DEFINE_string
(
graph_path
,
"TLG"
,
"decoder graph"
);
DEFINE_double
(
acoustic_scale
,
1.0
,
"acoustic scale"
);
DEFINE_double
(
acoustic_scale
,
1.0
,
"acoustic scale"
);
DEFINE_int32
(
max_active
,
7500
,
"decoder graph"
);
DEFINE_int32
(
max_active
,
7500
,
"decoder graph"
);
DEFINE_int32
(
nnet_decoder_chunk
,
1
,
"paddle nnet forward chunk"
);
DEFINE_int32
(
receptive_field_length
,
DEFINE_int32
(
receptive_field_length
,
7
,
7
,
"receptive field of two CNN(kernel=
5
) downsampling module."
);
"receptive field of two CNN(kernel=
3
) downsampling module."
);
DEFINE_int32
(
downsampling_rate
,
DEFINE_int32
(
downsampling_rate
,
4
,
4
,
"two CNN(kernel=
5
) module downsampling rate."
);
"two CNN(kernel=
3
) module downsampling rate."
);
DEFINE_string
(
DEFINE_string
(
model_input_names
,
model_input_names
,
"audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box"
,
"audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box"
,
...
@@ -93,8 +93,9 @@ int main(int argc, char* argv[]) {
...
@@ -93,8 +93,9 @@ int main(int argc, char* argv[]) {
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
new
ppspeech
::
Decodable
(
nnet
,
raw_data
,
FLAGS_acoustic_scale
));
new
ppspeech
::
Decodable
(
nnet
,
raw_data
,
FLAGS_acoustic_scale
));
int32
chunk_size
=
FLAGS_receptive_field_length
;
int32
chunk_size
=
FLAGS_receptive_field_length
int32
chunk_stride
=
FLAGS_downsampling_rate
;
+
(
FLAGS_nnet_decoder_chunk
-
1
)
*
FLAGS_downsampling_rate
;
int32
chunk_stride
=
FLAGS_downsampling_rate
*
FLAGS_nnet_decoder_chunk
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
LOG
(
INFO
)
<<
"chunk stride (frame): "
<<
chunk_stride
;
LOG
(
INFO
)
<<
"chunk stride (frame): "
<<
chunk_stride
;
...
...
speechx/speechx/frontend/audio/assembler.cc
浏览文件 @
f3132ce2
...
@@ -23,8 +23,9 @@ using std::unique_ptr;
...
@@ -23,8 +23,9 @@ using std::unique_ptr;
Assembler
::
Assembler
(
AssemblerOptions
opts
,
Assembler
::
Assembler
(
AssemblerOptions
opts
,
unique_ptr
<
FrontendInterface
>
base_extractor
)
{
unique_ptr
<
FrontendInterface
>
base_extractor
)
{
frame_chunk_stride_
=
opts
.
frame_chunk_stride
;
frame_chunk_stride_
=
opts
.
subsampling_rate
*
opts
.
nnet_decoder_chunk
;
frame_chunk_size_
=
opts
.
frame_chunk_size
;
frame_chunk_size_
=
(
opts
.
nnet_decoder_chunk
-
1
)
*
opts
.
subsampling_rate
+
opts
.
receptive_filed_length
;
receptive_filed_length_
=
opts
.
receptive_filed_length
;
base_extractor_
=
std
::
move
(
base_extractor
);
base_extractor_
=
std
::
move
(
base_extractor
);
dim_
=
base_extractor_
->
Dim
();
dim_
=
base_extractor_
->
Dim
();
}
}
...
@@ -48,10 +49,22 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
...
@@ -48,10 +49,22 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
while
(
feature_cache_
.
size
()
<
frame_chunk_size_
)
{
while
(
feature_cache_
.
size
()
<
frame_chunk_size_
)
{
Vector
<
BaseFloat
>
feature
;
Vector
<
BaseFloat
>
feature
;
result
=
base_extractor_
->
Read
(
&
feature
);
result
=
base_extractor_
->
Read
(
&
feature
);
if
(
result
==
false
||
feature
.
Dim
()
==
0
)
return
false
;
if
(
result
==
false
||
feature
.
Dim
()
==
0
)
{
if
(
IsFinished
()
==
false
)
return
false
;
break
;
}
feature_cache_
.
push
(
feature
);
feature_cache_
.
push
(
feature
);
}
}
if
(
feature_cache_
.
size
()
<
receptive_filed_length_
)
{
return
false
;
}
while
(
feature_cache_
.
size
()
<
frame_chunk_size_
)
{
Vector
<
BaseFloat
>
feature
(
dim_
,
kaldi
::
kSetZero
);
feature_cache_
.
push
(
feature
);
}
int32
counter
=
0
;
int32
counter
=
0
;
int32
cache_size
=
frame_chunk_size_
-
frame_chunk_stride_
;
int32
cache_size
=
frame_chunk_size_
-
frame_chunk_stride_
;
int32
elem_dim
=
base_extractor_
->
Dim
();
int32
elem_dim
=
base_extractor_
->
Dim
();
...
...
speechx/speechx/frontend/audio/assembler.h
浏览文件 @
f3132ce2
...
@@ -20,12 +20,16 @@
...
@@ -20,12 +20,16 @@
namespace
ppspeech
{
namespace
ppspeech
{
struct
AssemblerOptions
{
struct
AssemblerOptions
{
int32
frame_chunk_size
;
// refer:https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/s2t/exps/deepspeech2/model.py
int32
frame_chunk_stride
;
// the nnet batch forward
int32
receptive_filed_length
;
int32
subsampling_rate
;
int32
nnet_decoder_chunk
;
AssemblerOptions
()
AssemblerOptions
()
:
frame_chunk_size
(
1
),
:
receptive_filed_length
(
1
),
frame_chunk_stride
(
1
)
{}
subsampling_rate
(
1
),
nnet_decoder_chunk
(
1
)
{}
};
};
class
Assembler
:
public
FrontendInterface
{
class
Assembler
:
public
FrontendInterface
{
...
@@ -59,6 +63,7 @@ class Assembler : public FrontendInterface {
...
@@ -59,6 +63,7 @@ class Assembler : public FrontendInterface {
int32
dim_
;
int32
dim_
;
int32
frame_chunk_size_
;
// window
int32
frame_chunk_size_
;
// window
int32
frame_chunk_stride_
;
// stride
int32
frame_chunk_stride_
;
// stride
int32
receptive_filed_length_
;
std
::
queue
<
kaldi
::
Vector
<
kaldi
::
BaseFloat
>>
feature_cache_
;
std
::
queue
<
kaldi
::
Vector
<
kaldi
::
BaseFloat
>>
feature_cache_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
DISALLOW_COPY_AND_ASSIGN
(
Assembler
);
DISALLOW_COPY_AND_ASSIGN
(
Assembler
);
...
...
speechx/speechx/nnet/CMakeLists.txt
浏览文件 @
f3132ce2
...
@@ -4,4 +4,11 @@ add_library(nnet STATIC
...
@@ -4,4 +4,11 @@ add_library(nnet STATIC
decodable.cc
decodable.cc
paddle_nnet.cc
paddle_nnet.cc
)
)
target_link_libraries
(
nnet absl::strings
)
target_link_libraries
(
nnet absl::strings
)
\ No newline at end of file
set
(
bin_name nnet_forward_main
)
add_executable
(
${
bin_name
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
bin_name
}
.cc
)
target_include_directories
(
${
bin_name
}
PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_link_libraries
(
${
bin_name
}
utils kaldi-util kaldi-matrix gflags glog nnet
${
DEPS
}
)
speechx/speechx/nnet/nnet_forward_main.cc
0 → 100644
浏览文件 @
f3132ce2
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "base/flags.h"
#include "base/log.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/assembler.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/paddle_nnet.h"
DEFINE_string
(
feature_rspecifier
,
""
,
"test feature rspecifier"
);
DEFINE_string
(
nnet_prob_wspecifier
,
""
,
"nnet porb wspecifier"
);
DEFINE_string
(
model_path
,
"avg_1.jit.pdmodel"
,
"paddle nnet model"
);
DEFINE_string
(
param_path
,
"avg_1.jit.pdiparams"
,
"paddle nnet model param"
);
DEFINE_int32
(
nnet_decoder_chunk
,
1
,
"paddle nnet forward chunk"
);
DEFINE_int32
(
receptive_field_length
,
7
,
"receptive field of two CNN(kernel=3) downsampling module."
);
DEFINE_int32
(
downsampling_rate
,
4
,
"two CNN(kernel=3) module downsampling rate."
);
DEFINE_string
(
model_input_names
,
"audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box"
,
"model input names"
);
DEFINE_string
(
model_output_names
,
"softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0"
,
"model output names"
);
DEFINE_string
(
model_cache_names
,
"chunk_state_h_box,chunk_state_c_box"
,
"model cache names"
);
DEFINE_string
(
model_cache_shapes
,
"5-1-1024,5-1-1024"
,
"model cache shapes"
);
DEFINE_double
(
acoustic_scale
,
1.0
,
"acoustic scale"
);
using
kaldi
::
BaseFloat
;
using
kaldi
::
Matrix
;
using
std
::
vector
;
int
main
(
int
argc
,
char
*
argv
[])
{
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
false
);
google
::
InitGoogleLogging
(
argv
[
0
]);
kaldi
::
SequentialBaseFloatMatrixReader
feature_reader
(
FLAGS_feature_rspecifier
);
kaldi
::
BaseFloatMatrixWriter
nnet_writer
(
FLAGS_nnet_prob_wspecifier
);
std
::
string
model_graph
=
FLAGS_model_path
;
std
::
string
model_params
=
FLAGS_param_path
;
LOG
(
INFO
)
<<
"model path: "
<<
model_graph
;
LOG
(
INFO
)
<<
"model param: "
<<
model_params
;
int32
num_done
=
0
,
num_err
=
0
;
ppspeech
::
ModelOptions
model_opts
;
model_opts
.
model_path
=
model_graph
;
model_opts
.
param_path
=
model_params
;
model_opts
.
cache_names
=
FLAGS_model_cache_names
;
model_opts
.
cache_shape
=
FLAGS_model_cache_shapes
;
model_opts
.
input_names
=
FLAGS_model_input_names
;
model_opts
.
output_names
=
FLAGS_model_output_names
;
std
::
shared_ptr
<
ppspeech
::
PaddleNnet
>
nnet
(
new
ppspeech
::
PaddleNnet
(
model_opts
));
std
::
shared_ptr
<
ppspeech
::
DataCache
>
raw_data
(
new
ppspeech
::
DataCache
());
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
new
ppspeech
::
Decodable
(
nnet
,
raw_data
,
FLAGS_acoustic_scale
));
int32
chunk_size
=
FLAGS_receptive_field_length
+
(
FLAGS_nnet_decoder_chunk
-
1
)
*
FLAGS_downsampling_rate
;
int32
chunk_stride
=
FLAGS_downsampling_rate
*
FLAGS_nnet_decoder_chunk
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
LOG
(
INFO
)
<<
"chunk stride (frame): "
<<
chunk_stride
;
LOG
(
INFO
)
<<
"receptive field (frame): "
<<
receptive_field_length
;
kaldi
::
Timer
timer
;
for
(;
!
feature_reader
.
Done
();
feature_reader
.
Next
())
{
string
utt
=
feature_reader
.
Key
();
kaldi
::
Matrix
<
BaseFloat
>
feature
=
feature_reader
.
Value
();
raw_data
->
SetDim
(
feature
.
NumCols
());
LOG
(
INFO
)
<<
"process utt: "
<<
utt
;
LOG
(
INFO
)
<<
"rows: "
<<
feature
.
NumRows
();
LOG
(
INFO
)
<<
"cols: "
<<
feature
.
NumCols
();
int32
row_idx
=
0
;
int32
padding_len
=
0
;
int32
ori_feature_len
=
feature
.
NumRows
();
if
((
feature
.
NumRows
()
-
chunk_size
)
%
chunk_stride
!=
0
)
{
padding_len
=
chunk_stride
-
(
feature
.
NumRows
()
-
chunk_size
)
%
chunk_stride
;
feature
.
Resize
(
feature
.
NumRows
()
+
padding_len
,
feature
.
NumCols
(),
kaldi
::
kCopyData
);
}
int32
num_chunks
=
(
feature
.
NumRows
()
-
chunk_size
)
/
chunk_stride
+
1
;
int32
frame_idx
=
0
;
std
::
vector
<
kaldi
::
Vector
<
kaldi
::
BaseFloat
>>
prob_vec
;
for
(
int
chunk_idx
=
0
;
chunk_idx
<
num_chunks
;
++
chunk_idx
)
{
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
feature_chunk
(
chunk_size
*
feature
.
NumCols
());
int32
feature_chunk_size
=
0
;
if
(
ori_feature_len
>
chunk_idx
*
chunk_stride
)
{
feature_chunk_size
=
std
::
min
(
ori_feature_len
-
chunk_idx
*
chunk_stride
,
chunk_size
);
}
if
(
feature_chunk_size
<
receptive_field_length
)
break
;
int32
start
=
chunk_idx
*
chunk_stride
;
for
(
int
row_id
=
0
;
row_id
<
chunk_size
;
++
row_id
)
{
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
tmp
(
feature
,
start
);
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
f_chunk_tmp
(
feature_chunk
.
Data
()
+
row_id
*
feature
.
NumCols
(),
feature
.
NumCols
());
f_chunk_tmp
.
CopyFromVec
(
tmp
);
++
start
;
}
raw_data
->
Accept
(
feature_chunk
);
if
(
chunk_idx
==
num_chunks
-
1
)
{
raw_data
->
SetFinished
();
}
vector
<
kaldi
::
BaseFloat
>
prob
;
while
(
decodable
->
FrameLikelihood
(
frame_idx
,
&
prob
))
{
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
vec_tmp
(
prob
.
size
());
std
::
memcpy
(
vec_tmp
.
Data
(),
prob
.
data
(),
sizeof
(
kaldi
::
BaseFloat
)
*
prob
.
size
());
prob_vec
.
push_back
(
vec_tmp
);
frame_idx
++
;
}
}
decodable
->
Reset
();
if
(
prob_vec
.
size
()
==
0
)
{
// the TokenWriter can not write empty string.
++
num_err
;
KALDI_LOG
<<
" the nnet prob of "
<<
utt
<<
" is empty"
;
continue
;
}
kaldi
::
Matrix
<
kaldi
::
BaseFloat
>
result
(
prob_vec
.
size
(),
prob_vec
[
0
].
Dim
());
for
(
int32
row_idx
=
0
;
row_idx
<
prob_vec
.
size
();
++
row_idx
)
{
for
(
int32
col_idx
=
0
;
col_idx
<
prob_vec
[
0
].
Dim
();
++
col_idx
)
{
result
(
row_idx
,
col_idx
)
=
prob_vec
[
row_idx
](
col_idx
);
}
}
nnet_writer
.
Write
(
utt
,
result
);
++
num_done
;
}
double
elapsed
=
timer
.
Elapsed
();
KALDI_LOG
<<
" cost:"
<<
elapsed
<<
" s"
;
KALDI_LOG
<<
"Done "
<<
num_done
<<
" utterances, "
<<
num_err
<<
" with errors."
;
return
(
num_done
!=
0
?
0
:
1
);
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录