Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
11335406
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
“e5b7736e1af960923c0156927f831a8fd5b8e654”上不存在“tests/pytest/tools/git@gitcode.net:taosdata/tdengine.git”
提交
11335406
编写于
4月 19, 2022
作者:
Y
Yang Zhou
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add websocket
上级
b78bc637
变更
28
显示空白变更内容
内联
并排
Showing
28 changed file
with
537 addition
and
69 deletion
+537
-69
speechx/CMakeLists.txt
speechx/CMakeLists.txt
+3
-2
speechx/examples/ds2_ol/CMakeLists.txt
speechx/examples/ds2_ol/CMakeLists.txt
+2
-1
speechx/examples/ds2_ol/aishell/path.sh
speechx/examples/ds2_ol/aishell/path.sh
+2
-2
speechx/examples/ds2_ol/aishell/run.sh
speechx/examples/ds2_ol/aishell/run.sh
+6
-6
speechx/examples/ds2_ol/decoder/CMakeLists.txt
speechx/examples/ds2_ol/decoder/CMakeLists.txt
+3
-0
speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
...mples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
+6
-14
speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc
speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc
+2
-2
speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
.../examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
+2
-2
speechx/examples/ds2_ol/websocket/CMakeLists.txt
speechx/examples/ds2_ol/websocket/CMakeLists.txt
+10
-0
speechx/examples/ds2_ol/websocket/websocket_client_main.cc
speechx/examples/ds2_ol/websocket/websocket_client_main.cc
+82
-0
speechx/examples/ds2_ol/websocket/websocket_server_main.cc
speechx/examples/ds2_ol/websocket/websocket_server_main.cc
+30
-0
speechx/speechx/CMakeLists.txt
speechx/speechx/CMakeLists.txt
+7
-1
speechx/speechx/base/common.h
speechx/speechx/base/common.h
+2
-0
speechx/speechx/decoder/CMakeLists.txt
speechx/speechx/decoder/CMakeLists.txt
+2
-1
speechx/speechx/decoder/ctc_tlg_decoder.cc
speechx/speechx/decoder/ctc_tlg_decoder.cc
+1
-2
speechx/speechx/decoder/param.h
speechx/speechx/decoder/param.h
+94
-0
speechx/speechx/decoder/recognizer.cc
speechx/speechx/decoder/recognizer.cc
+60
-0
speechx/speechx/decoder/recognizer.h
speechx/speechx/decoder/recognizer.h
+59
-0
speechx/speechx/frontend/audio/CMakeLists.txt
speechx/speechx/frontend/audio/CMakeLists.txt
+2
-1
speechx/speechx/frontend/audio/audio_cache.cc
speechx/speechx/frontend/audio/audio_cache.cc
+1
-1
speechx/speechx/frontend/audio/audio_cache.h
speechx/speechx/frontend/audio/audio_cache.h
+1
-1
speechx/speechx/frontend/audio/feature_cache.cc
speechx/speechx/frontend/audio/feature_cache.cc
+41
-21
speechx/speechx/frontend/audio/feature_cache.h
speechx/speechx/frontend/audio/feature_cache.h
+18
-3
speechx/speechx/frontend/audio/feature_pipeline.cc
speechx/speechx/frontend/audio/feature_pipeline.cc
+36
-0
speechx/speechx/frontend/audio/feature_pipeline.h
speechx/speechx/frontend/audio/feature_pipeline.h
+57
-0
speechx/speechx/frontend/audio/linear_spectrogram.cc
speechx/speechx/frontend/audio/linear_spectrogram.cc
+4
-4
speechx/speechx/frontend/audio/linear_spectrogram.h
speechx/speechx/frontend/audio/linear_spectrogram.h
+4
-4
speechx/speechx/nnet/decodable.cc
speechx/speechx/nnet/decodable.cc
+0
-1
未找到文件。
speechx/CMakeLists.txt
浏览文件 @
11335406
...
...
@@ -63,7 +63,8 @@ include(libsndfile)
# include(boost) # not work
set
(
boost_SOURCE_DIR
${
fc_patch
}
/boost-src
)
set
(
BOOST_ROOT
${
boost_SOURCE_DIR
}
)
# #find_package(boost REQUIRED PATHS ${BOOST_ROOT})
include_directories
(
${
boost_SOURCE_DIR
}
)
link_directories
(
${
boost_SOURCE_DIR
}
/stage/lib
)
# Eigen
include
(
eigen
)
...
...
speechx/examples/ds2_ol/CMakeLists.txt
浏览文件 @
11335406
...
...
@@ -3,3 +3,4 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
add_subdirectory
(
feat
)
add_subdirectory
(
nnet
)
add_subdirectory
(
decoder
)
add_subdirectory
(
websocket
)
speechx/examples/ds2_ol/aishell/path.sh
浏览文件 @
11335406
# This contains the locations of binarys build required for running the examples.
SPEECHX_ROOT
=
$PWD
/../../..
/
SPEECHX_ROOT
=
$PWD
/../../..
SPEECHX_EXAMPLES
=
$SPEECHX_ROOT
/build/examples
SPEECHX_TOOLS
=
$SPEECHX_ROOT
/tools
...
...
@@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
export
LC_AL
=
C
SPEECHX_BIN
=
$SPEECHX_EXAMPLES
/ds2_ol/decoder:
$SPEECHX_EXAMPLES
/ds2_ol/feat
SPEECHX_BIN
=
$SPEECHX_EXAMPLES
/ds2_ol/decoder:
$SPEECHX_EXAMPLES
/ds2_ol/feat
:
$SPEECHX_EXAMPLES
/ds2_ol/websocket
export
PATH
=
$PATH
:
$SPEECHX_BIN
:
$TOOLS_BIN
speechx/examples/ds2_ol/aishell/run.sh
浏览文件 @
11335406
...
...
@@ -42,7 +42,7 @@ fi
if
[
!
-d
$ckpt_dir
]
;
then
mkdir
-p
$ckpt_dir
wget
-P
$ckpt_dir
-c
https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
tar
xzfv
$
model
_dir
/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-C
$ckpt_dir
tar
xzfv
$
ckpt
_dir
/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-C
$ckpt_dir
fi
lm
=
$data
/zh_giga.no_cna_cmn.prune01244.klm
...
...
@@ -79,7 +79,7 @@ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
ctc-prefix-beam-search-decoder-ol
\
--feature_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/feat.scp
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--param
s
_path
=
$model_dir
/avg_1.jit.pdiparams
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--result_wspecifier
=
ark,t:
$data
/split
${
nj
}
/JOB/result
...
...
@@ -92,7 +92,7 @@ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
ctc-prefix-beam-search-decoder-ol
\
--feature_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/feat.scp
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--param
s
_path
=
$model_dir
/avg_1.jit.pdiparams
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--dict_file
=
$vocb_dir
/vocab.txt
\
--lm_path
=
$lm
\
...
...
@@ -104,9 +104,9 @@ utils/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm
graph_dir
=
./aishell_graph
if
[
!
-d
$
]
;
then
if
[
!
-d
$
graph_dir
]
;
then
wget
-c
https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
unzip
-d
aishell_graph.zip
unzip aishell_graph.zip
fi
...
...
@@ -115,7 +115,7 @@ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
wfst-decoder-ol
\
--feature_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/feat.scp
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--param
s
_path
=
$model_dir
/avg_1.jit.pdiparams
\
--word_symbol_table
=
$graph_dir
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--graph_path
=
$graph_dir
/TLG.fst
--max_active
=
7500
\
...
...
speechx/examples/ds2_ol/decoder/CMakeLists.txt
浏览文件 @
11335406
...
...
@@ -17,3 +17,6 @@ add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories
(
${
bin_name
}
PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_link_libraries
(
${
bin_name
}
PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util
${
DEPS
}
)
add_executable
(
recognizer_test_main
${
CMAKE_CURRENT_SOURCE_DIR
}
/recognizer_test_main.cc
)
target_include_directories
(
recognizer_test_main PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_link_libraries
(
recognizer_test_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder
${
DEPS
}
)
speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
浏览文件 @
11335406
...
...
@@ -34,12 +34,10 @@ DEFINE_int32(receptive_field_length,
DEFINE_int32
(
downsampling_rate
,
4
,
"two CNN(kernel=5) module downsampling rate."
);
DEFINE_string
(
model_input_names
,
"audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box"
,
"model input names"
);
DEFINE_string
(
model_output_names
,
"softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0"
,
"save_infer_model/scale_0.tmp_1,save_infer_model/"
"scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
"scale_3.tmp_1"
,
"model output names"
);
DEFINE_string
(
model_cache_names
,
"5-1-1024,5-1-1024"
,
"model cache names"
);
...
...
@@ -52,18 +50,14 @@ int main(int argc, char* argv[]) {
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
false
);
google
::
InitGoogleLogging
(
argv
[
0
]);
CHECK
(
FLAGS_result_wspecifier
!=
""
);
CHECK
(
FLAGS_feature_rspecifier
!=
""
);
kaldi
::
SequentialBaseFloatMatrixReader
feature_reader
(
FLAGS_feature_rspecifier
);
kaldi
::
TokenWriter
result_writer
(
FLAGS_result_wspecifier
);
std
::
string
model_graph
=
FLAGS_model_path
;
std
::
string
model_path
=
FLAGS_model_path
;
std
::
string
model_params
=
FLAGS_param_path
;
std
::
string
dict_file
=
FLAGS_dict_file
;
std
::
string
lm_path
=
FLAGS_lm_path
;
LOG
(
INFO
)
<<
"model path: "
<<
model_
grap
h
;
LOG
(
INFO
)
<<
"model path: "
<<
model_
pat
h
;
LOG
(
INFO
)
<<
"model param: "
<<
model_params
;
LOG
(
INFO
)
<<
"dict path: "
<<
dict_file
;
LOG
(
INFO
)
<<
"lm path: "
<<
lm_path
;
...
...
@@ -76,10 +70,9 @@ int main(int argc, char* argv[]) {
ppspeech
::
CTCBeamSearch
decoder
(
opts
);
ppspeech
::
ModelOptions
model_opts
;
model_opts
.
model_path
=
model_
grap
h
;
model_opts
.
model_path
=
model_
pat
h
;
model_opts
.
params_path
=
model_params
;
model_opts
.
cache_shape
=
FLAGS_model_cache_names
;
model_opts
.
input_names
=
FLAGS_model_input_names
;
model_opts
.
output_names
=
FLAGS_model_output_names
;
std
::
shared_ptr
<
ppspeech
::
PaddleNnet
>
nnet
(
new
ppspeech
::
PaddleNnet
(
model_opts
));
...
...
@@ -125,7 +118,6 @@ int main(int argc, char* argv[]) {
if
(
feature_chunk_size
<
receptive_field_length
)
break
;
int32
start
=
chunk_idx
*
chunk_stride
;
int32
end
=
start
+
chunk_size
;
for
(
int
row_id
=
0
;
row_id
<
chunk_size
;
++
row_id
)
{
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
tmp
(
feature
,
start
);
...
...
speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc
浏览文件 @
11335406
...
...
@@ -73,7 +73,7 @@ int main(int argc, char* argv[]) {
LOG
(
INFO
)
<<
"cmvn stats have write into: "
<<
FLAGS_cmvn_write_path
;
LOG
(
INFO
)
<<
"Binary: "
<<
FLAGS_binary
;
}
catch
(
simdjson
::
simdjson_error
&
err
)
{
LOG
(
ERR
)
<<
err
.
what
();
LOG
(
ERR
OR
)
<<
err
.
what
();
}
...
...
speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
浏览文件 @
11335406
...
...
@@ -32,7 +32,6 @@ DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
DEFINE_string
(
cmvn_file
,
"./cmvn.ark"
,
"read cmvn"
);
DEFINE_double
(
streaming_chunk
,
0.36
,
"streaming feature chunk size"
);
int
main
(
int
argc
,
char
*
argv
[])
{
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
false
);
google
::
InitGoogleLogging
(
argv
[
0
]);
...
...
@@ -66,7 +65,8 @@ int main(int argc, char* argv[]) {
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
cmvn
(
new
ppspeech
::
CMVN
(
FLAGS_cmvn_file
,
std
::
move
(
linear_spectrogram
)));
ppspeech
::
FeatureCache
feature_cache
(
kint16max
,
std
::
move
(
cmvn
));
ppspeech
::
FeatureCacheOptions
feat_cache_opts
;
ppspeech
::
FeatureCache
feature_cache
(
feat_cache_opts
,
std
::
move
(
cmvn
));
LOG
(
INFO
)
<<
"feat dim: "
<<
feature_cache
.
Dim
();
int
sample_rate
=
16000
;
...
...
speechx/examples/ds2_ol/websocket/CMakeLists.txt
0 → 100644
浏览文件 @
11335406
cmake_minimum_required
(
VERSION 3.14 FATAL_ERROR
)
add_executable
(
websocket_server_main
${
CMAKE_CURRENT_SOURCE_DIR
}
/websocket_server_main.cc
)
target_include_directories
(
websocket_server_main PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_link_libraries
(
websocket_server_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder websocket
${
DEPS
}
)
add_executable
(
websocket_client_main
${
CMAKE_CURRENT_SOURCE_DIR
}
/websocket_client_main.cc
)
target_include_directories
(
websocket_client_main PRIVATE
${
SPEECHX_ROOT
}
${
SPEECHX_ROOT
}
/kaldi
)
target_link_libraries
(
websocket_client_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder websocket
${
DEPS
}
)
speechx/examples/ds2_ol/websocket/websocket_client_main.cc
0 → 100644
浏览文件 @
11335406
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "websocket/websocket_client.h"
#include "kaldi/feat/wave-reader.h"
#include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h"
DEFINE_string
(
host
,
"127.0.0.1"
,
"host of websocket server"
);
DEFINE_int32
(
port
,
201314
,
"port of websocket server"
);
DEFINE_string
(
wav_rspecifier
,
""
,
"test wav scp path"
);
DEFINE_double
(
streaming_chunk
,
0.1
,
"streaming feature chunk size"
);
using
kaldi
::
int16
;
int
main
(
int
argc
,
char
*
argv
[])
{
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
false
);
google
::
InitGoogleLogging
(
argv
[
0
]);
ppspeech
::
WebSocketClient
client
(
FLAGS_host
,
FLAGS_port
);
kaldi
::
SequentialTableReader
<
kaldi
::
WaveHolder
>
wav_reader
(
FLAGS_wav_rspecifier
);
const
int
sample_rate
=
16000
;
const
float
streaming_chunk
=
FLAGS_streaming_chunk
;
const
int
chunk_sample_size
=
streaming_chunk
*
sample_rate
;
for
(;
!
wav_reader
.
Done
();
wav_reader
.
Next
())
{
client
.
SendStartSignal
();
std
::
string
utt
=
wav_reader
.
Key
();
const
kaldi
::
WaveData
&
wave_data
=
wav_reader
.
Value
();
CHECK_EQ
(
wave_data
.
SampFreq
(),
sample_rate
);
int32
this_channel
=
0
;
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
waveform
(
wave_data
.
Data
(),
this_channel
);
const
int
tot_samples
=
waveform
.
Dim
();
int
sample_offset
=
0
;
while
(
sample_offset
<
tot_samples
)
{
int
cur_chunk_size
=
std
::
min
(
chunk_sample_size
,
tot_samples
-
sample_offset
);
std
::
vector
<
int16
>
wav_chunk
(
cur_chunk_size
);
for
(
int
i
=
0
;
i
<
cur_chunk_size
;
++
i
)
{
wav_chunk
[
i
]
=
static_cast
<
int16
>
(
waveform
(
sample_offset
+
i
));
}
client
.
SendBinaryData
(
wav_chunk
.
data
(),
wav_chunk
.
size
()
*
sizeof
(
int16
));
sample_offset
+=
cur_chunk_size
;
LOG
(
INFO
)
<<
"Send "
<<
cur_chunk_size
<<
" samples"
;
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
static_cast
<
int
>
(
1
*
1000
)));
if
(
cur_chunk_size
<
chunk_sample_size
)
{
client
.
SendEndSignal
();
}
}
while
(
!
client
.
Done
())
{
}
std
::
string
result
=
client
.
GetResult
();
LOG
(
INFO
)
<<
"utt: "
<<
utt
<<
" "
<<
result
;
client
.
Join
();
return
0
;
}
return
0
;
}
speechx/examples/ds2_ol/websocket/websocket_server_main.cc
0 → 100644
浏览文件 @
11335406
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "websocket/websocket_server.h"
#include "decoder/param.h"
DEFINE_int32
(
port
,
201314
,
"websocket listening port"
);
int
main
(
int
argc
,
char
*
argv
[])
{
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
false
);
google
::
InitGoogleLogging
(
argv
[
0
]);
ppspeech
::
RecognizerResource
resource
=
ppspeech
::
InitRecognizerResoure
();
ppspeech
::
WebSocketServer
server
(
FLAGS_port
,
resource
);
LOG
(
INFO
)
<<
"Listening at port "
<<
FLAGS_port
;
server
.
Start
();
return
0
;
}
speechx/speechx/CMakeLists.txt
浏览文件 @
11335406
...
...
@@ -31,3 +31,9 @@ ${CMAKE_CURRENT_SOURCE_DIR}
${
CMAKE_CURRENT_SOURCE_DIR
}
/decoder
)
add_subdirectory
(
decoder
)
include_directories
(
${
CMAKE_CURRENT_SOURCE_DIR
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/websocket
)
add_subdirectory
(
websocket
)
speechx/speechx/base/common.h
浏览文件 @
11335406
...
...
@@ -28,8 +28,10 @@
#include <sstream>
#include <stack>
#include <string>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "base/basic_types.h"
...
...
speechx/speechx/decoder/CMakeLists.txt
浏览文件 @
11335406
...
...
@@ -7,5 +7,6 @@ add_library(decoder STATIC
ctc_decoders/path_trie.cpp
ctc_decoders/scorer.cpp
ctc_tlg_decoder.cc
recognizer.cc
)
target_link_libraries
(
decoder PUBLIC kenlm utils fst
)
target_link_libraries
(
decoder PUBLIC kenlm utils fst
frontend nnet kaldi-decoder
)
speechx/speechx/decoder/ctc_tlg_decoder.cc
浏览文件 @
11335406
...
...
@@ -33,7 +33,6 @@ void TLGDecoder::InitDecoder() {
void
TLGDecoder
::
AdvanceDecode
(
const
std
::
shared_ptr
<
kaldi
::
DecodableInterface
>&
decodable
)
{
while
(
!
decodable
->
IsLastFrame
(
frame_decoded_size_
))
{
LOG
(
INFO
)
<<
"num frame decode: "
<<
frame_decoded_size_
;
AdvanceDecoding
(
decodable
.
get
());
}
}
...
...
speechx/speechx/decoder/param.h
0 → 100644
浏览文件 @
11335406
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "base/common.h"
#include "decoder/ctc_beam_search_decoder.h"
#include "decoder/ctc_tlg_decoder.h"
#include "frontend/audio/feature_pipeline.h"
DEFINE_string
(
cmvn_file
,
""
,
"read cmvn"
);
DEFINE_double
(
streaming_chunk
,
0.1
,
"streaming feature chunk size"
);
DEFINE_bool
(
convert2PCM32
,
true
,
"audio convert to pcm32"
);
DEFINE_string
(
model_path
,
"avg_1.jit.pdmodel"
,
"paddle nnet model"
);
DEFINE_string
(
params_path
,
"avg_1.jit.pdiparams"
,
"paddle nnet model param"
);
DEFINE_string
(
word_symbol_table
,
"words.txt"
,
"word symbol table"
);
DEFINE_string
(
graph_path
,
"TLG"
,
"decoder graph"
);
DEFINE_double
(
acoustic_scale
,
1.0
,
"acoustic scale"
);
DEFINE_int32
(
max_active
,
7500
,
"max active"
);
DEFINE_double
(
beam
,
15.0
,
"decoder beam"
);
DEFINE_double
(
lattice_beam
,
7.5
,
"decoder beam"
);
DEFINE_int32
(
receptive_field_length
,
7
,
"receptive field of two CNN(kernel=5) downsampling module."
);
DEFINE_int32
(
downsampling_rate
,
4
,
"two CNN(kernel=5) module downsampling rate."
);
DEFINE_string
(
model_output_names
,
"save_infer_model/scale_0.tmp_1,save_infer_model/"
"scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
"scale_3.tmp_1"
,
"model output names"
);
DEFINE_string
(
model_cache_names
,
"5-1-1024,5-1-1024"
,
"model cache names"
);
namespace
ppspeech
{
// todo refactor later
FeaturePipelineOptions
InitFeaturePipelineOptions
()
{
FeaturePipelineOptions
opts
;
opts
.
cmvn_file
=
FLAGS_cmvn_file
;
opts
.
linear_spectrogram_opts
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opts
.
convert2PCM32
=
FLAGS_convert2PCM32
;
kaldi
::
FrameExtractionOptions
frame_opts
;
frame_opts
.
frame_length_ms
=
20
;
frame_opts
.
frame_shift_ms
=
10
;
frame_opts
.
remove_dc_offset
=
false
;
frame_opts
.
window_type
=
"hanning"
;
frame_opts
.
preemph_coeff
=
0.0
;
frame_opts
.
dither
=
0.0
;
opts
.
linear_spectrogram_opts
.
frame_opts
=
frame_opts
;
opts
.
feature_cache_opts
.
frame_chunk_size
=
FLAGS_receptive_field_length
;
opts
.
feature_cache_opts
.
frame_chunk_stride
=
FLAGS_downsampling_rate
;
return
opts
;
}
ModelOptions
InitModelOptions
()
{
ModelOptions
model_opts
;
model_opts
.
model_path
=
FLAGS_model_path
;
model_opts
.
params_path
=
FLAGS_params_path
;
model_opts
.
cache_shape
=
FLAGS_model_cache_names
;
model_opts
.
output_names
=
FLAGS_model_output_names
;
return
model_opts
;
}
TLGDecoderOptions
InitDecoderOptions
()
{
TLGDecoderOptions
decoder_opts
;
decoder_opts
.
word_symbol_table
=
FLAGS_word_symbol_table
;
decoder_opts
.
fst_path
=
FLAGS_graph_path
;
decoder_opts
.
opts
.
max_active
=
FLAGS_max_active
;
decoder_opts
.
opts
.
beam
=
FLAGS_beam
;
decoder_opts
.
opts
.
lattice_beam
=
FLAGS_lattice_beam
;
return
decoder_opts
;
}
RecognizerResource
InitRecognizerResoure
()
{
RecognizerResource
resource
;
resource
.
acoustic_scale
=
FLAGS_acoustic_scale
;
resource
.
feature_pipeline_opts
=
InitFeaturePipelineOptions
();
resource
.
model_opts
=
InitModelOptions
();
resource
.
tlg_opts
=
InitDecoderOptions
();
return
resource
;
}
}
\ No newline at end of file
speechx/speechx/decoder/recognizer.cc
0 → 100644
浏览文件 @
11335406
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "decoder/recognizer.h"
namespace
ppspeech
{
using
kaldi
::
Vector
;
using
kaldi
::
VectorBase
;
using
kaldi
::
BaseFloat
;
using
std
::
vector
;
using
kaldi
::
SubVector
;
using
std
::
unique_ptr
;
Recognizer
::
Recognizer
(
const
RecognizerResource
&
resource
)
{
// resource_ = resource;
const
FeaturePipelineOptions
&
feature_opts
=
resource
.
feature_pipeline_opts
;
feature_pipeline_
.
reset
(
new
FeaturePipeline
(
feature_opts
));
std
::
shared_ptr
<
PaddleNnet
>
nnet
(
new
PaddleNnet
(
resource
.
model_opts
));
BaseFloat
ac_scale
=
resource
.
acoustic_scale
;
decodable_
.
reset
(
new
Decodable
(
nnet
,
feature_pipeline_
,
ac_scale
));
decoder_
.
reset
(
new
TLGDecoder
(
resource
.
tlg_opts
));
input_finished_
=
false
;
}
void
Recognizer
::
Accept
(
const
Vector
<
BaseFloat
>&
waves
)
{
feature_pipeline_
->
Accept
(
waves
);
}
void
Recognizer
::
Decode
()
{
decoder_
->
AdvanceDecode
(
decodable_
);
}
std
::
string
Recognizer
::
GetFinalResult
()
{
return
decoder_
->
GetFinalBestPath
();
}
void
Recognizer
::
SetFinished
()
{
feature_pipeline_
->
SetFinished
();
input_finished_
=
true
;
}
bool
Recognizer
::
IsFinished
()
{
return
input_finished_
;
}
void
Recognizer
::
Reset
()
{
feature_pipeline_
->
Reset
();
decodable_
->
Reset
();
decoder_
->
Reset
();
}
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/decoder/recognizer.h
0 → 100644
浏览文件 @
11335406
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// todo refactor later (SGoat)
#pragma once
#include "decoder/ctc_beam_search_decoder.h"
#include "decoder/ctc_tlg_decoder.h"
#include "frontend/audio/feature_pipeline.h"
#include "nnet/decodable.h"
#include "nnet/paddle_nnet.h"
namespace
ppspeech
{
struct
RecognizerResource
{
FeaturePipelineOptions
feature_pipeline_opts
;
ModelOptions
model_opts
;
TLGDecoderOptions
tlg_opts
;
// CTCBeamSearchOptions beam_search_opts;
kaldi
::
BaseFloat
acoustic_scale
;
RecognizerResource
()
:
acoustic_scale
(
1.0
),
feature_pipeline_opts
(),
model_opts
(),
tlg_opts
()
{}
};
class
Recognizer
{
public:
explicit
Recognizer
(
const
RecognizerResource
&
resouce
);
void
Accept
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
);
void
Decode
();
std
::
string
GetFinalResult
();
void
SetFinished
();
bool
IsFinished
();
void
Reset
();
private:
// std::shared_ptr<RecognizerResource> resource_;
// RecognizerResource resource_;
std
::
shared_ptr
<
FeaturePipeline
>
feature_pipeline_
;
std
::
shared_ptr
<
Decodable
>
decodable_
;
std
::
unique_ptr
<
TLGDecoder
>
decoder_
;
bool
input_finished_
;
};
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/frontend/audio/CMakeLists.txt
浏览文件 @
11335406
...
...
@@ -6,6 +6,7 @@ add_library(frontend STATIC
linear_spectrogram.cc
audio_cache.cc
feature_cache.cc
feature_pipeline.cc
)
target_link_libraries
(
frontend PUBLIC kaldi-matrix
)
\ No newline at end of file
target_link_libraries
(
frontend PUBLIC kaldi-matrix kaldi-feat-common
)
speechx/speechx/frontend/audio/audio_cache.cc
浏览文件 @
11335406
...
...
@@ -41,7 +41,7 @@ void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
ready_feed_condition_
.
wait
(
lock
);
}
for
(
size_t
idx
=
0
;
idx
<
waves
.
Dim
();
++
idx
)
{
int32
buffer_idx
=
(
idx
+
offset_
)
%
ring_buffer_
.
size
();
int32
buffer_idx
=
(
idx
+
offset_
+
size_
)
%
ring_buffer_
.
size
();
ring_buffer_
[
buffer_idx
]
=
waves
(
idx
);
if
(
convert2PCM32_
)
ring_buffer_
[
buffer_idx
]
=
Convert2PCM32
(
waves
(
idx
));
...
...
speechx/speechx/frontend/audio/audio_cache.h
浏览文件 @
11335406
...
...
@@ -24,7 +24,7 @@ namespace ppspeech {
class
AudioCache
:
public
FrontendInterface
{
public:
explicit
AudioCache
(
int
buffer_size
=
1000
*
kint16max
,
bool
convert2PCM32
=
fals
e
);
bool
convert2PCM32
=
tru
e
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
BaseFloat
>&
waves
);
...
...
speechx/speechx/frontend/audio/feature_cache.cc
浏览文件 @
11335406
...
...
@@ -23,10 +23,13 @@ using std::vector;
using
kaldi
::
SubVector
;
using
std
::
unique_ptr
;
FeatureCache
::
FeatureCache
(
int
max_size
,
FeatureCache
::
FeatureCache
(
FeatureCacheOptions
opts
,
unique_ptr
<
FrontendInterface
>
base_extractor
)
{
max_size_
=
max_size
;
max_size_
=
opts
.
max_size
;
frame_chunk_stride_
=
opts
.
frame_chunk_stride
;
frame_chunk_size_
=
opts
.
frame_chunk_size
;
base_extractor_
=
std
::
move
(
base_extractor
);
dim_
=
base_extractor_
->
Dim
();
}
void
FeatureCache
::
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
)
{
...
...
@@ -44,13 +47,14 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
while
(
cache_
.
empty
()
&&
base_extractor_
->
IsFinished
()
==
false
)
{
ready_read_condition_
.
wait
(
lock
);
BaseFloat
elapsed
=
timer
.
Elapsed
()
*
1000
;
// todo replace 1.0 with timeout_
if
(
elapsed
>
1.0
)
{
// todo refactor: wait
// ready_read_condition_.wait(lock);
int32
elapsed
=
static_cast
<
int32
>
(
timer
.
Elapsed
()
*
1000
);
// todo replace 1 with timeout_, 1 ms
if
(
elapsed
>
1
)
{
return
false
;
}
usleep
(
100
0
);
// sleep
1 ms
usleep
(
100
);
// sleep 0.
1 ms
}
if
(
cache_
.
empty
())
return
false
;
feats
->
Resize
(
cache_
.
front
().
Dim
());
...
...
@@ -63,8 +67,25 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
// read all data from base_feature_extractor_ into cache_
bool
FeatureCache
::
Compute
()
{
// compute and feed
Vector
<
BaseFloat
>
feature_chunk
;
bool
result
=
base_extractor_
->
Read
(
&
feature_chunk
);
Vector
<
BaseFloat
>
feature
;
bool
result
=
base_extractor_
->
Read
(
&
feature
);
if
(
result
==
false
||
feature
.
Dim
()
==
0
)
return
false
;
int32
joint_len
=
feature
.
Dim
()
+
remained_feature_
.
Dim
();
int32
num_chunk
=
((
joint_len
/
dim_
)
-
frame_chunk_size_
)
/
frame_chunk_stride_
+
1
;
Vector
<
BaseFloat
>
joint_feature
(
joint_len
);
joint_feature
.
Range
(
0
,
remained_feature_
.
Dim
())
.
CopyFromVec
(
remained_feature_
);
joint_feature
.
Range
(
remained_feature_
.
Dim
(),
feature
.
Dim
())
.
CopyFromVec
(
feature
);
for
(
int
chunk_idx
=
0
;
chunk_idx
<
num_chunk
;
++
chunk_idx
)
{
int32
start
=
chunk_idx
*
frame_chunk_stride_
*
dim_
;
Vector
<
BaseFloat
>
feature_chunk
(
frame_chunk_size_
*
dim_
);
SubVector
<
BaseFloat
>
tmp
(
joint_feature
.
Data
()
+
start
,
frame_chunk_size_
*
dim_
);
feature_chunk
.
CopyFromVec
(
tmp
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
while
(
cache_
.
size
()
>=
max_size_
)
{
...
...
@@ -72,16 +93,15 @@ bool FeatureCache::Compute() {
}
// feed cache
if
(
feature_chunk
.
Dim
()
!=
0
)
{
cache_
.
push
(
feature_chunk
);
}
ready_read_condition_
.
notify_one
();
}
int32
remained_feature_len
=
joint_len
-
num_chunk
*
frame_chunk_stride_
*
dim_
;
remained_feature_
.
Resize
(
remained_feature_len
);
remained_feature_
.
CopyFromVec
(
joint_feature
.
Range
(
frame_chunk_stride_
*
num_chunk
*
dim_
,
remained_feature_len
));
return
result
;
}
void
Reset
()
{
// std::lock_guard<std::mutex> lock(mutex_);
return
;
}
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/frontend/audio/feature_cache.h
浏览文件 @
11335406
...
...
@@ -19,10 +19,18 @@
namespace
ppspeech
{
struct
FeatureCacheOptions
{
int32
max_size
;
int32
frame_chunk_size
;
int32
frame_chunk_stride
;
FeatureCacheOptions
()
:
max_size
(
kint16max
),
frame_chunk_size
(
1
),
frame_chunk_stride
(
1
)
{}
};
class
FeatureCache
:
public
FrontendInterface
{
public:
explicit
FeatureCache
(
int32
max_size
=
kint16max
,
FeatureCacheOptions
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
=
NULL
);
// Feed feats or waves
...
...
@@ -32,12 +40,15 @@ class FeatureCache : public FrontendInterface {
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// feat dim
virtual
size_t
Dim
()
const
{
return
base_extractor_
->
Dim
()
;
}
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
// std::unique_lock<std::mutex> lock(mutex_);
base_extractor_
->
SetFinished
();
LOG
(
INFO
)
<<
"set finished"
;
// read the last chunk data
Compute
();
// ready_feed_condition_.notify_one();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
...
...
@@ -52,9 +63,13 @@ class FeatureCache : public FrontendInterface {
private:
bool
Compute
();
int32
dim_
;
size_t
max_size_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
int32
frame_chunk_size_
;
int32
frame_chunk_stride_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_feature_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
std
::
mutex
mutex_
;
std
::
queue
<
kaldi
::
Vector
<
BaseFloat
>>
cache_
;
std
::
condition_variable
ready_feed_condition_
;
...
...
speechx/speechx/frontend/audio/feature_pipeline.cc
0 → 100644
浏览文件 @
11335406
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/feature_pipeline.h"
namespace
ppspeech
{
using
std
::
unique_ptr
;
FeaturePipeline
::
FeaturePipeline
(
const
FeaturePipelineOptions
&
opts
)
{
unique_ptr
<
FrontendInterface
>
data_source
(
new
ppspeech
::
AudioCache
(
1000
*
kint16max
,
opts
.
convert2PCM32
));
unique_ptr
<
FrontendInterface
>
linear_spectrogram
(
new
ppspeech
::
LinearSpectrogram
(
opts
.
linear_spectrogram_opts
,
std
::
move
(
data_source
)));
unique_ptr
<
FrontendInterface
>
cmvn
(
new
ppspeech
::
CMVN
(
opts
.
cmvn_file
,
std
::
move
(
linear_spectrogram
)));
base_extractor_
.
reset
(
new
ppspeech
::
FeatureCache
(
opts
.
feature_cache_opts
,
std
::
move
(
cmvn
)));
}
}
// ppspeech
\ No newline at end of file
speechx/speechx/frontend/audio/feature_pipeline.h
0 → 100644
浏览文件 @
11335406
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// todo refactor later (SGoat)
#pragma once
#include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/normalizer.h"
namespace
ppspeech
{
struct
FeaturePipelineOptions
{
std
::
string
cmvn_file
;
bool
convert2PCM32
;
LinearSpectrogramOptions
linear_spectrogram_opts
;
FeatureCacheOptions
feature_cache_opts
;
FeaturePipelineOptions
()
:
cmvn_file
(
""
),
convert2PCM32
(
false
),
linear_spectrogram_opts
(),
feature_cache_opts
()
{}
};
class
FeaturePipeline
:
public
FrontendInterface
{
public:
explicit
FeaturePipeline
(
const
FeaturePipelineOptions
&
opts
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
)
{
base_extractor_
->
Accept
(
waves
);
}
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
)
{
return
base_extractor_
->
Read
(
feats
);
}
virtual
size_t
Dim
()
const
{
return
base_extractor_
->
Dim
();
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
}
private:
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
};
}
\ No newline at end of file
speechx/speechx/frontend/audio/linear_spectrogram.cc
浏览文件 @
11335406
...
...
@@ -52,16 +52,16 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
if
(
flag
==
false
||
input_feats
.
Dim
()
==
0
)
return
false
;
int32
feat_len
=
input_feats
.
Dim
();
int32
left_len
=
rem
ind
ed_wav_
.
Dim
();
int32
left_len
=
rem
ain
ed_wav_
.
Dim
();
Vector
<
BaseFloat
>
waves
(
feat_len
+
left_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
rem
ind
ed_wav_
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
rem
ain
ed_wav_
);
waves
.
Range
(
left_len
,
feat_len
).
CopyFromVec
(
input_feats
);
Compute
(
waves
,
feats
);
int32
frame_shift
=
opts_
.
frame_opts
.
WindowShift
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
opts_
.
frame_opts
);
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
rem
ind
ed_wav_
.
Resize
(
left_samples
);
rem
ind
ed_wav_
.
CopyFromVec
(
rem
ain
ed_wav_
.
Resize
(
left_samples
);
rem
ain
ed_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
}
...
...
speechx/speechx/frontend/audio/linear_spectrogram.h
浏览文件 @
11335406
...
...
@@ -25,12 +25,12 @@ struct LinearSpectrogramOptions {
kaldi
::
FrameExtractionOptions
frame_opts
;
kaldi
::
BaseFloat
streaming_chunk
;
// second
LinearSpectrogramOptions
()
:
streaming_chunk
(
0.
36
),
frame_opts
()
{}
LinearSpectrogramOptions
()
:
streaming_chunk
(
0.
1
),
frame_opts
()
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"streaming-chunk"
,
&
streaming_chunk
,
"streaming chunk size, default: 0.
36
sec"
);
"streaming chunk size, default: 0.
1
sec"
);
frame_opts
.
Register
(
opts
);
}
};
...
...
@@ -48,7 +48,7 @@ class LinearSpectrogram : public FrontendInterface {
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
rem
ind
ed_wav_
.
Resize
(
0
);
rem
ain
ed_wav_
.
Resize
(
0
);
}
private:
...
...
@@ -60,7 +60,7 @@ class LinearSpectrogram : public FrontendInterface {
kaldi
::
BaseFloat
hanning_window_energy_
;
LinearSpectrogramOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
rem
ind
ed_wav_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
rem
ain
ed_wav_
;
int
chunk_sample_size_
;
DISALLOW_COPY_AND_ASSIGN
(
LinearSpectrogram
);
};
...
...
speechx/speechx/nnet/decodable.cc
浏览文件 @
11335406
...
...
@@ -78,7 +78,6 @@ bool Decodable::AdvanceChunk() {
}
int32
nnet_dim
=
0
;
Vector
<
BaseFloat
>
inferences
;
Matrix
<
BaseFloat
>
nnet_cache_tmp
;
nnet_
->
FeedForward
(
features
,
frontend_
->
Dim
(),
&
inferences
,
&
nnet_dim
);
nnet_cache_
.
Resize
(
inferences
.
Dim
()
/
nnet_dim
,
nnet_dim
);
nnet_cache_
.
CopyRowsFromVec
(
inferences
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录