diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 68c3b0453bffa8f10af87b62bae75af80c60c87e..0eed8e5615f5185af884e372bf25d27b09a93936 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -352,4 +352,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 - ``` \ No newline at end of file + ``` diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index c58e17e984289cc02ea3dade7a3e390f4937639a..bf122bb3afe845d76a6327c378917169c4dbf3ff 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -353,4 +353,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 - ``` \ No newline at end of file + ``` diff --git a/speechx/examples/ds2_ol/README.md b/speechx/examples/ds2_ol/README.md index 3fabd3e7a5360605dc87aef5cebb891cea893da7..18f248a12d3c7c3ee68a7d75bfb1f7317a5b8c26 100644 --- a/speechx/examples/ds2_ol/README.md +++ b/speechx/examples/ds2_ol/README.md @@ -4,6 +4,8 @@ Please go to `aishell` to test it. * aishell Deepspeech2 Streaming Decoding under aishell dataset. +* websocket +Streaming ASR with websocket. The below is for developing and offline testing: * nnet diff --git a/speechx/examples/ds2_ol/aishell/README.md b/speechx/examples/ds2_ol/aishell/README.md index 115bf85f42bfd477635a21d0d69b765f94cf1dfe..01c899799cfceb9ed3f75ed15aebe418dfc0e52d 100644 --- a/speechx/examples/ds2_ol/aishell/README.md +++ b/speechx/examples/ds2_ol/aishell/README.md @@ -33,4 +33,4 @@ LM: [wenetspeech](http://paddlespeech.bj.bcebos.com/speechx/examples/ds2_ol/aish Overall -> 10.93 % N=104765 C=93410 S=9780 D=1575 I=95 Mandarin -> 10.93 % N=104762 C=93410 S=9779 D=1573 I=95 Other -> 100.00 % N=3 C=0 S=1 D=2 I=0 -``` \ No newline at end of file +``` diff --git a/speechx/examples/ds2_ol/aishell/path.sh b/speechx/examples/ds2_ol/aishell/path.sh old mode 100644 new mode 100755 diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 9906de3a7a2c5f65616e320bb6d8eabb10727e74..06f274276f5e61d232e6cd505ba84936ad167edf 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -119,7 +119,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then pushd $wfst wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip unzip aishell_graph.zip - mv aishell_graph/* + mv aishell_graph/* $wfst popd fi diff --git a/speechx/examples/ds2_ol/aishell/websocket_server.sh b/speechx/examples/ds2_ol/aishell/websocket_server.sh deleted file mode 100644 index ea619d544b7421ca9dc49d2c01c8acbe2583237c..0000000000000000000000000000000000000000 --- a/speechx/examples/ds2_ol/aishell/websocket_server.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash -set +x -set -e - -. path.sh - - -# 1. compile -if [ ! -d ${SPEECHX_EXAMPLES} ]; then - pushd ${SPEECHX_ROOT} - bash build.sh - popd -fi - -# input -mkdir -p data -data=$PWD/data -ckpt_dir=$data/model -model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ -vocb_dir=$ckpt_dir/data/lang_char/ - -# output -aishell_wav_scp=aishell_test.scp -if [ ! -d $data/test ]; then - pushd $data - wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip - unzip aishell_test.zip - popd - - realpath $data/test/*/*.wav > $data/wavlist - awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id - paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp -fi - - -if [ ! -d $ckpt_dir ]; then - mkdir -p $ckpt_dir - wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz - tar xzfv $ckpt_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir -fi - - -export GLOG_logtostderr=1 - -# 3. gen cmvn -cmvn=$PWD/cmvn.ark -cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn - -text=$data/test/text -graph_dir=./aishell_graph -if [ ! -d $graph_dir ]; then - wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip - unzip aishell_graph.zip -fi - -# 5. test websocket server -websocket_server_main \ - --cmvn_file=$cmvn \ - --model_path=$model_dir/avg_1.jit.pdmodel \ - --streaming_chunk=0.1 \ - --convert2PCM32=true \ - --params_path=$model_dir/avg_1.jit.pdiparams \ - --word_symbol_table=$graph_dir/words.txt \ - --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --graph_path=$graph_dir/TLG.fst --max_active=7500 \ - --acoustic_scale=1.2 diff --git a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc index 4a39217cbe021569bacfc71af098634fee3e9c99..6a6495aa1a9313ce3c93e7db2b05be079a22c328 100644 --- a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc +++ b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc @@ -76,7 +76,7 @@ int main(int argc, char* argv[]) { ppspeech::ModelOptions model_opts; model_opts.model_path = model_path; - model_opts.params_path = model_params; + model_opts.param_path = model_params; model_opts.cache_shape = FLAGS_model_cache_names; model_opts.input_names = FLAGS_model_input_names; model_opts.output_names = FLAGS_model_output_names; diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc index 198a8ec2c11413eda5a9516cb8b2bea0f607aaa2..e6fed0ed9d5c24518ac64936c235788d8d10456e 100644 --- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc +++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc @@ -60,6 +60,7 @@ int main(int argc, char* argv[]) { for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } + // wav_chunk = waveform.Range(sample_offset + i, cur_chunk_size); recognizer.Accept(wav_chunk); if (cur_chunk_size < chunk_sample_size) { @@ -67,8 +68,10 @@ int main(int argc, char* argv[]) { } recognizer.Decode(); + // no overlap sample_offset += cur_chunk_size; } + std::string result; result = recognizer.GetFinalResult(); recognizer.Reset(); diff --git a/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc b/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc index 92b3d8ec78d7b4b09354fa4e5f487c56b9d5b674..544e59cb1a765b62728de3e75ea010544ea71e70 100644 --- a/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc +++ b/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc @@ -79,7 +79,7 @@ int main(int argc, char* argv[]) { ppspeech::ModelOptions model_opts; model_opts.model_path = model_graph; - model_opts.params_path = model_params; + model_opts.param_path = model_params; model_opts.cache_shape = FLAGS_model_cache_names; model_opts.input_names = FLAGS_model_input_names; model_opts.output_names = FLAGS_model_output_names; diff --git a/speechx/examples/ds2_ol/websocket/.gitignore b/speechx/examples/ds2_ol/websocket/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..bbd86a25b018bc611bf6ae52cbb6afa5f60bce62 --- /dev/null +++ b/speechx/examples/ds2_ol/websocket/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/speechx/examples/ds2_ol/websocket/CMakeLists.txt b/speechx/examples/ds2_ol/websocket/CMakeLists.txt index 754b528efc0f617d350063ec9539321b965f44ff..ed542aad07b729f5c0797a10a5733cbb7c1bc7f6 100644 --- a/speechx/examples/ds2_ol/websocket/CMakeLists.txt +++ b/speechx/examples/ds2_ol/websocket/CMakeLists.txt @@ -6,5 +6,4 @@ target_link_libraries(websocket_server_main PUBLIC frontend kaldi-feat-common nn add_executable(websocket_client_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_client_main.cc) target_include_directories(websocket_client_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(websocket_client_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder websocket ${DEPS}) - +target_link_libraries(websocket_client_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder websocket ${DEPS}) \ No newline at end of file diff --git a/speechx/examples/ds2_ol/websocket/path.sh b/speechx/examples/ds2_ol/websocket/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..d66b5dccea6fd44a9180020f6b557bfcf89c7875 --- /dev/null +++ b/speechx/examples/ds2_ol/websocket/path.sh @@ -0,0 +1,14 @@ +# This contains the locations of binarys build required for running the examples. + +SPEECHX_ROOT=$PWD/../../.. +SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/websocket:$SPEECHX_EXAMPLES/ds2_ol/feat +export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN diff --git a/speechx/examples/ds2_ol/aishell/websocket_client.sh b/speechx/examples/ds2_ol/websocket/websocket_client.sh old mode 100644 new mode 100755 similarity index 100% rename from speechx/examples/ds2_ol/aishell/websocket_client.sh rename to speechx/examples/ds2_ol/websocket/websocket_client.sh diff --git a/speechx/examples/ds2_ol/websocket/websocket_client_main.cc b/speechx/examples/ds2_ol/websocket/websocket_client_main.cc index d6f0d4806ce9ff17a1cb9564b499576e12adbd0b..df658b0a2218b72af99c23d7a986aa4867732c6d 100644 --- a/speechx/examples/ds2_ol/websocket/websocket_client_main.cc +++ b/speechx/examples/ds2_ol/websocket/websocket_client_main.cc @@ -26,7 +26,6 @@ using kaldi::int16; int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); - ppspeech::WebSocketClient client(FLAGS_host, FLAGS_port); kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); @@ -36,6 +35,8 @@ int main(int argc, char* argv[]) { const int chunk_sample_size = streaming_chunk * sample_rate; for (; !wav_reader.Done(); wav_reader.Next()) { + ppspeech::WebSocketClient client(FLAGS_host, FLAGS_port); + client.SendStartSignal(); std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); @@ -74,9 +75,8 @@ int main(int argc, char* argv[]) { std::string result = client.GetResult(); LOG(INFO) << "utt: " << utt << " " << result; - client.Join(); - return 0; } + return 0; } diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh new file mode 100755 index 0000000000000000000000000000000000000000..0e9e796cf0652de2727d7f52c3d62051c1204243 --- /dev/null +++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set +x +set -e + +. path.sh + + +# 1. compile +if [ ! -d ${SPEECHX_EXAMPLES} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + +# input +mkdir -p data +data=$PWD/data +ckpt_dir=$data/model +model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ +vocb_dir=$ckpt_dir/data/lang_char/ + +if [ ! -f $ckpt_dir/data/mean_std.json ]; then + mkdir -p $ckpt_dir + pushd $ckpt_dir + wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + popd +fi + +export GLOG_logtostderr=1 + +# 3. gen cmvn +cmvn=$data/cmvn.ark +cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn + + +wfst=$data/wfst/ +mkdir -p $wfst +if [ ! -f $wfst/aishell_graph.zip ]; then + pushd $wfst + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip + unzip aishell_graph.zip + mv aishell_graph/* $wfst + popd +fi + +# 5. test websocket server +websocket_server_main \ + --cmvn_file=$cmvn \ + --model_path=$model_dir/avg_1.jit.pdmodel \ + --streaming_chunk=0.1 \ + --convert2PCM32=true \ + --param_path=$model_dir/avg_1.jit.pdiparams \ + --word_symbol_table=$data/wfst/words.txt \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --graph_path=$data/wfst/TLG.fst --max_active=7500 \ + --acoustic_scale=1.2 diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index cd50ef53b8d2f68031b797bcf98523a46c13da56..aff8d39a83a529f494faf9870c0183ad2b72aac4 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -15,7 +15,6 @@ #pragma once #include "base/common.h" - #include "decoder/ctc_beam_search_decoder.h" #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" @@ -24,7 +23,7 @@ DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); DEFINE_bool(convert2PCM32, true, "audio convert to pcm32"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(params_path, "avg_1.jit.pdiparams", "paddle nnet model param"); +DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); DEFINE_string(graph_path, "TLG", "decoder graph"); DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); @@ -37,13 +36,16 @@ DEFINE_int32(receptive_field_length, DEFINE_int32(downsampling_rate, 4, "two CNN(kernel=5) module downsampling rate."); +DEFINE_string( + model_input_names, + "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", + "model input names"); DEFINE_string(model_output_names, - "save_infer_model/scale_0.tmp_1,save_infer_model/" - "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/" - "scale_3.tmp_1", + "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", "model output names"); DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names"); + namespace ppspeech { // todo refactor later FeaturePipelineOptions InitFeaturePipelineOptions() { @@ -67,7 +69,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { ModelOptions InitModelOptions() { ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; - model_opts.params_path = FLAGS_params_path; + model_opts.param_path = FLAGS_param_path; model_opts.cache_shape = FLAGS_model_cache_names; model_opts.output_names = FLAGS_model_output_names; return model_opts; diff --git a/speechx/speechx/nnet/paddle_nnet.cc b/speechx/speechx/nnet/paddle_nnet.cc index 5c4da11ac9b3006a1c66fd6f693a8e35eda0d4a3..f8e1f697b8367eea17fe7c32341bbc8e575e388e 100644 --- a/speechx/speechx/nnet/paddle_nnet.cc +++ b/speechx/speechx/nnet/paddle_nnet.cc @@ -49,7 +49,7 @@ void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) { PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) { paddle_infer::Config config; - config.SetModel(opts.model_path, opts.params_path); + config.SetModel(opts.model_path, opts.param_path); if (opts.use_gpu) { config.EnableUseGpu(500, 0); } diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/paddle_nnet.h index 906994d06c81b753689e5172d7e61fda60b27fbf..8b4ed4785fe824c2a0ca624dd91e74d24a15a916 100644 --- a/speechx/speechx/nnet/paddle_nnet.h +++ b/speechx/speechx/nnet/paddle_nnet.h @@ -11,25 +11,19 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - - #pragma once - - +#include +#include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" - -#include "base/common.h" #include "nnet/nnet_itf.h" #include "paddle_inference_api.h" -#include - namespace ppspeech { struct ModelOptions { std::string model_path; - std::string params_path; + std::string param_path; int thread_num; bool use_gpu; bool switch_ir_optim; @@ -41,7 +35,7 @@ struct ModelOptions { bool enable_profile; ModelOptions() : model_path("avg_1.jit.pdmodel"), - params_path("avg_1.jit.pdiparams"), + param_path("avg_1.jit.pdiparams"), thread_num(2), use_gpu(false), input_names( @@ -59,7 +53,7 @@ struct ModelOptions { void Register(kaldi::OptionsItf* opts) { opts->Register("model-path", &model_path, "model file path"); - opts->Register("model-params", ¶ms_path, "params model file path"); + opts->Register("model-param", ¶m_path, "params model file path"); opts->Register("thread-num", &thread_num, "thread num"); opts->Register("use-gpu", &use_gpu, "if use gpu"); opts->Register("input-names", &input_names, "paddle input names"); diff --git a/utils/format_rsl.py b/utils/format_rsl.py index 1a714253522b7d655f885a7e69adcd685003b102..8230416c4ac9cbebad9a317dff56a9cf380221ab 100644 --- a/utils/format_rsl.py +++ b/utils/format_rsl.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse + import jsonlines