diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index d693dc4198a070eb519b2c9d83ac81309b89dc62..6808de5e22d4ec824c3172c0c1934b9600d364dd 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -630,4 +630,4 @@ bash server.sh [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 ``` - \ No newline at end of file + diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index b768c43599fc0b5b01a6c6e903229026a78f986f..5fa81d4bf9af3cd06fc0d2e9f950169436136bd2 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -638,4 +638,4 @@ bash server.sh [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 ``` - \ No newline at end of file + diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 3111badf4d53cb638582eb349f9f0e8702460ea3..0a169f8bbbc68d24cd373793b2534d88ad29d8ce 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -437,7 +437,9 @@ class VectorExecutor(BaseExecutor): if self.sample_rate != 16000 and self.sample_rate != 8000: logger.error( "invalid sample rate, please input --sr 8000 or --sr 16000") - logger.error(f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}") + logger.error( + f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}" + ) return False if isinstance(audio_file, (str, os.PathLike)): diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md index 010d3d5189ab554ad94b5f0defa5d22e78bca5e5..a974d40ff26e74ad0fe77322f8cc3fef19efbb8f 100644 --- a/paddlespeech/server/README_cn.md +++ b/paddlespeech/server/README_cn.md @@ -82,4 +82,4 @@ paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input ``` paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 123456789.wav --test 85236145389.wav -``` \ No newline at end of file +``` diff --git a/paddlespeech/server/engine/vector/__init__.py b/paddlespeech/server/engine/vector/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..97043fd7ba6885aac81cad5a49924c23c67d4d47 100644 --- a/paddlespeech/server/engine/vector/__init__.py +++ b/paddlespeech/server/engine/vector/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/engine/vector/python/__init__.py b/paddlespeech/server/engine/vector/python/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..97043fd7ba6885aac81cad5a49924c23c67d4d47 100644 --- a/paddlespeech/server/engine/vector/python/__init__.py +++ b/paddlespeech/server/engine/vector/python/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 2fd8dec60cfedb6ed55aacd768b2d8e09095e681..854303701d1717478b5083a27879a7071434702f 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -16,9 +16,9 @@ from collections import OrderedDict import numpy as np import paddle - from paddleaudio.backends import load as load_audio from paddleaudio.compliance.librosa import melspectrogram + from paddlespeech.cli.log import logger from paddlespeech.cli.vector.infer import VectorExecutor from paddlespeech.server.engine.base_engine import BaseEngine diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index b44200b0bccadae84b6bc221ac317a5436f86231..650cb14090eea9b33e11d24c6d1951033f3e4c78 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -155,7 +155,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ - --to_float32=true \ --streaming_chunk=30 \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc index 00764f533a3ae1352c2184ad027b56eb7397efa5..7aef73f7439ef7cf56e46486ca6e2baf8e1640f0 100644 --- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc +++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc @@ -19,6 +19,7 @@ DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_int32(sample_rate, 16000, "sample rate"); int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -30,7 +31,8 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - int sample_rate = 16000; + + int sample_rate = FLAGS_sample_rate; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; LOG(INFO) << "sr: " << sample_rate; diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc index 7beaa587796d3d5162daa833389b8f4ac7e64758..67683eebf6e2fb3f73b6a44f6c9ac682c6c5cda7 100644 --- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc +++ b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc @@ -69,6 +69,7 @@ int main(int argc, char* argv[]) { feat_cache_opts.frame_chunk_stride = 1; feat_cache_opts.frame_chunk_size = 1; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); + LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); int sample_rate = 16000; diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc index c3652ad4adab8205cd960d31580ca209f5dcd354..bbf0e6908dddb0e7b1c776d9f42616fb70d92e81 100644 --- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc +++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc @@ -56,6 +56,7 @@ int main(int argc, char* argv[]) { opt.frame_opts.remove_dc_offset = false; opt.frame_opts.window_type = "hanning"; opt.frame_opts.preemph_coeff = 0.0; + LOG(INFO) << "linear feature: " << true; LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; @@ -77,7 +78,7 @@ int main(int argc, char* argv[]) { int sample_rate = 16000; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "sample rate: " << sample_rate; LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh index 0e389f89917b82bad6847a043acbe642a401916e..fc57e326fb8cc2491d2443738fb8552e052fd033 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_server.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh @@ -63,7 +63,6 @@ websocket_server_main \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ --streaming_chunk=0.1 \ - --to_float32=true \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 85de08ca1b3b964a93b0af7a2f14a5077a40baef..b2bf1890a41c5d831d5fc777bbec7aea93e1eb0e 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -19,23 +19,24 @@ #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" +// feature +DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); +// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear +// feature, or fbank"); +DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); -DEFINE_bool(to_float32, true, "audio convert to pcm32"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); -DEFINE_int32(max_active, 7500, "max active"); -DEFINE_double(beam, 15.0, "decoder beam"); -DEFINE_double(lattice_beam, 7.5, "decoder beam"); +// feature sliding window DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=5) downsampling module."); DEFINE_int32(downsampling_rate, 4, "two CNN(kernel=5) module downsampling rate."); + +// nnet +DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); +DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string( model_input_names, "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", @@ -47,8 +48,14 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_bool(use_fbank, false, "use fbank or linear feature"); -DEFINE_int32(num_bins, 161, "num bins of mel"); + +// decoder +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); +DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); +DEFINE_int32(max_active, 7500, "max active"); +DEFINE_double(beam, 15.0, "decoder beam"); +DEFINE_double(lattice_beam, 7.5, "decoder beam"); namespace ppspeech { // todo refactor later @@ -56,22 +63,23 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions opts; opts.cmvn_file = FLAGS_cmvn_file; opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; - opts.to_float32 = FLAGS_to_float32; kaldi::FrameExtractionOptions frame_opts; frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; if (opts.use_fbank) { - frame_opts.window_type = "povey"; - frame_opts.frame_length_ms = 25; - opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; - opts.fbank_opts.fbank_opts.frame_opts = frame_opts; + opts.to_float32 = false; + frame_opts.window_type = "povey"; + frame_opts.frame_length_ms = 25; + opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; + opts.fbank_opts.fbank_opts.frame_opts = frame_opts; } else { - frame_opts.remove_dc_offset = false; - frame_opts.frame_length_ms = 20; - frame_opts.window_type = "hanning"; - frame_opts.preemph_coeff = 0.0; - opts.linear_spectrogram_opts.frame_opts = frame_opts; + opts.to_float32 = true; + frame_opts.remove_dc_offset = false; + frame_opts.frame_length_ms = 20; + frame_opts.window_type = "hanning"; + frame_opts.preemph_coeff = 0.0; + opts.linear_spectrogram_opts.frame_opts = frame_opts; } opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length; opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate; diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc index a865db5939d6a8bfa01c8caa79c58cfa43325a4c..fea9032acf7a799785b620610987e4d0bc170c59 100644 --- a/speechx/speechx/frontend/audio/fbank.cc +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -102,13 +102,16 @@ bool Fbank::Compute(const Vector& waves, Vector* feats) { // note: this online feature-extraction code does not support VTLN. RealFft(&window, true); kaldi::ComputePowerSpectrum(&window); - const kaldi::MelBanks &mel_bank = *(computer_.GetMelBanks(1.0)); - SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); + const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0)); + SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); if (!opts_.fbank_opts.use_power) { power_spectrum.ApplyPow(0.5); } - int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0); - SubVector mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); + int32 mel_offset = + ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 + : 0); + SubVector mel_energies( + this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); mel_bank.Compute(power_spectrum, &mel_energies); mel_energies.ApplyFloor(1e-07); mel_energies.ApplyLog(); diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 40891871a14322ef19bcb002a4591e280db70077..087de0f0d14cc7c760389b4fb69ad216e0b77db9 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -23,13 +23,13 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); unique_ptr base_feature; - + if (opts.use_fbank) { - base_feature.reset(new ppspeech::Fbank(opts.fbank_opts, - std::move(data_source))); + base_feature.reset( + new ppspeech::Fbank(opts.fbank_opts, std::move(data_source))); } else { - base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts, - std::move(data_source))); + base_feature.reset(new ppspeech::LinearSpectrogram( + opts.linear_spectrogram_opts, std::move(data_source))); } unique_ptr cmvn( diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 4868d37e430f94e626bafaed407b0d2e70a117e3..6b9b4795e5431b20116a1e7cbed59415e8f1e0c7 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -18,25 +18,25 @@ #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" +#include "frontend/audio/fbank.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/linear_spectrogram.h" -#include "frontend/audio/fbank.h" #include "frontend/audio/normalizer.h" namespace ppspeech { struct FeaturePipelineOptions { std::string cmvn_file; - bool to_float32; + bool to_float32; // true, only for linear feature bool use_fbank; LinearSpectrogramOptions linear_spectrogram_opts; FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; FeaturePipelineOptions() : cmvn_file(""), - to_float32(false), - use_fbank(false), + to_float32(false), // true, only for linear feature + use_fbank(true), linear_spectrogram_opts(), fbank_opts(), feature_cache_opts() {}