diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h index 206b7be67548b4d84655df882d4392367832f40a..3a648649f5f8873dc8944a4d10776f616a8d4a0b 100644 --- a/speechx/speechx/base/basic_types.h +++ b/speechx/speechx/base/basic_types.h @@ -14,10 +14,10 @@ #pragma once -#include "kaldi/base/kaldi-types.h" - #include +#include "kaldi/base/kaldi-types.h" + typedef float BaseFloat; typedef double double64; diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index b470b9de5e9219276a9f2bba33769fbb472c7b87..97bff96620e45eaf5b3e591b65ec31007e0dbce8 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -47,6 +47,5 @@ #include "base/flags.h" #include "base/log.h" #include "base/macros.h" - #include "utils/file_utils.h" #include "utils/math.h" \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index 2c28bee1b663999204b69133fb3247e3a32d57d1..eef8823da4aa2522cdf36521b3deb134a1e9c7f5 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -17,7 +17,6 @@ #include "decoder/ctc_beam_search_opt.h" #include "decoder/ctc_prefix_beam_search_score.h" #include "decoder/decoder_itf.h" - #include "fst/symbol-table.h" namespace ppspeech { diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index 76bbcf42d2c7f67b929d2a37cf16b42221af83a4..cf8a9b7303a48b276f5f6eef5ea1a0d2a970eecf 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -16,7 +16,6 @@ #include "base/common.h" #include "decoder/decoder_itf.h" - #include "kaldi/decoder/lattice-faster-online-decoder.h" #include "util/parse-options.h" @@ -35,7 +34,7 @@ struct TLGDecoderOptions { std::string word_symbol_table{}; std::string fst_path{}; - static TLGDecoderOptions InitFromFlags(){ + static TLGDecoderOptions InitFromFlags() { TLGDecoderOptions decoder_opts; decoder_opts.word_symbol_table = FLAGS_word_symbol_table; decoder_opts.fst_path = FLAGS_graph_path; @@ -45,9 +44,11 @@ struct TLGDecoderOptions { decoder_opts.opts.max_active = FLAGS_max_active; decoder_opts.opts.beam = FLAGS_beam; decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; - LOG(INFO) << "LatticeFasterDecoder max active: " << decoder_opts.opts.max_active ; - LOG(INFO) << "LatticeFasterDecoder beam: " << decoder_opts.opts.beam ; - LOG(INFO) << "LatticeFasterDecoder lattice_beam: " << decoder_opts.opts.lattice_beam ; + LOG(INFO) << "LatticeFasterDecoder max active: " + << decoder_opts.opts.max_active; + LOG(INFO) << "LatticeFasterDecoder beam: " << decoder_opts.opts.beam; + LOG(INFO) << "LatticeFasterDecoder lattice_beam: " + << decoder_opts.opts.lattice_beam; return decoder_opts; } diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 5e1120ad8feab482035d5d7d8e581b42bf993c48..1f13bbc0e468c97e7206006a9087a93dfa4e6600 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -30,7 +30,7 @@ DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); DEFINE_int32(subsampling_rate, - 4, + 4, "two CNN(kernel=3) module downsampling rate."); DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); @@ -62,7 +62,6 @@ DEFINE_double(beam, 15.0, "decoder beam"); DEFINE_double(lattice_beam, 7.5, "decoder beam"); - // DecodeOptions flags // DEFINE_int32(chunk_size, -1, "decoding chunk size"); DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h index 51b6667393aaebf7846a542c4f8102af00f4c73b..0402bcd3ccd5582b4b248c44a309c978046fad16 100644 --- a/speechx/speechx/decoder/recognizer.h +++ b/speechx/speechx/decoder/recognizer.h @@ -32,15 +32,15 @@ struct RecognizerResource { ModelOptions model_opts{}; TLGDecoderOptions tlg_opts{}; // CTCBeamSearchOptions beam_search_opts; - - static RecognizerResource InitFromFlags(){ + + static RecognizerResource InitFromFlags() { RecognizerResource resource; resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts = + FeaturePipelineOptions::InitFromFlags(); resource.model_opts = ModelOptions::InitFromFlags(); - resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); - return resource; - + resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); + return resource; } }; diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h index 86bd482164dbeebe85c9842c6ee332a5250547de..f4e91b18f966e7ad7dc69507d1868cac230335ff 100644 --- a/speechx/speechx/decoder/u2_recognizer.h +++ b/speechx/speechx/decoder/u2_recognizer.h @@ -21,10 +21,9 @@ #include "decoder/ctc_prefix_beam_search_decoder.h" #include "decoder/decoder_itf.h" #include "frontend/audio/feature_pipeline.h" -#include "nnet/decodable.h" - #include "fst/fstlib.h" #include "fst/symbol-table.h" +#include "nnet/decodable.h" DECLARE_int32(nnet_decoder_chunk); DECLARE_int32(num_left_chunks); @@ -63,9 +62,9 @@ struct DecodeOptions { // CtcEndpointConfig ctc_endpoint_opts; CTCBeamSearchOptions ctc_prefix_search_opts{}; - static DecodeOptions InitFromFlags(){ + static DecodeOptions InitFromFlags() { DecodeOptions decoder_opts; - decoder_opts.chunk_size=FLAGS_nnet_decoder_chunk; + decoder_opts.chunk_size = FLAGS_nnet_decoder_chunk; decoder_opts.num_left_chunks = FLAGS_num_left_chunks; decoder_opts.ctc_weight = FLAGS_ctc_weight; decoder_opts.rescoring_weight = FLAGS_rescoring_weight; @@ -86,15 +85,16 @@ struct U2RecognizerResource { DecodeOptions decoder_opts{}; static U2RecognizerResource InitFromFlags() { - U2RecognizerResource resource; - resource.vocab_path = FLAGS_vocab_path; - resource.acoustic_scale = FLAGS_acoustic_scale; - - resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); - resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); - resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); - return resource; -} + U2RecognizerResource resource; + resource.vocab_path = FLAGS_vocab_path; + resource.acoustic_scale = FLAGS_acoustic_scale; + + resource.feature_pipeline_opts = + ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); + return resource; + } }; diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h index 64e9db860ec7fd978e2623de453d8be6d622640c..5fafdeb285aa16714b5c779b53a68b2b3046801e 100644 --- a/speechx/speechx/frontend/audio/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -56,4 +56,4 @@ class DataCache : public FrontendInterface { DISALLOW_COPY_AND_ASSIGN(DataCache); }; -} \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 38a47433ffa3b3e7d2accf9459cd006454a286cf..d91a70e352ffe85689e666b37d1b6e2d8f971e81 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -46,17 +46,17 @@ struct FeaturePipelineOptions { FeatureCacheOptions feature_cache_opts{}; AssemblerOptions assembler_opts{}; - static FeaturePipelineOptions InitFromFlags(){ + static FeaturePipelineOptions InitFromFlags() { FeaturePipelineOptions opts; opts.cmvn_file = FLAGS_cmvn_file; - LOG(INFO) << "cmvn file: " << opts.cmvn_file; + LOG(INFO) << "cmvn file: " << opts.cmvn_file; // frame options kaldi::FrameExtractionOptions frame_opts; frame_opts.dither = 0.0; - LOG(INFO) << "dither: " << frame_opts.dither; + LOG(INFO) << "dither: " << frame_opts.dither; frame_opts.frame_shift_ms = 10; - LOG(INFO) << "frame shift ms: " << frame_opts.frame_shift_ms; + LOG(INFO) << "frame shift ms: " << frame_opts.frame_shift_ms; opts.use_fbank = FLAGS_use_fbank; LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear"); if (opts.use_fbank) { @@ -76,15 +76,19 @@ struct FeaturePipelineOptions { opts.linear_spectrogram_opts.frame_opts = frame_opts; } - LOG(INFO) << "frame length ms: " << frame_opts.frame_length_ms; + LOG(INFO) << "frame length ms: " << frame_opts.frame_length_ms; // assembler opts opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate; - LOG(INFO) << "subsampling rate: " << opts.assembler_opts.subsampling_rate; - opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length; - LOG(INFO) << "nnet receptive filed length: " << opts.assembler_opts.receptive_filed_length; + LOG(INFO) << "subsampling rate: " + << opts.assembler_opts.subsampling_rate; + opts.assembler_opts.receptive_filed_length = + FLAGS_receptive_field_length; + LOG(INFO) << "nnet receptive filed length: " + << opts.assembler_opts.receptive_filed_length; opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; - LOG(INFO) << "nnet chunk size: " << opts.assembler_opts.nnet_decoder_chunk; + LOG(INFO) << "nnet chunk size: " + << opts.assembler_opts.nnet_decoder_chunk; return opts; } }; diff --git a/speechx/speechx/frontend/audio/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h index 62b0078c7d4fe3f1509a72318470fc23b53bc1b1..6c1c2f7dfb6bfbfd3451a07cef6272bcca2d32dd 100644 --- a/speechx/speechx/frontend/audio/mfcc.h +++ b/speechx/speechx/frontend/audio/mfcc.h @@ -14,7 +14,6 @@ #pragma once -#include "kaldi/feat/feature-mfcc.h" #include "kaldi/feat/feature-mfcc.h" #include "kaldi/matrix/kaldi-vector.h" diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index 2a53e5f7e341e12585a9bf253d2a3112cf6f8c49..4aeec32f388164460f47cf6bb733bab7851abae6 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once #include + #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" #include "nnet/nnet_itf.h" diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index f8105b7f077cf55d1d7398e362bdb30b20e3012a..cc737ce054b084259bee23b3f9e8123f34dac7b7 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -48,25 +48,25 @@ struct ModelOptions { bool enable_fc_padding{false}; bool enable_profile{false}; - static ModelOptions InitFromFlags(){ + static ModelOptions InitFromFlags() { ModelOptions opts; opts.subsample_rate = FLAGS_subsampling_rate; - LOG(INFO) << "subsampling rate: " << opts.subsample_rate; + LOG(INFO) << "subsampling rate: " << opts.subsample_rate; opts.model_path = FLAGS_model_path; - LOG(INFO) << "model path: " << opts.model_path ; + LOG(INFO) << "model path: " << opts.model_path; opts.param_path = FLAGS_param_path; - LOG(INFO) << "param path: " << opts.param_path ; + LOG(INFO) << "param path: " << opts.param_path; LOG(INFO) << "DS2 param: "; opts.cache_names = FLAGS_model_cache_names; - LOG(INFO) << " cache names: " << opts.cache_names; + LOG(INFO) << " cache names: " << opts.cache_names; opts.cache_shape = FLAGS_model_cache_shapes; - LOG(INFO) << " cache shape: " << opts.cache_shape; + LOG(INFO) << " cache shape: " << opts.cache_shape; opts.input_names = FLAGS_model_input_names; - LOG(INFO) << " input names: " << opts.input_names; + LOG(INFO) << " input names: " << opts.input_names; opts.output_names = FLAGS_model_output_names; - LOG(INFO) << " output names: " << opts.output_names; + LOG(INFO) << " output names: " << opts.output_names; return opts; } }; diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 697ac20c83682f7681be286a2a24166677a7f0fc..3435bca8ba0dfbdeafe915ac8fc4fda18924a83b 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -16,7 +16,6 @@ #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" - #include "nnet/nnet_itf.h" #include "paddle/extension.h" #include "paddle/jit/all.h" diff --git a/speechx/speechx/protocol/websocket/websocket_client.h b/speechx/speechx/protocol/websocket/websocket_client.h index 886da29295c5cdac19e4ffa91328c2b8291378a6..7ae6d98d5155d50f6389489328da32ea9e5e6022 100644 --- a/speechx/speechx/protocol/websocket/websocket_client.h +++ b/speechx/speechx/protocol/websocket/websocket_client.h @@ -13,7 +13,6 @@ // limitations under the License. #include "base/common.h" - #include "boost/asio/connect.hpp" #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" @@ -54,4 +53,4 @@ class WebSocketClient { websocket::stream ws_{ioc_}; std::unique_ptr t_{nullptr}; }; -} \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h index 009fc42ed827fed1258f241a3e48936bf71a7daf..8f3360e40ba0b74cc8282f00afc49663cb19bab7 100644 --- a/speechx/speechx/protocol/websocket/websocket_server.h +++ b/speechx/speechx/protocol/websocket/websocket_server.h @@ -15,12 +15,10 @@ #pragma once #include "base/common.h" - #include "boost/asio/connect.hpp" #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" #include "boost/beast/websocket.hpp" - #include "decoder/recognizer.h" #include "frontend/audio/feature_pipeline.h" diff --git a/speechx/speechx/utils/file_utils.h b/speechx/speechx/utils/file_utils.h index 8c56c02eba5fb65c2069d69a78d238fda4fac1c3..a471e024e4844a14e83170be7a3d3e23935415a8 100644 --- a/speechx/speechx/utils/file_utils.h +++ b/speechx/speechx/utils/file_utils.h @@ -20,4 +20,4 @@ bool ReadFileToVector(const std::string& filename, std::vector* data); std::string ReadFile2String(const std::string& path); -} +} // namespace ppspeech