diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 00f2129324cc818430bd39bb8bbe59dfb5c1df4a..e020b5015e9b94acc6c1871031f35c80e4b30b02 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -179,7 +179,7 @@ class ASRExecutor(BaseExecutor): self.collate_fn_test = SpeechCollator.from_config(self.config) text_feature = TextFeaturizer( unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, + vocab=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) self.config.model.input_dim = self.collate_fn_test.feature_size self.config.model.output_dim = text_feature.vocab_size @@ -192,7 +192,7 @@ class ASRExecutor(BaseExecutor): res_path, self.config.collator.spm_model_prefix) text_feature = TextFeaturizer( unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, + vocab=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) self.config.model.input_dim = self.config.collator.feat_dim self.config.model.output_dim = text_feature.vocab_size @@ -279,7 +279,7 @@ class ASRExecutor(BaseExecutor): audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) text_feature = TextFeaturizer( unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, + vocab=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) self._inputs["audio"] = audio self._inputs["audio_len"] = audio_len @@ -295,7 +295,7 @@ class ASRExecutor(BaseExecutor): """ text_feature = TextFeaturizer( unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, + vocab=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) cfg = self.config.decoding audio = self._inputs["audio"] @@ -321,13 +321,7 @@ class ASRExecutor(BaseExecutor): audio_len, text_feature=text_feature, decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch, ctc_weight=cfg.ctc_weight, decoding_chunk_size=cfg.decoding_chunk_size, num_decoding_left_chunks=cfg.num_decoding_left_chunks, diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 6bb8282107260f69726ba22a3e3883a0803a5922..fd32e3b42c4594cf7fbedee91435e29e28c2e7ea 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -180,7 +180,7 @@ class STExecutor(BaseExecutor): res_path, self.config.collator.spm_model_prefix) self.text_feature = TextFeaturizer( unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, + vocab=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) self.config.model.input_dim = self.config.collator.feat_dim self.config.model.output_dim = self.text_feature.vocab_size @@ -292,14 +292,7 @@ class STExecutor(BaseExecutor): audio_len, text_feature=self.text_feature, decoding_method=cfg.decoding_method, - lang_model_path=None, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch, - ctc_weight=cfg.ctc_weight, word_reward=cfg.word_reward, decoding_chunk_size=cfg.decoding_chunk_size, num_decoding_left_chunks=cfg.num_decoding_left_chunks, diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py index b8544dc2bf4666cb62ff61c5fbc476c33be783a9..cf2ca0d642c11db133fe348b49e2c633da3a55c8 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py @@ -41,7 +41,7 @@ class DeepSpeech2Tester_hub(): self.audio_file = args.audio_file self.collate_fn_test = SpeechCollator.from_config(config) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab_filepath=None) + unit_type=config.collator.unit_type, vocab=None) def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): result_transcripts = self.model.decode( diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 3e4ff1a8b10be88fc6990850ae1dda63fe1a2b21..a0b69d64f58a66373435e8c8c28456cc6b86cb4a 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -286,7 +286,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def __init__(self, config, args): super().__init__(config, args) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab_filepath=None) + unit_type=config.collator.unit_type, vocab=None) def ordid2token(self, texts, texts_len): """ ord() id to chr() chr """ diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index a9450129fb2d98c8d15acca8c996549d3673188e..556316ec0065f76e337a6ea00cf78313778ce04a 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -44,7 +44,7 @@ class U2Infer(): self.text_feature = TextFeaturizer( unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, + vocab=config.collator.vocab_filepath, spm_model_prefix=config.collator.spm_model_prefix) paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') @@ -91,13 +91,7 @@ class U2Infer(): ilen, text_feature=self.text_feature, decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch, ctc_weight=cfg.ctc_weight, decoding_chunk_size=cfg.decoding_chunk_size, num_decoding_left_chunks=cfg.num_decoding_left_chunks, diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index d448021cb1bc3d58d76d29f5c703bedf113a8713..404058edcad00a4c6cc4833f772cba9b3b183a28 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -437,7 +437,7 @@ class U2Tester(U2Trainer): super().__init__(config, args) self.text_feature = TextFeaturizer( unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, + vocab=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list @@ -469,13 +469,7 @@ class U2Tester(U2Trainer): audio_len, text_feature=self.text_feature, decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch, ctc_weight=cfg.ctc_weight, decoding_chunk_size=cfg.decoding_chunk_size, num_decoding_left_chunks=cfg.num_decoding_left_chunks, diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 43e31a60dc127d607b7a644d1ce553832cb909a3..9b8274ad64bbdd704e030c8f3f61d72d71c9c9b6 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -393,7 +393,7 @@ class U2Tester(U2Trainer): super().__init__(config, args) self.text_feature = TextFeaturizer( unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, + vocab=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list @@ -425,13 +425,7 @@ class U2Tester(U2Trainer): audio_len, text_feature=self.text_feature, decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch, ctc_weight=cfg.ctc_weight, decoding_chunk_size=cfg.decoding_chunk_size, num_decoding_left_chunks=cfg.num_decoding_left_chunks, diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 3ec2c920e9864929866bcb0f7b05dfa19d83be60..a3b39df7cd4d8d6a5783aa874e0164b589aac01d 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -437,14 +437,7 @@ class U2STTester(U2STTrainer): audio_len, text_feature=text_feature, decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch, - ctc_weight=cfg.ctc_weight, word_reward=cfg.word_reward, decoding_chunk_size=cfg.decoding_chunk_size, num_decoding_left_chunks=cfg.num_decoding_left_chunks, @@ -475,14 +468,7 @@ class U2STTester(U2STTrainer): audio_len, text_feature=text_feature, decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch, - ctc_weight=cfg.ctc_weight, word_reward=cfg.word_reward, decoding_chunk_size=cfg.decoding_chunk_size, num_decoding_left_chunks=cfg.num_decoding_left_chunks, diff --git a/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py b/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py index 591df96e1f23067ae1645f0690de119f3b85bd73..9dc86829ae2aa9068393462e27f4701efe965585 100644 --- a/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py @@ -55,7 +55,7 @@ class SpeechFeaturizer(): self.text_feature = TextFeaturizer( unit_type=unit_type, - vocab_filepath=vocab_filepath, + vocab=vocab_filepath, spm_model_prefix=spm_model_prefix, maskctc=maskctc) self.vocab_size = self.text_feature.vocab_size diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py index 812be6e41422e451e6ea89d5fdfdbc3704c9421a..0c0fa5e2f63b05387cd6ce9af6fb0331c400cfb8 100644 --- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains the text featurizer class.""" from pprint import pformat +from typing import Union import sentencepiece as spm @@ -31,11 +32,7 @@ __all__ = ["TextFeaturizer"] class TextFeaturizer(): - def __init__(self, - unit_type, - vocab_filepath, - spm_model_prefix=None, - maskctc=False): + def __init__(self, unit_type, vocab, spm_model_prefix=None, maskctc=False): """Text featurizer, for processing or extracting features from text. Currently, it supports char/word/sentence-piece level tokenizing and conversion into @@ -44,7 +41,7 @@ class TextFeaturizer(): Args: unit_type (str): unit type, e.g. char, word, spm - vocab_filepath (str): Filepath to load vocabulary for token indices conversion. + vocab Option[str, list]: Filepath to load vocabulary for token indices conversion, or vocab list. spm_model_prefix (str, optional): spm model prefix. Defaults to None. """ assert unit_type in ('char', 'spm', 'word') @@ -52,12 +49,12 @@ class TextFeaturizer(): self.unk = UNK self.maskctc = maskctc - if vocab_filepath: + if vocab: self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file( - vocab_filepath, maskctc) + vocab, maskctc) self.vocab_size = len(self.vocab_list) else: - logger.warning("TextFeaturizer: not have vocab file.") + logger.warning("TextFeaturizer: not have vocab file or vocab list.") if unit_type == 'spm': spm_model = spm_model_prefix + '.model' @@ -207,9 +204,13 @@ class TextFeaturizer(): return decode(tokens) - def _load_vocabulary_from_file(self, vocab_filepath: str, maskctc: bool): + def _load_vocabulary_from_file(self, vocab: Union[str, list], + maskctc: bool): """Load vocabulary from file.""" - vocab_list = load_dict(vocab_filepath, maskctc) + if isinstance(vocab, list): + vocab_list = vocab + else: + vocab_list = load_dict(vocab, maskctc) assert vocab_list is not None logger.debug(f"Vocab: {pformat(vocab_list)}") diff --git a/paddlespeech/s2t/models/lm/dataset.py b/paddlespeech/s2t/models/lm/dataset.py index 4059dfe2c1b50fe69a8776979065c608462f65bf..25a47be60398aec7e311de5703532b37f1dc03c6 100644 --- a/paddlespeech/s2t/models/lm/dataset.py +++ b/paddlespeech/s2t/models/lm/dataset.py @@ -42,7 +42,7 @@ class TextCollatorSpm(): assert (vocab_filepath is not None) self.text_featurizer = TextFeaturizer( unit_type=unit_type, - vocab_filepath=vocab_filepath, + vocab=vocab_filepath, spm_model_prefix=spm_model_prefix) self.eos_id = self.text_featurizer.eos_id self.blank_id = self.text_featurizer.blank_id diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 8053ed3a8bd7a0efcd975e7e59a5a7779917564d..fdcab0187c415393927fac732c38cb7a0934f307 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -717,13 +717,7 @@ class U2BaseModel(ASRInterface, nn.Layer): feats_lengths: paddle.Tensor, text_feature: Dict[str, int], decoding_method: str, - lang_model_path: str, - beam_alpha: float, - beam_beta: float, beam_size: int, - cutoff_prob: float, - cutoff_top_n: int, - num_processes: int, ctc_weight: float=0.0, decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, @@ -737,13 +731,7 @@ class U2BaseModel(ASRInterface, nn.Layer): decoding_method (str): decoding mode, e.g. 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path (str): lm path. - beam_alpha (float): lm weight. - beam_beta (float): length penalty. beam_size (int): beam size for search - cutoff_prob (float): for prune. - cutoff_top_n (int): for prune. - num_processes (int): ctc_weight (float, optional): ctc weight for attention rescoring decode mode. Defaults to 0.0. decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1. <0: for decoding, use full chunk. @@ -839,12 +827,13 @@ class U2Model(U2DecodeModel): def __init__(self, configs: dict): vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs) + model_conf = configs.get('model_conf', dict()) super().__init__( vocab_size=vocab_size, encoder=encoder, decoder=decoder, ctc=ctc, - **configs['model_conf']) + **model_conf) @classmethod def _init_from_config(cls, configs: dict): @@ -893,7 +882,7 @@ class U2Model(U2DecodeModel): **configs['decoder_conf']) # ctc decoder and ctc loss - model_conf = configs['model_conf'] + model_conf = configs.get('model_conf', dict()) dropout_rate = model_conf.get('ctc_dropout_rate', 0.0) grad_norm_type = model_conf.get('ctc_grad_norm_type', None) ctc = CTCDecoder( diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 3a23804fe2c6e4e9096e2a76eecd30c94f33190a..8b07e389d011cdc410c10c6abe1ba194e40389dd 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -522,14 +522,7 @@ class U2STBaseModel(nn.Layer): feats_lengths: paddle.Tensor, text_feature: Dict[str, int], decoding_method: str, - lang_model_path: str, - beam_alpha: float, - beam_beta: float, beam_size: int, - cutoff_prob: float, - cutoff_top_n: int, - num_processes: int, - ctc_weight: float=0.0, word_reward: float=0.0, decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, @@ -543,14 +536,7 @@ class U2STBaseModel(nn.Layer): decoding_method (str): decoding mode, e.g. 'fullsentence', 'simultaneous' - lang_model_path (str): lm path. - beam_alpha (float): lm weight. - beam_beta (float): length penalty. beam_size (int): beam size for search - cutoff_prob (float): for prune. - cutoff_top_n (int): for prune. - num_processes (int): - ctc_weight (float, optional): ctc weight for attention rescoring decode mode. Defaults to 0.0. decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. diff --git a/paddlespeech/s2t/transform/cmvn.py b/paddlespeech/s2t/transform/cmvn.py index aa1e6b4450f41103c1f0c9f2e723bfcdbe2cde9d..2db0070bf38e690fdeb4c4212e2f683773574703 100644 --- a/paddlespeech/s2t/transform/cmvn.py +++ b/paddlespeech/s2t/transform/cmvn.py @@ -168,13 +168,17 @@ class GlobalCMVN(): norm_means=True, norm_vars=True, std_floor=1.0e-20): - self.cmvn_path = cmvn_path + # cmvn_path: Option[str, dict] + cmvn = cmvn_path + self.cmvn = cmvn self.norm_means = norm_means self.norm_vars = norm_vars self.std_floor = std_floor - - with open(cmvn_path) as f: - cmvn_stats = json.load(f) + if isinstance(cmvn, dict): + cmvn_stats = cmvn + else: + with open(cmvn) as f: + cmvn_stats = json.load(f) self.count = cmvn_stats['frame_num'] self.mean = np.array(cmvn_stats['mean_stat']) / self.count self.square_sums = np.array(cmvn_stats['var_stat']) @@ -183,8 +187,8 @@ class GlobalCMVN(): def __repr__(self): return f"""{self.__class__.__name__}( - cmvn_path={self.cmvn_path}, - norm_means={self.norm_means}, + cmvn_path={self.cmvn}, + norm_means={self.norm_means}, norm_vars={self.norm_vars},)""" def __call__(self, x, uttid=None):