use reverse_weight in decode.yaml

cda440e6 · tianhao zhang · c98b5dd1 · cda440e6 · cda440e6 · cda440e6
6 changed file
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -9,7 +9,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB  | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python |
 [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python |
 [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python |
-[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.1.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB  | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python |
+[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB  | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python |
 [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python |
 [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python |
 [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) | python |

--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -69,11 +69,11 @@ asr_dynamic_pretrained_models = {
        },
    },
    "conformer_u2pp_wenetspeech-zh-16k": {
-        '1.1': {
+        '1.3': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.1.model.tar.gz',
+            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.3.model.tar.gz',
            'md5':
-            'eae678c04ed3b3f89672052fdc0c5e10',
+            '662b347e1d2131b7a4dc5398365e2134',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
@@ -89,11 +89,11 @@ asr_dynamic_pretrained_models = {
        },
    },
    "conformer_u2pp_online_wenetspeech-zh-16k": {
-        '1.1': {
+        '1.4': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.2.model.tar.gz',
+            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz',
            'md5':
-            '925d047e9188dea7f421a718230c9ae3',
+            '3100fc1eac5779486cab859366992d0b',
            'cfg_path':
            'model.yaml',
            'ckpt_path':

--- a/paddlespeech/s2t/exps/u2/bin/quant.py
+++ b/paddlespeech/s2t/exps/u2/bin/quant.py
@@ -39,7 +39,6 @@ class U2Infer():
        self.preprocess_conf = config.preprocess_config
        self.preprocess_args = {"train": False}
        self.preprocessing = Transformation(self.preprocess_conf)
-        self.reverse_weight = getattr(config.model_conf, 'reverse_weight', 0.0)
        self.text_feature = TextFeaturizer(
            unit_type=config.unit_type,
            vocab=config.vocab_filepath,
@@ -81,6 +80,7 @@ class U2Infer():
            xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
            decode_config = self.config.decode
            logger.info(f"decode cfg: {decode_config}")
+            reverse_weight = getattr(decode_config, 'reverse_weight', 0.0)
            result_transcripts = self.model.decode(
                xs,
                ilen,
@@ -91,7 +91,7 @@ class U2Infer():
                decoding_chunk_size=decode_config.decoding_chunk_size,
                num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
                simulate_streaming=decode_config.simulate_streaming,
-                reverse_weight=decode_config.reverse_weight)
+                reverse_weight=reverse_weight)
            rsl = result_transcripts[0][0]
            utt = Path(self.audio_file).name
            logger.info(f"hyp: {utt} {rsl}")

--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -79,6 +79,7 @@ class U2Infer():
            xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
            decode_config = self.config.decode
            logger.info(f"decode cfg: {decode_config}")
+            reverse_weight = getattr(decode_config, 'reverse_weight', 0.0)
            result_transcripts = self.model.decode(
                xs,
                ilen,
@@ -89,7 +90,7 @@ class U2Infer():
                decoding_chunk_size=decode_config.decoding_chunk_size,
                num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
                simulate_streaming=decode_config.simulate_streaming,
-                reverse_weight=decode_config.reverse_weight)
+                reverse_weight=reverse_weight)
            rsl = result_transcripts[0][0]
            utt = Path(self.audio_file).name
            logger.info(f"hyp: {utt} {result_transcripts[0][0]}")

--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -337,6 +337,7 @@ class U2Tester(U2Trainer):
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors
        error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer
+        reverse_weight = getattr(decode_config, 'reverse_weight', 0.0)
        start_time = time.time()
        target_transcripts = self.id2token(texts, texts_len, self.text_feature)
@@ -351,7 +352,7 @@ class U2Tester(U2Trainer):
            decoding_chunk_size=decode_config.decoding_chunk_size,
            num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
            simulate_streaming=decode_config.simulate_streaming,
-            reverse_weight=decode_config.reverse_weight)
+            reverse_weight=reverse_weight)
        decode_time = time.time() - start_time
        for utt, target, result, rec_tids in zip(

--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -580,6 +580,7 @@ class PaddleASRConnectionHanddler:
        self.update_result()
        beam_size = self.ctc_decode_config.beam_size
+        reverse_weight = getattr(self.ctc_decode_config, 'reverse_weight', 0.0)
        hyps = self.searcher.get_hyps()
        if hyps is None or len(hyps) == 0:
            logger.info("No Hyps!")
@@ -613,7 +614,7 @@ class PaddleASRConnectionHanddler:
        # ctc score in ln domain
        # (beam_size, max_hyps_len, vocab_size)
        decoder_out, r_decoder_out = self.model.forward_attention_decoder(
-            hyps_pad, hyps_lens, self.encoder_out, self.model.reverse_weight)
+            hyps_pad, hyps_lens, self.encoder_out, reverse_weight)
        decoder_out = decoder_out.numpy()
        # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a
@@ -631,13 +632,12 @@ class PaddleASRConnectionHanddler:
            # last decoder output token is `eos`, for laste decoder input token.
            score += decoder_out[i][len(hyp[0])][self.model.eos]
-            if self.model.reverse_weight > 0:
+            if reverse_weight > 0:
                r_score = 0.0
                for j, w in enumerate(hyp[0]):
                    r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w]
                r_score += r_decoder_out[i][len(hyp[0])][self.model.eos]
-                score = score * (1 - self.model.reverse_weight
+                score = score * (1 - reverse_weight) + r_score * reverse_weight
-                                 ) + r_score * self.model.reverse_weight
            # add ctc score (which in ln domain)
            score += hyp[1] * self.ctc_decode_config.ctc_weight