Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSpeech into ted_en_zh_t0

d4ee5916 · gongel · 7cef93a6 · 880e897c · d4ee5916 · d4ee5916
251 changed file
--- a/README.md
+++ b/README.md
@@ -128,9 +128,9 @@ For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
 ```shell
 cd examples/csmsc/tts3
 # download the pretrained models and unaip them
-wget https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
 unzip pwg_baker_ckpt_0.4.zip
-wget https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
 unzip fastspeech2_nosil_baker_ckpt_0.4.zip
 # source the environment
 source path.sh

--- a/demos/metaverse/run.sh
+++ b/demos/metaverse/run.sh
@@ -25,9 +25,9 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # download pretrained tts models and unzip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
    unzip -d download download/pwg_baker_ckpt_0.4.zip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
 fi


--- a/demos/story_talker/run.sh
+++ b/demos/story_talker/run.sh
@@ -19,9 +19,9 @@ fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # download pretrained tts models and unzip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
    unzip -d download download/pwg_baker_ckpt_0.4.zip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
 fi


--- a/demos/style_fs2/run.sh
+++ b/demos/style_fs2/run.sh
@@ -14,9 +14,9 @@ mkdir -p download

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # download pretrained tts models and unzip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
    unzip -d download download/pwg_baker_ckpt_0.4.zip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
 fi


--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
-
 # Released Models

 ## Speech-to-Text Models
@@ -32,27 +31,28 @@ Language Model | Training Data | Token-based | Size | Descriptions
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
-Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)|||
-TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)|||
-SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
-FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
-FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
-FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
-FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
+Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)|||
+TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
+FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
+FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
+FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
+FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||

 ### Vocoders

 Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
-WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)|||
-Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB|
-Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)|||
-Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)|||
-Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)|||
-|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB|
+WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
+Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB|
+Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|||
+Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
+Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|||
+|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB|

 ### Voice Cloning
 Model Type | Dataset| Example Link | Pretrained Models
 :-------------:| :------------:| :-----: | :-----:
-GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
-GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
+GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip)
+GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip)
+GE2E + FastSpeech2 | AISHELL-3  |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
@@ -52,7 +52,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
            <td>
                <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -72,7 +72,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
            <td>
             <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
            </audio>
@@ -91,7 +91,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
            <td>
                <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -110,7 +110,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
            <td>
                <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -129,7 +129,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
            <td>
                <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -281,7 +281,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            <td> 
                <audio controls="controls">
                        <source
-                            src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
+                            src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
                            type="audio/wav">
                        Your browser does not support the <code>audio</code> element.
                </audio>
@@ -300,7 +300,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            <td> 
            <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -320,7 +320,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            <td> 
            <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -341,7 +341,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            <td> 
                <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -361,7 +361,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            <td> 
            <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -381,7 +381,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            <td> 
                <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -401,7 +401,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            <td> 
            <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -421,7 +421,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            <td> 
                <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>
@@ -441,7 +441,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            <td> 
                <audio controls="controls">
                    <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
                        type="audio/wav">
                    Your browser does not support the <code>audio</code> element.
                </audio>

--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
 # ASR

-* s0 for deepspeech2
-* s1 for u2/transformer/conformer
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+

 ## Data


--- a/examples/aishell/s0/.gitignore
+++ b/examples/aishell/s0/.gitignore
--- a/examples/aishell/s0/README.md
+++ b/examples/aishell/s0/README.md
--- a/examples/aishell/s0/conf/augmentation.json
+++ b/examples/aishell/s0/conf/augmentation.json
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --spectrum_type="linear" \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --sample_rate=16000 \
    --use_dB_normalization=True \
    --num_samples=2000 \
@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-                --feat_type "raw" \
                --cmvn_path "data/mean_std.json" \
                --unit_type "char" \
                --vocab_path="data/vocab.txt" \

--- a/examples/aishell/s0/local/download_lm_ch.sh
+++ b/examples/aishell/s0/local/download_lm_ch.sh
--- a/examples/aishell/s0/local/export.sh
+++ b/examples/aishell/s0/local/export.sh
--- a/examples/aishell/s0/local/test.sh
+++ b/examples/aishell/s0/local/test.sh
--- a/examples/aishell/s0/local/test_export.sh
+++ b/examples/aishell/s0/local/test_export.sh
--- a/examples/aishell/s0/local/test_hub.sh
+++ b/examples/aishell/s0/local/test_hub.sh
--- a/examples/aishell/s0/local/train.sh
+++ b/examples/aishell/s0/local/train.sh
--- a/examples/aishell/s0/path.sh
+++ b/examples/aishell/s0/path.sh
--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
--- a/examples/aishell/s1/.gitignore
+++ b/examples/aishell/s1/.gitignore
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/s1/README.md
@@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 |  - | 0.059400 |  
+
+
+## Transformer 
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |  
\ No newline at end of file
--- a/examples/aishell/s1/conf/augmentation.json
+++ b/examples/aishell/s1/conf/augmentation.json
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.5
+  max_input_len: 20.0 # second
+  min_output_len: 0.0
+  max_output_len: 400.0
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/preprocess.yaml
+  batch_size: 64
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True 
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+# network architecture
+model:
+    cmvn_file: 
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null 
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 120 
+  accum_grad: 2
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.002
+    weight_decay: 1e-6
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+
+
+decoding:
+  batch_size: 128
+  error_rate_type: cer 
+  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+
+
--- a/examples/aishell/s1/local/aishell_train_lms.sh
+++ b/examples/aishell/s1/local/aishell_train_lms.sh
--- a/examples/aishell/s1/local/align.sh
+++ b/examples/aishell/s1/local/align.sh
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --sample_rate=16000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
@@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-            --feat_type "raw" \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="data/vocab.txt" \

--- a/examples/aishell/s1/local/export.sh
+++ b/examples/aishell/s1/local/export.sh
--- a/examples/aishell/s1/local/test.sh
+++ b/examples/aishell/s1/local/test.sh
--- a/examples/aishell/s1/local/test_hub.sh
+++ b/examples/aishell/s1/local/test_hub.sh
@@ -23,8 +23,6 @@ fi
 #    exit 1
 #fi

-
-
 for type in  attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1

--- a/examples/aishell/s1/local/tlg.sh
+++ b/examples/aishell/s1/local/tlg.sh
--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
--- a/examples/aishell/s1/utils
+++ b/examples/aishell/s1/utils
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -97,7 +97,7 @@ optional arguments:

 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
 ```bash
 unzip pwg_aishell3_ckpt_0.5.zip
 ```
@@ -202,7 +202,7 @@ optional arguments:
 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)

 FastSpeech2 checkpoint contains files listed below.


--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -41,7 +41,7 @@ We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so th

 We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.

-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.

 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -86,4 +86,4 @@ In addition, in order to accelerate the convergence of the model, we add `guided
 CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
 ```
 ## Pretrained Model
-[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
+[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip).
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -22,7 +22,7 @@ You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech
 ## Pretrained GE2E model
 We use pretrained GE2E model to generate spwaker embedding for each sentence.

-Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip), and `unzip` it.
+Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.

 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
@@ -84,7 +84,7 @@ The training step is very similar to that one of [tts3](https://github.com/Paddl

 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
 ```bash
 unzip pwg_aishell3_ckpt_0.5.zip
 ```
@@ -115,7 +115,7 @@ ref_audio
 CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
 ```
 ## Pretrained Model
-[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
+[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)

 FastSpeech2 checkpoint contains files listed below.
 (There is no need for `speaker_id_map.txt` here )

--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -132,7 +132,7 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

 ## Pretrained Models
-Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip).
+Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip).

 Parallel WaveGAN checkpoint contains files listed below.


--- a/examples/callcenter/s1/.gitignore
+++ b/examples/callcenter/s1/.gitignore
--- a/examples/callcenter/s1/README.md
+++ b/examples/callcenter/s1/README.md
--- a/examples/callcenter/s1/conf/augmentation.json
+++ b/examples/callcenter/s1/conf/augmentation.json
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/callcenter/asr1/conf/preprocess.yaml
+++ b/examples/callcenter/asr1/conf/preprocess.yaml
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/callcenter/s1/local/align.sh
+++ b/examples/callcenter/s1/local/align.sh
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --sample_rate=8000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
@@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-            --feat_type "raw" \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="data/vocab.txt" \

--- a/examples/callcenter/s1/local/download_lm_ch.sh
+++ b/examples/callcenter/s1/local/download_lm_ch.sh
--- a/examples/callcenter/s1/local/export.sh
+++ b/examples/callcenter/s1/local/export.sh
--- a/examples/callcenter/s1/local/test.sh
+++ b/examples/callcenter/s1/local/test.sh
--- a/examples/callcenter/s1/local/train.sh
+++ b/examples/callcenter/s1/local/train.sh
--- a/examples/callcenter/s1/path.sh
+++ b/examples/callcenter/s1/path.sh
--- a/examples/callcenter/s1/run.sh
+++ b/examples/callcenter/s1/run.sh
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -90,7 +90,7 @@ optional arguments:

 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
 unzip pwg_baker_ckpt_0.4.zip
 ```
@@ -208,9 +208,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ```

 ## Pretrained Model
-Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip).
+Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip).

-Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip).
+Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip).

 SpeedySpeech checkpoint contains files listed below.
 ```text

--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -88,7 +88,7 @@ optional arguments:

 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
 unzip pwg_baker_ckpt_0.4.zip
 ```
@@ -199,9 +199,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ```

 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip).
+Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip).

-Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip).
+Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).

 FastSpeech2 checkpoint contains files listed below.
 ```text

--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -122,9 +122,9 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

 ## Pretrained Models
-Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
+Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip).

-Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip).
+Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip).

 Parallel WaveGAN checkpoint contains files listed below.


--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -113,7 +113,7 @@ The length of mel-spectrograms should align with the length of wavs, so we shoul

 But since we are fine-tuning, we should use the statistics computed during training step.

-You should  first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it.
+You should  first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it.

 Assume the path to the dump-dir of  training  step is `dump`.
 Assume the path to the duration result of CSMSC is `durations.txt` (generated during training step's preprocessing).
@@ -147,11 +147,11 @@ TODO:
 The hyperparameter of  `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).

 ## Pretrained Models
-Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip).
+Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip).

-Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_finetune_ckpt_0.5.zip).
+Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip).

-Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip)
+Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip)

 Multi Band MelGAN checkpoint contains files listed below.


--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path

 import soundfile

@@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):

                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name

                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
@@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text,

--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
@@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path

 import soundfile

@@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
+
+                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
@@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text

--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
@@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    total_sec = 0.0
-    total_text = 0.0
+    total_char = 0.0
    total_num = 0

    for subfolder, _, filelist in sorted(os.walk(data_dir)):
@@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
            text_filepath = os.path.join(subfolder, text_filelist[0])
            for line in io.open(text_filepath, encoding="utf8"):
                segments = line.strip().split()
+                nchars = len(segments[1:])
                text = ' '.join(segments[1:]).lower()

                audio_filepath = os.path.abspath(
                    os.path.join(subfolder, segments[0] + '.flac'))
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
+
+                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+                utt2spk = '-'.join(utt.split('-')[:2])
+
                json_lines.append(
                    json.dumps({
-                        'utt':
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
-                        'feat':
-                        audio_filepath,
-                        'feat_shape': (duration, ),  #second
-                        'text':
-                        text
+                        'utt': utt,
+                        'utt2spk': utt2spk,
+                        'feat': audio_filepath,
+                        'feat_shape': (duration, ),  # second
+                        'text': text,
                    }))

                total_sec += duration
-                total_text += len(text)
+                total_char += nchars
                total_num += 1

    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
        print(f"{subset}:", file=f)
        print(f"{total_num} utts", file=f)
        print(f"{total_sec / (60*60)} h", file=f)
-        print(f"{total_text} text", file=f)
-        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_char} char", file=f)
+        print(f"{total_char / total_sec} char/sec", file=f)
        print(f"{total_sec / total_num} sec/utt", file=f)



--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
@@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
+
+                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+                utt2spk = '-'.join(utt.split('-')[:2])
                json_lines.append(
                    json.dumps({
-                        'utt':
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
-                        'feat':
-                        audio_filepath,
+                        'utt': utt,
+                        'utt2spk': utt2spk,
+                        'feat': audio_filepath,
                        'feat_shape': (duration, ),  #second
-                        'text':
-                        text
+                        'text': text,
                    }))

                total_sec += duration

--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
@@ -72,14 +72,16 @@ def create_manifest(data_dir, manifest_path_prefix):
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
+
+                translation_str = " ".join(translation.split())
+                trancription_str = " ".join(trancription.split())
                json_lines.append(
                    json.dumps(
                        {
                            'utt': utt,
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
-                            'text': " ".join(translation.split()),
-                            'text1': " ".join(trancription.split())
+                            'text': [translation_str, trancription_str],
                        },
                        ensure_ascii=False))


--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
@@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                assert os.path.exists(audio_path) and os.path.exists(text_path)

                audio_id = os.path.basename(audio_path)[:-4]
+                spk = audio_id.split('_')[0]
+
                word_text, syllable_text, phone_text = read_trn(text_path)
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
@@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
+                            'utt2spk': spk,
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': word_text,  # charactor

--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
@@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
                json.dumps(
                    {
                        'utt': utt_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                        'feat': str(audio_path),
                        'feat_shape': (duration, ),  # second
                        'text': word_text,  # word
                        'phone': phone_text,
-                        'spk': spk,
-                        'gender': gender,
                    },
                    ensure_ascii=False))


--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
@@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path

 import soundfile

@@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
            audio_data, samplerate = soundfile.read(audio_path)
            duration = float(len(audio_data) / samplerate)
            text = phn_dict[audio_id]
+
+            gender_spk = str(Path(audio_path).parent.stem)
+            spk = gender_spk[1:]
+            gender = gender_spk[0]
+            utt_id = '_'.join([spk, gender, audio_id])
            json_lines.append(
                json.dumps(
                    {
                        'utt': audio_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                        'feat': audio_path,
                        'feat_shape': (duration, ),  # second
                        'text': text

--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
@@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):

            audio_data, samplerate = soundfile.read(u)
            duration = float(len(audio_data)) / samplerate
+
+            utt = os.path.splitext(os.path.basename(u))[0]
            json_lines.append(
                json.dumps({
-                    'utt': os.path.splitext(os.path.basename(u))[0],
+                    'utt': utt,
+                    'utt2spk': speaker,
                    'feat': u,
                    'feat_shape': (duration, ),  #second
                    'text': trans.lower()

--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
 # ASR

-* s0 is for deepspeech2 offline
-* s1 is for transformer/conformer/U2
-* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+

 ## Data
 | Data Subset | Duration in Seconds |

--- a/examples/librispeech/s0/README.md
+++ b/examples/librispeech/s0/README.md
--- a/examples/librispeech/s0/conf/augmentation.json
+++ b/examples/librispeech/s0/conf/augmentation.json
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --use_dB_normalization=True \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for set in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
        --cmvn_path "data/mean_std.json" \
        --unit_type ${unit_type} \
        --vocab_path="data/vocab.txt" \

--- a/examples/librispeech/s0/local/download_lm_en.sh
+++ b/examples/librispeech/s0/local/download_lm_en.sh
--- a/examples/librispeech/s0/local/export.sh
+++ b/examples/librispeech/s0/local/export.sh
--- a/examples/librispeech/s0/local/test.sh
+++ b/examples/librispeech/s0/local/test.sh
--- a/examples/librispeech/s0/local/test_hub.sh
+++ b/examples/librispeech/s0/local/test_hub.sh
--- a/examples/librispeech/s0/local/train.sh
+++ b/examples/librispeech/s0/local/train.sh
--- a/examples/librispeech/s0/path.sh
+++ b/examples/librispeech/s0/path.sh
--- a/examples/librispeech/s0/run.sh
+++ b/examples/librispeech/s0/run.sh
--- a/examples/librispeech/s1/.gitignore
+++ b/examples/librispeech/s1/.gitignore
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
@@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.805267604192098, | 0.049795 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |  
--- a/examples/librispeech/s1/cmd.sh
+++ b/examples/librispeech/s1/cmd.sh
--- a/examples/librispeech/s1/conf/augmentation.json
+++ b/examples/librispeech/s1/conf/augmentation.json
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer

--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/librispeech/asr1/conf/preprocess.yaml
+++ b/examples/librispeech/asr1/conf/preprocess.yaml
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer

--- a/examples/librispeech/s1/local/align.sh
+++ b/examples/librispeech/s1/local/align.sh
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -8,6 +8,11 @@ nbpe=5000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"

+stride_ms=10
+window_ms=25
+sample_rate=16000
+feat_dim=80
+
 source ${MAIN_ROOT}/utils/parse_options.sh


@@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
        exit 1
    fi

-    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-        mv data/manifest.${set} data/manifest.${set}.raw
+    for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        mv data/manifest.${sub} data/manifest.${sub}.raw
    done

    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
-    for set in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    for sub in train-clean-100 train-clean-360 train-other-500; do
+        cat data/manifest.${sub}.raw >> data/manifest.train.raw
    done

-    for set in dev-clean dev-other; do
-        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    for sub in dev-clean dev-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.dev.raw
    done

-    for set in test-clean test-other; do
-        cat data/manifest.${set}.raw >> data/manifest.test.raw
+    for sub in test-clean test-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.test.raw
    done
 fi

@@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
    --spectrum_type="fbank" \
-    --feat_dim=80 \
+    --feat_dim=${feat_dim} \
    --delta_delta=false \
-    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --sample_rate=${sample_rate} \
+    --stride_ms=${stride_ms} \
+    --window_ms=${window_ms} \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@@ -85,16 +90,15 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
-    for set in train dev test dev-clean dev-other test-clean test-other; do
+    for sub in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
        --cmvn_path "data/mean_std.json" \
        --unit_type "spm" \
        --spm_model_prefix ${bpeprefix} \
        --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
+        --manifest_path="data/manifest.${sub}.raw" \
+        --output_path="data/manifest.${sub}"

        if [ $? -ne 0 ]; then
            echo "Formt mnaifest failed. Terminated."
@@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    }&
    done
    wait
+
+    for sub in train dev; do
+        mv data/manifest.${sub} data/manifest.${sub}.fmt
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    for sub in train dev; do
+        remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
+    done
 fi

 echo "LibriSpeech Data preparation done."

--- a/examples/librispeech/s1/local/download_lm_en.sh
+++ b/examples/librispeech/s1/local/download_lm_en.sh
--- a/examples/librispeech/s1/local/export.sh
+++ b/examples/librispeech/s1/local/export.sh
--- a/examples/librispeech/s1/local/test.sh
+++ b/examples/librispeech/s1/local/test.sh
--- a/examples/librispeech/s1/local/test_hub.sh
+++ b/examples/librispeech/s1/local/test_hub.sh
--- a/examples/librispeech/s1/local/train.sh
+++ b/examples/librispeech/s1/local/train.sh
--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
--- a/examples/librispeech/s1/utils
+++ b/examples/librispeech/s1/utils
--- a/examples/librispeech/s2/.gitignore
+++ b/examples/librispeech/s2/.gitignore
--- a/examples/librispeech/s2/README.md
+++ b/examples/librispeech/s2/README.md
--- a/examples/librispeech/s2/cmd.sh
+++ b/examples/librispeech/s2/cmd.sh
--- a/examples/librispeech/s2/conf/augmentation.json
+++ b/examples/librispeech/s2/conf/augmentation.json
--- a/examples/librispeech/s2/conf/decode/decode.yaml
+++ b/examples/librispeech/s2/conf/decode/decode.yaml
--- a/examples/librispeech/s2/conf/decode/decode_att.yaml
+++ b/examples/librispeech/s2/conf/decode/decode_att.yaml
--- a/examples/librispeech/s2/conf/decode/decode_ctc.yaml
+++ b/examples/librispeech/s2/conf/decode/decode_ctc.yaml
--- a/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml
+++ b/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml
--- a/examples/librispeech/s2/conf/fbank.conf
+++ b/examples/librispeech/s2/conf/fbank.conf
--- a/examples/librispeech/s2/conf/lm/transformer.yaml
+++ b/examples/librispeech/s2/conf/lm/transformer.yaml
--- a/examples/librispeech/s2/conf/pitch.conf
+++ b/examples/librispeech/s2/conf/pitch.conf
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
--- a/examples/librispeech/s2/local/align.sh
+++ b/examples/librispeech/s2/local/align.sh
--- a/examples/librispeech/s2/local/cacu_perplexity.sh
+++ b/examples/librispeech/s2/local/cacu_perplexity.sh
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
--- a/examples/librispeech/s2/local/data_prep.sh
+++ b/examples/librispeech/s2/local/data_prep.sh
--- a/examples/librispeech/s2/local/download_lm_en.sh
+++ b/examples/librispeech/s2/local/download_lm_en.sh
--- a/examples/librispeech/s2/local/espnet_json_to_manifest.py
+++ b/examples/librispeech/s2/local/espnet_json_to_manifest.py
--- a/examples/librispeech/s2/local/export.sh
+++ b/examples/librispeech/s2/local/export.sh
--- a/examples/librispeech/s2/local/recog.sh
+++ b/examples/librispeech/s2/local/recog.sh
--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
--- a/examples/librispeech/s2/local/train.sh
+++ b/examples/librispeech/s2/local/train.sh
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
--- a/examples/librispeech/s2/steps
+++ b/examples/librispeech/s2/steps
--- a/examples/librispeech/s2/utils
+++ b/examples/librispeech/s2/utils
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
--- a/examples/ljspeech/voc0/README.md
+++ b/examples/ljspeech/voc0/README.md
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
--- a/examples/other/1xt2x/aishell/local/data.sh
+++ b/examples/other/1xt2x/aishell/local/data.sh
--- a/examples/other/1xt2x/baidu_en8k/local/data.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/data.sh
--- a/examples/other/1xt2x/librispeech/local/data.sh
+++ b/examples/other/1xt2x/librispeech/local/data.sh
--- a/examples/other/ge2e/README.md
+++ b/examples/other/ge2e/README.md
--- a/examples/ted_en_zh/README.md
+++ b/examples/ted_en_zh/README.md
--- a/examples/ted_en_zh/t0/.gitignore
+++ b/examples/ted_en_zh/t0/.gitignore
--- a/examples/ted_en_zh/t0/README.md
+++ b/examples/ted_en_zh/t0/README.md
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
--- a/examples/ted_en_zh/t0/local/test.sh
+++ b/examples/ted_en_zh/t0/local/test.sh
--- a/examples/ted_en_zh/t0/local/train.sh
+++ b/examples/ted_en_zh/t0/local/train.sh
--- a/examples/ted_en_zh/t0/path.sh
+++ b/examples/ted_en_zh/t0/path.sh
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
--- a/examples/thchs30/README.md
+++ b/examples/thchs30/README.md
--- a/examples/thchs30/a0/README.md
+++ b/examples/thchs30/a0/README.md
--- a/examples/thchs30/a0/data/dict/syllable.lexicon
+++ b/examples/thchs30/a0/data/dict/syllable.lexicon
--- a/examples/thchs30/a0/local/data.sh
+++ b/examples/thchs30/a0/local/data.sh
--- a/examples/thchs30/a0/local/gen_word2phone.py
+++ b/examples/thchs30/a0/local/gen_word2phone.py
--- a/examples/thchs30/a0/local/reorganize_thchs30.py
+++ b/examples/thchs30/a0/local/reorganize_thchs30.py
--- a/examples/thchs30/a0/path.sh
+++ b/examples/thchs30/a0/path.sh
--- a/examples/thchs30/a0/run.sh
+++ b/examples/thchs30/a0/run.sh
--- a/examples/timit/README.md
+++ b/examples/timit/README.md
--- a/examples/timit/s1/.gitignore
+++ b/examples/timit/s1/.gitignore
--- a/examples/timit/s1/README.md
+++ b/examples/timit/s1/README.md
--- a/examples/timit/s1/conf/augmentation.json
+++ b/examples/timit/s1/conf/augmentation.json
--- a/examples/timit/s1/conf/dev_spk.list
+++ b/examples/timit/s1/conf/dev_spk.list
--- a/examples/timit/s1/conf/phones.60-48-39.map
+++ b/examples/timit/s1/conf/phones.60-48-39.map
--- a/examples/timit/asr1/conf/preprocess.yaml
+++ b/examples/timit/asr1/conf/preprocess.yaml
--- a/examples/timit/s1/conf/test_spk.list
+++ b/examples/timit/s1/conf/test_spk.list
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
--- a/examples/timit/s1/local/align.sh
+++ b/examples/timit/s1/local/align.sh
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
--- a/examples/timit/s1/local/export.sh
+++ b/examples/timit/s1/local/export.sh
--- a/examples/timit/s1/local/test.sh
+++ b/examples/timit/s1/local/test.sh
--- a/examples/timit/s1/local/timit_data_prep.sh
+++ b/examples/timit/s1/local/timit_data_prep.sh
--- a/examples/timit/s1/local/timit_norm_trans.pl
+++ b/examples/timit/s1/local/timit_norm_trans.pl
--- a/examples/timit/s1/local/train.sh
+++ b/examples/timit/s1/local/train.sh
--- a/examples/timit/s1/path.sh
+++ b/examples/timit/s1/path.sh
--- a/examples/timit/s1/run.sh
+++ b/examples/timit/s1/run.sh
--- a/examples/tiny/README.md
+++ b/examples/tiny/README.md
--- a/examples/tiny/s0/.gitignore
+++ b/examples/tiny/s0/.gitignore
--- a/examples/tiny/s0/README.md
+++ b/examples/tiny/s0/README.md
--- a/examples/tiny/s0/conf/augmentation.json
+++ b/examples/tiny/s0/conf/augmentation.json
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
--- a/examples/tiny/s0/local/download_lm_en.sh
+++ b/examples/tiny/s0/local/download_lm_en.sh
--- a/examples/tiny/s0/local/export.sh
+++ b/examples/tiny/s0/local/export.sh
--- a/examples/tiny/s0/local/test.sh
+++ b/examples/tiny/s0/local/test.sh
--- a/examples/tiny/s0/local/train.sh
+++ b/examples/tiny/s0/local/train.sh
--- a/examples/tiny/s0/path.sh
+++ b/examples/tiny/s0/path.sh
--- a/examples/tiny/s0/run.sh
+++ b/examples/tiny/s0/run.sh
--- a/examples/tiny/s1/.gitignore
+++ b/examples/tiny/s1/.gitignore
--- a/examples/tiny/s1/conf/augmentation.json
+++ b/examples/tiny/s1/conf/augmentation.json
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
--- a/examples/tiny/asr1/conf/preprocess.yaml
+++ b/examples/tiny/asr1/conf/preprocess.yaml
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
--- a/examples/tiny/s1/local/align.sh
+++ b/examples/tiny/s1/local/align.sh
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
--- a/examples/tiny/s1/local/export.sh
+++ b/examples/tiny/s1/local/export.sh
--- a/examples/tiny/s1/local/test.sh
+++ b/examples/tiny/s1/local/test.sh
--- a/examples/tiny/s1/local/train.sh
+++ b/examples/tiny/s1/local/train.sh
--- a/examples/tiny/s1/path.sh
+++ b/examples/tiny/s1/path.sh
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
--- a/examples/wenetspeech/README.md
+++ b/examples/wenetspeech/README.md
--- a/examples/wenetspeech/asr1/.gitignore
+++ b/examples/wenetspeech/asr1/.gitignore
--- a/examples/wenetspeech/asr1/README.md
+++ b/examples/wenetspeech/asr1/README.md
--- a/examples/wenetspeech/asr1/RESULTS.md
+++ b/examples/wenetspeech/asr1/RESULTS.md
--- a/examples/wenetspeech/asr1/conf/conformer.yaml
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
--- a/examples/wenetspeech/asr1/conf/preprocess.yaml
+++ b/examples/wenetspeech/asr1/conf/preprocess.yaml
--- a/examples/wenetspeech/asr1/local/data.sh
+++ b/examples/wenetspeech/asr1/local/data.sh
--- a/examples/wenetspeech/asr1/local/extract_meta.py
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
--- a/examples/wenetspeech/asr1/local/process_opus.py
+++ b/examples/wenetspeech/asr1/local/process_opus.py
--- a/examples/wenetspeech/asr1/local/test.sh
+++ b/examples/wenetspeech/asr1/local/test.sh
--- a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
+++ b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
--- a/examples/wenetspeech/asr1/path.sh
+++ b/examples/wenetspeech/asr1/path.sh
--- a/examples/wenetspeech/asr1/run.sh
+++ b/examples/wenetspeech/asr1/run.sh
--- a/examples/wenetspeech/asr1/utils
+++ b/examples/wenetspeech/asr1/utils
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
--- a/paddlespeech/s2t/io/collator.py
+++ b/paddlespeech/s2t/io/collator.py
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
--- a/paddlespeech/s2t/io/utility.py
+++ b/paddlespeech/s2t/io/utility.py
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
--- a/paddlespeech/s2t/modules/cmvn.py
+++ b/paddlespeech/s2t/modules/cmvn.py
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
--- a/paddlespeech/s2t/modules/loss.py
+++ b/paddlespeech/s2t/modules/loss.py
--- a/paddlespeech/s2t/modules/mask.py
+++ b/paddlespeech/s2t/modules/mask.py
--- a/paddlespeech/s2t/modules/positionwise_feed_forward.py
+++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py
--- a/paddlespeech/s2t/modules/subsampling.py
+++ b/paddlespeech/s2t/modules/subsampling.py
--- a/paddlespeech/s2t/transform/cmvn.py
+++ b/paddlespeech/s2t/transform/cmvn.py
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
--- a/paddlespeech/s2t/transform/spec_augment.py
+++ b/paddlespeech/s2t/transform/spec_augment.py
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
--- a/paddlespeech/s2t/transform/transformation.py
+++ b/paddlespeech/s2t/transform/transformation.py
--- a/tests/chains/speedyspeech/prepare.sh
+++ b/tests/chains/speedyspeech/prepare.sh
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
--- a/utils/format_data.py
+++ b/utils/format_data.py
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
--- a/utils/pack_model.sh
+++ b/utils/pack_model.sh
--- a/utils/remove_longshortdata.py
+++ b/utils/remove_longshortdata.py
--- a/utils/show_results.sh
+++ b/utils/show_results.sh