diff --git a/README.md b/README.md
index 2f9d992895309f28ccfabc5d0bf83dfa94aaa443..ec2d0f3022b85310ed8c0fec36fd98c91c474d71 100644
--- a/README.md
+++ b/README.md
@@ -128,9 +128,9 @@ For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
```shell
cd examples/csmsc/tts3
# download the pretrained models and unaip them
-wget https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
unzip pwg_baker_ckpt_0.4.zip
-wget https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip fastspeech2_nosil_baker_ckpt_0.4.zip
# source the environment
source path.sh
diff --git a/demos/metaverse/run.sh b/demos/metaverse/run.sh
index ea7f683c8e428c6f3f311f9de591f9faa2f9f7b2..ba7d7980f7db4fead012d772980b99ad741dca12 100755
--- a/demos/metaverse/run.sh
+++ b/demos/metaverse/run.sh
@@ -25,9 +25,9 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# download pretrained tts models and unzip
- wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+ wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
unzip -d download download/pwg_baker_ckpt_0.4.zip
- wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+ wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
fi
diff --git a/demos/story_talker/run.sh b/demos/story_talker/run.sh
index 069ec12ee92f2a20f1943fde7891a024f184d13e..44259cd3065869f283ecb1653a0709256971f8ae 100755
--- a/demos/story_talker/run.sh
+++ b/demos/story_talker/run.sh
@@ -19,9 +19,9 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download pretrained tts models and unzip
- wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+ wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
unzip -d download download/pwg_baker_ckpt_0.4.zip
- wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+ wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
fi
diff --git a/demos/style_fs2/run.sh b/demos/style_fs2/run.sh
index f035dd1be7d7815baba8cc28cbbdbece156a39d0..6f6d606809a6182c9ed654fbe4e14c1073bf2543 100755
--- a/demos/style_fs2/run.sh
+++ b/demos/style_fs2/run.sh
@@ -14,9 +14,9 @@ mkdir -p download
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# download pretrained tts models and unzip
- wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+ wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
unzip -d download download/pwg_baker_ckpt_0.4.zip
- wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+ wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
fi
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 78f5c92f0ed0aae46c5ec3a82d59c10bf4360064..ca04f6a7416ca924195b04194cca342d7b855ec1 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,4 +1,3 @@
-
# Released Models
## Speech-to-Text Models
@@ -32,27 +31,28 @@ Language Model | Training Data | Token-based | Size | Descriptions
### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
-Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)|||
-TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)|||
-SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
-FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
-FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
-FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
-FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
+Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)|||
+TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
+FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
+FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
+FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
+FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
### Vocoders
Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
-WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)|||
-Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB|
-Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)|||
-Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)|||
-Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)|||
-|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB|
+WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
+Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB|
+Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|||
+Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
+Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|||
+|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB|
### Voice Cloning
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----:
-GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
-GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
+GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip)
+GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip)
+GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst
index f47c0892b91b60c2fec11e11cf107329ac1411b7..4c2f86b148a9895a1bfaa3c1f61be7580fb2fd3e 100644
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
@@ -52,7 +52,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
@@ -72,7 +72,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
|
@@ -91,7 +91,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
|
@@ -110,7 +110,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
|
@@ -129,7 +129,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
|
@@ -281,7 +281,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
|
@@ -300,7 +300,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
|
@@ -320,7 +320,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
|
@@ -341,7 +341,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
|
@@ -361,7 +361,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
|
@@ -381,7 +381,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
|
@@ -401,7 +401,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
|
@@ -421,7 +421,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
|
@@ -441,7 +441,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
|
diff --git a/examples/aishell/README.md b/examples/aishell/README.md
index 82ef91da96f47e99e5589695f8de9776279aece8..a9bba074f2f6bff624976260c847fce8cc6a75f1 100644
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@@ -1,7 +1,9 @@
# ASR
-* s0 for deepspeech2
-* s1 for u2/transformer/conformer
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+
## Data
diff --git a/examples/aishell/s0/.gitignore b/examples/aishell/asr0/.gitignore
similarity index 100%
rename from examples/aishell/s0/.gitignore
rename to examples/aishell/asr0/.gitignore
diff --git a/examples/aishell/s0/README.md b/examples/aishell/asr0/README.md
similarity index 100%
rename from examples/aishell/s0/README.md
rename to examples/aishell/asr0/README.md
diff --git a/examples/aishell/s0/conf/augmentation.json b/examples/aishell/asr0/conf/augmentation.json
similarity index 100%
rename from examples/aishell/s0/conf/augmentation.json
rename to examples/aishell/asr0/conf/augmentation.json
diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml
similarity index 100%
rename from examples/aishell/s0/conf/deepspeech2.yaml
rename to examples/aishell/asr0/conf/deepspeech2.yaml
diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml
similarity index 100%
rename from examples/aishell/s0/conf/deepspeech2_online.yaml
rename to examples/aishell/asr0/conf/deepspeech2_online.yaml
diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/asr0/local/data.sh
similarity index 96%
rename from examples/aishell/s0/local/data.sh
rename to examples/aishell/asr0/local/data.sh
index f4fccbe6e034064935f1d41abd9aebdc0cc7d7ac..23f04f2a640b5cbeb0ac43fbd07877642dee38ae 100755
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \
--spectrum_type="linear" \
--delta_delta=false \
- --stride_ms=10.0 \
- --window_ms=20.0 \
+ --stride_ms=10 \
+ --window_ms=20 \
--sample_rate=16000 \
--use_dB_normalization=True \
--num_samples=2000 \
@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
diff --git a/examples/aishell/s0/local/download_lm_ch.sh b/examples/aishell/asr0/local/download_lm_ch.sh
similarity index 100%
rename from examples/aishell/s0/local/download_lm_ch.sh
rename to examples/aishell/asr0/local/download_lm_ch.sh
diff --git a/examples/aishell/s0/local/export.sh b/examples/aishell/asr0/local/export.sh
similarity index 100%
rename from examples/aishell/s0/local/export.sh
rename to examples/aishell/asr0/local/export.sh
diff --git a/examples/aishell/s0/local/test.sh b/examples/aishell/asr0/local/test.sh
similarity index 100%
rename from examples/aishell/s0/local/test.sh
rename to examples/aishell/asr0/local/test.sh
diff --git a/examples/aishell/s0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh
similarity index 100%
rename from examples/aishell/s0/local/test_export.sh
rename to examples/aishell/asr0/local/test_export.sh
diff --git a/examples/aishell/s0/local/test_hub.sh b/examples/aishell/asr0/local/test_hub.sh
similarity index 100%
rename from examples/aishell/s0/local/test_hub.sh
rename to examples/aishell/asr0/local/test_hub.sh
diff --git a/examples/aishell/s0/local/train.sh b/examples/aishell/asr0/local/train.sh
similarity index 100%
rename from examples/aishell/s0/local/train.sh
rename to examples/aishell/asr0/local/train.sh
diff --git a/examples/aishell/s0/path.sh b/examples/aishell/asr0/path.sh
similarity index 100%
rename from examples/aishell/s0/path.sh
rename to examples/aishell/asr0/path.sh
diff --git a/examples/aishell/s0/run.sh b/examples/aishell/asr0/run.sh
similarity index 100%
rename from examples/aishell/s0/run.sh
rename to examples/aishell/asr0/run.sh
diff --git a/examples/aishell/s1/.gitignore b/examples/aishell/asr1/.gitignore
similarity index 100%
rename from examples/aishell/s1/.gitignore
rename to examples/aishell/asr1/.gitignore
diff --git a/examples/aishell/s1/README.md b/examples/aishell/asr1/README.md
similarity index 67%
rename from examples/aishell/s1/README.md
rename to examples/aishell/asr1/README.md
index 0096c73e30ec57bb41ddf54508a9d197c31adf3a..8c53f95f67514ad8ba5f9050b4d5e1aa3651ecbc 100644
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/asr1/README.md
@@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 |
+
+
+## Transformer
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |
+| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |
+| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
+| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |
\ No newline at end of file
diff --git a/examples/aishell/s1/conf/augmentation.json b/examples/aishell/asr1/conf/augmentation.json
similarity index 100%
rename from examples/aishell/s1/conf/augmentation.json
rename to examples/aishell/asr1/conf/augmentation.json
diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml
similarity index 97%
rename from examples/aishell/s1/conf/chunk_conformer.yaml
rename to examples/aishell/asr1/conf/chunk_conformer.yaml
index 8682538b303e9a4c1a2b0d928a06c4a33454f9e2..336a6c46224d6be6cd0d1846a742de66973f5109 100644
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml
similarity index 97%
rename from examples/aishell/s1/conf/conformer.yaml
rename to examples/aishell/asr1/conf/conformer.yaml
index 71cd044ed19dd91ece61673d5c0852a4a0cef4f9..0e9d79d8b25e0a19955db8d7c3f9f79c25501ae8 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c
--- /dev/null
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+ # extract kaldi fbank from PCM
+ - type: fbank_kaldi
+ fs: 16000
+ n_mels: 80
+ n_shift: 160
+ win_length: 400
+ dither: true
+ - type: cmvn_json
+ cmvn_path: data/mean_std.json
+ # these three processes are a.k.a. SpecAugument
+ - type: time_warp
+ max_time_warp: 5
+ inplace: true
+ mode: PIL
+ - type: freq_mask
+ F: 30
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+ - type: time_mask
+ T: 40
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+
+
+
+
diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c021f66b71513b300b98d6f50fcc39573cc85dca
--- /dev/null
+++ b/examples/aishell/asr1/conf/transformer.yaml
@@ -0,0 +1,112 @@
+# https://yaml.org/type/float.html
+data:
+ train_manifest: data/manifest.train
+ dev_manifest: data/manifest.dev
+ test_manifest: data/manifest.test
+ min_input_len: 0.5
+ max_input_len: 20.0 # second
+ min_output_len: 0.0
+ max_output_len: 400.0
+ min_output_input_ratio: 0.05
+ max_output_input_ratio: 10.0
+
+
+collator:
+ vocab_filepath: data/vocab.txt
+ unit_type: 'char'
+ spm_model_prefix: ''
+ augmentation_config: conf/preprocess.yaml
+ batch_size: 64
+ raw_wav: True # use raw_wav or kaldi feature
+ spectrum_type: fbank #linear, mfcc, fbank
+ feat_dim: 80
+ delta_delta: False
+ dither: 1.0
+ target_sample_rate: 16000
+ max_freq: None
+ n_fft: None
+ stride_ms: 10.0
+ window_ms: 25.0
+ use_dB_normalization: True
+ target_dB: -20
+ random_seed: 0
+ keep_transcription_text: False
+ sortagrad: True
+ shuffle_method: batch_shuffle
+ num_workers: 2
+
+# network architecture
+model:
+ cmvn_file:
+ cmvn_file_type: "json"
+ # encoder related
+ encoder: transformer
+ encoder_conf:
+ output_size: 256 # dimension of attention
+ attention_heads: 4
+ linear_units: 2048 # the number of units of position-wise feed forward
+ num_blocks: 12 # the number of encoder blocks
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.0
+ input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+ normalize_before: true
+
+ # decoder related
+ decoder: transformer
+ decoder_conf:
+ attention_heads: 4
+ linear_units: 2048
+ num_blocks: 6
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.0
+ src_attention_dropout_rate: 0.0
+
+ # hybrid CTC/attention
+ model_conf:
+ ctc_weight: 0.3
+ ctc_dropoutrate: 0.0
+ ctc_grad_norm_type: null
+ lsm_weight: 0.1 # label smoothing option
+ length_normalized_loss: false
+
+
+training:
+ n_epoch: 120
+ accum_grad: 2
+ global_grad_clip: 5.0
+ optim: adam
+ optim_conf:
+ lr: 0.002
+ weight_decay: 1e-6
+ scheduler: warmuplr # pytorch v1.1.0+ required
+ scheduler_conf:
+ warmup_steps: 25000
+ lr_decay: 1.0
+ log_interval: 100
+ checkpoint:
+ kbest_n: 50
+ latest_n: 5
+
+
+decoding:
+ batch_size: 128
+ error_rate_type: cer
+ decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+ lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+ alpha: 2.5
+ beta: 0.3
+ beam_size: 10
+ cutoff_prob: 1.0
+ cutoff_top_n: 0
+ num_proc_bsearch: 8
+ ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+ decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+ # <0: for decoding, use full chunk.
+ # >0: for decoding, use fixed chunk size as set.
+ # 0: used for training, it's prohibited here.
+ num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
+ simulate_streaming: False # simulate streaming inference. Defaults to False.
+
+
diff --git a/examples/aishell/s1/local/aishell_train_lms.sh b/examples/aishell/asr1/local/aishell_train_lms.sh
similarity index 100%
rename from examples/aishell/s1/local/aishell_train_lms.sh
rename to examples/aishell/asr1/local/aishell_train_lms.sh
diff --git a/examples/aishell/s1/local/align.sh b/examples/aishell/asr1/local/align.sh
similarity index 100%
rename from examples/aishell/s1/local/align.sh
rename to examples/aishell/asr1/local/align.sh
diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/asr1/local/data.sh
similarity index 96%
rename from examples/aishell/s1/local/data.sh
rename to examples/aishell/asr1/local/data.sh
index 2b9f69ae46c35fbaba51ba7e9629147e053ffdc5..76e28075298c4817e6b553fcb0870cf145fa06f0 100755
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
- --stride_ms=10.0 \
- --window_ms=25.0 \
+ --stride_ms=10 \
+ --window_ms=25 \
--sample_rate=16000 \
--use_dB_normalization=False \
--num_samples=-1 \
@@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
diff --git a/examples/aishell/s1/local/export.sh b/examples/aishell/asr1/local/export.sh
similarity index 100%
rename from examples/aishell/s1/local/export.sh
rename to examples/aishell/asr1/local/export.sh
diff --git a/examples/aishell/s1/local/test.sh b/examples/aishell/asr1/local/test.sh
similarity index 100%
rename from examples/aishell/s1/local/test.sh
rename to examples/aishell/asr1/local/test.sh
diff --git a/examples/aishell/s1/local/test_hub.sh b/examples/aishell/asr1/local/test_hub.sh
similarity index 99%
rename from examples/aishell/s1/local/test_hub.sh
rename to examples/aishell/asr1/local/test_hub.sh
index 99b141c8107ddd4d4bf118d90fd6d9b2441d69af..6e78ec784bb8cf1445ebbe83fd39f4c8a441f418 100755
--- a/examples/aishell/s1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@@ -23,8 +23,6 @@ fi
# exit 1
#fi
-
-
for type in attention_rescoring; do
echo "decoding ${type}"
batch_size=1
diff --git a/examples/aishell/s1/local/tlg.sh b/examples/aishell/asr1/local/tlg.sh
similarity index 100%
rename from examples/aishell/s1/local/tlg.sh
rename to examples/aishell/asr1/local/tlg.sh
diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/asr1/local/train.sh
similarity index 100%
rename from examples/aishell/s1/local/train.sh
rename to examples/aishell/asr1/local/train.sh
diff --git a/examples/aishell/s1/path.sh b/examples/aishell/asr1/path.sh
similarity index 100%
rename from examples/aishell/s1/path.sh
rename to examples/aishell/asr1/path.sh
diff --git a/examples/aishell/s1/run.sh b/examples/aishell/asr1/run.sh
similarity index 100%
rename from examples/aishell/s1/run.sh
rename to examples/aishell/asr1/run.sh
diff --git a/examples/aishell/s1/utils b/examples/aishell/asr1/utils
similarity index 100%
rename from examples/aishell/s1/utils
rename to examples/aishell/asr1/utils
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index fe4887b935e329abe58ac075a57d7f1e993ad49d..056f35ba96dd1342fafdf9bfba7e840662c738cf 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -97,7 +97,7 @@ optional arguments:
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
```bash
unzip pwg_aishell3_ckpt_0.5.zip
```
@@ -202,7 +202,7 @@ optional arguments:
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
FastSpeech2 checkpoint contains files listed below.
diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index 2f1b37ee2fcd0906265ffe19101365189dd155e5..376d4a3317d0b5e139f7c31e2efdc2fc5c0fafc3 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -41,7 +41,7 @@ We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so th
We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
```bash
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -86,4 +86,4 @@ In addition, in order to accelerate the convergence of the model, we add `guided
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
```
## Pretrained Model
-[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
+[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip).
diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index 834942fa05c0d8863ae6592db4fa9d099331c845..ae53443efe0471f200468b8a404aa29d55915dc0 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -22,7 +22,7 @@ You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech
## Pretrained GE2E model
We use pretrained GE2E model to generate spwaker embedding for each sentence.
-Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip), and `unzip` it.
+Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
@@ -84,7 +84,7 @@ The training step is very similar to that one of [tts3](https://github.com/Paddl
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
```bash
unzip pwg_aishell3_ckpt_0.5.zip
```
@@ -115,7 +115,7 @@ ref_audio
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
```
## Pretrained Model
-[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
+[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
FastSpeech2 checkpoint contains files listed below.
(There is no need for `speaker_id_map.txt` here )
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index d67af726ebd001e29716ed0681327c2430bf43a9..bc28bba10f8773c4822c6153a2945804bd43700a 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -132,7 +132,7 @@ optional arguments:
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Models
-Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip).
+Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip).
Parallel WaveGAN checkpoint contains files listed below.
diff --git a/examples/callcenter/s1/.gitignore b/examples/callcenter/asr1/.gitignore
similarity index 100%
rename from examples/callcenter/s1/.gitignore
rename to examples/callcenter/asr1/.gitignore
diff --git a/examples/callcenter/s1/README.md b/examples/callcenter/asr1/README.md
similarity index 100%
rename from examples/callcenter/s1/README.md
rename to examples/callcenter/asr1/README.md
diff --git a/examples/callcenter/s1/conf/augmentation.json b/examples/callcenter/asr1/conf/augmentation.json
similarity index 100%
rename from examples/callcenter/s1/conf/augmentation.json
rename to examples/callcenter/asr1/conf/augmentation.json
diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml
similarity index 97%
rename from examples/callcenter/s1/conf/chunk_conformer.yaml
rename to examples/callcenter/asr1/conf/chunk_conformer.yaml
index a853658a859c409cb7109e08b4a9c74d4610fe87..b18b46fe6aa0a91476f5b6fcac0c8e03d3745f42 100644
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml
similarity index 97%
rename from examples/callcenter/s1/conf/conformer.yaml
rename to examples/callcenter/asr1/conf/conformer.yaml
index bd4f45788ef039e8bc302936ca4167dfd86c5585..47c438a6d1b2f453500540e102e54d48dbe8cd5f 100644
--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/asr1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c
--- /dev/null
+++ b/examples/callcenter/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+ # extract kaldi fbank from PCM
+ - type: fbank_kaldi
+ fs: 16000
+ n_mels: 80
+ n_shift: 160
+ win_length: 400
+ dither: true
+ - type: cmvn_json
+ cmvn_path: data/mean_std.json
+ # these three processes are a.k.a. SpecAugument
+ - type: time_warp
+ max_time_warp: 5
+ inplace: true
+ mode: PIL
+ - type: freq_mask
+ F: 30
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+ - type: time_mask
+ T: 40
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+
+
+
+
diff --git a/examples/callcenter/s1/local/align.sh b/examples/callcenter/asr1/local/align.sh
similarity index 100%
rename from examples/callcenter/s1/local/align.sh
rename to examples/callcenter/asr1/local/align.sh
diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/asr1/local/data.sh
similarity index 96%
rename from examples/callcenter/s1/local/data.sh
rename to examples/callcenter/asr1/local/data.sh
index 634bb8d0eb24c660fb17ba80fa78082192b33f03..c40c752abc981ee8354d32d2ad99c1326173bcf8 100755
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/asr1/local/data.sh
@@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
- --stride_ms=10.0 \
- --window_ms=25.0 \
+ --stride_ms=10 \
+ --window_ms=25 \
--sample_rate=8000 \
--use_dB_normalization=False \
--num_samples=-1 \
@@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
diff --git a/examples/callcenter/s1/local/download_lm_ch.sh b/examples/callcenter/asr1/local/download_lm_ch.sh
similarity index 100%
rename from examples/callcenter/s1/local/download_lm_ch.sh
rename to examples/callcenter/asr1/local/download_lm_ch.sh
diff --git a/examples/callcenter/s1/local/export.sh b/examples/callcenter/asr1/local/export.sh
similarity index 100%
rename from examples/callcenter/s1/local/export.sh
rename to examples/callcenter/asr1/local/export.sh
diff --git a/examples/callcenter/s1/local/test.sh b/examples/callcenter/asr1/local/test.sh
similarity index 100%
rename from examples/callcenter/s1/local/test.sh
rename to examples/callcenter/asr1/local/test.sh
diff --git a/examples/callcenter/s1/local/train.sh b/examples/callcenter/asr1/local/train.sh
similarity index 100%
rename from examples/callcenter/s1/local/train.sh
rename to examples/callcenter/asr1/local/train.sh
diff --git a/examples/callcenter/s1/path.sh b/examples/callcenter/asr1/path.sh
similarity index 100%
rename from examples/callcenter/s1/path.sh
rename to examples/callcenter/asr1/path.sh
diff --git a/examples/callcenter/s1/run.sh b/examples/callcenter/asr1/run.sh
similarity index 100%
rename from examples/callcenter/s1/run.sh
rename to examples/callcenter/asr1/run.sh
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 61c4972bcd44a781175264ac779a8b3ad02ec8e0..5ebf3cf4e0ce641be92f6c92ad78c38a4a2defd9 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -90,7 +90,7 @@ optional arguments:
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
unzip pwg_baker_ckpt_0.4.zip
```
@@ -208,9 +208,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
```
## Pretrained Model
-Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip).
+Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip).
-Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip).
+Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip).
SpeedySpeech checkpoint contains files listed below.
```text
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 6570d33dc6862e9c7ebfd3694046578b1d28dabf..104964c85aee16ba33bf5769adc4433c9b7675ef 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -88,7 +88,7 @@ optional arguments:
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
unzip pwg_baker_ckpt_0.4.zip
```
@@ -199,9 +199,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
```
## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip).
+Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip).
-Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip).
+Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
FastSpeech2 checkpoint contains files listed below.
```text
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index b9c8a465fe2b79e55ebd032e1c43f1fd0e3397a6..86114a42338a859a3f81ab43cfebc9403b9e5794 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -122,9 +122,9 @@ optional arguments:
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Models
-Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
+Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip).
-Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip).
+Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip).
Parallel WaveGAN checkpoint contains files listed below.
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index a72f60f1633c4469441f2bbebdd2a94e172d098b..4925b649d747f8d5d6124cf8eb20564a28f94f11 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -113,7 +113,7 @@ The length of mel-spectrograms should align with the length of wavs, so we shoul
But since we are fine-tuning, we should use the statistics computed during training step.
-You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it.
+You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it.
Assume the path to the dump-dir of training step is `dump`.
Assume the path to the duration result of CSMSC is `durations.txt` (generated during training step's preprocessing).
@@ -147,11 +147,11 @@ TODO:
The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
## Pretrained Models
-Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip).
+Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip).
-Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_finetune_ckpt_0.5.zip).
+Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip).
-Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip)
+Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip)
Multi Band MelGAN checkpoint contains files listed below.
diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
index e32f619e90eaded75aa465fa8bc2ae39b6e77486..85f478c20ddba18d40c175c724370b81c93e46d0 100644
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -22,6 +22,7 @@ import argparse
import codecs
import json
import os
+from pathlib import Path
import soundfile
@@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
+ utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
@@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': audio_id,
+ 'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text,
diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py
index 66e0690138a91d3fd465547a71fcde6d1922b3ef..7431fc08369546f372c93dc923f50300f1da10a3 100644
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
@@ -22,6 +22,7 @@ import argparse
import codecs
import json
import os
+from pathlib import Path
import soundfile
@@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix):
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue
+
+ utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
@@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': audio_id,
+ 'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text
diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py
index e85bbb3aa44d0bd775698ac0c79ce5b690f83a00..69f0db599e12d0a482a8d7783eb85ce9e04c744d 100644
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
@@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
print("Creating manifest %s ..." % manifest_path)
json_lines = []
total_sec = 0.0
- total_text = 0.0
+ total_char = 0.0
total_num = 0
for subfolder, _, filelist in sorted(os.walk(data_dir)):
@@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
text_filepath = os.path.join(subfolder, text_filelist[0])
for line in io.open(text_filepath, encoding="utf8"):
segments = line.strip().split()
+ nchars = len(segments[1:])
text = ' '.join(segments[1:]).lower()
audio_filepath = os.path.abspath(
os.path.join(subfolder, segments[0] + '.flac'))
audio_data, samplerate = soundfile.read(audio_filepath)
duration = float(len(audio_data)) / samplerate
+
+ utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+ utt2spk = '-'.join(utt.split('-')[:2])
+
json_lines.append(
json.dumps({
- 'utt':
- os.path.splitext(os.path.basename(audio_filepath))[0],
- 'feat':
- audio_filepath,
- 'feat_shape': (duration, ), #second
- 'text':
- text
+ 'utt': utt,
+ 'utt2spk': utt2spk,
+ 'feat': audio_filepath,
+ 'feat_shape': (duration, ), # second
+ 'text': text,
}))
total_sec += duration
- total_text += len(text)
+ total_char += nchars
total_num += 1
with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
print(f"{subset}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
- print(f"{total_text} text", file=f)
- print(f"{total_text / total_sec} text/sec", file=f)
+ print(f"{total_char} char", file=f)
+ print(f"{total_char / total_sec} char/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)
diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py
index 65fee81a70bd96e01d0cf119c5081179230e5708..730c73a8b4dc44691351717de1bfe918f3b957ac 100644
--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
@@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
audio_data, samplerate = soundfile.read(audio_filepath)
duration = float(len(audio_data)) / samplerate
+
+ utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+ utt2spk = '-'.join(utt.split('-')[:2])
json_lines.append(
json.dumps({
- 'utt':
- os.path.splitext(os.path.basename(audio_filepath))[0],
- 'feat':
- audio_filepath,
+ 'utt': utt,
+ 'utt2spk': utt2spk,
+ 'feat': audio_filepath,
'feat_shape': (duration, ), #second
- 'text':
- text
+ 'text': text,
}))
total_sec += duration
diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py
index 14bef01d2b129a04dc0aac21765321893c470ac8..9a3ba3b31c2f7a9b9e050ceebaa8da9ace0ccb89 100644
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
@@ -72,14 +72,16 @@ def create_manifest(data_dir, manifest_path_prefix):
continue
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
+
+ translation_str = " ".join(translation.split())
+ trancription_str = " ".join(trancription.split())
json_lines.append(
json.dumps(
{
'utt': utt,
'feat': audio_path,
'feat_shape': (duration, ), # second
- 'text': " ".join(translation.split()),
- 'text1': " ".join(trancription.split())
+ 'text': [translation_str, trancription_str],
},
ensure_ascii=False))
diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py
index 77a264cbba1171a027e6548a6176d5d9822515b5..cdfc0a75c0aacfdf89492d2f83642cb7f5decea8 100644
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
@@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
assert os.path.exists(audio_path) and os.path.exists(text_path)
audio_id = os.path.basename(audio_path)[:-4]
+ spk = audio_id.split('_')[0]
+
word_text, syllable_text, phone_text = read_trn(text_path)
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
@@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': audio_id,
+ 'utt2spk': spk,
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': word_text, # charactor
diff --git a/examples/dataset/timit/timit.py b/examples/dataset/timit/timit.py
index 311d445cb3a1e5123889d50b5028ca1aeb85ca19..c4a9f06631809bd4ca1d72755576d631f8590055 100644
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
@@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': utt_id,
+ 'utt2spk': spk,
+ 'utt2gender': gender,
'feat': str(audio_path),
'feat_shape': (duration, ), # second
'text': word_text, # word
'phone': phone_text,
- 'spk': spk,
- 'gender': gender,
},
ensure_ascii=False))
diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/examples/dataset/timit/timit_kaldi_standard_split.py
index 2b494c06db990e8c932cc28a5a974cddc9e4e943..473fc856f4f78e6ed1f2d145a599d41226e212f9 100644
--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
@@ -22,6 +22,7 @@ import argparse
import codecs
import json
import os
+from pathlib import Path
import soundfile
@@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = phn_dict[audio_id]
+
+ gender_spk = str(Path(audio_path).parent.stem)
+ spk = gender_spk[1:]
+ gender = gender_spk[0]
+ utt_id = '_'.join([spk, gender, audio_id])
json_lines.append(
json.dumps(
{
'utt': audio_id,
+ 'utt2spk': spk,
+ 'utt2gender': gender,
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text
diff --git a/examples/dataset/voxforge/voxforge.py b/examples/dataset/voxforge/voxforge.py
index 36282bd609f372aec47080686c102b7f81b02286..373791bffe04114a51d89f6bf84c6dde504be84c 100644
--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
@@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):
audio_data, samplerate = soundfile.read(u)
duration = float(len(audio_data)) / samplerate
+
+ utt = os.path.splitext(os.path.basename(u))[0]
json_lines.append(
json.dumps({
- 'utt': os.path.splitext(os.path.basename(u))[0],
+ 'utt': utt,
+ 'utt2spk': speaker,
'feat': u,
'feat_shape': (duration, ), #second
'text': trans.lower()
diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md
index 5943cf1d7884de4fa8bc39a04e6e8651f32b0a2c..74441fd0915d6cf91473659d2f973c0de60af34e 100644
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@@ -1,8 +1,9 @@
# ASR
-* s0 is for deepspeech2 offline
-* s1 is for transformer/conformer/U2
-* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+
## Data
| Data Subset | Duration in Seconds |
diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/asr0/README.md
similarity index 100%
rename from examples/librispeech/s0/README.md
rename to examples/librispeech/asr0/README.md
diff --git a/examples/librispeech/s0/conf/augmentation.json b/examples/librispeech/asr0/conf/augmentation.json
similarity index 100%
rename from examples/librispeech/s0/conf/augmentation.json
rename to examples/librispeech/asr0/conf/augmentation.json
diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml
similarity index 100%
rename from examples/librispeech/s0/conf/deepspeech2.yaml
rename to examples/librispeech/asr0/conf/deepspeech2.yaml
diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
similarity index 100%
rename from examples/librispeech/s0/conf/deepspeech2_online.yaml
rename to examples/librispeech/asr0/conf/deepspeech2_online.yaml
diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/asr0/local/data.sh
similarity index 97%
rename from examples/librispeech/s0/local/data.sh
rename to examples/librispeech/asr0/local/data.sh
index fd2b0c0138dceb8352344b3b7beb9b19a2f94c40..0f276cecad316bcb444fddd36df988c0618b7152 100755
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="linear" \
--delta_delta=false \
--sample_rate=16000 \
- --stride_ms=10.0 \
- --window_ms=20.0 \
+ --stride_ms=10 \
+ --window_ms=20 \
--use_dB_normalization=True \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
@@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
diff --git a/examples/librispeech/s0/local/download_lm_en.sh b/examples/librispeech/asr0/local/download_lm_en.sh
similarity index 100%
rename from examples/librispeech/s0/local/download_lm_en.sh
rename to examples/librispeech/asr0/local/download_lm_en.sh
diff --git a/examples/librispeech/s0/local/export.sh b/examples/librispeech/asr0/local/export.sh
similarity index 100%
rename from examples/librispeech/s0/local/export.sh
rename to examples/librispeech/asr0/local/export.sh
diff --git a/examples/librispeech/s0/local/test.sh b/examples/librispeech/asr0/local/test.sh
similarity index 100%
rename from examples/librispeech/s0/local/test.sh
rename to examples/librispeech/asr0/local/test.sh
diff --git a/examples/librispeech/s0/local/test_hub.sh b/examples/librispeech/asr0/local/test_hub.sh
similarity index 100%
rename from examples/librispeech/s0/local/test_hub.sh
rename to examples/librispeech/asr0/local/test_hub.sh
diff --git a/examples/librispeech/s0/local/train.sh b/examples/librispeech/asr0/local/train.sh
similarity index 100%
rename from examples/librispeech/s0/local/train.sh
rename to examples/librispeech/asr0/local/train.sh
diff --git a/examples/librispeech/s0/path.sh b/examples/librispeech/asr0/path.sh
similarity index 100%
rename from examples/librispeech/s0/path.sh
rename to examples/librispeech/asr0/path.sh
diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/asr0/run.sh
similarity index 100%
rename from examples/librispeech/s0/run.sh
rename to examples/librispeech/asr0/run.sh
diff --git a/examples/librispeech/s1/.gitignore b/examples/librispeech/asr1/.gitignore
similarity index 100%
rename from examples/librispeech/s1/.gitignore
rename to examples/librispeech/asr1/.gitignore
diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/asr1/README.md
similarity index 74%
rename from examples/librispeech/s1/README.md
rename to examples/librispeech/asr1/README.md
index b7ec93ebec92020d9089233a218dbb2143f40c37..73f0863ed38746d11eb50a8d86ae46c651926a4b 100644
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/asr1/README.md
@@ -21,7 +21,7 @@
## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |
diff --git a/examples/librispeech/s1/cmd.sh b/examples/librispeech/asr1/cmd.sh
similarity index 100%
rename from examples/librispeech/s1/cmd.sh
rename to examples/librispeech/asr1/cmd.sh
diff --git a/examples/librispeech/s1/conf/augmentation.json b/examples/librispeech/asr1/conf/augmentation.json
similarity index 100%
rename from examples/librispeech/s1/conf/augmentation.json
rename to examples/librispeech/asr1/conf/augmentation.json
diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml
similarity index 97%
rename from examples/librispeech/s1/conf/chunk_conformer.yaml
rename to examples/librispeech/asr1/conf/chunk_conformer.yaml
index 4d0e6ceb1c3ec629383a634b727b00bb09623be4..2bfb0fb6f7b939c69372cd6be0bc676edcf92880 100644
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml
similarity index 97%
rename from examples/librispeech/s1/conf/chunk_transformer.yaml
rename to examples/librispeech/asr1/conf/chunk_transformer.yaml
index c7b53f95bd6b0a7d67acaf339972fd09f52ab2b8..fe533777630a3aa1c26577492457ff2e4c06c848 100644
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml
similarity index 97%
rename from examples/librispeech/s1/conf/conformer.yaml
rename to examples/librispeech/asr1/conf/conformer.yaml
index 3bc942dc064ff8958280bf90ba7d6fdb8180bd94..c844baaafb3ab4c55a95baa7f4b2a43c9ec40f7c 100644
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
diff --git a/examples/librispeech/asr1/conf/preprocess.yaml b/examples/librispeech/asr1/conf/preprocess.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..021ca4c58022f696c5218dedfe20e7244f09bd7f
--- /dev/null
+++ b/examples/librispeech/asr1/conf/preprocess.yaml
@@ -0,0 +1,25 @@
+process:
+ # extract kaldi fbank from PCM
+ - type: fbank_kaldi
+ fs: 16000
+ n_mels: 80
+ n_shift: 160
+ win_length: 400
+ dither: true
+ - type: cmvn_json
+ cmvn_path: data/mean_std.json
+ # these three processes are a.k.a. SpecAugument
+ - type: time_warp
+ max_time_warp: 5
+ inplace: true
+ mode: PIL
+ - type: freq_mask
+ F: 30
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+ - type: time_mask
+ T: 40
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
similarity index 97%
rename from examples/librispeech/s1/conf/transformer.yaml
rename to examples/librispeech/asr1/conf/transformer.yaml
index 3cc17004c0ac103efd16e5d4899b910805faeda5..5a158f3ed69ee90f2936cff5016937f7b20932b7 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
diff --git a/examples/librispeech/s1/local/align.sh b/examples/librispeech/asr1/local/align.sh
similarity index 100%
rename from examples/librispeech/s1/local/align.sh
rename to examples/librispeech/asr1/local/align.sh
diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/asr1/local/data.sh
similarity index 66%
rename from examples/librispeech/s1/local/data.sh
rename to examples/librispeech/asr1/local/data.sh
index 56fec8463c9c27eda2853ac83d5b8d030942d8cf..35f4e635fa26b99f89498c761dc1e6906a24899a 100755
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@@ -8,6 +8,11 @@ nbpe=5000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
+stride_ms=10
+window_ms=25
+sample_rate=16000
+feat_dim=80
+
source ${MAIN_ROOT}/utils/parse_options.sh
@@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
exit 1
fi
- for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
- mv data/manifest.${set} data/manifest.${set}.raw
+ for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+ mv data/manifest.${sub} data/manifest.${sub}.raw
done
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
- for set in train-clean-100 train-clean-360 train-other-500; do
- cat data/manifest.${set}.raw >> data/manifest.train.raw
+ for sub in train-clean-100 train-clean-360 train-other-500; do
+ cat data/manifest.${sub}.raw >> data/manifest.train.raw
done
- for set in dev-clean dev-other; do
- cat data/manifest.${set}.raw >> data/manifest.dev.raw
+ for sub in dev-clean dev-other; do
+ cat data/manifest.${sub}.raw >> data/manifest.dev.raw
done
- for set in test-clean test-other; do
- cat data/manifest.${set}.raw >> data/manifest.test.raw
+ for sub in test-clean test-other; do
+ cat data/manifest.${sub}.raw >> data/manifest.test.raw
done
fi
@@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \
--num_samples=-1 \
--spectrum_type="fbank" \
- --feat_dim=80 \
+ --feat_dim=${feat_dim} \
--delta_delta=false \
- --sample_rate=16000 \
- --stride_ms=10.0 \
- --window_ms=25.0 \
+ --sample_rate=${sample_rate} \
+ --stride_ms=${stride_ms} \
+ --window_ms=${window_ms} \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
@@ -85,16 +90,15 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
- for set in train dev test dev-clean dev-other test-clean test-other; do
+ for sub in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
- --manifest_path="data/manifest.${set}.raw" \
- --output_path="data/manifest.${set}"
+ --manifest_path="data/manifest.${sub}.raw" \
+ --output_path="data/manifest.${sub}"
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
@@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
}&
done
wait
+
+ for sub in train dev; do
+ mv data/manifest.${sub} data/manifest.${sub}.fmt
+ done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ for sub in train dev; do
+ remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
+ done
fi
echo "LibriSpeech Data preparation done."
diff --git a/examples/librispeech/s1/local/download_lm_en.sh b/examples/librispeech/asr1/local/download_lm_en.sh
similarity index 100%
rename from examples/librispeech/s1/local/download_lm_en.sh
rename to examples/librispeech/asr1/local/download_lm_en.sh
diff --git a/examples/librispeech/s1/local/export.sh b/examples/librispeech/asr1/local/export.sh
similarity index 100%
rename from examples/librispeech/s1/local/export.sh
rename to examples/librispeech/asr1/local/export.sh
diff --git a/examples/librispeech/s1/local/test.sh b/examples/librispeech/asr1/local/test.sh
similarity index 100%
rename from examples/librispeech/s1/local/test.sh
rename to examples/librispeech/asr1/local/test.sh
diff --git a/examples/librispeech/s1/local/test_hub.sh b/examples/librispeech/asr1/local/test_hub.sh
similarity index 100%
rename from examples/librispeech/s1/local/test_hub.sh
rename to examples/librispeech/asr1/local/test_hub.sh
diff --git a/examples/librispeech/s1/local/train.sh b/examples/librispeech/asr1/local/train.sh
similarity index 100%
rename from examples/librispeech/s1/local/train.sh
rename to examples/librispeech/asr1/local/train.sh
diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/asr1/path.sh
similarity index 100%
rename from examples/librispeech/s1/path.sh
rename to examples/librispeech/asr1/path.sh
diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/asr1/run.sh
similarity index 100%
rename from examples/librispeech/s1/run.sh
rename to examples/librispeech/asr1/run.sh
diff --git a/examples/librispeech/s1/utils b/examples/librispeech/asr1/utils
similarity index 100%
rename from examples/librispeech/s1/utils
rename to examples/librispeech/asr1/utils
diff --git a/examples/librispeech/s2/.gitignore b/examples/librispeech/asr2/.gitignore
similarity index 100%
rename from examples/librispeech/s2/.gitignore
rename to examples/librispeech/asr2/.gitignore
diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/asr2/README.md
similarity index 100%
rename from examples/librispeech/s2/README.md
rename to examples/librispeech/asr2/README.md
diff --git a/examples/librispeech/s2/cmd.sh b/examples/librispeech/asr2/cmd.sh
similarity index 100%
rename from examples/librispeech/s2/cmd.sh
rename to examples/librispeech/asr2/cmd.sh
diff --git a/examples/librispeech/s2/conf/augmentation.json b/examples/librispeech/asr2/conf/augmentation.json
similarity index 100%
rename from examples/librispeech/s2/conf/augmentation.json
rename to examples/librispeech/asr2/conf/augmentation.json
diff --git a/examples/librispeech/s2/conf/decode/decode.yaml b/examples/librispeech/asr2/conf/decode/decode.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/decode/decode.yaml
rename to examples/librispeech/asr2/conf/decode/decode.yaml
diff --git a/examples/librispeech/s2/conf/decode/decode_att.yaml b/examples/librispeech/asr2/conf/decode/decode_att.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/decode/decode_att.yaml
rename to examples/librispeech/asr2/conf/decode/decode_att.yaml
diff --git a/examples/librispeech/s2/conf/decode/decode_ctc.yaml b/examples/librispeech/asr2/conf/decode/decode_ctc.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/decode/decode_ctc.yaml
rename to examples/librispeech/asr2/conf/decode/decode_ctc.yaml
diff --git a/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml b/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/decode/decode_wo_lm.yaml
rename to examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml
diff --git a/examples/librispeech/s2/conf/fbank.conf b/examples/librispeech/asr2/conf/fbank.conf
similarity index 100%
rename from examples/librispeech/s2/conf/fbank.conf
rename to examples/librispeech/asr2/conf/fbank.conf
diff --git a/examples/librispeech/s2/conf/lm/transformer.yaml b/examples/librispeech/asr2/conf/lm/transformer.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/lm/transformer.yaml
rename to examples/librispeech/asr2/conf/lm/transformer.yaml
diff --git a/examples/librispeech/s2/conf/pitch.conf b/examples/librispeech/asr2/conf/pitch.conf
similarity index 100%
rename from examples/librispeech/s2/conf/pitch.conf
rename to examples/librispeech/asr2/conf/pitch.conf
diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/transformer.yaml
rename to examples/librispeech/asr2/conf/transformer.yaml
diff --git a/examples/librispeech/s2/local/align.sh b/examples/librispeech/asr2/local/align.sh
similarity index 100%
rename from examples/librispeech/s2/local/align.sh
rename to examples/librispeech/asr2/local/align.sh
diff --git a/examples/librispeech/s2/local/cacu_perplexity.sh b/examples/librispeech/asr2/local/cacu_perplexity.sh
similarity index 100%
rename from examples/librispeech/s2/local/cacu_perplexity.sh
rename to examples/librispeech/asr2/local/cacu_perplexity.sh
diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/asr2/local/data.sh
similarity index 100%
rename from examples/librispeech/s2/local/data.sh
rename to examples/librispeech/asr2/local/data.sh
diff --git a/examples/librispeech/s2/local/data_prep.sh b/examples/librispeech/asr2/local/data_prep.sh
similarity index 100%
rename from examples/librispeech/s2/local/data_prep.sh
rename to examples/librispeech/asr2/local/data_prep.sh
diff --git a/examples/librispeech/s2/local/download_lm_en.sh b/examples/librispeech/asr2/local/download_lm_en.sh
similarity index 100%
rename from examples/librispeech/s2/local/download_lm_en.sh
rename to examples/librispeech/asr2/local/download_lm_en.sh
diff --git a/examples/librispeech/s2/local/espnet_json_to_manifest.py b/examples/librispeech/asr2/local/espnet_json_to_manifest.py
similarity index 100%
rename from examples/librispeech/s2/local/espnet_json_to_manifest.py
rename to examples/librispeech/asr2/local/espnet_json_to_manifest.py
diff --git a/examples/librispeech/s2/local/export.sh b/examples/librispeech/asr2/local/export.sh
similarity index 100%
rename from examples/librispeech/s2/local/export.sh
rename to examples/librispeech/asr2/local/export.sh
diff --git a/examples/librispeech/s2/local/recog.sh b/examples/librispeech/asr2/local/recog.sh
similarity index 100%
rename from examples/librispeech/s2/local/recog.sh
rename to examples/librispeech/asr2/local/recog.sh
diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/asr2/local/test.sh
similarity index 100%
rename from examples/librispeech/s2/local/test.sh
rename to examples/librispeech/asr2/local/test.sh
diff --git a/examples/librispeech/s2/local/train.sh b/examples/librispeech/asr2/local/train.sh
similarity index 100%
rename from examples/librispeech/s2/local/train.sh
rename to examples/librispeech/asr2/local/train.sh
diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/asr2/path.sh
similarity index 100%
rename from examples/librispeech/s2/path.sh
rename to examples/librispeech/asr2/path.sh
diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/asr2/run.sh
similarity index 100%
rename from examples/librispeech/s2/run.sh
rename to examples/librispeech/asr2/run.sh
diff --git a/examples/librispeech/s2/steps b/examples/librispeech/asr2/steps
similarity index 100%
rename from examples/librispeech/s2/steps
rename to examples/librispeech/asr2/steps
diff --git a/examples/librispeech/s2/utils b/examples/librispeech/asr2/utils
similarity index 100%
rename from examples/librispeech/s2/utils
rename to examples/librispeech/asr2/utils
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index 09fd0c133a8d43afe47947e2639bcd2041defce2..305add2042b0b62a8cb3ca57e05230fdf5778d46 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -80,6 +80,6 @@ optional arguments:
## Pretrained Models
Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.
-1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
+1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)
-2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
+2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 12e43e2ea42825872a6623ae76e7bd6963ddb8ae..8a43ecd9c2b7685f4457295b2e4c0e4da066d246 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -79,7 +79,7 @@ optional arguments:
## Synthesize
We use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder.
-Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it.
+Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip) and unzip it.
```bash
unzip waveflow_ljspeech_ckpt_0.3.zip
```
@@ -173,7 +173,7 @@ optional arguments:
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Model
-Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
+Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)
TransformerTTS checkpoint contains files listed below.
```text
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index cda5354122bdc333c333616868dd97a3008524a8..5bdaf4b82621e548553d05754a4950d2745d6096 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -87,7 +87,7 @@ optional arguments:
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
```bash
unzip pwg_ljspeech_ckpt_0.5.zip
```
@@ -191,7 +191,7 @@ optional arguments:
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
FastSpeech2 checkpoint contains files listed below.
```text
diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md
index 09856c36766dd5dc8cbdf457ab37678282473017..0d4e6c51a00dc1e91238545382972ad977f5d718 100644
--- a/examples/ljspeech/voc0/README.md
+++ b/examples/ljspeech/voc0/README.md
@@ -48,4 +48,4 @@ Synthesize waveform.
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Model
-Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
+Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip).
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 0506d5d8f75e35b646bd985b2e47e6f65e85ee55..24f6dbcafe7086230ea6c05c9a8a7624f3cf142a 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -123,7 +123,7 @@ optional arguments:
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Models
-Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
+Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)
Parallel WaveGAN checkpoint contains files listed below.
diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh
index 0bf35e1f582c424c4634d395d42867b7775673ad..85574260b023f8559c16869666a34b7f42599679 100755
--- a/examples/other/1xt2x/aishell/local/data.sh
+++ b/examples/other/1xt2x/aishell/local/data.sh
@@ -50,7 +50,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.npz" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh
index f0bde77fe4ec4d0abb25634a28bb4bdfae91766f..8e378ff053ba78c970a50ca5d938975f3464f50f 100755
--- a/examples/other/1xt2x/baidu_en8k/local/data.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/data.sh
@@ -65,7 +65,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.npz" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh
index 6f9bc5566cbdb1d482fc00988e87f1a15c2b3647..7387472d53cd0cb6ed6a73eb9eb8e5d3ba6685aa 100755
--- a/examples/other/1xt2x/librispeech/local/data.sh
+++ b/examples/other/1xt2x/librispeech/local/data.sh
@@ -63,7 +63,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.npz" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
diff --git a/examples/other/ge2e/README.md b/examples/other/ge2e/README.md
index d86c8c132d50b7d60069554e95f3f279d1226b0a..d58ca5137b6f39d2b38a5c9cb73872dcd9e5f7ff 100644
--- a/examples/other/ge2e/README.md
+++ b/examples/other/ge2e/README.md
@@ -95,7 +95,7 @@ In `${BIN_DIR}/inference.py`:
## Pretrained Model
The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.
-Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip).
+Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip).
## References
diff --git a/examples/ted_en_zh/README.md b/examples/ted_en_zh/README.md
index 5664b06b31a299b662df6d87e07222ecae1d1d5f..6d6886daf11089168e8272cd0100b1edeb79d3ee 100644
--- a/examples/ted_en_zh/README.md
+++ b/examples/ted_en_zh/README.md
@@ -1,3 +1,3 @@
# TED En -> Zh
-* t0 for u2 speech translation
+* st0 - conformer/transformer speech translation
diff --git a/examples/ted_en_zh/st0/.gitignore b/examples/ted_en_zh/st0/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..469c61715f4dc74da526f3a4a5a5e2a6d287d716
--- /dev/null
+++ b/examples/ted_en_zh/st0/.gitignore
@@ -0,0 +1,3 @@
+TED-En-Zh
+data
+exp
diff --git a/examples/ted_en_zh/t0/README.md b/examples/ted_en_zh/st0/README.md
similarity index 100%
rename from examples/ted_en_zh/t0/README.md
rename to examples/ted_en_zh/st0/README.md
diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml
similarity index 100%
rename from examples/ted_en_zh/t0/conf/transformer.yaml
rename to examples/ted_en_zh/st0/conf/transformer.yaml
diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
similarity index 100%
rename from examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
rename to examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh
similarity index 91%
rename from examples/ted_en_zh/t0/local/data.sh
rename to examples/ted_en_zh/st0/local/data.sh
index b080a5b497e703c6b2f1c2d385315f771b33a1a0..d3acbd4486b3753e70fe7d0c3f71b4f1b3576583 100755
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -9,7 +9,7 @@ stop_stage=100
nbpe=8000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
-data_dir=./TED_EnZh
+data_dir=./TED-En-Zh
source ${MAIN_ROOT}/utils/parse_options.sh
@@ -21,7 +21,7 @@ mkdir -p data
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
if [ ! -e ${data_dir} ]; then
- echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+ echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset"
echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
echo "The tree of the directory should be:"
echo "."
@@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
- --stride_ms=10.0 \
- --window_ms=25.0 \
+ --stride_ms=10 \
+ --window_ms=25 \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
@@ -88,8 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for set in train dev test; do
{
- python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
- --feat_type "raw" \
+ python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
diff --git a/examples/ted_en_zh/t0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh
similarity index 100%
rename from examples/ted_en_zh/t0/local/test.sh
rename to examples/ted_en_zh/st0/local/test.sh
diff --git a/examples/ted_en_zh/t0/local/train.sh b/examples/ted_en_zh/st0/local/train.sh
similarity index 100%
rename from examples/ted_en_zh/t0/local/train.sh
rename to examples/ted_en_zh/st0/local/train.sh
diff --git a/examples/ted_en_zh/t0/path.sh b/examples/ted_en_zh/st0/path.sh
similarity index 100%
rename from examples/ted_en_zh/t0/path.sh
rename to examples/ted_en_zh/st0/path.sh
diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/st0/run.sh
similarity index 93%
rename from examples/ted_en_zh/t0/run.sh
rename to examples/ted_en_zh/st0/run.sh
index ed9ab5f87506aeec6b0edf9ad3f2b4299bb9647c..fb4bc33880b0013d05e19c734554a51348b6a484 100755
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/st0/run.sh
@@ -22,7 +22,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
- CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
diff --git a/examples/ted_en_zh/t0/.gitignore b/examples/ted_en_zh/t0/.gitignore
deleted file mode 100644
index 123e5174a4e676b2ac3e616673dac984e958c4b5..0000000000000000000000000000000000000000
--- a/examples/ted_en_zh/t0/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-TED_EnZh
-data
-exp
diff --git a/examples/thchs30/README.md b/examples/thchs30/README.md
index 7b3cc3d9547f011610bd823ffc1c8229a926c33f..9a0026a0f0c212b8a0b2b5cd8ec6c2cd6f6bb115 100644
--- a/examples/thchs30/README.md
+++ b/examples/thchs30/README.md
@@ -1,3 +1,3 @@
# thchs30
-* a0 for mfa alignment
+* align0 - mfa alignment
diff --git a/examples/thchs30/a0/README.md b/examples/thchs30/align0/README.md
similarity index 100%
rename from examples/thchs30/a0/README.md
rename to examples/thchs30/align0/README.md
diff --git a/examples/thchs30/a0/data/dict/syllable.lexicon b/examples/thchs30/align0/data/dict/syllable.lexicon
similarity index 100%
rename from examples/thchs30/a0/data/dict/syllable.lexicon
rename to examples/thchs30/align0/data/dict/syllable.lexicon
diff --git a/examples/thchs30/a0/local/data.sh b/examples/thchs30/align0/local/data.sh
similarity index 100%
rename from examples/thchs30/a0/local/data.sh
rename to examples/thchs30/align0/local/data.sh
diff --git a/examples/thchs30/a0/local/gen_word2phone.py b/examples/thchs30/align0/local/gen_word2phone.py
similarity index 100%
rename from examples/thchs30/a0/local/gen_word2phone.py
rename to examples/thchs30/align0/local/gen_word2phone.py
diff --git a/examples/thchs30/a0/local/reorganize_thchs30.py b/examples/thchs30/align0/local/reorganize_thchs30.py
similarity index 100%
rename from examples/thchs30/a0/local/reorganize_thchs30.py
rename to examples/thchs30/align0/local/reorganize_thchs30.py
diff --git a/examples/thchs30/a0/path.sh b/examples/thchs30/align0/path.sh
similarity index 100%
rename from examples/thchs30/a0/path.sh
rename to examples/thchs30/align0/path.sh
diff --git a/examples/thchs30/a0/run.sh b/examples/thchs30/align0/run.sh
similarity index 100%
rename from examples/thchs30/a0/run.sh
rename to examples/thchs30/align0/run.sh
diff --git a/examples/timit/README.md b/examples/timit/README.md
index b7c8b754521a1cbc9a8535e6f5c67854c68f8ba1..51fcfd57c930850c5fbb239436ac0eab5afa47eb 100644
--- a/examples/timit/README.md
+++ b/examples/timit/README.md
@@ -1,3 +1,7 @@
# TIMIT
-* s1 u2 model with phone unit
+asr model with phone unit
+
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
diff --git a/examples/timit/s1/.gitignore b/examples/timit/asr1/.gitignore
similarity index 100%
rename from examples/timit/s1/.gitignore
rename to examples/timit/asr1/.gitignore
diff --git a/examples/timit/s1/README.md b/examples/timit/asr1/README.md
similarity index 100%
rename from examples/timit/s1/README.md
rename to examples/timit/asr1/README.md
diff --git a/examples/timit/s1/conf/augmentation.json b/examples/timit/asr1/conf/augmentation.json
similarity index 100%
rename from examples/timit/s1/conf/augmentation.json
rename to examples/timit/asr1/conf/augmentation.json
diff --git a/examples/timit/s1/conf/dev_spk.list b/examples/timit/asr1/conf/dev_spk.list
similarity index 100%
rename from examples/timit/s1/conf/dev_spk.list
rename to examples/timit/asr1/conf/dev_spk.list
diff --git a/examples/timit/s1/conf/phones.60-48-39.map b/examples/timit/asr1/conf/phones.60-48-39.map
similarity index 100%
rename from examples/timit/s1/conf/phones.60-48-39.map
rename to examples/timit/asr1/conf/phones.60-48-39.map
diff --git a/examples/timit/asr1/conf/preprocess.yaml b/examples/timit/asr1/conf/preprocess.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c
--- /dev/null
+++ b/examples/timit/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+ # extract kaldi fbank from PCM
+ - type: fbank_kaldi
+ fs: 16000
+ n_mels: 80
+ n_shift: 160
+ win_length: 400
+ dither: true
+ - type: cmvn_json
+ cmvn_path: data/mean_std.json
+ # these three processes are a.k.a. SpecAugument
+ - type: time_warp
+ max_time_warp: 5
+ inplace: true
+ mode: PIL
+ - type: freq_mask
+ F: 30
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+ - type: time_mask
+ T: 40
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+
+
+
+
diff --git a/examples/timit/s1/conf/test_spk.list b/examples/timit/asr1/conf/test_spk.list
similarity index 100%
rename from examples/timit/s1/conf/test_spk.list
rename to examples/timit/asr1/conf/test_spk.list
diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml
similarity index 97%
rename from examples/timit/s1/conf/transformer.yaml
rename to examples/timit/asr1/conf/transformer.yaml
index d3ced898ef4b496d10c2410c6528179099c0ad14..1d18468b80025b5ced93c08db0e7f38acc2eb937 100644
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/asr1/conf/transformer.yaml
@@ -14,7 +14,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: "word"
mean_std_filepath: ""
- augmentation_config: ""
+ augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
diff --git a/examples/timit/s1/local/align.sh b/examples/timit/asr1/local/align.sh
similarity index 100%
rename from examples/timit/s1/local/align.sh
rename to examples/timit/asr1/local/align.sh
diff --git a/examples/timit/s1/local/data.sh b/examples/timit/asr1/local/data.sh
similarity index 96%
rename from examples/timit/s1/local/data.sh
rename to examples/timit/asr1/local/data.sh
index ad4ddde3fc9fa5696f0818497126f01da9e36431..e588e48df112c604878b0128251f867ce90905b2 100755
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/asr1/local/data.sh
@@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
- --stride_ms=10.0 \
- --window_ms=25.0 \
+ --stride_ms=10 \
+ --window_ms=25 \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
diff --git a/examples/timit/s1/local/export.sh b/examples/timit/asr1/local/export.sh
similarity index 100%
rename from examples/timit/s1/local/export.sh
rename to examples/timit/asr1/local/export.sh
diff --git a/examples/timit/s1/local/test.sh b/examples/timit/asr1/local/test.sh
similarity index 100%
rename from examples/timit/s1/local/test.sh
rename to examples/timit/asr1/local/test.sh
diff --git a/examples/timit/s1/local/timit_data_prep.sh b/examples/timit/asr1/local/timit_data_prep.sh
similarity index 100%
rename from examples/timit/s1/local/timit_data_prep.sh
rename to examples/timit/asr1/local/timit_data_prep.sh
diff --git a/examples/timit/s1/local/timit_norm_trans.pl b/examples/timit/asr1/local/timit_norm_trans.pl
similarity index 100%
rename from examples/timit/s1/local/timit_norm_trans.pl
rename to examples/timit/asr1/local/timit_norm_trans.pl
diff --git a/examples/timit/s1/local/train.sh b/examples/timit/asr1/local/train.sh
similarity index 100%
rename from examples/timit/s1/local/train.sh
rename to examples/timit/asr1/local/train.sh
diff --git a/examples/timit/s1/path.sh b/examples/timit/asr1/path.sh
similarity index 100%
rename from examples/timit/s1/path.sh
rename to examples/timit/asr1/path.sh
diff --git a/examples/timit/s1/run.sh b/examples/timit/asr1/run.sh
similarity index 100%
rename from examples/timit/s1/run.sh
rename to examples/timit/asr1/run.sh
diff --git a/examples/tiny/README.md b/examples/tiny/README.md
index 6766f59a24399d54fd0d3a7a76ba5dc82a38d0bd..f36baae6306823dcd87cd92117a886931656f22e 100644
--- a/examples/tiny/README.md
+++ b/examples/tiny/README.md
@@ -1,2 +1,3 @@
-* s0 for deepspeech2
-* s1 for U2
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
diff --git a/examples/tiny/s0/.gitignore b/examples/tiny/asr0/.gitignore
similarity index 100%
rename from examples/tiny/s0/.gitignore
rename to examples/tiny/asr0/.gitignore
diff --git a/examples/tiny/s0/README.md b/examples/tiny/asr0/README.md
similarity index 100%
rename from examples/tiny/s0/README.md
rename to examples/tiny/asr0/README.md
diff --git a/examples/tiny/s0/conf/augmentation.json b/examples/tiny/asr0/conf/augmentation.json
similarity index 100%
rename from examples/tiny/s0/conf/augmentation.json
rename to examples/tiny/asr0/conf/augmentation.json
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml
similarity index 100%
rename from examples/tiny/s0/conf/deepspeech2.yaml
rename to examples/tiny/asr0/conf/deepspeech2.yaml
diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml
similarity index 100%
rename from examples/tiny/s0/conf/deepspeech2_online.yaml
rename to examples/tiny/asr0/conf/deepspeech2_online.yaml
diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/asr0/local/data.sh
similarity index 96%
rename from examples/tiny/s0/local/data.sh
rename to examples/tiny/asr0/local/data.sh
index 711ebee406b0b101b07bd38453257128995f25bf..f1fb8cb1d093a3adde7f71cebc1ecee50bff7238 100755
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/asr0/local/data.sh
@@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="linear" \
--delta_delta=false \
--sample_rate=16000 \
- --stride_ms=10.0 \
- --window_ms=20.0 \
+ --stride_ms=10 \
+ --window_ms=20 \
--use_dB_normalization=False \
--num_workers=2 \
--output_path="data/mean_std.json"
@@ -63,7 +63,6 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
diff --git a/examples/tiny/s0/local/download_lm_en.sh b/examples/tiny/asr0/local/download_lm_en.sh
similarity index 100%
rename from examples/tiny/s0/local/download_lm_en.sh
rename to examples/tiny/asr0/local/download_lm_en.sh
diff --git a/examples/tiny/s0/local/export.sh b/examples/tiny/asr0/local/export.sh
similarity index 100%
rename from examples/tiny/s0/local/export.sh
rename to examples/tiny/asr0/local/export.sh
diff --git a/examples/tiny/s0/local/test.sh b/examples/tiny/asr0/local/test.sh
similarity index 100%
rename from examples/tiny/s0/local/test.sh
rename to examples/tiny/asr0/local/test.sh
diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/asr0/local/train.sh
similarity index 100%
rename from examples/tiny/s0/local/train.sh
rename to examples/tiny/asr0/local/train.sh
diff --git a/examples/tiny/s0/path.sh b/examples/tiny/asr0/path.sh
similarity index 100%
rename from examples/tiny/s0/path.sh
rename to examples/tiny/asr0/path.sh
diff --git a/examples/tiny/s0/run.sh b/examples/tiny/asr0/run.sh
similarity index 100%
rename from examples/tiny/s0/run.sh
rename to examples/tiny/asr0/run.sh
diff --git a/examples/tiny/s1/.gitignore b/examples/tiny/asr1/.gitignore
similarity index 100%
rename from examples/tiny/s1/.gitignore
rename to examples/tiny/asr1/.gitignore
diff --git a/examples/tiny/s1/conf/augmentation.json b/examples/tiny/asr1/conf/augmentation.json
similarity index 100%
rename from examples/tiny/s1/conf/augmentation.json
rename to examples/tiny/asr1/conf/augmentation.json
diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml
similarity index 98%
rename from examples/tiny/s1/conf/chunk_confermer.yaml
rename to examples/tiny/asr1/conf/chunk_confermer.yaml
index c518666977faef8c0862be3e7c7f4d5b5244a5fc..6bed27f5c9caba478f127064d2fcce102eccf1f7 100644
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml
similarity index 98%
rename from examples/tiny/s1/conf/chunk_transformer.yaml
rename to examples/tiny/asr1/conf/chunk_transformer.yaml
index 29c30b262048b46bf08d132aebbb24bd7186bf71..7aed1b1933ca1edcf34e6c45a49dbc68eed91527 100644
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml
similarity index 98%
rename from examples/tiny/s1/conf/conformer.yaml
rename to examples/tiny/asr1/conf/conformer.yaml
index 8487da771930e6f615ac9fe0e718bab310f66970..2c09b3ae6954cff537a7f1c934b4193e56f3243f 100644
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
diff --git a/examples/tiny/asr1/conf/preprocess.yaml b/examples/tiny/asr1/conf/preprocess.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c
--- /dev/null
+++ b/examples/tiny/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+ # extract kaldi fbank from PCM
+ - type: fbank_kaldi
+ fs: 16000
+ n_mels: 80
+ n_shift: 160
+ win_length: 400
+ dither: true
+ - type: cmvn_json
+ cmvn_path: data/mean_std.json
+ # these three processes are a.k.a. SpecAugument
+ - type: time_warp
+ max_time_warp: 5
+ inplace: true
+ mode: PIL
+ - type: freq_mask
+ F: 30
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+ - type: time_mask
+ T: 40
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+
+
+
+
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml
similarity index 96%
rename from examples/tiny/s1/conf/transformer.yaml
rename to examples/tiny/asr1/conf/transformer.yaml
index cc9b5c5158adf2ca74ccf715e6edaf61cb320953..1378e848dceee2565e1d4de1b31d6e887ba65103 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@@ -11,11 +11,11 @@ data:
max_output_input_ratio: 10.0
collator:
- mean_std_filepath: ""
+ mean_std_filepath: data/mean_std.json
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:
# network architecture
model:
- cmvn_file: "data/mean_std.json"
+ cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
diff --git a/examples/tiny/s1/local/align.sh b/examples/tiny/asr1/local/align.sh
similarity index 100%
rename from examples/tiny/s1/local/align.sh
rename to examples/tiny/asr1/local/align.sh
diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/asr1/local/data.sh
similarity index 96%
rename from examples/tiny/s1/local/data.sh
rename to examples/tiny/asr1/local/data.sh
index b25f993f6107d222ec7feea63967e5bdea1b291a..87539d5ed33b3ebbb21d398dc78349ead6cd27e3 100755
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/asr1/local/data.sh
@@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
- --stride_ms=10.0 \
- --window_ms=25.0 \
+ --stride_ms=10 \
+ --window_ms=25 \
--use_dB_normalization=False \
--num_workers=2 \
--output_path="data/mean_std.json"
@@ -69,7 +69,6 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
diff --git a/examples/tiny/s1/local/export.sh b/examples/tiny/asr1/local/export.sh
similarity index 100%
rename from examples/tiny/s1/local/export.sh
rename to examples/tiny/asr1/local/export.sh
diff --git a/examples/tiny/s1/local/test.sh b/examples/tiny/asr1/local/test.sh
similarity index 100%
rename from examples/tiny/s1/local/test.sh
rename to examples/tiny/asr1/local/test.sh
diff --git a/examples/tiny/s1/local/train.sh b/examples/tiny/asr1/local/train.sh
similarity index 100%
rename from examples/tiny/s1/local/train.sh
rename to examples/tiny/asr1/local/train.sh
diff --git a/examples/tiny/s1/path.sh b/examples/tiny/asr1/path.sh
similarity index 100%
rename from examples/tiny/s1/path.sh
rename to examples/tiny/asr1/path.sh
diff --git a/examples/tiny/s1/run.sh b/examples/tiny/asr1/run.sh
similarity index 100%
rename from examples/tiny/s1/run.sh
rename to examples/tiny/asr1/run.sh
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 334372f9549807e0f7da09af853e15025fa8ead2..894d6b1476243736c0b540c285ae72824147f573 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -90,7 +90,7 @@ optional arguments:
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it.
+Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)and unzip it.
```bash
unzip pwg_vctk_ckpt_0.5.zip
```
@@ -196,7 +196,7 @@ optional arguments:
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)
FastSpeech2 checkpoint contains files listed below.
```text
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 5063b869c872edbd015b85e5da2b66cb827ced4e..8692f0104358455a7c10617c1f08a8822d861d34 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -127,7 +127,7 @@ optional arguments:
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Models
-Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip).
+Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip).
Parallel WaveGAN checkpoint contains files listed below.
diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cbd01eb8c443494e0fcb07cc7b5834cd8d1648b2
--- /dev/null
+++ b/examples/wenetspeech/README.md
@@ -0,0 +1,58 @@
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+
+# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech)
+
+A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition
+
+## Description
+
+### Creation
+
+All the data are collected from YouTube and Podcast. Optical character recognition (OCR) and automatic speech recognition (ASR) techniques are adopted to label each YouTube and Podcast recording, respectively. To improve the quality of the corpus, we use a novel end-to-end label error detection method to further validate and filter the data.
+
+### Categories
+
+In summary, WenetSpeech groups all data into 3 categories, as the following table shows:
+
+| Set | Hours | Confidence | Usage |
+|------------|-------|-------------|---------------------------------------|
+| High Label | 10005 | >=0.95 | Supervised Training |
+| Weak Label | 2478 | [0.6, 0.95] | Semi-supervised or noise training |
+| Unlabel | 9952 | / | Unsupervised training or Pre-training |
+| In Total | 22435 | / | All above |
+
+### High Label Data
+
+We classify the high label into 10 groups according to its domain, speaking style, and scenarios.
+
+| Domain | Youtube | Podcast | Total |
+|-------------|---------|---------|--------|
+| audiobook | 0 | 250.9 | 250.9 |
+| commentary | 112.6 | 135.7 | 248.3 |
+| documentary | 386.7 | 90.5 | 477.2 |
+| drama | 4338.2 | 0 | 4338.2 |
+| interview | 324.2 | 614 | 938.2 |
+| news | 0 | 868 | 868 |
+| reading | 0 | 1110.2 | 1110.2 |
+| talk | 204 | 90.7 | 294.7 |
+| variety | 603.3 | 224.5 | 827.8 |
+| others | 144 | 507.5 | 651.5 |
+| Total | 6113 | 3892 | 10005 |
+
+As shown in the following table, we provide 3 training subsets, namely `S`, `M` and `L` for building ASR systems on different data scales.
+
+| Training Subsets | Confidence | Hours |
+|------------------|-------------|-------|
+| L | [0.95, 1.0] | 10005 |
+| M | 1.0 | 1000 |
+| S | 1.0 | 100 |
+
+### Evaluation Sets
+
+| Evaluation Sets | Hours | Source | Description |
+|-----------------|-------|--------------|-----------------------------------------------------------------------------------------|
+| DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training |
+| TEST\_NET | 23 | Internet | Match test |
+| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset |
diff --git a/examples/wenetspeech/asr1/.gitignore b/examples/wenetspeech/asr1/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..02a229225b1bd83122ea8a3945166876d7183447
--- /dev/null
+++ b/examples/wenetspeech/asr1/.gitignore
@@ -0,0 +1,3 @@
+data
+exp
+*.profile
diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c08b94e29b92a80a8487e171330e5928b492ece4
--- /dev/null
+++ b/examples/wenetspeech/asr1/README.md
@@ -0,0 +1,14 @@
+## Pack Model
+
+pack model to tar.gz, e.g.
+
+```bash
+./utils/pack_model.sh --preprocess_conf conf/preprocess.yaml --dict data/vocab.txt conf/conformer.yaml '' data/mean_std.json exp/conformer/checkpoints/wenetspeec
+h.pdparams
+
+```
+
+show model.tar.gz
+```
+tar tf model.tar.gz
+```
diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md
new file mode 100644
index 0000000000000000000000000000000000000000..5c2b8143ca31b2c6f12a8db53db4abc38a0af748
--- /dev/null
+++ b/examples/wenetspeech/asr1/RESULTS.md
@@ -0,0 +1,24 @@
+# WenetSpeech
+
+
+## Conformer
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug | dev | attention | | |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | ctc_greedy_search | | |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test meeting | ctc_prefix_beam_search | | |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | attention_rescoring | | |
+
+
+
+## Conformer Pretrain Model
+
+Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 |
diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0340dc85dcd6d4ca7ca0eedd08b70d16c3846e01
--- /dev/null
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
@@ -0,0 +1,113 @@
+# network architecture
+model:
+ # encoder related
+ encoder: conformer
+ encoder_conf:
+ output_size: 512 # dimension of attention
+ attention_heads: 8
+ linear_units: 2048 # the number of units of position-wise feed forward
+ num_blocks: 12 # the number of encoder blocks
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.0
+ input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+ normalize_before: True
+ use_cnn_module: True
+ cnn_module_kernel: 15
+ cnn_module_norm: layer_norm
+ activation_type: swish
+ pos_enc_layer_type: rel_pos
+ selfattention_layer_type: rel_selfattn
+
+ # decoder related
+ decoder: transformer
+ decoder_conf:
+ attention_heads: 8
+ linear_units: 2048
+ num_blocks: 6
+ dropout_rate: 0.1
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.0
+ src_attention_dropout_rate: 0.0
+
+ # hybrid CTC/attention
+ model_conf:
+ ctc_weight: 0.3
+ ctc_dropoutrate: 0.0
+ ctc_grad_norm_type: null
+ lsm_weight: 0.1 # label smoothing option
+ length_normalized_loss: false
+
+# https://yaml.org/type/float.html
+data:
+ train_manifest: data/manifest.train
+ dev_manifest: data/manifest.dev
+ test_manifest: data/manifest.test
+ min_input_len: 0.1 # second
+ max_input_len: 12.0 # second
+ min_output_len: 1.0
+ max_output_len: 400.0
+ min_output_input_ratio: 0.05
+ max_output_input_ratio: 10.0
+
+collator:
+ vocab_filepath: data/vocab.txt
+ unit_type: 'char'
+ spm_model_prefix: ''
+ augmentation_config: conf/preprocess.yaml
+ batch_size: 64
+ raw_wav: True # use raw_wav or kaldi feature
+ spectrum_type: fbank #linear, mfcc, fbank
+ feat_dim: 80
+ delta_delta: False
+ dither: 1.0
+ target_sample_rate: 16000
+ max_freq: None
+ n_fft: None
+ stride_ms: 10.0
+ window_ms: 25.0
+ use_dB_normalization: True
+ target_dB: -20
+ random_seed: 0
+ keep_transcription_text: False
+ sortagrad: True
+ shuffle_method: batch_shuffle
+ num_workers: 2
+
+
+training:
+ n_epoch: 240
+ accum_grad: 16
+ global_grad_clip: 5.0
+ log_interval: 100
+ checkpoint:
+ kbest_n: 50
+ latest_n: 5
+ optim: adam
+ optim_conf:
+ lr: 0.001
+ weight_decay: 1e-6
+ scheduler: warmuplr # pytorch v1.1.0+ required
+ scheduler_conf:
+ warmup_steps: 5000
+ lr_decay: 1.0
+
+
+decoding:
+ batch_size: 128
+ error_rate_type: cer
+ decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+ lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+ alpha: 2.5
+ beta: 0.3
+ beam_size: 10
+ cutoff_prob: 1.0
+ cutoff_top_n: 0
+ num_proc_bsearch: 8
+ ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+ decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+ # <0: for decoding, use full chunk.
+ # >0: for decoding, use fixed chunk size as set.
+ # 0: used for training, it's prohibited here.
+ num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
+ simulate_streaming: False # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c
--- /dev/null
+++ b/examples/wenetspeech/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+ # extract kaldi fbank from PCM
+ - type: fbank_kaldi
+ fs: 16000
+ n_mels: 80
+ n_shift: 160
+ win_length: 400
+ dither: true
+ - type: cmvn_json
+ cmvn_path: data/mean_std.json
+ # these three processes are a.k.a. SpecAugument
+ - type: time_warp
+ max_time_warp: 5
+ inplace: true
+ mode: PIL
+ - type: freq_mask
+ F: 30
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+ - type: time_mask
+ T: 40
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+
+
+
+
diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..67b3d5a55ff6fa8c842d540c3814fb35aafa48e1
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/data.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+# NPU, ASLP Group (Author: Qijie Shao)
+
+stage=-1
+stop_stage=100
+
+# Use your own data path. You need to download the WenetSpeech dataset by yourself.
+wenetspeech_data_dir=./wenetspeech
+# Make sure you have 1.2T for ${shards_dir}
+shards_dir=./wenetspeech_shards
+
+#wenetspeech training set
+set=L
+train_set=train_`echo $set | tr 'A-Z' 'a-z'`
+dev_set=dev
+test_sets="test_net test_meeting"
+
+cmvn=true
+cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn
+
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+set -u
+set -o pipefail
+
+
+mkdir -p data
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then
+ # download data
+ echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
+ exit 0;
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ echo "Data preparation"
+ local/wenetspeech_data_prep.sh \
+ --train-subset $set \
+ $wenetspeech_data_dir \
+ data || exit 1;
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+ # generate manifests
+ python3 ${TARGET_DIR}/aishell/aishell.py \
+ --manifest_prefix="data/manifest" \
+ --target_dir="${TARGET_DIR}/aishell"
+
+ if [ $? -ne 0 ]; then
+ echo "Prepare Aishell failed. Terminated."
+ exit 1
+ fi
+
+ for dataset in train dev test; do
+ mv data/manifest.${dataset} data/manifest.${dataset}.raw
+ done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # compute mean and stddev for normalizer
+ if $cmvn; then
+ full_size=`cat data/${train_set}/wav.scp | wc -l`
+ sampling_size=$((full_size / cmvn_sampling_divisor))
+ shuf -n $sampling_size data/$train_set/wav.scp \
+ > data/$train_set/wav.scp.sampled
+ num_workers=$(nproc)
+
+ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+ --manifest_path="data/manifest.train.raw" \
+ --spectrum_type="fbank" \
+ --feat_dim=80 \
+ --delta_delta=false \
+ --stride_ms=10 \
+ --window_ms=25 \
+ --sample_rate=16000 \
+ --use_dB_normalization=False \
+ --num_samples=-1 \
+ --num_workers=${num_workers} \
+ --output_path="data/mean_std.json"
+
+ if [ $? -ne 0 ]; then
+ echo "Compute mean and stddev failed. Terminated."
+ exit 1
+ fi
+ fi
+fi
+
+dict=data/dict/lang_char.txt
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # download data, generate manifests
+ # build vocabulary
+ python3 ${MAIN_ROOT}/utils/build_vocab.py \
+ --unit_type="char" \
+ --count_threshold=0 \
+ --vocab_path="data/vocab.txt" \
+ --manifest_paths "data/manifest.train.raw"
+
+ if [ $? -ne 0 ]; then
+ echo "Build vocabulary failed. Terminated."
+ exit 1
+ fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # format manifest with tokenids, vocab size
+ for dataset in train dev test; do
+ {
+ python3 ${MAIN_ROOT}/utils/format_data.py \
+ --cmvn_path "data/mean_std.json" \
+ --unit_type "char" \
+ --vocab_path="data/vocab.txt" \
+ --manifest_path="data/manifest.${dataset}.raw" \
+ --output_path="data/manifest.${dataset}"
+
+ if [ $? -ne 0 ]; then
+ echo "Formt mnaifest failed. Terminated."
+ exit 1
+ fi
+ } &
+ done
+ wait
+fi
+
+echo "Aishell data preparation done."
+exit 0
diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1b2727838052740e5e89593dcdab04ffe387c9
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
+# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+import sys
+
+
+def get_args():
+ parser = argparse.ArgumentParser(description="""
+ This script is used to process raw json dataset of WenetSpeech,
+ where the long wav is splitinto segments and
+ data of wenet format is generated.
+ """)
+ parser.add_argument('input_json', help="""Input json file of WenetSpeech""")
+ parser.add_argument('output_dir', help="""Output dir for prepared data""")
+
+ args = parser.parse_args()
+ return args
+
+
+def meta_analysis(input_json, output_dir):
+ input_dir = os.path.dirname(input_json)
+
+ if not os.path.exists(output_dir):
+ os.makedirs(output_dir)
+
+ try:
+ with open(input_json, 'r') as injson:
+ json_data = json.load(injson)
+ except Exception:
+ sys.exit(f'Failed to load input json file: {input_json}')
+ else:
+ if json_data['audios'] is not None:
+ with open(f'{output_dir}/text', 'w') as utt2text, \
+ open(f'{output_dir}/segments', 'w') as segments, \
+ open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
+ open(f'{output_dir}/wav.scp', 'w') as wavscp, \
+ open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
+ open(f'{output_dir}/reco2dur', 'w') as reco2dur:
+ for long_audio in json_data['audios']:
+ try:
+ long_audio_path = os.path.realpath(
+ os.path.join(input_dir, long_audio['path']))
+ aid = long_audio['aid']
+ segments_lists = long_audio['segments']
+ duration = long_audio['duration']
+ assert (os.path.exists(long_audio_path))
+ except AssertionError:
+ print(f'''Warning: {aid} something is wrong,
+ maybe AssertionError, skipped''')
+ continue
+ except Exception:
+ print(f'''Warning: {aid} something is wrong, maybe the
+ error path: {long_audio_path}, skipped''')
+ continue
+ else:
+ wavscp.write(f'{aid}\t{long_audio_path}\n')
+ reco2dur.write(f'{aid}\t{duration}\n')
+ for segment_file in segments_lists:
+ try:
+ sid = segment_file['sid']
+ start_time = segment_file['begin_time']
+ end_time = segment_file['end_time']
+ dur = end_time - start_time
+ text = segment_file['text']
+ segment_subsets = segment_file["subsets"]
+ except Exception:
+ print(f'''Warning: {segment_file} something
+ is wrong, skipped''')
+ continue
+ else:
+ utt2text.write(f'{sid}\t{text}\n')
+ segments.write(
+ f'{sid}\t{aid}\t{start_time}\t{end_time}\n')
+ utt2dur.write(f'{sid}\t{dur}\n')
+ segment_sub_names = " ".join(segment_subsets)
+ utt2subsets.write(
+ f'{sid}\t{segment_sub_names}\n')
+
+
+def main():
+ args = get_args()
+
+ meta_analysis(args.input_json, args.output_dir)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/wenetspeech/asr1/local/process_opus.py b/examples/wenetspeech/asr1/local/process_opus.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1b9287edbdcda1270fcd5192a3fc1328d492bbe
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/process_opus.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao)
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# process_opus.py: segmentation and downsampling of opus audio
+# usage: python3 process_opus.py wav.scp segments output_wav.scp
+import os
+import sys
+
+from pydub import AudioSegment
+
+
+def read_file(wav_scp, segments):
+ wav_scp_dict = {}
+ with open(wav_scp, 'r', encoding='UTF-8') as fin:
+ for line_str in fin:
+ wav_id, path = line_str.strip().split()
+ wav_scp_dict[wav_id] = path
+
+ utt_list = []
+ seg_path_list = []
+ start_time_list = []
+ end_time_list = []
+ with open(segments, 'r', encoding='UTF-8') as fin:
+ for line_str in fin:
+ arr = line_str.strip().split()
+ assert len(arr) == 4
+ utt_list.append(arr[0])
+ seg_path_list.append(wav_scp_dict[arr[1]])
+ start_time_list.append(float(arr[2]))
+ end_time_list.append(float(arr[3]))
+ return utt_list, seg_path_list, start_time_list, end_time_list
+
+
+# TODO(Qijie): Fix the process logic
+def output(output_wav_scp, utt_list, seg_path_list, start_time_list,
+ end_time_list):
+ num_utts = len(utt_list)
+ step = int(num_utts * 0.01)
+ with open(output_wav_scp, 'w', encoding='UTF-8') as fout:
+ previous_wav_path = ""
+ for i in range(num_utts):
+ utt_id = utt_list[i]
+ current_wav_path = seg_path_list[i]
+ output_dir = (os.path.dirname(current_wav_path)) \
+ .replace("audio", 'audio_seg')
+ seg_wav_path = os.path.join(output_dir, utt_id + '.wav')
+
+ # if not os.path.exists(output_dir):
+ # os.makedirs(output_dir)
+
+ if current_wav_path != previous_wav_path:
+ source_wav = AudioSegment.from_file(current_wav_path)
+ previous_wav_path = current_wav_path
+
+ start = int(start_time_list[i] * 1000)
+ end = int(end_time_list[i] * 1000)
+ target_audio = source_wav[start:end].set_frame_rate(16000)
+ target_audio.export(seg_wav_path, format="wav")
+
+ fout.write("{} {}\n".format(utt_id, seg_wav_path))
+ if i % step == 0:
+ print("seg wav finished: {}%".format(int(i / step)))
+
+
+def main():
+ wav_scp = sys.argv[1]
+ segments = sys.argv[2]
+ output_wav_scp = sys.argv[3]
+
+ utt_list, seg_path_list, start_time_list, end_time_list \
+ = read_file(wav_scp, segments)
+ output(output_wav_scp, utt_list, seg_path_list, start_time_list,
+ end_time_list)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..47bd2f6338a7d062b094c327f39f5362fae39865
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/test.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+if [ $# != 2 ];then
+ echo "usage: ${0} config_path ckpt_path_prefix"
+ exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+ chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+# exit 1
+#fi
+
+
+for type in attention ctc_greedy_search; do
+ echo "decoding ${type}"
+ if [ ${chunk_mode} == true ];then
+ # stream decoding only support batchsize=1
+ batch_size=1
+ else
+ batch_size=64
+ fi
+ output_dir=${ckpt_prefix}
+ mkdir -p ${output_dir}
+ python3 -u ${BIN_DIR}/test.py \
+ --nproc ${ngpu} \
+ --config ${config_path} \
+ --result_file ${output_dir}/${type}.rsl \
+ --checkpoint_path ${ckpt_prefix} \
+ --opts decoding.decoding_method ${type} \
+ --opts decoding.batch_size ${batch_size}
+
+ if [ $? -ne 0 ]; then
+ echo "Failed in evaluation!"
+ exit 1
+ fi
+done
+
+for type in ctc_prefix_beam_search attention_rescoring; do
+ echo "decoding ${type}"
+ batch_size=1
+ output_dir=${ckpt_prefix}
+ mkdir -p ${output_dir}
+ python3 -u ${BIN_DIR}/test.py \
+ --nproc ${ngpu} \
+ --config ${config_path} \
+ --result_file ${output_dir}/${type}.rsl \
+ --checkpoint_path ${ckpt_prefix} \
+ --opts decoding.decoding_method ${type} \
+ --opts decoding.batch_size ${batch_size}
+
+ if [ $? -ne 0 ]; then
+ echo "Failed in evaluation!"
+ exit 1
+ fi
+done
+
+exit 0
diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..858530534efdaf28818ea6a6f1cc742667d6e71b
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
+# Seasalt AI, Inc (Author: Guoguo Chen)
+# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+# NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -o pipefail
+
+stage=1
+prefix=
+train_subset=L
+
+. ./tools/parse_options.sh || exit 1;
+
+filter_by_id () {
+ idlist=$1
+ input=$2
+ output=$3
+ field=1
+ if [ $# -eq 4 ]; then
+ field=$4
+ fi
+ cat $input | perl -se '
+ open(F, "<$idlist") || die "Could not open id-list file $idlist";
+ while() {
+ @A = split;
+ @A>=1 || die "Invalid id-list file line $_";
+ $seen{$A[0]} = 1;
+ }
+ while(<>) {
+ @A = split;
+ @A > 0 || die "Invalid file line $_";
+ @A >= $field || die "Invalid file line $_";
+ if ($seen{$A[$field-1]}) {
+ print $_;
+ }
+ }' -- -idlist="$idlist" -field="$field" > $output ||\
+ (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
+}
+
+subset_data_dir () {
+ utt_list=$1
+ src_dir=$2
+ dest_dir=$3
+ mkdir -p $dest_dir || exit 1;
+ # wav.scp text segments utt2dur
+ filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
+ (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
+ filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
+ (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
+ filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
+ (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
+ awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
+ filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
+ (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
+ rm -f $dest_dir/reco
+}
+
+if [ $# -ne 2 ]; then
+ echo "Usage: $0 [options] "
+ echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
+ echo ""
+ echo "This script takes the WenetSpeech source directory, and prepares the"
+ echo "WeNet format data directory."
+ echo " --prefix # Prefix for output data directory."
+ echo " --stage # Processing stage."
+ echo " --train-subset # Train subset to be created."
+ exit 1
+fi
+
+wenetspeech_dir=$1
+data_dir=$2
+
+declare -A subsets
+subsets=(
+ [L]="train_l"
+ [M]="train_m"
+ [S]="train_s"
+ [W]="train_w"
+ [DEV]="dev"
+ [TEST_NET]="test_net"
+ [TEST_MEETING]="test_meeting")
+
+prefix=${prefix:+${prefix}_}
+
+corpus_dir=$data_dir/${prefix}corpus/
+if [ $stage -le 1 ]; then
+ echo "$0: Extract meta into $corpus_dir"
+ # Sanity check.
+ [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
+ echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
+ [ ! -d $wenetspeech_dir/audio ] &&\
+ echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;
+
+ [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
+
+ # Files to be created:
+ # wav.scp text segments utt2dur
+ python3 local/extract_meta.py \
+ $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+ echo "$0: Split data to train, dev, test_net, and test_meeting"
+ [ ! -f $corpus_dir/utt2subsets ] &&\
+ echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
+ for label in $train_subset DEV TEST_NET TEST_MEETING; do
+ if [ ! ${subsets[$label]+set} ]; then
+ echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
+ fi
+ subset=${subsets[$label]}
+ [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
+ cat $corpus_dir/utt2subsets | \
+ awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
+ > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
+ subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
+ $corpus_dir $data_dir/${prefix}$subset || exit 1;
+ done
+fi
+
+echo "$0: Done"
\ No newline at end of file
diff --git a/examples/wenetspeech/asr1/path.sh b/examples/wenetspeech/asr1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..666b29bce2611ae6a23b71b4d8f460cbc58c6c1e
--- /dev/null
+++ b/examples/wenetspeech/asr1/path.sh
@@ -0,0 +1,15 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+# model exp
+MODEL=u2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8c4a12cb46056113781e40fc475388426e0c92bf
--- /dev/null
+++ b/examples/wenetspeech/asr1/run.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+. path.sh || exit 1;
+set -e
+
+gpus=0,1,2,3,4,5,6,7
+stage=0
+stop_stage=100
+conf_path=conf/conformer.yaml
+
+average_checkpoint=true
+avg_num=10
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+audio_file="data/tmp.wav"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # prepare data
+ bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # train model, all `ckpt` under `exp` dir
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # avg n best model
+ avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # test ckpt avg_n
+ CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # ctc alignment of test data
+ CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+ # export ckpt avg_n
+ CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+ # test a single .wav file
+ CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
diff --git a/examples/wenetspeech/asr1/utils b/examples/wenetspeech/asr1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/examples/wenetspeech/asr1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index 177d710b066d4b05bb5e60ee0b040aa51f823613..e827414d3c67a5790a381f4877bf6a7618ff7d46 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -409,7 +409,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
@paddle.no_grad()
def test(self):
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
from paddlespeech.s2t.utils.log import Autolog
self.autolog = Autolog(
batch_size=self.config.decoding.batch_size,
@@ -438,7 +438,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
msg += "Final error rate [%s] (%d/%d) = %f" % (
error_rate_type, num_ins, num_ins, errors_sum / len_refs)
logger.info(msg)
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
self.autolog.report()
def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
@@ -512,7 +512,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
x_len_list = np.split(x_len_batch, batch_size, axis=0)
for x, x_len in zip(x_list, x_len_list):
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
self.autolog.times.start()
x_len = x_len[0]
assert (chunk_size <= x_len)
@@ -547,7 +547,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
probs_chunk_list = []
probs_chunk_lens_list = []
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
# record the model preprocessing time
self.autolog.times.stamp()
@@ -606,7 +606,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
[output_probs, output_probs_padding], axis=1)
output_probs_list.append(output_probs)
output_lens_list.append(output_lens)
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
# record the model inference time
self.autolog.times.stamp()
# record the post processing time
@@ -641,12 +641,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
audio_len_handle.reshape(x_len.shape)
audio_len_handle.copy_from_cpu(x_len)
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
self.autolog.times.start()
# record the prefix processing time
self.autolog.times.stamp()
self.predictor.run()
- if self.args.enable_auto_log == True:
+ if self.args.enable_auto_log is True:
# record the model inference time
self.autolog.times.stamp()
# record the post processing time
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 22d4238ac832fa79fec3b3286ef3e64c16937441..27bc47d2baa537496bef0e3f1d1b18a23cf1d1f2 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -24,13 +24,10 @@ import jsonlines
import numpy as np
import paddle
from paddle import distributed as dist
-from paddle.io import DataLoader
from yacs.config import CfgNode
-from paddlespeech.s2t.io.collator import SpeechCollator
-from paddlespeech.s2t.io.dataset import ManifestDataset
-from paddlespeech.s2t.io.sampler import SortagradBatchSampler
-from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.models.u2 import U2Model
from paddlespeech.s2t.training.optimizer import OptimizerFactory
from paddlespeech.s2t.training.reporter import ObsScope
@@ -213,7 +210,7 @@ class U2Trainer(Trainer):
msg += f"{v:>.8f}" if isinstance(v,
float) else f"{v}"
msg += f" {k.split(',')[1]}" if len(
- k.split(',')) == 2 else f""
+ k.split(',')) == 2 else ""
msg += ","
msg = msg[:-1] # remove the last ","
if (batch_index + 1
@@ -249,92 +246,103 @@ class U2Trainer(Trainer):
def setup_dataloader(self):
config = self.config.clone()
- config.defrost()
- config.collator.keep_transcription_text = False
- # train/valid dataset, return token ids
- config.data.manifest = config.data.train_manifest
- train_dataset = ManifestDataset.from_config(config)
-
- config.data.manifest = config.data.dev_manifest
- dev_dataset = ManifestDataset.from_config(config)
-
- collate_fn_train = SpeechCollator.from_config(config)
-
- config.collator.augmentation_config = ""
- collate_fn_dev = SpeechCollator.from_config(config)
-
- if self.parallel:
- batch_sampler = SortagradDistributedBatchSampler(
- train_dataset,
+ if self.train:
+ # train/valid dataset, return token ids
+ self.train_loader = BatchDataLoader(
+ json_file=config.data.train_manifest,
+ train_mode=True,
+ sortagrad=False,
batch_size=config.collator.batch_size,
- num_replicas=None,
- rank=None,
- shuffle=True,
- drop_last=True,
- sortagrad=config.collator.sortagrad,
- shuffle_method=config.collator.shuffle_method)
- else:
- batch_sampler = SortagradBatchSampler(
- train_dataset,
- shuffle=True,
+ maxlen_in=float('inf'),
+ maxlen_out=float('inf'),
+ minibatches=0,
+ mini_batch_size=self.args.nprocs,
+ batch_count='auto',
+ batch_bins=0,
+ batch_frames_in=0,
+ batch_frames_out=0,
+ batch_frames_inout=0,
+ preprocess_conf=config.collator.
+ augmentation_config, # aug will be off when train_mode=False
+ n_iter_processes=config.collator.num_workers,
+ subsampling_factor=1,
+ num_encs=1)
+
+ self.valid_loader = BatchDataLoader(
+ json_file=config.data.dev_manifest,
+ train_mode=False,
+ sortagrad=False,
batch_size=config.collator.batch_size,
- drop_last=True,
- sortagrad=config.collator.sortagrad,
- shuffle_method=config.collator.shuffle_method)
- self.train_loader = DataLoader(
- train_dataset,
- batch_sampler=batch_sampler,
- collate_fn=collate_fn_train,
- num_workers=config.collator.num_workers, )
- self.valid_loader = DataLoader(
- dev_dataset,
- batch_size=config.collator.batch_size,
- shuffle=False,
- drop_last=False,
- collate_fn=collate_fn_dev,
- num_workers=config.collator.num_workers, )
-
- # test dataset, return raw text
- config.data.manifest = config.data.test_manifest
- # filter test examples, will cause less examples, but no mismatch with training
- # and can use large batch size , save training time, so filter test egs now.
- config.data.min_input_len = 0.0 # second
- config.data.max_input_len = float('inf') # second
- config.data.min_output_len = 0.0 # tokens
- config.data.max_output_len = float('inf') # tokens
- config.data.min_output_input_ratio = 0.00
- config.data.max_output_input_ratio = float('inf')
-
- test_dataset = ManifestDataset.from_config(config)
- # return text ord id
- config.collator.keep_transcription_text = True
- config.collator.augmentation_config = ""
- self.test_loader = DataLoader(
- test_dataset,
- batch_size=config.decoding.batch_size,
- shuffle=False,
- drop_last=False,
- collate_fn=SpeechCollator.from_config(config),
- num_workers=config.collator.num_workers, )
- # return text token id
- config.collator.keep_transcription_text = False
- self.align_loader = DataLoader(
- test_dataset,
- batch_size=config.decoding.batch_size,
- shuffle=False,
- drop_last=False,
- collate_fn=SpeechCollator.from_config(config),
- num_workers=config.collator.num_workers, )
- logger.info("Setup train/valid/test/align Dataloader!")
+ maxlen_in=float('inf'),
+ maxlen_out=float('inf'),
+ minibatches=0,
+ mini_batch_size=self.args.nprocs,
+ batch_count='auto',
+ batch_bins=0,
+ batch_frames_in=0,
+ batch_frames_out=0,
+ batch_frames_inout=0,
+ preprocess_conf=config.collator.
+ augmentation_config, # aug will be off when train_mode=False
+ n_iter_processes=config.collator.num_workers,
+ subsampling_factor=1,
+ num_encs=1)
+ logger.info("Setup train/valid Dataloader!")
+ else:
+ # test dataset, return raw text
+ self.test_loader = BatchDataLoader(
+ json_file=config.data.test_manifest,
+ train_mode=False,
+ sortagrad=False,
+ batch_size=config.decoding.batch_size,
+ maxlen_in=float('inf'),
+ maxlen_out=float('inf'),
+ minibatches=0,
+ mini_batch_size=1,
+ batch_count='auto',
+ batch_bins=0,
+ batch_frames_in=0,
+ batch_frames_out=0,
+ batch_frames_inout=0,
+ preprocess_conf=config.collator.
+ augmentation_config, # aug will be off when train_mode=False
+ n_iter_processes=1,
+ subsampling_factor=1,
+ num_encs=1)
+
+ self.align_loader = BatchDataLoader(
+ json_file=config.data.test_manifest,
+ train_mode=False,
+ sortagrad=False,
+ batch_size=config.decoding.batch_size,
+ maxlen_in=float('inf'),
+ maxlen_out=float('inf'),
+ minibatches=0,
+ mini_batch_size=1,
+ batch_count='auto',
+ batch_bins=0,
+ batch_frames_in=0,
+ batch_frames_out=0,
+ batch_frames_inout=0,
+ preprocess_conf=config.collator.
+ augmentation_config, # aug will be off when train_mode=False
+ n_iter_processes=1,
+ subsampling_factor=1,
+ num_encs=1)
+ logger.info("Setup test/align Dataloader!")
def setup_model(self):
config = self.config
model_conf = config.model
with UpdateConfig(model_conf):
- model_conf.input_dim = self.train_loader.collate_fn.feature_size
- model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+ if self.train:
+ model_conf.input_dim = self.train_loader.feat_dim
+ model_conf.output_dim = self.train_loader.vocab_size
+ else:
+ model_conf.input_dim = self.test_loader.feat_dim
+ model_conf.output_dim = self.test_loader.vocab_size
model = U2Model.from_config(model_conf)
@@ -343,6 +351,11 @@ class U2Trainer(Trainer):
logger.info(f"{model}")
layer_tools.print_params(model, logger.info)
+ self.model = model
+ logger.info("Setup model!")
+
+ if not self.train:
+ return
train_config = config.training
optim_type = train_config.optim
@@ -383,10 +396,9 @@ class U2Trainer(Trainer):
optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
- self.model = model
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
- logger.info("Setup model/optimizer/lr_scheduler!")
+ logger.info("Setup optimizer/lr_scheduler!")
class U2Tester(U2Trainer):
@@ -421,14 +433,19 @@ class U2Tester(U2Trainer):
def __init__(self, config, args):
super().__init__(config, args)
+ self.text_feature = TextFeaturizer(
+ unit_type=self.config.collator.unit_type,
+ vocab_filepath=self.config.collator.vocab_filepath,
+ spm_model_prefix=self.config.collator.spm_model_prefix)
+ self.vocab_list = self.text_feature.vocab_list
- def ordid2token(self, texts, texts_len):
+ def id2token(self, texts, texts_len, text_feature):
""" ord() id to chr() chr """
trans = []
for text, n in zip(texts, texts_len):
n = n.numpy().item()
ids = text[:n]
- trans.append(''.join([chr(i) for i in ids]))
+ trans.append(text_feature.defeaturize(ids.numpy().tolist()))
return trans
def compute_metrics(self,
@@ -444,12 +461,11 @@ class U2Tester(U2Trainer):
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
start_time = time.time()
- text_feature = self.test_loader.collate_fn.text_feature
- target_transcripts = self.ordid2token(texts, texts_len)
+ target_transcripts = self.id2token(texts, texts_len, self.text_feature)
result_transcripts, result_tokenids = self.model.decode(
audio,
audio_len,
- text_feature=text_feature,
+ text_feature=self.text_feature,
decoding_method=cfg.decoding_method,
lang_model_path=cfg.lang_model_path,
beam_alpha=cfg.alpha,
@@ -499,7 +515,7 @@ class U2Tester(U2Trainer):
self.model.eval()
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
- stride_ms = self.test_loader.collate_fn.stride_ms
+ stride_ms = self.config.collator.stride_ms
error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0
num_frames = 0.0
@@ -558,8 +574,7 @@ class U2Tester(U2Trainer):
def align(self):
ctc_utils.ctc_align(self.config, self.model, self.align_loader,
self.config.decoding.batch_size,
- self.align_loader.collate_fn.stride_ms,
- self.align_loader.collate_fn.vocab_list,
+ self.config.collator.stride_ms, self.vocab_list,
self.args.result_file)
def load_inferspec(self):
@@ -573,7 +588,7 @@ class U2Tester(U2Trainer):
infer_model = U2InferModel.from_pretrained(self.test_loader,
self.config.model.clone(),
self.args.checkpoint_path)
- feat_dim = self.test_loader.collate_fn.feature_size
+ feat_dim = self.test_loader.feat_dim
input_spec = [
paddle.static.InputSpec(shape=[1, None, feat_dim],
dtype='float32'), # audio, [B,T,D]
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index 0d8508c205f7f06cc5daabd6ff8c3cbf6205864b..d82034c8234df7bb621f2756dd047f40e0475e5c 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -392,6 +392,7 @@ class U2Tester(U2Trainer):
unit_type=self.config.collator.unit_type,
vocab_filepath=self.config.collator.vocab_filepath,
spm_model_prefix=self.config.collator.spm_model_prefix)
+ self.vocab_list = self.text_feature.vocab_list
def id2token(self, texts, texts_len, text_feature):
""" ord() id to chr() chr """
@@ -529,8 +530,7 @@ class U2Tester(U2Trainer):
def align(self):
ctc_utils.ctc_align(self.config, self.model, self.align_loader,
self.config.decoding.batch_size,
- self.align_loader.collate_fn.stride_ms,
- self.align_loader.collate_fn.vocab_list,
+ self.config.collator.stride_ms, self.vocab_list,
self.args.result_file)
def load_inferspec(self):
diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py
index 13dc3a44d49b1e2f98b457086190a09c16f38fd9..65dccad385c07ee59edf585a603ab822d8d38607 100644
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -24,6 +24,8 @@ import soundfile
import soxbindings as sox
from scipy import signal
+from .utility import convert_samples_from_float32
+from .utility import convert_samples_to_float32
from .utility import subfile_from_tar
@@ -689,15 +691,7 @@ class AudioSegment():
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
"""
- float32_samples = samples.astype('float32')
- if samples.dtype in np.sctypes['int']:
- bits = np.iinfo(samples.dtype).bits
- float32_samples *= (1. / 2**(bits - 1))
- elif samples.dtype in np.sctypes['float']:
- pass
- else:
- raise TypeError("Unsupported sample type: %s." % samples.dtype)
- return float32_samples
+ return convert_samples_to_float32(samples)
def _convert_samples_from_float32(self, samples, dtype):
"""Convert sample type from float32 to dtype.
@@ -708,20 +702,4 @@ class AudioSegment():
This is for writing a audio file.
"""
- dtype = np.dtype(dtype)
- output_samples = samples.copy()
- if dtype in np.sctypes['int']:
- bits = np.iinfo(dtype).bits
- output_samples *= (2**(bits - 1) / 1.)
- min_val = np.iinfo(dtype).min
- max_val = np.iinfo(dtype).max
- output_samples[output_samples > max_val] = max_val
- output_samples[output_samples < min_val] = min_val
- elif samples.dtype in np.sctypes['float']:
- min_val = np.finfo(dtype).min
- max_val = np.finfo(dtype).max
- output_samples[output_samples > max_val] = max_val
- output_samples[output_samples < min_val] = min_val
- else:
- raise TypeError("Unsupported sample type: %s." % samples.dtype)
- return output_samples.astype(dtype)
+ return convert_samples_from_float32(samples, dtype)
diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
index 7f3bd9e1253fbcf491743ba912472b22b4d8f0e9..21f512e9b9e2827b6c6e23b53f16badf3ec8c958 100644
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -92,7 +92,9 @@ class TextFeaturizer():
tokens = self.tokenize(text)
ids = []
for token in tokens:
- token = token if token in self.vocab_dict else self.unk
+ if token not in self.vocab_dict:
+ logger.debug(f"Text Token: {token} -> {self.unk}")
+ token = self.unk
ids.append(self.vocab_dict[token])
return ids
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index 089890d2a2b5f2f5e024fcfb29667b23a0da232b..703f2127d7e71b093030658f6d88c45979770c61 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
__all__ = [
"load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
"max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
- "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
+ "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
+ "convert_samples_from_float32"
]
IGNORE_ID = -1
@@ -342,3 +343,50 @@ def load_cmvn(cmvn_file: str, filetype: str):
else:
raise ValueError(f"cmvn file type no support: {filetype}")
return cmvn[0], cmvn[1]
+
+
+def convert_samples_to_float32(samples):
+ """Convert sample type to float32.
+
+ Audio sample type is usually integer or float-point.
+ Integers will be scaled to [-1, 1] in float32.
+
+ PCM16 -> PCM32
+ """
+ float32_samples = samples.astype('float32')
+ if samples.dtype in np.sctypes['int']:
+ bits = np.iinfo(samples.dtype).bits
+ float32_samples *= (1. / 2**(bits - 1))
+ elif samples.dtype in np.sctypes['float']:
+ pass
+ else:
+ raise TypeError("Unsupported sample type: %s." % samples.dtype)
+ return float32_samples
+
+
+def convert_samples_from_float32(samples, dtype):
+ """Convert sample type from float32 to dtype.
+
+ Audio sample type is usually integer or float-point. For integer
+ type, float32 will be rescaled from [-1, 1] to the maximum range
+ supported by the integer type.
+
+ PCM32 -> PCM16
+ """
+ dtype = np.dtype(dtype)
+ output_samples = samples.copy()
+ if dtype in np.sctypes['int']:
+ bits = np.iinfo(dtype).bits
+ output_samples *= (2**(bits - 1) / 1.)
+ min_val = np.iinfo(dtype).min
+ max_val = np.iinfo(dtype).max
+ output_samples[output_samples > max_val] = max_val
+ output_samples[output_samples < min_val] = min_val
+ elif samples.dtype in np.sctypes['float']:
+ min_val = np.finfo(dtype).min
+ max_val = np.finfo(dtype).max
+ output_samples[output_samples > max_val] = max_val
+ output_samples[output_samples < min_val] = min_val
+ else:
+ raise TypeError("Unsupported sample type: %s." % samples.dtype)
+ return output_samples.astype(dtype)
diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py
index cb7349d00d3c59cf50288c4f4b5511f19bab3aaf..5f2335496c6c1cfddb8a62874d328fd51084aec9 100644
--- a/paddlespeech/s2t/io/collator.py
+++ b/paddlespeech/s2t/io/collator.py
@@ -199,8 +199,8 @@ class SpeechCollatorBase():
for idx, item in enumerate(batch):
utts.append(item['utt'])
- audio = item['feat']
- text = item['text']
+ audio = item['input'][0]['feat']
+ text = item['output'][0]['text']
audio, text = self.process_utterance(audio, text)
audios.append(audio) # [T, D]
@@ -343,9 +343,10 @@ class TripletSpeechCollator(SpeechCollator):
for idx, item in enumerate(batch):
utts.append(item['utt'])
- audio = item['feat']
- translation = item['text']
- transcription = item['text1']
+ audio = item['input'][0]['feat']
+ translation = item['output'][0]['text']
+ transcription = item['output'][1]['text']
+
audio, translation, transcription = self.process_utterance(
audio, translation, transcription)
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index c503107a01534bf5b1c05b5b6ab51aae418c6217..61eeb00f1c41b586762035fe1a097b2197a74008 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -103,7 +103,7 @@ class ManifestDataset(Dataset):
min_output_len=min_output_len,
max_output_input_ratio=max_output_input_ratio,
min_output_input_ratio=min_output_input_ratio)
- self._manifest.sort(key=lambda x: x["feat_shape"][0])
+ self._manifest.sort(key=lambda x: x["input"][0]["shape"][0])
def __len__(self):
return len(self._manifest)
@@ -188,34 +188,16 @@ class AudioDataset(Dataset):
if sort:
data = sorted(data, key=lambda x: x["feat_shape"][0])
if raw_wav:
- assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark',
- '.scp')
- data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms))
+ path_suffix = data[0]['feat'].split(':')[0].splitext()[-1]
+ assert path_suffix not in ('.ark', '.scp')
+ # m second to n frame
+ data = list(
+ map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms),
+ data))
self.input_dim = data[0]['feat_shape'][1]
self.output_dim = data[0]['token_shape'][1]
- # with open(data_file, 'r') as f:
- # for line in f:
- # arr = line.strip().split('\t')
- # if len(arr) != 7:
- # continue
- # key = arr[0].split(':')[1]
- # tokenid = arr[5].split(':')[1]
- # output_dim = int(arr[6].split(':')[1].split(',')[1])
- # if raw_wav:
- # wav_path = ':'.join(arr[1].split(':')[1:])
- # duration = int(float(arr[2].split(':')[1]) * 1000 / 10)
- # data.append((key, wav_path, duration, tokenid))
- # else:
- # feat_ark = ':'.join(arr[1].split(':')[1:])
- # feat_info = arr[2].split(':')[1].split(',')
- # feat_dim = int(feat_info[1].strip())
- # num_frames = int(feat_info[0].strip())
- # data.append((key, feat_ark, num_frames, tokenid))
- # self.input_dim = feat_dim
- # self.output_dim = output_dim
-
valid_data = []
for i in range(len(data)):
length = data[i]['feat_shape'][0]
@@ -223,17 +205,17 @@ class AudioDataset(Dataset):
# remove too lang or too short utt for both input and output
# to prevent from out of memory
if length > max_length or length < min_length:
- # logging.warn('ignore utterance {} feature {}'.format(
- # data[i][0], length))
pass
elif token_length > token_max_length or token_length < token_min_length:
pass
else:
valid_data.append(data[i])
+ logger.info(f"raw dataset len: {len(data)}")
data = valid_data
+ num_data = len(data)
+ logger.info(f"dataset len after filter: {num_data}")
self.minibatch = []
- num_data = len(data)
# Dynamic batch size
if batch_type == 'dynamic':
assert (max_frames_in_batch > 0)
@@ -258,7 +240,9 @@ class AudioDataset(Dataset):
cur = end
def __len__(self):
+ """number of example(batch)"""
return len(self.minibatch)
def __getitem__(self, idx):
+ """batch example of idx"""
return self.minibatch[idx]
diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py
index e810662df377af2db05e128c1cbda9d2042afa89..38ff1396389f55c5dfcd8f42656483e5981a3e54 100644
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
@@ -18,8 +18,10 @@ import kaldiio
import numpy as np
import soundfile
-from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation
+from .utility import feat_type
+from paddlespeech.s2t.transform.transformation import Transformation
from paddlespeech.s2t.utils.log import Log
+# from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation
__all__ = ["LoadInputsAndTargets"]
@@ -322,20 +324,7 @@ class LoadInputsAndTargets():
"Not supported: loader_type={}".format(filetype))
def file_type(self, filepath):
- suffix = filepath.split(":")[0].split('.')[-1].lower()
- if suffix == 'ark':
- return 'mat'
- elif suffix == 'scp':
- return 'scp'
- elif suffix == 'npy':
- return 'npy'
- elif suffix == 'npz':
- return 'npz'
- elif suffix in ['wav', 'flac']:
- # PCM16
- return 'sound'
- else:
- raise ValueError(f"Not support filetype: {suffix}")
+ return feat_type(filepath)
class SoundHDF5File():
diff --git a/paddlespeech/s2t/io/utility.py b/paddlespeech/s2t/io/utility.py
index 392031ba81c5b6e641538891b81b36767805d2ff..1a90e3d0461ba9bb278b1c4be358449c2b2c6191 100644
--- a/paddlespeech/s2t/io/utility.py
+++ b/paddlespeech/s2t/io/utility.py
@@ -17,7 +17,7 @@ import numpy as np
from paddlespeech.s2t.utils.log import Log
-__all__ = ["pad_list", "pad_sequence"]
+__all__ = ["pad_list", "pad_sequence", "feat_type"]
logger = Log(__name__).getlog()
@@ -85,3 +85,20 @@ def pad_sequence(sequences: List[np.ndarray],
out_tensor[:length, i, ...] = tensor
return out_tensor
+
+
+def feat_type(filepath):
+ suffix = filepath.split(":")[0].split('.')[-1].lower()
+ if suffix == 'ark':
+ return 'mat'
+ elif suffix == 'scp':
+ return 'scp'
+ elif suffix == 'npy':
+ return 'npy'
+ elif suffix == 'npz':
+ return 'npz'
+ elif suffix in ['wav', 'flac']:
+ # PCM16
+ return 'sound'
+ else:
+ raise ValueError(f"Not support filetype: {suffix}")
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 9977cecc4bb2ecc337f7126cd7c1a38d95424358..4f833372a9ace05d9228c5ccad537a9e627dae88 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
"""
# cmvn
- if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
+ if 'cmvn_file' in configs and configs['cmvn_file']:
mean, istd = load_cmvn(configs['cmvn_file'],
configs['cmvn_file_type'])
global_cmvn = GlobalCMVN(
@@ -934,8 +934,8 @@ class U2Model(U2DecodeModel):
DeepSpeech2Model: The model built from pretrained result.
"""
with UpdateConfig(config):
- config.input_dim = dataloader.collate_fn.feature_size
- config.output_dim = dataloader.collate_fn.vocab_size
+ config.input_dim = dataloader.feat_dim
+ config.output_dim = dataloader.vocab_size
model = cls.from_config(config)
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 80eaf97542b090adec271570ce991e693b61e124..3d5f8cd1d3aaff3841a8b519bb7b3af178c700ef 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py
index 6e97f82458921bf287778b0e95a869ca54950138..67f71b6678e9908613b0fe867a44453fb204297a 100644
--- a/paddlespeech/s2t/modules/cmvn.py
+++ b/paddlespeech/s2t/modules/cmvn.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index 7601a5cca607df2820ba45583190516b2c80a48c..7ec92554eec73b8889335b3a16fd1a34692bb021 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index b0ab869a212b2fb3f9b9216e9f41f4b8b07b0fe0..6b4d959123b19cc23cd42bdcf68491ac6e5f61de 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py
index 4d516068239beac686c116846d75a3dbebcd8194..520b18dea17928b6fe95bbda804bd89ef28aa904 100644
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index 9207658f99bec9d9bcec8425d7f8a74f2bc146f5..5d4e91753b38129a9c2c71d706787af9d14a903d 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 0cde5b9f2360f005565d17c68cd7384b74039a2c..5c8ba0810d00db66a3c96238cf5d243802eb9d7b 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index 29d5a2d872e61a7f8aec7ace9bde05829043b08c..d39c0695a044cd9cdc5969b547be911565015672 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py
index 5750f5a0f79e8b2bce0f89b7f68c4c696d9e85f8..c7d9bd45dd2bf005a575098456c435a173678d26 100644
--- a/paddlespeech/s2t/modules/loss.py
+++ b/paddlespeech/s2t/modules/loss.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py
index 6576cb9221e027feeaee4651b2f73795fa174225..d6b63761b49b530db68a7ff0bb342675124c9fca 100644
--- a/paddlespeech/s2t/modules/mask.py
+++ b/paddlespeech/s2t/modules/mask.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py
index 347264e9d6cd9211f2c54ca556e99ba36cd1ae71..e2619cd49dc15ef7d9ddb1fbbb991f3fe3eb1c35 100644
--- a/paddlespeech/s2t/modules/positionwise_feed_forward.py
+++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py
index 759bd540f5b0149a03ef70ab6353bdcd6817d16d..99a8300f246149e924fe741f53934259d404e4e8 100644
--- a/paddlespeech/s2t/modules/subsampling.py
+++ b/paddlespeech/s2t/modules/subsampling.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/transform/cmvn.py b/paddlespeech/s2t/transform/cmvn.py
index 4d2d2324f660dd676d6774e8aedc37f0e7cb6585..aa1e6b4450f41103c1f0c9f2e723bfcdbe2cde9d 100644
--- a/paddlespeech/s2t/transform/cmvn.py
+++ b/paddlespeech/s2t/transform/cmvn.py
@@ -13,6 +13,7 @@
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
import io
+import json
import h5py
import kaldiio
@@ -157,3 +158,40 @@ class UtteranceCMVN():
x = np.divide(x, std)
return x
+
+
+class GlobalCMVN():
+ "Apply Global CMVN"
+
+ def __init__(self,
+ cmvn_path,
+ norm_means=True,
+ norm_vars=True,
+ std_floor=1.0e-20):
+ self.cmvn_path = cmvn_path
+ self.norm_means = norm_means
+ self.norm_vars = norm_vars
+ self.std_floor = std_floor
+
+ with open(cmvn_path) as f:
+ cmvn_stats = json.load(f)
+ self.count = cmvn_stats['frame_num']
+ self.mean = np.array(cmvn_stats['mean_stat']) / self.count
+ self.square_sums = np.array(cmvn_stats['var_stat'])
+ self.var = self.square_sums / self.count - self.mean**2
+ self.std = np.maximum(np.sqrt(self.var), self.std_floor)
+
+ def __repr__(self):
+ return f"""{self.__class__.__name__}(
+ cmvn_path={self.cmvn_path},
+ norm_means={self.norm_means},
+ norm_vars={self.norm_vars},)"""
+
+ def __call__(self, x, uttid=None):
+ # x: [Time, Dim]
+ if self.norm_means:
+ x = np.subtract(x, self.mean)
+
+ if self.norm_vars:
+ x = np.divide(x, self.std)
+ return x
diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py
index 153d494bf76b7d1ea99c79caf16af88ddc70638c..873adb0b8ab2e5d67bb434fb6c1ab907114bc35d 100644
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -16,6 +16,7 @@ import librosa
import numpy
import scipy
import soundfile
+import soxbindings as sox
from paddlespeech.s2t.io.reader import SoundHDF5File
@@ -82,7 +83,6 @@ class SpeedPerturbation():
def __call__(self, x, uttid=None, train=True):
if not train:
return x
-
x = x.astype(numpy.float32)
if self.accept_uttid:
ratio = self.utt2ratio[uttid]
@@ -108,6 +108,110 @@ class SpeedPerturbation():
return y
+class SpeedPerturbationSox():
+ """SpeedPerturbationSox
+
+ The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
+ and sox-speed just to resample the input,
+ i.e pitch and tempo are changed both.
+
+ To speed up or slow down the sound of a file,
+ use speed to modify the pitch and the duration of the file.
+ This raises the speed and reduces the time.
+ The default factor is 1.0 which makes no change to the audio.
+ 2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher.
+
+ "Why use speed option instead of tempo -s in SoX for speed perturbation"
+ https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
+
+ tempo option:
+ sox -t wav input.wav -t wav output.tempo0.9.wav tempo -s 0.9
+
+ speed option:
+ sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9
+
+ If we use speed option like above, the pitch of audio also will be changed,
+ but the tempo option does not change the pitch.
+ """
+
+ def __init__(
+ self,
+ lower=0.9,
+ upper=1.1,
+ utt2ratio=None,
+ keep_length=True,
+ sr=16000,
+ seed=None, ):
+ self.sr = sr
+ self.keep_length = keep_length
+ self.state = numpy.random.RandomState(seed)
+
+ if utt2ratio is not None:
+ self.utt2ratio = {}
+ # Use the scheduled ratio for each utterances
+ self.utt2ratio_file = utt2ratio
+ self.lower = None
+ self.upper = None
+ self.accept_uttid = True
+
+ with open(utt2ratio, "r") as f:
+ for line in f:
+ utt, ratio = line.rstrip().split(None, 1)
+ ratio = float(ratio)
+ self.utt2ratio[utt] = ratio
+ else:
+ self.utt2ratio = None
+ # The ratio is given on runtime randomly
+ self.lower = lower
+ self.upper = upper
+
+ def __repr__(self):
+ if self.utt2ratio is None:
+ return f"""{self.__class__.__name__}(
+ lower={self.lower},
+ upper={self.upper},
+ keep_length={self.keep_length},
+ sample_rate={self.sr})"""
+
+ else:
+ return f"""{self.__class__.__name__}(
+ utt2ratio={self.utt2ratio_file},
+ sample_rate={self.sr})"""
+
+ def __call__(self, x, uttid=None, train=True):
+ if not train:
+ return x
+
+ x = x.astype(numpy.float32)
+ if self.accept_uttid:
+ ratio = self.utt2ratio[uttid]
+ else:
+ ratio = self.state.uniform(self.lower, self.upper)
+
+ tfm = sox.Transformer()
+ tfm.set_globals(multithread=False)
+ tfm.speed(ratio)
+ y = tfm.build_array(input_array=x, sample_rate_in=self.sr)
+
+ if self.keep_length:
+ diff = abs(len(x) - len(y))
+ if len(y) > len(x):
+ # Truncate noise
+ y = y[diff // 2:-((diff + 1) // 2)]
+ elif len(y) < len(x):
+ # Assume the time-axis is the first: (Time, Channel)
+ pad_width = [(diff // 2, (diff + 1) // 2)] + [
+ (0, 0) for _ in range(y.ndim - 1)
+ ]
+ y = numpy.pad(
+ y, pad_width=pad_width, constant_values=0, mode="constant")
+
+ if y.ndim == 2 and x.ndim == 1:
+ # (T, C) -> (T)
+ y = y.sequence(1)
+ return y
+
+
class BandpassPerturbation():
"""BandpassPerturbation
diff --git a/paddlespeech/s2t/transform/spec_augment.py b/paddlespeech/s2t/transform/spec_augment.py
index 83e4e2e7502390dd2610c15923eabe0be694b802..5ce950851a4ee6dbaa2bcbe529cbc89ce714a60b 100644
--- a/paddlespeech/s2t/transform/spec_augment.py
+++ b/paddlespeech/s2t/transform/spec_augment.py
@@ -34,6 +34,9 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
:returns numpy.ndarray: time warped spectrogram (time, freq)
"""
window = max_time_warp
+ if window == 0:
+ return x
+
if mode == "PIL":
t = x.shape[0]
if t - window <= window:
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index df3130dadcd68e2416cc860cd41bb0e6c3d2c651..da91ef92174b817bd8778b6a53518eed2b9e6f1b 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -14,6 +14,7 @@
# Modified from espnet(https://github.com/espnet/espnet)
import librosa
import numpy as np
+from python_speech_features import logfbank
def stft(x,
@@ -304,3 +305,94 @@ class IStft():
win_length=self.win_length,
window=self.window,
center=self.center, )
+
+
+class LogMelSpectrogramKaldi():
+ def __init__(
+ self,
+ fs=16000,
+ n_mels=80,
+ n_fft=512, # fft point
+ n_shift=160, # unit:sample, 10ms
+ win_length=400, # unit:sample, 25ms
+ window="povey",
+ fmin=20,
+ fmax=None,
+ eps=1e-10,
+ dither=False):
+ self.fs = fs
+ self.n_mels = n_mels
+ self.n_fft = n_fft
+ if n_shift > win_length:
+ raise ValueError("Stride size must not be greater than "
+ "window size.")
+ self.n_shift = n_shift / fs # unit: ms
+ self.win_length = win_length / fs # unit: ms
+
+ self.window = window
+ self.fmin = fmin
+ if fmax is None:
+ fmax_ = fmax if fmax else self.fs / 2
+ elif fmax > int(self.fs / 2):
+ raise ValueError("fmax must not be greater than half of "
+ "sample rate.")
+ self.fmax = fmax_
+
+ self.eps = eps
+ self.remove_dc_offset = True
+ self.preemph = 0.97
+ self.dither = dither
+
+ def __repr__(self):
+ return (
+ "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+ "n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, "
+ "fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format(
+ name=self.__class__.__name__,
+ fs=self.fs,
+ n_mels=self.n_mels,
+ n_fft=self.n_fft,
+ n_shift=self.n_shift,
+ preemph=self.preemph,
+ win_length=self.win_length,
+ window=self.window,
+ fmin=self.fmin,
+ fmax=self.fmax,
+ eps=self.eps,
+ dither=self.dither, ))
+
+ def __call__(self, x):
+ """
+
+ Args:
+ x (np.ndarray): shape (Ti,)
+
+ Raises:
+ ValueError: not support (Ti, C)
+
+ Returns:
+ np.ndarray: (T, D)
+ """
+ if x.ndim != 1:
+ raise ValueError("Not support x: [Time, Channel]")
+
+ if x.dtype in np.sctypes['float']:
+ # PCM32 -> PCM16
+ bits = np.iinfo(np.int16).bits
+ x = x * 2**(bits - 1)
+
+ # logfbank need PCM16 input
+ y = logfbank(
+ signal=x,
+ samplerate=self.fs,
+ winlen=self.win_length, # unit ms
+ winstep=self.n_shift, # unit ms
+ nfilt=self.n_mels,
+ nfft=self.n_fft,
+ lowfreq=self.fmin,
+ highfreq=self.fmax,
+ dither=self.dither,
+ remove_dc_offset=self.remove_dc_offset,
+ preemph=self.preemph,
+ wintype=self.window)
+ return y
diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py
index 1aee4b36f79479d5aa60d4d2148bb3431697bcde..381b0cdc9d92c9d583bf357935dcf8ac9759c9aa 100644
--- a/paddlespeech/s2t/transform/transformation.py
+++ b/paddlespeech/s2t/transform/transformation.py
@@ -45,7 +45,8 @@ import_alias = dict(
stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram",
wpe="paddlespeech.s2t.transform.wpe:WPE",
channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector",
-)
+ fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi",
+ cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN")
class Transformation():
diff --git a/tests/chains/speedyspeech/prepare.sh b/tests/chains/speedyspeech/prepare.sh
index fb6ef285ca5d2957d54c6225eb92d87188037c5e..1ddcd67760c397cb0d296d4f5010e9fa7610682e 100755
--- a/tests/chains/speedyspeech/prepare.sh
+++ b/tests/chains/speedyspeech/prepare.sh
@@ -32,7 +32,7 @@ trainer_list=$(func_parser_value "${lines[14]}")
# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer']
if [ ${MODE} = "lite_train_infer" ];then
# pretrain lite train data
- wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+ wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
(cd ./pretrain_models && unzip pwg_baker_ckpt_0.4.zip)
# download data
rm -rf ./train_data/mini_BZNSYP
@@ -40,7 +40,7 @@ if [ ${MODE} = "lite_train_infer" ];then
cd ./train_data/ && tar xzf mini_BZNSYP.tar.gz
cd ../
elif [ ${MODE} = "whole_train_infer" ];then
- wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip
+ wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip
wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
(cd ./pretrain_models && unzip speedyspeech_nosil_baker_ckpt_0.5.zip && unzip pwg_baker_ckpt_0.4.zip)
rm -rf ./train_data/processed_BZNSYP
diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py
index 296d272a46525fc3fdbe42b572e12924fb5557a2..e47554dcabbfb596b04e7037c6fb6a151aded2e2 100755
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -33,8 +33,8 @@ add_arg('spectrum_type', str,
choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim', int, 13, "Audio feature dim.")
add_arg('delta_delta', bool, False, "Audio feature with delta delta.")
-add_arg('stride_ms', float, 10.0, "stride length in ms.")
-add_arg('window_ms', float, 20.0, "stride length in ms.")
+add_arg('stride_ms', int, 10, "stride length in ms.")
+add_arg('window_ms', int, 20, "stride length in ms.")
add_arg('sample_rate', int, 16000, "target sample rate.")
add_arg('use_dB_normalization', bool, True, "do dB normalization.")
add_arg('target_dB', int, -20, "target dB.")
@@ -61,8 +61,8 @@ def main():
spectrum_type=args.spectrum_type,
feat_dim=args.feat_dim,
delta_delta=args.delta_delta,
- stride_ms=args.stride_ms,
- window_ms=args.window_ms,
+ stride_ms=float(args.stride_ms),
+ window_ms=float(args.window_ms),
n_fft=None,
max_freq=None,
target_sample_rate=args.sample_rate,
diff --git a/utils/format_data.py b/utils/format_data.py
index 6fe36997a6513121e2878a00306e9d09018af47c..2fa1924a072faa67ad559f68174896846f8cbdf9 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -20,13 +20,13 @@ import json
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
+from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
-add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp")
add_arg('cmvn_path', str,
'examples/librispeech/data/mean_std.json',
"Filepath of cmvn.")
@@ -62,27 +62,76 @@ def main():
vocab_size = text_feature.vocab_size
print(f"Vocab size: {vocab_size}")
+ # josnline like this
+ # {
+ # "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+ # "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+ # "utt2spk": "111-2222",
+ # "utt": "111-2222-333"
+ # }
count = 0
for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
+ output_json = {
+ "input": [],
+ "output": [],
+ 'utt': line_json['utt'],
+ 'utt2spk': line_json.get('utt2spk', 'global'),
+ }
+
+ # output
line = line_json['text']
- tokens = text_feature.tokenize(line)
- tokenids = text_feature.featurize(line)
- line_json['token'] = tokens
- line_json['token_id'] = tokenids
- line_json['token_shape'] = (len(tokenids), vocab_size)
- feat_shape = line_json['feat_shape']
- assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
- if args.feat_type == 'raw':
- feat_shape.append(feat_dim)
- line_json['filetype'] = 'sound'
- else: # kaldi
- raise NotImplementedError('no support kaldi feat now!')
- fout.write(json.dumps(line_json) + '\n')
+ if isinstance(line, str):
+ # only one target
+ tokens = text_feature.tokenize(line)
+ tokenids = text_feature.featurize(line)
+ output_json['output'].append({
+ 'name': 'target1',
+ 'shape': (len(tokenids), vocab_size),
+ 'text': line,
+ 'token': ' '.join(tokens),
+ 'tokenid': ' '.join(map(str, tokenids)),
+ })
+ else:
+ # isinstance(line, list), multi target in one vocab
+ for i, item in enumerate(line, 1):
+ tokens = text_feature.tokenize(item)
+ tokenids = text_feature.featurize(item)
+ output_json['output'].append({
+ 'name': f'target{i}',
+ 'shape': (len(tokenids), vocab_size),
+ 'text': item,
+ 'token': ' '.join(tokens),
+ 'tokenid': ' '.join(map(str, tokenids)),
+ })
+
+ # input
+ line = line_json['feat']
+ if isinstance(line, str):
+ # only one input
+ feat_shape = line_json['feat_shape']
+ assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
+ filetype = feat_type(line)
+ if filetype == 'sound':
+ feat_shape.append(feat_dim)
+ else: # kaldi
+ raise NotImplementedError('no support kaldi feat now!')
+
+ output_json['input'].append({
+ "name": "input1",
+ "shape": feat_shape,
+ "feat": line,
+ "filetype": filetype,
+ })
+ else:
+ # isinstance(line, list), multi input
+ raise NotImplementedError("not support multi input now!")
+
+ fout.write(json.dumps(output_json) + '\n')
count += 1
- print(f"Examples number: {count}")
+ print(f"{args.manifest_paths} Examples number: {count}")
fout.close()
diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py
index 79b3d2cb2dd8b4cb2edea6f055b109d35dc0cae7..e0b5ece37353dc9cc592d440ce11ba486569bfaf 100755
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -20,13 +20,13 @@ import json
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
+from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
-add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi")
add_arg('cmvn_path', str,
'examples/librispeech/data/mean_std.json',
"Filepath of cmvn.")
@@ -79,9 +79,11 @@ def main():
line_json['token1'] = tokens
line_json['token_id1'] = tokenids
line_json['token_shape1'] = (len(tokenids), vocab_size)
+
feat_shape = line_json['feat_shape']
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
- if args.feat_type == 'raw':
+ filetype = feat_type(line_json['feat'])
+ if filetype == 'sound':
feat_shape.append(feat_dim)
else: # kaldi
raise NotImplementedError('no support kaldi feat now!')
diff --git a/utils/pack_model.sh b/utils/pack_model.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8acd59a640bb6fe10e22a38fc1c2f6d6c71ca46a
--- /dev/null
+++ b/utils/pack_model.sh
@@ -0,0 +1,164 @@
+#!/usr/bin/env bash
+
+# Copyright 2019 Johns Hopkins University (Shinji Watanabe)
+# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+
+[ -f ./path.sh ] && . ./path.sh
+
+results=""
+# e.g., "exp/tr_it_pytorch_train/decode_dt_it_decode/result.wrd.txt
+# exp/tr_it_pytorch_train/decode_et_it_decode/result.wrd.txt"'
+lm=""
+dict=""
+etc=""
+outfile="model"
+preprocess_conf=""
+
+help_message=$(cat < --dict , for example:
+: exp/train_rnnlm/rnnlm.model.best
+: data/lang_char
+: conf/train.yaml
+: conf/decode.yaml
+: data/tr_it/cmvn.ark
+: exp/tr_it_pytorch_train/results/model.last10.avg.best
+EOF
+)
+
+. utils/parse_options.sh
+
+if [ $# != 4 ]; then
+ echo "${help_message}"
+ exit 1
+fi
+
+tr_conf=$1
+dec_conf=$2
+cmvn=$3
+e2e=$4
+
+echo " - Model files (archived to ${outfile}.tar.gz by \`\$ pack_model.sh\`)"
+echo " - model link: (put the model link manually.)"
+
+# configs
+if [ -e ${tr_conf} ]; then
+ tar cfh ${outfile}.tar ${tr_conf}
+ echo -n " - training config file: \`"
+ echo ${tr_conf} | sed -e "s/$/\`/"
+else
+ echo "missing ${tr_conf}"
+ exit 1
+fi
+if [ -e ${dec_conf} ]; then
+ tar rfh ${outfile}.tar ${dec_conf}
+ echo -n " - decoding config file: \`"
+ echo ${dec_conf} | sed -e "s/$/\`/"
+else
+ echo "missing ${dec_conf}"
+ exit 1
+fi
+# NOTE(kan-bayashi): preprocess conf is optional
+if [ -n "${preprocess_conf}" ]; then
+ tar rfh ${outfile}.tar ${preprocess_conf}
+ echo -n " - preprocess config file: \`"
+ echo ${preprocess_conf} | sed -e "s/$/\`/"
+fi
+
+# cmvn
+if [ -e ${cmvn} ]; then
+ tar rfh ${outfile}.tar ${cmvn}
+ echo -n " - cmvn file: \`"
+ echo ${cmvn} | sed -e "s/$/\`/"
+else
+ echo "missing ${cmvn}"
+ exit 1
+fi
+
+# e2e
+if [ -e ${e2e} ]; then
+ tar rfh ${outfile}.tar ${e2e}
+ echo -n " - e2e file: \`"
+ echo ${e2e} | sed -e "s/$/\`/"
+
+ e2e_conf=$(dirname ${e2e})/model.json
+ if [ ! -e ${e2e_conf} ]; then
+ echo missing ${e2e_conf}
+ #exit 1
+ else
+ echo -n " - e2e JSON file: \`"
+ echo ${e2e_conf} | sed -e "s/$/\`/"
+ tar rfh ${outfile}.tar ${e2e_conf}
+ fi
+else
+ echo "missing ${e2e}"
+ exit 1
+fi
+
+# lm
+if [ -n "${lm}" ]; then
+ if [ -e ${lm} ]; then
+ tar rfh ${outfile}.tar ${lm}
+ echo -n " - lm file: \`"
+ echo ${lm} | sed -e "s/$/\`/"
+
+ lm_conf=$(dirname ${lm})/model.json
+ if [ ! -e ${lm_conf} ]; then
+ echo missing ${lm_conf}
+ exit 1
+ else
+ echo -n " - lm JSON file: \`"
+ echo ${lm_conf} | sed -e "s/$/\`/"
+ tar rfh ${outfile}.tar ${lm_conf}
+ fi
+ else
+ echo "missing ${lm}"
+ exit 1
+ fi
+fi
+
+# dict
+if [ -n "${dict}" ]; then
+ if [ -e ${dict} ]; then
+ tar rfh ${outfile}.tar ${dict}
+ echo -n " - dict file: \`"
+ echo ${dict} | sed -e "s/$/\`/"
+ else
+ echo "missing ${dict}"
+ exit 1
+ fi
+fi
+
+# etc
+for x in ${etc}; do
+ if [ -e ${x} ]; then
+ tar rfh ${outfile}.tar ${x}
+ echo -n " - etc file: \`"
+ echo ${x} | sed -e "s/$/\`/"
+ else
+ echo "missing ${x}"
+ exit 1
+ fi
+done
+
+# finally compress the tar file
+gzip -f ${outfile}.tar
+
+# results
+if [ -n "${results}" ]; then
+ echo " - Results (paste them by yourself or obtained by \`\$ pack_model.sh --results \`)"
+ echo "\`\`\`"
+fi
+for x in ${results}; do
+ if [ -e ${x} ]; then
+ echo "${x}"
+ grep -e Avg -e SPKR -m 2 ${x}
+ else
+ echo "missing ${x}"
+ exit 1
+ fi
+done
+if [ -n "${results}" ]; then
+ echo "\`\`\`"
+fi
+
+exit 0
diff --git a/utils/remove_longshortdata.py b/utils/remove_longshortdata.py
new file mode 100755
index 0000000000000000000000000000000000000000..131b4a5828bee7dc3e2520ed1694e80230c57f56
--- /dev/null
+++ b/utils/remove_longshortdata.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""remove longshort data from manifest"""
+import argparse
+import logging
+
+import jsonlines
+
+from paddlespeech.s2t.utils.cli_utils import get_commandline_args
+
+# manifest after format
+# josnline like this
+# {
+# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+# "utt2spk": "111-2222",
+# "utt": "111-2222-333"
+# }
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description="remove longshort data from format manifest",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+ parser.add_argument(
+ "--verbose", "-V", default=0, type=int, help="Verbose option")
+ parser.add_argument(
+ "--iaxis",
+ default=0,
+ type=int,
+ help="multi inputs index, 0 is the first")
+ parser.add_argument(
+ "--oaxis",
+ default=0,
+ type=int,
+ help="multi outputs index, 0 is the first")
+ parser.add_argument("--maxframes", default=2000, type=int, help="maxframes")
+ parser.add_argument("--minframes", default=10, type=int, help="minframes")
+ parser.add_argument("--maxchars", default=200, type=int, help="max tokens")
+ parser.add_argument("--minchars", default=0, type=int, help="min tokens")
+ parser.add_argument(
+ "--stride_ms", default=10, type=int, help="stride in ms unit.")
+ parser.add_argument(
+ "rspecifier",
+ type=str,
+ help="jsonl format manifest. e.g. manifest.jsonl")
+ parser.add_argument(
+ "wspecifier_or_wxfilename",
+ type=str,
+ help="Write specifier. e.g. manifest.jsonl")
+ return parser
+
+
+def filter_input(args, line):
+ tmp = line['input'][args.iaxis]
+ if args.sound:
+ # second to frame
+ nframe = tmp['shape'][0] * 1000 / args.stride_ms
+ else:
+ nframe = tmp['shape'][0]
+
+ if nframe < args.minframes or nframe > args.maxframes:
+ return True
+ else:
+ return False
+
+
+def filter_output(args, line):
+ nchars = len(line['output'][args.iaxis]['text'])
+ if nchars < args.minchars or nchars > args.maxchars:
+ return True
+ else:
+ return False
+
+
+def main():
+ args = get_parser().parse_args()
+
+ logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+ if args.verbose > 0:
+ logging.basicConfig(level=logging.INFO, format=logfmt)
+ else:
+ logging.basicConfig(level=logging.WARN, format=logfmt)
+ logging.info(get_commandline_args())
+
+ with jsonlines.open(args.rspecifier, 'r') as reader:
+ lines = list(reader)
+ logging.info(f"Example: {len(lines)}")
+ feat = lines[0]['input'][args.iaxis]['feat']
+ args.soud = False
+ if feat.split('.')[-1] not in 'ark, scp':
+ args.sound = True
+
+ count = 0
+ filter = 0
+ with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer:
+ for line in lines:
+ if filter_input(args, line) or filter_output(args, line):
+ filter += 1
+ continue
+ writer.write(line)
+ count += 1
+ logging.info(f"Example after filter: {count}\{filter}")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/utils/show_results.sh b/utils/show_results.sh
new file mode 100755
index 0000000000000000000000000000000000000000..42f80ee6300b26cf6116ff6017cd3de65777d115
--- /dev/null
+++ b/utils/show_results.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+mindepth=0
+maxdepth=1
+
+. utils/parse_options.sh
+
+if [ $# -gt 1 ]; then
+ echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+ echo ""
+ echo "Show the system environments and the evaluation results in Markdown format."
+ echo 'The default of is "exp/".'
+ exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+ exp=$1
+else
+ exp=exp
+fi
+
+
+cat << EOF
+
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python3 << EOF
+import sys, paddle
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- paddle version: \`paddle {paddle.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+ - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+while IFS= read -r expdir; do
+ if ls ${expdir}/decode_*/result.txt &> /dev/null; then
+ # 1. Show the result table
+ cat << EOF
+## $(basename ${expdir})
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+ grep -e Avg ${expdir}/decode_*/result.txt \
+ | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \
+ | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+ echo
+
+ # 2. Show the result table for WER
+ if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then
+ cat << EOF
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+ grep -e Avg ${expdir}/decode_*/result.wrd.txt \
+ | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \
+ | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+ echo
+ fi
+ fi
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)
|